Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
This commit is contained in:
qinling0210
2026-01-28 13:29:34 +08:00
committed by GitHub
parent fd11aca8e5
commit 9a5208976c
24 changed files with 1529 additions and 304 deletions

View File

@ -24,6 +24,7 @@ from abc import abstractmethod
from elasticsearch import NotFoundError
from elasticsearch_dsl import Index
from elastic_transport import ConnectionTimeout
from elasticsearch.client import IndicesClient
from common.file_utils import get_project_base_directory
from common.misc_utils import convert_bytes
from common.doc_store.doc_store_base import DocStoreConnection, OrderByExpr, MatchExpr
@ -128,13 +129,34 @@ class ESConnectionBase(DocStoreConnection):
if self.index_exist(index_name, dataset_id):
return True
try:
from elasticsearch.client import IndicesClient
return IndicesClient(self.es).create(index=index_name,
settings=self.mapping["settings"],
mappings=self.mapping["mappings"])
except Exception:
self.logger.exception("ESConnection.createIndex error %s" % index_name)
def create_doc_meta_idx(self, index_name: str):
"""
Create a document metadata index.
Index name pattern: ragflow_doc_meta_{tenant_id}
- Per-tenant metadata index for storing document metadata fields
"""
if self.index_exist(index_name, ""):
return True
try:
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_es_mapping.json")
if not os.path.exists(fp_mapping):
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
return False
doc_meta_mapping = json.load(open(fp_mapping, "r"))
return IndicesClient(self.es).create(index=index_name,
settings=doc_meta_mapping["settings"],
mappings=doc_meta_mapping["mappings"])
except Exception as e:
self.logger.exception(f"Error creating document metadata index {index_name}: {e}")
def delete_idx(self, index_name: str, dataset_id: str):
if len(dataset_id) > 0:
# The index need to be alive after any kb deletion since all kb under this tenant are in one index.

View File

@ -285,8 +285,65 @@ class InfinityConnectionBase(DocStoreConnection):
self.logger.info(f"INFINITY created table {table_name}, vector size {vector_size}")
return True
def create_doc_meta_idx(self, index_name: str):
"""
Create a document metadata table.
Table name pattern: ragflow_doc_meta_{tenant_id}
- Per-tenant metadata table for storing document metadata fields
"""
table_name = index_name
inf_conn = self.connPool.get_conn()
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
try:
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_infinity_mapping.json")
if not os.path.exists(fp_mapping):
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
return False
schema = json.load(open(fp_mapping))
inf_db.create_table(
table_name,
schema,
ConflictType.Ignore,
)
# Create secondary indexes on id and kb_id for better query performance
inf_table = inf_db.get_table(table_name)
try:
inf_table.create_index(
f"idx_{table_name}_id",
IndexInfo("id", IndexType.Secondary),
ConflictType.Ignore,
)
self.logger.debug(f"INFINITY created secondary index on id for table {table_name}")
except Exception as e:
self.logger.warning(f"Failed to create index on id for {table_name}: {e}")
try:
inf_table.create_index(
f"idx_{table_name}_kb_id",
IndexInfo("kb_id", IndexType.Secondary),
ConflictType.Ignore,
)
self.logger.debug(f"INFINITY created secondary index on kb_id for table {table_name}")
except Exception as e:
self.logger.warning(f"Failed to create index on kb_id for {table_name}: {e}")
self.connPool.release_conn(inf_conn)
self.logger.debug(f"INFINITY created document metadata table {table_name} with secondary indexes")
return True
except Exception as e:
self.connPool.release_conn(inf_conn)
self.logger.exception(f"Error creating document metadata table {table_name}: {e}")
return False
def delete_idx(self, index_name: str, dataset_id: str):
table_name = f"{index_name}_{dataset_id}"
if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{dataset_id}"
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
db_instance.drop_table(table_name, ConflictType.Ignore)
@ -294,7 +351,10 @@ class InfinityConnectionBase(DocStoreConnection):
self.logger.info(f"INFINITY dropped table {table_name}")
def index_exist(self, index_name: str, dataset_id: str) -> bool:
table_name = f"{index_name}_{dataset_id}"
if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{dataset_id}"
try:
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
@ -341,7 +401,10 @@ class InfinityConnectionBase(DocStoreConnection):
def delete(self, condition: dict, index_name: str, dataset_id: str) -> int:
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
table_name = f"{index_name}_{dataset_id}"
if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{dataset_id}"
try:
table_instance = db_instance.get_table(table_name)
except Exception: