mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-31 23:55:06 +08:00
Put document metadata in ES/Infinity (#12826)
### What problem does this PR solve?
Put document metadata in ES/Infinity.
Index name of meta data: ragflow_doc_meta_{tenant_id}
### Type of change
- [x] Refactoring
This commit is contained in:
@ -24,6 +24,7 @@ from abc import abstractmethod
|
||||
from elasticsearch import NotFoundError
|
||||
from elasticsearch_dsl import Index
|
||||
from elastic_transport import ConnectionTimeout
|
||||
from elasticsearch.client import IndicesClient
|
||||
from common.file_utils import get_project_base_directory
|
||||
from common.misc_utils import convert_bytes
|
||||
from common.doc_store.doc_store_base import DocStoreConnection, OrderByExpr, MatchExpr
|
||||
@ -128,13 +129,34 @@ class ESConnectionBase(DocStoreConnection):
|
||||
if self.index_exist(index_name, dataset_id):
|
||||
return True
|
||||
try:
|
||||
from elasticsearch.client import IndicesClient
|
||||
return IndicesClient(self.es).create(index=index_name,
|
||||
settings=self.mapping["settings"],
|
||||
mappings=self.mapping["mappings"])
|
||||
except Exception:
|
||||
self.logger.exception("ESConnection.createIndex error %s" % index_name)
|
||||
|
||||
def create_doc_meta_idx(self, index_name: str):
|
||||
"""
|
||||
Create a document metadata index.
|
||||
|
||||
Index name pattern: ragflow_doc_meta_{tenant_id}
|
||||
- Per-tenant metadata index for storing document metadata fields
|
||||
"""
|
||||
if self.index_exist(index_name, ""):
|
||||
return True
|
||||
try:
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_es_mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
|
||||
return False
|
||||
|
||||
doc_meta_mapping = json.load(open(fp_mapping, "r"))
|
||||
return IndicesClient(self.es).create(index=index_name,
|
||||
settings=doc_meta_mapping["settings"],
|
||||
mappings=doc_meta_mapping["mappings"])
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error creating document metadata index {index_name}: {e}")
|
||||
|
||||
def delete_idx(self, index_name: str, dataset_id: str):
|
||||
if len(dataset_id) > 0:
|
||||
# The index need to be alive after any kb deletion since all kb under this tenant are in one index.
|
||||
|
||||
@ -285,8 +285,65 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
self.logger.info(f"INFINITY created table {table_name}, vector size {vector_size}")
|
||||
return True
|
||||
|
||||
def create_doc_meta_idx(self, index_name: str):
|
||||
"""
|
||||
Create a document metadata table.
|
||||
|
||||
Table name pattern: ragflow_doc_meta_{tenant_id}
|
||||
- Per-tenant metadata table for storing document metadata fields
|
||||
"""
|
||||
table_name = index_name
|
||||
inf_conn = self.connPool.get_conn()
|
||||
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
||||
try:
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_infinity_mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
|
||||
return False
|
||||
schema = json.load(open(fp_mapping))
|
||||
inf_db.create_table(
|
||||
table_name,
|
||||
schema,
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
|
||||
# Create secondary indexes on id and kb_id for better query performance
|
||||
inf_table = inf_db.get_table(table_name)
|
||||
|
||||
try:
|
||||
inf_table.create_index(
|
||||
f"idx_{table_name}_id",
|
||||
IndexInfo("id", IndexType.Secondary),
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
self.logger.debug(f"INFINITY created secondary index on id for table {table_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to create index on id for {table_name}: {e}")
|
||||
|
||||
try:
|
||||
inf_table.create_index(
|
||||
f"idx_{table_name}_kb_id",
|
||||
IndexInfo("kb_id", IndexType.Secondary),
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
self.logger.debug(f"INFINITY created secondary index on kb_id for table {table_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to create index on kb_id for {table_name}: {e}")
|
||||
|
||||
self.connPool.release_conn(inf_conn)
|
||||
self.logger.debug(f"INFINITY created document metadata table {table_name} with secondary indexes")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.connPool.release_conn(inf_conn)
|
||||
self.logger.exception(f"Error creating document metadata table {table_name}: {e}")
|
||||
return False
|
||||
|
||||
def delete_idx(self, index_name: str, dataset_id: str):
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
db_instance.drop_table(table_name, ConflictType.Ignore)
|
||||
@ -294,7 +351,10 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
self.logger.info(f"INFINITY dropped table {table_name}")
|
||||
|
||||
def index_exist(self, index_name: str, dataset_id: str) -> bool:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
try:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
@ -341,7 +401,10 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
def delete(self, condition: dict, index_name: str, dataset_id: str) -> int:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
|
||||
Reference in New Issue
Block a user