Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
This commit is contained in:
qinling0210
2026-01-28 13:29:34 +08:00
committed by GitHub
parent fd11aca8e5
commit 9a5208976c
24 changed files with 1529 additions and 304 deletions

View File

@ -98,6 +98,7 @@ def message_fit_in(msg, max_length=4000):
def kb_prompt(kbinfos, max_tokens, hash_id=False):
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
knowledges = [get_value(ck, "content", "content_with_weight") for ck in kbinfos["chunks"]]
kwlg_len = len(knowledges)
@ -114,7 +115,12 @@ def kb_prompt(kbinfos, max_tokens, hash_id=False):
break
docs = DocumentService.get_by_ids([get_value(ck, "doc_id", "document_id") for ck in kbinfos["chunks"][:chunks_num]])
docs = {d.id: d.meta_fields for d in docs}
docs_with_meta = {}
for d in docs:
meta = DocMetadataService.get_document_metadata(d.id)
docs_with_meta[d.id] = meta if meta else {}
docs = docs_with_meta
def draw_node(k, line):
if line is not None and not isinstance(line, str):

View File

@ -61,6 +61,7 @@ import numpy as np
from peewee import DoesNotExist
from common.constants import LLMType, ParserType, PipelineTaskType
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
from api.db.services.file2document_service import File2DocumentService
@ -438,12 +439,10 @@ async def build_chunks(task, progress_callback):
metadata = update_metadata_to(metadata, doc["metadata_obj"])
del doc["metadata_obj"]
if metadata:
e, doc = DocumentService.get_by_id(task["doc_id"])
if e:
if isinstance(doc.meta_fields, str):
doc.meta_fields = json.loads(doc.meta_fields)
metadata = update_metadata_to(metadata, doc.meta_fields)
DocumentService.update_by_id(task["doc_id"], {"meta_fields": metadata})
existing_meta = DocMetadataService.get_document_metadata(task["doc_id"])
existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
metadata = update_metadata_to(metadata, existing_meta)
DocMetadataService.update_document_metadata(task["doc_id"], metadata)
progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
if task["kb_parser_config"].get("tag_kb_ids", []):
@ -735,12 +734,10 @@ async def run_dataflow(task: dict):
del ck["positions"]
if metadata:
e, doc = DocumentService.get_by_id(doc_id)
if e:
if isinstance(doc.meta_fields, str):
doc.meta_fields = json.loads(doc.meta_fields)
metadata = update_metadata_to(metadata, doc.meta_fields)
DocumentService.update_by_id(doc_id, {"meta_fields": metadata})
existing_meta = DocMetadataService.get_document_metadata(doc_id)
existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
metadata = update_metadata_to(metadata, existing_meta)
DocMetadataService.update_document_metadata(doc_id, metadata)
start_ts = timer()
set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")

View File

@ -162,7 +162,11 @@ class ESConnection(ESConnectionBase):
self._connect()
continue
except Exception as e:
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
# Only log debug for NotFoundError(accepted when metadata index doesn't exist)
if 'NotFound' in str(e):
self.logger.debug(f"ESConnection.search {str(index_names)} query: " + str(q) + " - " + str(e))
else:
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
raise e
self.logger.error(f"ESConnection.search timeout for {ATTEMPT_TIME} times!")

View File

@ -149,8 +149,11 @@ class InfinityConnection(InfinityConnectionBase):
if condition:
table_found = False
for indexName in index_names:
for kb_id in knowledgebase_ids:
table_name = f"{indexName}_{kb_id}"
if indexName.startswith("ragflow_doc_meta_"):
table_names_to_search = [indexName]
else:
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
for table_name in table_names_to_search:
try:
filter_cond = self.equivalent_condition_to_str(condition, db_instance.get_table(table_name))
table_found = True
@ -221,8 +224,11 @@ class InfinityConnection(InfinityConnectionBase):
total_hits_count = 0
# Scatter search tables and gather the results
for indexName in index_names:
for knowledgebaseId in knowledgebase_ids:
table_name = f"{indexName}_{knowledgebaseId}"
if indexName.startswith("ragflow_doc_meta_"):
table_names_to_search = [indexName]
else:
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
for table_name in table_names_to_search:
try:
table_instance = db_instance.get_table(table_name)
except Exception:
@ -276,8 +282,11 @@ class InfinityConnection(InfinityConnectionBase):
df_list = list()
assert isinstance(knowledgebase_ids, list)
table_list = list()
for knowledgebaseId in knowledgebase_ids:
table_name = f"{index_name}_{knowledgebaseId}"
if index_name.startswith("ragflow_doc_meta_"):
table_names_to_search = [index_name]
else:
table_names_to_search = [f"{index_name}_{kb_id}" for kb_id in knowledgebase_ids]
for table_name in table_names_to_search:
table_list.append(table_name)
try:
table_instance = db_instance.get_table(table_name)
@ -301,7 +310,10 @@ class InfinityConnection(InfinityConnectionBase):
def insert(self, documents: list[dict], index_name: str, knowledgebase_id: str = None) -> list[str]:
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
table_name = f"{index_name}_{knowledgebase_id}"
if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{knowledgebase_id}"
try:
table_instance = db_instance.get_table(table_name)
except InfinityException as e:
@ -405,6 +417,11 @@ class InfinityConnection(InfinityConnectionBase):
elif k in ["page_num_int", "top_int"]:
assert isinstance(v, list)
d[k] = "_".join(f"{num:08x}" for num in v)
elif k == "meta_fields":
if isinstance(v, dict):
d[k] = json.dumps(v, ensure_ascii=False)
else:
d[k] = v if v else "{}"
else:
d[k] = v
for k in ["docnm_kwd", "title_tks", "title_sm_tks", "important_kwd", "important_tks", "content_with_weight",
@ -434,7 +451,10 @@ class InfinityConnection(InfinityConnectionBase):
# logger.info(f"update position_int: {newValue['position_int']}")
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
table_name = f"{index_name}_{knowledgebase_id}"
if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{knowledgebase_id}"
table_instance = db_instance.get_table(table_name)
# if "exists" in condition:
# del condition["exists"]