mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-31 15:45:08 +08:00
Put document metadata in ES/Infinity (#12826)
### What problem does this PR solve?
Put document metadata in ES/Infinity.
Index name of meta data: ragflow_doc_meta_{tenant_id}
### Type of change
- [x] Refactoring
This commit is contained in:
@ -98,6 +98,7 @@ def message_fit_in(msg, max_length=4000):
|
||||
|
||||
def kb_prompt(kbinfos, max_tokens, hash_id=False):
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
|
||||
knowledges = [get_value(ck, "content", "content_with_weight") for ck in kbinfos["chunks"]]
|
||||
kwlg_len = len(knowledges)
|
||||
@ -114,7 +115,12 @@ def kb_prompt(kbinfos, max_tokens, hash_id=False):
|
||||
break
|
||||
|
||||
docs = DocumentService.get_by_ids([get_value(ck, "doc_id", "document_id") for ck in kbinfos["chunks"][:chunks_num]])
|
||||
docs = {d.id: d.meta_fields for d in docs}
|
||||
|
||||
docs_with_meta = {}
|
||||
for d in docs:
|
||||
meta = DocMetadataService.get_document_metadata(d.id)
|
||||
docs_with_meta[d.id] = meta if meta else {}
|
||||
docs = docs_with_meta
|
||||
|
||||
def draw_node(k, line):
|
||||
if line is not None and not isinstance(line, str):
|
||||
|
||||
@ -61,6 +61,7 @@ import numpy as np
|
||||
from peewee import DoesNotExist
|
||||
from common.constants import LLMType, ParserType, PipelineTaskType
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
@ -438,12 +439,10 @@ async def build_chunks(task, progress_callback):
|
||||
metadata = update_metadata_to(metadata, doc["metadata_obj"])
|
||||
del doc["metadata_obj"]
|
||||
if metadata:
|
||||
e, doc = DocumentService.get_by_id(task["doc_id"])
|
||||
if e:
|
||||
if isinstance(doc.meta_fields, str):
|
||||
doc.meta_fields = json.loads(doc.meta_fields)
|
||||
metadata = update_metadata_to(metadata, doc.meta_fields)
|
||||
DocumentService.update_by_id(task["doc_id"], {"meta_fields": metadata})
|
||||
existing_meta = DocMetadataService.get_document_metadata(task["doc_id"])
|
||||
existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
|
||||
metadata = update_metadata_to(metadata, existing_meta)
|
||||
DocMetadataService.update_document_metadata(task["doc_id"], metadata)
|
||||
progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
|
||||
|
||||
if task["kb_parser_config"].get("tag_kb_ids", []):
|
||||
@ -735,12 +734,10 @@ async def run_dataflow(task: dict):
|
||||
del ck["positions"]
|
||||
|
||||
if metadata:
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if e:
|
||||
if isinstance(doc.meta_fields, str):
|
||||
doc.meta_fields = json.loads(doc.meta_fields)
|
||||
metadata = update_metadata_to(metadata, doc.meta_fields)
|
||||
DocumentService.update_by_id(doc_id, {"meta_fields": metadata})
|
||||
existing_meta = DocMetadataService.get_document_metadata(doc_id)
|
||||
existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
|
||||
metadata = update_metadata_to(metadata, existing_meta)
|
||||
DocMetadataService.update_document_metadata(doc_id, metadata)
|
||||
|
||||
start_ts = timer()
|
||||
set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")
|
||||
|
||||
@ -162,7 +162,11 @@ class ESConnection(ESConnectionBase):
|
||||
self._connect()
|
||||
continue
|
||||
except Exception as e:
|
||||
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
|
||||
# Only log debug for NotFoundError(accepted when metadata index doesn't exist)
|
||||
if 'NotFound' in str(e):
|
||||
self.logger.debug(f"ESConnection.search {str(index_names)} query: " + str(q) + " - " + str(e))
|
||||
else:
|
||||
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
|
||||
raise e
|
||||
|
||||
self.logger.error(f"ESConnection.search timeout for {ATTEMPT_TIME} times!")
|
||||
|
||||
@ -149,8 +149,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
if condition:
|
||||
table_found = False
|
||||
for indexName in index_names:
|
||||
for kb_id in knowledgebase_ids:
|
||||
table_name = f"{indexName}_{kb_id}"
|
||||
if indexName.startswith("ragflow_doc_meta_"):
|
||||
table_names_to_search = [indexName]
|
||||
else:
|
||||
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
|
||||
for table_name in table_names_to_search:
|
||||
try:
|
||||
filter_cond = self.equivalent_condition_to_str(condition, db_instance.get_table(table_name))
|
||||
table_found = True
|
||||
@ -221,8 +224,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
total_hits_count = 0
|
||||
# Scatter search tables and gather the results
|
||||
for indexName in index_names:
|
||||
for knowledgebaseId in knowledgebase_ids:
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
if indexName.startswith("ragflow_doc_meta_"):
|
||||
table_names_to_search = [indexName]
|
||||
else:
|
||||
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
|
||||
for table_name in table_names_to_search:
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
@ -276,8 +282,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
df_list = list()
|
||||
assert isinstance(knowledgebase_ids, list)
|
||||
table_list = list()
|
||||
for knowledgebaseId in knowledgebase_ids:
|
||||
table_name = f"{index_name}_{knowledgebaseId}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_names_to_search = [index_name]
|
||||
else:
|
||||
table_names_to_search = [f"{index_name}_{kb_id}" for kb_id in knowledgebase_ids]
|
||||
for table_name in table_names_to_search:
|
||||
table_list.append(table_name)
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
@ -301,7 +310,10 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
def insert(self, documents: list[dict], index_name: str, knowledgebase_id: str = None) -> list[str]:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except InfinityException as e:
|
||||
@ -405,6 +417,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
elif k in ["page_num_int", "top_int"]:
|
||||
assert isinstance(v, list)
|
||||
d[k] = "_".join(f"{num:08x}" for num in v)
|
||||
elif k == "meta_fields":
|
||||
if isinstance(v, dict):
|
||||
d[k] = json.dumps(v, ensure_ascii=False)
|
||||
else:
|
||||
d[k] = v if v else "{}"
|
||||
else:
|
||||
d[k] = v
|
||||
for k in ["docnm_kwd", "title_tks", "title_sm_tks", "important_kwd", "important_tks", "content_with_weight",
|
||||
@ -434,7 +451,10 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
# logger.info(f"update position_int: {newValue['position_int']}")
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
# if "exists" in condition:
|
||||
# del condition["exists"]
|
||||
|
||||
Reference in New Issue
Block a user