mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-28 22:26:36 +08:00
Put document metadata in ES/Infinity (#12826)
### What problem does this PR solve?
Put document metadata in ES/Infinity.
Index name of meta data: ragflow_doc_meta_{tenant_id}
### Type of change
- [x] Refactoring
This commit is contained in:
@ -21,7 +21,7 @@ import re
|
||||
from abc import ABC
|
||||
from agent.tools.base import ToolParamBase, ToolBase, ToolMeta
|
||||
from common.constants import LLMType
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from common.metadata_utils import apply_meta_data_filter
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
@ -125,7 +125,7 @@ class Retrieval(ToolBase, ABC):
|
||||
|
||||
doc_ids = []
|
||||
if self._param.meta_data_filter != {}:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
|
||||
def _resolve_manual_filter(flt: dict) -> dict:
|
||||
pat = re.compile(self.variable_ref_patt)
|
||||
|
||||
@ -22,6 +22,7 @@ import xxhash
|
||||
from quart import request
|
||||
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from common.metadata_utils import apply_meta_data_filter
|
||||
@ -381,7 +382,7 @@ async def retrieval_test():
|
||||
chat_mdl = LLMBundle(user_id, LLMType.CHAT)
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, local_doc_ids)
|
||||
|
||||
tenants = UserTenantService.query(user_id=user_id)
|
||||
|
||||
@ -26,6 +26,7 @@ from api.db import VALID_FILE_TYPES, FileType
|
||||
from api.db.db_models import Task
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from common.metadata_utils import meta_filter, convert_conditions, turn2jsonschema
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
@ -281,7 +282,7 @@ async def list_docs():
|
||||
doc_ids_filter = None
|
||||
metas = None
|
||||
if metadata_condition or metadata:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([kb_id])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id])
|
||||
|
||||
if metadata_condition:
|
||||
doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
|
||||
@ -401,7 +402,11 @@ async def doc_infos():
|
||||
if not DocumentService.accessible(doc_id, current_user.id):
|
||||
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
|
||||
docs = DocumentService.get_by_ids(doc_ids)
|
||||
return get_json_result(data=list(docs.dicts()))
|
||||
docs_list = list(docs.dicts())
|
||||
# Add meta_fields for each document
|
||||
for doc in docs_list:
|
||||
doc["meta_fields"] = DocMetadataService.get_document_metadata(doc["id"])
|
||||
return get_json_result(data=docs_list)
|
||||
|
||||
|
||||
@manager.route("/metadata/summary", methods=["POST"]) # noqa: F821
|
||||
@ -421,7 +426,7 @@ async def metadata_summary():
|
||||
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
|
||||
|
||||
try:
|
||||
summary = DocumentService.get_metadata_summary(kb_id, doc_ids)
|
||||
summary = DocMetadataService.get_metadata_summary(kb_id, doc_ids)
|
||||
return get_json_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -432,10 +437,14 @@ async def metadata_summary():
|
||||
@validate_request("doc_ids")
|
||||
async def metadata_update():
|
||||
req = await get_request_json()
|
||||
kb_id = req.get("kb_id")
|
||||
document_ids = req.get("doc_ids")
|
||||
updates = req.get("updates", []) or []
|
||||
deletes = req.get("deletes", []) or []
|
||||
|
||||
if not kb_id:
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
if not isinstance(updates, list) or not isinstance(deletes, list):
|
||||
return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
@ -446,8 +455,8 @@ async def metadata_update():
|
||||
if not isinstance(d, dict) or not d.get("key"):
|
||||
return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
updated = DocumentService.batch_update_metadata(None, document_ids, updates, deletes)
|
||||
return get_json_result(data={"updated": updated})
|
||||
updated = DocMetadataService.batch_update_metadata(kb_id, document_ids, updates, deletes)
|
||||
return get_json_result(data={"updated": updated, "matched_docs": len(document_ids)})
|
||||
|
||||
|
||||
@manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821
|
||||
@ -905,7 +914,7 @@ async def set_meta():
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
|
||||
if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}):
|
||||
if not DocMetadataService.update_document_metadata(req["doc_id"], meta):
|
||||
return get_data_error_result(message="Database error (meta updates)!")
|
||||
|
||||
return get_json_result(data=True)
|
||||
|
||||
@ -25,6 +25,7 @@ import numpy as np
|
||||
from api.db.services.connector_service import Connector2KbService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
|
||||
@ -467,7 +468,7 @@ def get_meta():
|
||||
message='No authorization.',
|
||||
code=RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=DocumentService.get_meta_by_kbs(kb_ids))
|
||||
return get_json_result(data=DocMetadataService.get_flatted_meta_by_kbs(kb_ids))
|
||||
|
||||
|
||||
@manager.route("/basic_info", methods=["GET"]) # noqa: F821
|
||||
|
||||
@ -18,6 +18,7 @@ import logging
|
||||
from quart import jsonify
|
||||
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from common.metadata_utils import meta_filter, convert_conditions
|
||||
@ -121,7 +122,7 @@ async def retrieval(tenant_id):
|
||||
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
|
||||
top = int(retrieval_setting.get("top_k", 1024))
|
||||
metadata_condition = req.get("metadata_condition", {}) or {}
|
||||
metas = DocumentService.get_meta_by_kbs([kb_id])
|
||||
metas = DocMetadataService.get_meta_by_kbs([kb_id])
|
||||
|
||||
doc_ids = []
|
||||
try:
|
||||
|
||||
@ -29,6 +29,7 @@ from api.constants import FILE_NAME_LEN_LIMIT
|
||||
from api.db import FileType
|
||||
from api.db.db_models import File, Task
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
@ -255,7 +256,8 @@ async def update_doc(tenant_id, dataset_id, document_id):
|
||||
if "meta_fields" in req:
|
||||
if not isinstance(req["meta_fields"], dict):
|
||||
return get_error_data_result(message="meta_fields must be a dictionary")
|
||||
DocumentService.update_meta_fields(document_id, req["meta_fields"])
|
||||
if not DocMetadataService.update_document_metadata(document_id, req["meta_fields"]):
|
||||
return get_error_data_result(message="Failed to update metadata")
|
||||
|
||||
if "name" in req and req["name"] != doc.name:
|
||||
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
|
||||
@ -568,7 +570,7 @@ def list_docs(dataset_id, tenant_id):
|
||||
|
||||
doc_ids_filter = None
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
|
||||
doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
|
||||
if metadata_condition.get("conditions") and not doc_ids_filter:
|
||||
return get_result(data={"total": 0, "docs": []})
|
||||
@ -611,7 +613,7 @@ async def metadata_summary(dataset_id, tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
|
||||
req = await get_request_json()
|
||||
try:
|
||||
summary = DocumentService.get_metadata_summary(dataset_id, req.get("doc_ids"))
|
||||
summary = DocMetadataService.get_metadata_summary(dataset_id, req.get("doc_ids"))
|
||||
return get_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -657,14 +659,14 @@ async def metadata_batch_update(dataset_id, tenant_id):
|
||||
target_doc_ids = set(document_ids)
|
||||
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
|
||||
filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
|
||||
target_doc_ids = target_doc_ids & filtered_ids
|
||||
if metadata_condition.get("conditions") and not target_doc_ids:
|
||||
return get_result(data={"updated": 0, "matched_docs": 0})
|
||||
|
||||
target_doc_ids = list(target_doc_ids)
|
||||
updated = DocumentService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
|
||||
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
|
||||
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
|
||||
@ -1534,7 +1536,7 @@ async def retrieval_test(tenant_id):
|
||||
if not doc_ids:
|
||||
metadata_condition = req.get("metadata_condition")
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_meta_by_kbs(kb_ids)
|
||||
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
|
||||
# If metadata_condition has conditions but no docs match, return empty result
|
||||
if not doc_ids and metadata_condition.get("conditions"):
|
||||
|
||||
@ -35,7 +35,7 @@ from api.db.services.conversation_service import ConversationService
|
||||
from api.db.services.conversation_service import async_iframe_completion as iframe_completion
|
||||
from api.db.services.conversation_service import async_completion as rag_completion
|
||||
from api.db.services.dialog_service import DialogService, async_ask, async_chat, gen_mindmap
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from common.metadata_utils import apply_meta_data_filter, convert_conditions, meta_filter
|
||||
@ -147,7 +147,7 @@ async def chat_completion(tenant_id, chat_id):
|
||||
return get_error_data_result(message="metadata_condition must be an object.")
|
||||
|
||||
if metadata_condition and req.get("question"):
|
||||
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or [])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
|
||||
filtered_doc_ids = meta_filter(
|
||||
metas,
|
||||
convert_conditions(metadata_condition),
|
||||
@ -279,7 +279,7 @@ async def chat_completion_openai_like(tenant_id, chat_id):
|
||||
|
||||
doc_ids_str = None
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or [])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
|
||||
filtered_doc_ids = meta_filter(
|
||||
metas,
|
||||
convert_conditions(metadata_condition),
|
||||
@ -1084,7 +1084,7 @@ async def retrieval_test_embedded():
|
||||
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, _question, chat_mdl, local_doc_ids)
|
||||
|
||||
tenants = UserTenantService.query(user_id=tenant_id)
|
||||
|
||||
@ -795,7 +795,6 @@ class Document(DataBaseModel):
|
||||
progress_msg = TextField(null=True, help_text="process message", default="")
|
||||
process_begin_at = DateTimeField(null=True, index=True)
|
||||
process_duration = FloatField(default=0)
|
||||
meta_fields = JSONField(null=True, default={})
|
||||
suffix = CharField(max_length=32, null=False, help_text="The real file extension suffix", index=True)
|
||||
|
||||
run = CharField(max_length=1, null=True, help_text="start to run processing or cancel.(1: run it; 2: cancel)", default="0", index=True)
|
||||
@ -1267,7 +1266,6 @@ def migrate_db():
|
||||
alter_db_add_column(migrator, "task", "digest", TextField(null=True, help_text="task digest", default=""))
|
||||
alter_db_add_column(migrator, "task", "chunk_ids", LongTextField(null=True, help_text="chunk ids", default=""))
|
||||
alter_db_add_column(migrator, "conversation", "user_id", CharField(max_length=255, null=True, help_text="user_id", index=True))
|
||||
alter_db_add_column(migrator, "document", "meta_fields", JSONField(null=True, default={}))
|
||||
alter_db_add_column(migrator, "task", "task_type", CharField(max_length=32, null=False, default=""))
|
||||
alter_db_add_column(migrator, "task", "priority", IntegerField(default=0))
|
||||
alter_db_add_column(migrator, "user_canvas", "permission", CharField(max_length=16, null=False, help_text="me|team", default="me", index=True))
|
||||
|
||||
@ -23,6 +23,7 @@ from api.db.services.canvas_service import UserCanvasService
|
||||
from api.db.services.conversation_service import ConversationService
|
||||
from api.db.services.dialog_service import DialogService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.langfuse_service import TenantLangfuseService
|
||||
@ -107,6 +108,11 @@ def create_new_user(user_info: dict) -> dict:
|
||||
except Exception as create_error:
|
||||
logging.exception(create_error)
|
||||
# rollback
|
||||
try:
|
||||
metadata_index_name = DocMetadataService._get_doc_meta_index_name(user_id)
|
||||
settings.docStoreConn.delete_idx(metadata_index_name, "")
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
try:
|
||||
TenantService.delete_by_id(user_id)
|
||||
except Exception as e:
|
||||
@ -165,6 +171,12 @@ def delete_user_data(user_id: str) -> dict:
|
||||
# step1.1.2 delete file and document info in db
|
||||
doc_ids = DocumentService.get_all_doc_ids_by_kb_ids(kb_ids)
|
||||
if doc_ids:
|
||||
for doc in doc_ids:
|
||||
try:
|
||||
DocMetadataService.delete_document_metadata(doc["id"], skip_empty_check=True)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to delete metadata for document {doc['id']}: {e}")
|
||||
|
||||
doc_delete_res = DocumentService.delete_by_ids([i["id"] for i in doc_ids])
|
||||
done_msg += f"- Deleted {doc_delete_res} document records.\n"
|
||||
task_delete_res = TaskService.delete_by_doc_ids([i["id"] for i in doc_ids])
|
||||
@ -202,6 +214,13 @@ def delete_user_data(user_id: str) -> dict:
|
||||
done_msg += f"- Deleted {llm_delete_res} tenant-LLM records.\n"
|
||||
langfuse_delete_res = TenantLangfuseService.delete_ty_tenant_id(tenant_id)
|
||||
done_msg += f"- Deleted {langfuse_delete_res} langfuse records.\n"
|
||||
try:
|
||||
metadata_index_name = DocMetadataService._get_doc_meta_index_name(tenant_id)
|
||||
settings.docStoreConn.delete_idx(metadata_index_name, "")
|
||||
done_msg += f"- Deleted metadata table {metadata_index_name}.\n"
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to delete metadata table for tenant {tenant_id}: {e}")
|
||||
done_msg += "- Warning: Failed to delete metadata table (continuing).\n"
|
||||
# step1.3 delete memory and messages
|
||||
user_memory = MemoryService.get_by_tenant_id(tenant_id)
|
||||
if user_memory:
|
||||
@ -269,6 +288,11 @@ def delete_user_data(user_id: str) -> dict:
|
||||
# step2.1.5 delete document record
|
||||
doc_delete_res = DocumentService.delete_by_ids([d['id'] for d in created_documents])
|
||||
done_msg += f"- Deleted {doc_delete_res} documents.\n"
|
||||
for doc in created_documents:
|
||||
try:
|
||||
DocMetadataService.delete_document_metadata(doc['id'])
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to delete metadata for document {doc['id']}: {e}")
|
||||
# step2.1.6 update dataset doc&chunk&token cnt
|
||||
for kb_id, doc_num in kb_doc_info.items():
|
||||
KnowledgebaseService.decrease_document_num_in_delete(kb_id, doc_num)
|
||||
|
||||
@ -25,6 +25,7 @@ from api.db import InputType
|
||||
from api.db.db_models import Connector, SyncLogs, Connector2Kb, Knowledgebase
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.document_service import DocMetadataService
|
||||
from common.misc_utils import get_uuid
|
||||
from common.constants import TaskStatus
|
||||
from common.time_utils import current_timestamp, timestamp_to_date
|
||||
@ -227,7 +228,7 @@ class SyncLogsService(CommonService):
|
||||
|
||||
# Set metadata if available for this document
|
||||
if doc["name"] in metadata_map:
|
||||
DocumentService.update_by_id(doc["id"], {"meta_fields": metadata_map[doc["name"]]})
|
||||
DocMetadataService.update_document_metadata(doc["id"], metadata_map[doc["name"]])
|
||||
|
||||
if not auto_parse or auto_parse == "0":
|
||||
continue
|
||||
|
||||
@ -28,7 +28,7 @@ from api.db.services.file_service import FileService
|
||||
from common.constants import LLMType, ParserType, StatusEnum
|
||||
from api.db.db_models import DB, Dialog
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.langfuse_service import TenantLangfuseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
@ -355,7 +355,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
|
||||
questions = [await cross_languages(dialog.tenant_id, dialog.llm_id, questions[0], prompt_config["cross_languages"])]
|
||||
|
||||
if dialog.meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(dialog.kb_ids)
|
||||
attachments = await apply_meta_data_filter(
|
||||
dialog.meta_data_filter,
|
||||
metas,
|
||||
@ -1048,7 +1048,7 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf
|
||||
tenant_ids = list(set([kb.tenant_id for kb in kbs]))
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
|
||||
|
||||
kbinfos = await retriever.retrieval(
|
||||
@ -1124,7 +1124,7 @@ async def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
|
||||
rerank_mdl = LLMBundle(tenant_id, LLMType.RERANK, rerank_id)
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
|
||||
|
||||
ranks = await settings.retriever.retrieval(
|
||||
|
||||
1073
api/db/services/doc_metadata_service.py
Normal file
1073
api/db/services/doc_metadata_service.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -33,7 +33,7 @@ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTena
|
||||
from api.db.db_utils import bulk_insert_into_db
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from common.metadata_utils import dedupe_list
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from common.misc_utils import get_uuid
|
||||
from common.time_utils import current_timestamp, get_format_time
|
||||
from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME
|
||||
@ -67,7 +67,6 @@ class DocumentService(CommonService):
|
||||
cls.model.progress_msg,
|
||||
cls.model.process_begin_at,
|
||||
cls.model.process_duration,
|
||||
cls.model.meta_fields,
|
||||
cls.model.suffix,
|
||||
cls.model.run,
|
||||
cls.model.status,
|
||||
@ -154,8 +153,11 @@ class DocumentService(CommonService):
|
||||
docs = docs.where(cls.model.type.in_(types))
|
||||
if suffix:
|
||||
docs = docs.where(cls.model.suffix.in_(suffix))
|
||||
if return_empty_metadata:
|
||||
docs = docs.where(fn.COALESCE(fn.JSON_LENGTH(cls.model.meta_fields), 0) == 0)
|
||||
|
||||
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
|
||||
doc_ids_with_metadata = set(metadata_map.keys())
|
||||
if return_empty_metadata and doc_ids_with_metadata:
|
||||
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
|
||||
|
||||
count = docs.count()
|
||||
if desc:
|
||||
@ -166,7 +168,14 @@ class DocumentService(CommonService):
|
||||
if page_number and items_per_page:
|
||||
docs = docs.paginate(page_number, items_per_page)
|
||||
|
||||
return list(docs.dicts()), count
|
||||
docs_list = list(docs.dicts())
|
||||
if return_empty_metadata:
|
||||
for doc in docs_list:
|
||||
doc["meta_fields"] = {}
|
||||
else:
|
||||
for doc in docs_list:
|
||||
doc["meta_fields"] = metadata_map.get(doc["id"], {})
|
||||
return docs_list, count
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
@ -212,7 +221,7 @@ class DocumentService(CommonService):
|
||||
if suffix:
|
||||
query = query.where(cls.model.suffix.in_(suffix))
|
||||
|
||||
rows = query.select(cls.model.run, cls.model.suffix, cls.model.meta_fields)
|
||||
rows = query.select(cls.model.run, cls.model.suffix, cls.model.id)
|
||||
total = rows.count()
|
||||
|
||||
suffix_counter = {}
|
||||
@ -220,10 +229,18 @@ class DocumentService(CommonService):
|
||||
metadata_counter = {}
|
||||
empty_metadata_count = 0
|
||||
|
||||
doc_ids = [row.id for row in rows]
|
||||
metadata = {}
|
||||
if doc_ids:
|
||||
try:
|
||||
metadata = DocMetadataService.get_metadata_for_documents(doc_ids, kb_id)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to fetch metadata from ES/Infinity: {e}")
|
||||
|
||||
for row in rows:
|
||||
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
|
||||
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
|
||||
meta_fields = row.meta_fields or {}
|
||||
meta_fields = metadata.get(row.id, {})
|
||||
if not meta_fields:
|
||||
empty_metadata_count += 1
|
||||
continue
|
||||
@ -374,6 +391,12 @@ class DocumentService(CommonService):
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}")
|
||||
|
||||
# Delete document metadata (non-critical, log and continue)
|
||||
try:
|
||||
DocMetadataService.delete_document_metadata(doc.id)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to delete metadata for document {doc.id}: {e}")
|
||||
|
||||
# Cleanup knowledge graph references (non-critical, log and continue)
|
||||
try:
|
||||
graph_source = settings.docStoreConn.get_fields(
|
||||
@ -707,246 +730,6 @@ class DocumentService(CommonService):
|
||||
|
||||
cls.update_by_id(doc_id, info)
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def update_meta_fields(cls, doc_id, meta_fields):
|
||||
return cls.update_by_id(doc_id, {"meta_fields": meta_fields})
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_meta_by_kbs(cls, kb_ids):
|
||||
"""
|
||||
Legacy metadata aggregator (backward-compatible).
|
||||
- Does NOT expand list values and a list is kept as one string key.
|
||||
Example: {"tags": ["foo","bar"]} -> meta["tags"]["['foo', 'bar']"] = [doc_id]
|
||||
- Expects meta_fields is a dict.
|
||||
Use when existing callers rely on the old list-as-string semantics.
|
||||
"""
|
||||
fields = [
|
||||
cls.model.id,
|
||||
cls.model.meta_fields,
|
||||
]
|
||||
meta = {}
|
||||
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
|
||||
doc_id = r.id
|
||||
for k,v in r.meta_fields.items():
|
||||
if k not in meta:
|
||||
meta[k] = {}
|
||||
if not isinstance(v, list):
|
||||
v = [v]
|
||||
for vv in v:
|
||||
if vv not in meta[k]:
|
||||
if isinstance(vv, list) or isinstance(vv, dict):
|
||||
continue
|
||||
meta[k][vv] = []
|
||||
meta[k][vv].append(doc_id)
|
||||
return meta
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_flatted_meta_by_kbs(cls, kb_ids):
|
||||
"""
|
||||
- Parses stringified JSON meta_fields when possible and skips non-dict or unparsable values.
|
||||
- Expands list values into individual entries.
|
||||
Example: {"tags": ["foo","bar"], "author": "alice"} ->
|
||||
meta["tags"]["foo"] = [doc_id], meta["tags"]["bar"] = [doc_id], meta["author"]["alice"] = [doc_id]
|
||||
Prefer for metadata_condition filtering and scenarios that must respect list semantics.
|
||||
"""
|
||||
fields = [
|
||||
cls.model.id,
|
||||
cls.model.meta_fields,
|
||||
]
|
||||
meta = {}
|
||||
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
|
||||
doc_id = r.id
|
||||
meta_fields = r.meta_fields or {}
|
||||
if isinstance(meta_fields, str):
|
||||
try:
|
||||
meta_fields = json.loads(meta_fields)
|
||||
except Exception:
|
||||
continue
|
||||
if not isinstance(meta_fields, dict):
|
||||
continue
|
||||
for k, v in meta_fields.items():
|
||||
if k not in meta:
|
||||
meta[k] = {}
|
||||
values = v if isinstance(v, list) else [v]
|
||||
for vv in values:
|
||||
if vv is None:
|
||||
continue
|
||||
sv = str(vv)
|
||||
if sv not in meta[k]:
|
||||
meta[k][sv] = []
|
||||
meta[k][sv].append(doc_id)
|
||||
return meta
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_metadata_summary(cls, kb_id, document_ids=None):
|
||||
def _meta_value_type(value):
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, list):
|
||||
return "list"
|
||||
if isinstance(value, bool):
|
||||
return "string"
|
||||
if isinstance(value, (int, float)):
|
||||
return "number"
|
||||
if re.match(r"\d{4}\-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", str(value)):
|
||||
return "time"
|
||||
return "string"
|
||||
|
||||
fields = [cls.model.id, cls.model.meta_fields]
|
||||
summary = {}
|
||||
type_counter = {}
|
||||
query = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
|
||||
if document_ids:
|
||||
query = query.where(cls.model.id.in_(document_ids))
|
||||
for r in query:
|
||||
meta_fields = r.meta_fields or {}
|
||||
if isinstance(meta_fields, str):
|
||||
try:
|
||||
meta_fields = json.loads(meta_fields)
|
||||
except Exception:
|
||||
continue
|
||||
if not isinstance(meta_fields, dict):
|
||||
continue
|
||||
for k, v in meta_fields.items():
|
||||
value_type = _meta_value_type(v)
|
||||
if value_type:
|
||||
if k not in type_counter:
|
||||
type_counter[k] = {}
|
||||
type_counter[k][value_type] = type_counter[k].get(value_type, 0) + 1
|
||||
values = v if isinstance(v, list) else [v]
|
||||
for vv in values:
|
||||
if not vv:
|
||||
continue
|
||||
sv = str(vv)
|
||||
if k not in summary:
|
||||
summary[k] = {}
|
||||
summary[k][sv] = summary[k].get(sv, 0) + 1
|
||||
result = {}
|
||||
for k, v in summary.items():
|
||||
values = sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True)
|
||||
type_counts = type_counter.get(k, {})
|
||||
value_type = "string"
|
||||
if type_counts:
|
||||
value_type = max(type_counts.items(), key=lambda item: item[1])[0]
|
||||
result[k] = {"type": value_type, "values": values}
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def batch_update_metadata(cls, kb_id, doc_ids, updates=None, deletes=None, adds=None):
|
||||
updates = updates or []
|
||||
deletes = deletes or []
|
||||
if not doc_ids:
|
||||
return 0
|
||||
|
||||
def _normalize_meta(meta):
|
||||
if isinstance(meta, str):
|
||||
try:
|
||||
meta = json.loads(meta)
|
||||
except Exception:
|
||||
return {}
|
||||
if not isinstance(meta, dict):
|
||||
return {}
|
||||
return deepcopy(meta)
|
||||
|
||||
def _str_equal(a, b):
|
||||
return str(a) == str(b)
|
||||
|
||||
def _apply_updates(meta):
|
||||
changed = False
|
||||
for upd in updates:
|
||||
key = upd.get("key")
|
||||
if not key:
|
||||
continue
|
||||
|
||||
new_value = upd.get("value")
|
||||
match_provided = upd.get("match")
|
||||
if key not in meta:
|
||||
if match_provided:
|
||||
continue
|
||||
meta[key] = dedupe_list(new_value) if isinstance(new_value, list) else new_value
|
||||
changed = True
|
||||
continue
|
||||
|
||||
if isinstance(meta[key], list):
|
||||
if not match_provided:
|
||||
if isinstance(new_value, list):
|
||||
meta[key] = dedupe_list(new_value)
|
||||
else:
|
||||
meta[key].append(new_value)
|
||||
changed = True
|
||||
else:
|
||||
match_value = upd.get("match")
|
||||
replaced = False
|
||||
new_list = []
|
||||
for item in meta[key]:
|
||||
if _str_equal(item, match_value):
|
||||
new_list.append(new_value)
|
||||
replaced = True
|
||||
else:
|
||||
new_list.append(item)
|
||||
if replaced:
|
||||
meta[key] = dedupe_list(new_list)
|
||||
changed = True
|
||||
else:
|
||||
if not match_provided:
|
||||
meta[key] = new_value
|
||||
changed = True
|
||||
else:
|
||||
match_value = upd.get("match")
|
||||
if _str_equal(meta[key], match_value):
|
||||
meta[key] = new_value
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
def _apply_deletes(meta):
|
||||
changed = False
|
||||
for d in deletes:
|
||||
key = d.get("key")
|
||||
if not key or key not in meta:
|
||||
continue
|
||||
value = d.get("value", None)
|
||||
if isinstance(meta[key], list):
|
||||
if value is None:
|
||||
del meta[key]
|
||||
changed = True
|
||||
continue
|
||||
new_list = [item for item in meta[key] if not _str_equal(item, value)]
|
||||
if len(new_list) != len(meta[key]):
|
||||
if new_list:
|
||||
meta[key] = new_list
|
||||
else:
|
||||
del meta[key]
|
||||
changed = True
|
||||
else:
|
||||
if value is None or _str_equal(meta[key], value):
|
||||
del meta[key]
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
updated_docs = 0
|
||||
with DB.atomic():
|
||||
rows = cls.model.select(cls.model.id, cls.model.meta_fields).where(
|
||||
cls.model.id.in_(doc_ids)
|
||||
)
|
||||
for r in rows:
|
||||
meta = _normalize_meta(r.meta_fields or {})
|
||||
original_meta = deepcopy(meta)
|
||||
changed = _apply_updates(meta)
|
||||
changed = _apply_deletes(meta) or changed
|
||||
if changed and meta != original_meta:
|
||||
cls.model.update(
|
||||
meta_fields=meta,
|
||||
update_time=current_timestamp(),
|
||||
update_date=get_format_time()
|
||||
).where(cls.model.id == r.id).execute()
|
||||
updated_docs += 1
|
||||
return updated_docs
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def update_progress(cls):
|
||||
|
||||
@ -24,6 +24,7 @@ from abc import abstractmethod
|
||||
from elasticsearch import NotFoundError
|
||||
from elasticsearch_dsl import Index
|
||||
from elastic_transport import ConnectionTimeout
|
||||
from elasticsearch.client import IndicesClient
|
||||
from common.file_utils import get_project_base_directory
|
||||
from common.misc_utils import convert_bytes
|
||||
from common.doc_store.doc_store_base import DocStoreConnection, OrderByExpr, MatchExpr
|
||||
@ -128,13 +129,34 @@ class ESConnectionBase(DocStoreConnection):
|
||||
if self.index_exist(index_name, dataset_id):
|
||||
return True
|
||||
try:
|
||||
from elasticsearch.client import IndicesClient
|
||||
return IndicesClient(self.es).create(index=index_name,
|
||||
settings=self.mapping["settings"],
|
||||
mappings=self.mapping["mappings"])
|
||||
except Exception:
|
||||
self.logger.exception("ESConnection.createIndex error %s" % index_name)
|
||||
|
||||
def create_doc_meta_idx(self, index_name: str):
|
||||
"""
|
||||
Create a document metadata index.
|
||||
|
||||
Index name pattern: ragflow_doc_meta_{tenant_id}
|
||||
- Per-tenant metadata index for storing document metadata fields
|
||||
"""
|
||||
if self.index_exist(index_name, ""):
|
||||
return True
|
||||
try:
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_es_mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
|
||||
return False
|
||||
|
||||
doc_meta_mapping = json.load(open(fp_mapping, "r"))
|
||||
return IndicesClient(self.es).create(index=index_name,
|
||||
settings=doc_meta_mapping["settings"],
|
||||
mappings=doc_meta_mapping["mappings"])
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Error creating document metadata index {index_name}: {e}")
|
||||
|
||||
def delete_idx(self, index_name: str, dataset_id: str):
|
||||
if len(dataset_id) > 0:
|
||||
# The index need to be alive after any kb deletion since all kb under this tenant are in one index.
|
||||
|
||||
@ -285,8 +285,65 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
self.logger.info(f"INFINITY created table {table_name}, vector size {vector_size}")
|
||||
return True
|
||||
|
||||
def create_doc_meta_idx(self, index_name: str):
|
||||
"""
|
||||
Create a document metadata table.
|
||||
|
||||
Table name pattern: ragflow_doc_meta_{tenant_id}
|
||||
- Per-tenant metadata table for storing document metadata fields
|
||||
"""
|
||||
table_name = index_name
|
||||
inf_conn = self.connPool.get_conn()
|
||||
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
||||
try:
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_infinity_mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
|
||||
return False
|
||||
schema = json.load(open(fp_mapping))
|
||||
inf_db.create_table(
|
||||
table_name,
|
||||
schema,
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
|
||||
# Create secondary indexes on id and kb_id for better query performance
|
||||
inf_table = inf_db.get_table(table_name)
|
||||
|
||||
try:
|
||||
inf_table.create_index(
|
||||
f"idx_{table_name}_id",
|
||||
IndexInfo("id", IndexType.Secondary),
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
self.logger.debug(f"INFINITY created secondary index on id for table {table_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to create index on id for {table_name}: {e}")
|
||||
|
||||
try:
|
||||
inf_table.create_index(
|
||||
f"idx_{table_name}_kb_id",
|
||||
IndexInfo("kb_id", IndexType.Secondary),
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
self.logger.debug(f"INFINITY created secondary index on kb_id for table {table_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to create index on kb_id for {table_name}: {e}")
|
||||
|
||||
self.connPool.release_conn(inf_conn)
|
||||
self.logger.debug(f"INFINITY created document metadata table {table_name} with secondary indexes")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.connPool.release_conn(inf_conn)
|
||||
self.logger.exception(f"Error creating document metadata table {table_name}: {e}")
|
||||
return False
|
||||
|
||||
def delete_idx(self, index_name: str, dataset_id: str):
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
db_instance.drop_table(table_name, ConflictType.Ignore)
|
||||
@ -294,7 +351,10 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
self.logger.info(f"INFINITY dropped table {table_name}")
|
||||
|
||||
def index_exist(self, index_name: str, dataset_id: str) -> bool:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
try:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
@ -341,7 +401,10 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
def delete(self, condition: dict, index_name: str, dataset_id: str) -> int:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
|
||||
@ -106,10 +106,10 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
|
||||
else:
|
||||
if logic == "and":
|
||||
doc_ids = doc_ids & set(ids)
|
||||
if not doc_ids:
|
||||
return []
|
||||
else:
|
||||
doc_ids = doc_ids | set(ids)
|
||||
if not doc_ids:
|
||||
return []
|
||||
return list(doc_ids)
|
||||
|
||||
|
||||
|
||||
29
conf/doc_meta_es_mapping.json
Normal file
29
conf/doc_meta_es_mapping.json
Normal file
@ -0,0 +1,29 @@
|
||||
{
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 2,
|
||||
"number_of_replicas": 0,
|
||||
"refresh_interval": "1000ms"
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"_source": {
|
||||
"enabled": true
|
||||
},
|
||||
"dynamic": "runtime",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "keyword",
|
||||
"store": true
|
||||
},
|
||||
"kb_id": {
|
||||
"type": "keyword",
|
||||
"store": true
|
||||
},
|
||||
"meta_fields": {
|
||||
"type": "object",
|
||||
"dynamic": true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
5
conf/doc_meta_infinity_mapping.json
Normal file
5
conf/doc_meta_infinity_mapping.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"id": {"type": "varchar", "default": ""},
|
||||
"kb_id": {"type": "varchar", "default": ""},
|
||||
"meta_fields": {"type": "json", "default": "{}"}
|
||||
}
|
||||
@ -98,6 +98,7 @@ def message_fit_in(msg, max_length=4000):
|
||||
|
||||
def kb_prompt(kbinfos, max_tokens, hash_id=False):
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
|
||||
knowledges = [get_value(ck, "content", "content_with_weight") for ck in kbinfos["chunks"]]
|
||||
kwlg_len = len(knowledges)
|
||||
@ -114,7 +115,12 @@ def kb_prompt(kbinfos, max_tokens, hash_id=False):
|
||||
break
|
||||
|
||||
docs = DocumentService.get_by_ids([get_value(ck, "doc_id", "document_id") for ck in kbinfos["chunks"][:chunks_num]])
|
||||
docs = {d.id: d.meta_fields for d in docs}
|
||||
|
||||
docs_with_meta = {}
|
||||
for d in docs:
|
||||
meta = DocMetadataService.get_document_metadata(d.id)
|
||||
docs_with_meta[d.id] = meta if meta else {}
|
||||
docs = docs_with_meta
|
||||
|
||||
def draw_node(k, line):
|
||||
if line is not None and not isinstance(line, str):
|
||||
|
||||
@ -61,6 +61,7 @@ import numpy as np
|
||||
from peewee import DoesNotExist
|
||||
from common.constants import LLMType, ParserType, PipelineTaskType
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
@ -438,12 +439,10 @@ async def build_chunks(task, progress_callback):
|
||||
metadata = update_metadata_to(metadata, doc["metadata_obj"])
|
||||
del doc["metadata_obj"]
|
||||
if metadata:
|
||||
e, doc = DocumentService.get_by_id(task["doc_id"])
|
||||
if e:
|
||||
if isinstance(doc.meta_fields, str):
|
||||
doc.meta_fields = json.loads(doc.meta_fields)
|
||||
metadata = update_metadata_to(metadata, doc.meta_fields)
|
||||
DocumentService.update_by_id(task["doc_id"], {"meta_fields": metadata})
|
||||
existing_meta = DocMetadataService.get_document_metadata(task["doc_id"])
|
||||
existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
|
||||
metadata = update_metadata_to(metadata, existing_meta)
|
||||
DocMetadataService.update_document_metadata(task["doc_id"], metadata)
|
||||
progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
|
||||
|
||||
if task["kb_parser_config"].get("tag_kb_ids", []):
|
||||
@ -735,12 +734,10 @@ async def run_dataflow(task: dict):
|
||||
del ck["positions"]
|
||||
|
||||
if metadata:
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if e:
|
||||
if isinstance(doc.meta_fields, str):
|
||||
doc.meta_fields = json.loads(doc.meta_fields)
|
||||
metadata = update_metadata_to(metadata, doc.meta_fields)
|
||||
DocumentService.update_by_id(doc_id, {"meta_fields": metadata})
|
||||
existing_meta = DocMetadataService.get_document_metadata(doc_id)
|
||||
existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
|
||||
metadata = update_metadata_to(metadata, existing_meta)
|
||||
DocMetadataService.update_document_metadata(doc_id, metadata)
|
||||
|
||||
start_ts = timer()
|
||||
set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")
|
||||
|
||||
@ -162,7 +162,11 @@ class ESConnection(ESConnectionBase):
|
||||
self._connect()
|
||||
continue
|
||||
except Exception as e:
|
||||
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
|
||||
# Only log debug for NotFoundError(accepted when metadata index doesn't exist)
|
||||
if 'NotFound' in str(e):
|
||||
self.logger.debug(f"ESConnection.search {str(index_names)} query: " + str(q) + " - " + str(e))
|
||||
else:
|
||||
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
|
||||
raise e
|
||||
|
||||
self.logger.error(f"ESConnection.search timeout for {ATTEMPT_TIME} times!")
|
||||
|
||||
@ -149,8 +149,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
if condition:
|
||||
table_found = False
|
||||
for indexName in index_names:
|
||||
for kb_id in knowledgebase_ids:
|
||||
table_name = f"{indexName}_{kb_id}"
|
||||
if indexName.startswith("ragflow_doc_meta_"):
|
||||
table_names_to_search = [indexName]
|
||||
else:
|
||||
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
|
||||
for table_name in table_names_to_search:
|
||||
try:
|
||||
filter_cond = self.equivalent_condition_to_str(condition, db_instance.get_table(table_name))
|
||||
table_found = True
|
||||
@ -221,8 +224,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
total_hits_count = 0
|
||||
# Scatter search tables and gather the results
|
||||
for indexName in index_names:
|
||||
for knowledgebaseId in knowledgebase_ids:
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
if indexName.startswith("ragflow_doc_meta_"):
|
||||
table_names_to_search = [indexName]
|
||||
else:
|
||||
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
|
||||
for table_name in table_names_to_search:
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
@ -276,8 +282,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
df_list = list()
|
||||
assert isinstance(knowledgebase_ids, list)
|
||||
table_list = list()
|
||||
for knowledgebaseId in knowledgebase_ids:
|
||||
table_name = f"{index_name}_{knowledgebaseId}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_names_to_search = [index_name]
|
||||
else:
|
||||
table_names_to_search = [f"{index_name}_{kb_id}" for kb_id in knowledgebase_ids]
|
||||
for table_name in table_names_to_search:
|
||||
table_list.append(table_name)
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
@ -301,7 +310,10 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
def insert(self, documents: list[dict], index_name: str, knowledgebase_id: str = None) -> list[str]:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except InfinityException as e:
|
||||
@ -405,6 +417,11 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
elif k in ["page_num_int", "top_int"]:
|
||||
assert isinstance(v, list)
|
||||
d[k] = "_".join(f"{num:08x}" for num in v)
|
||||
elif k == "meta_fields":
|
||||
if isinstance(v, dict):
|
||||
d[k] = json.dumps(v, ensure_ascii=False)
|
||||
else:
|
||||
d[k] = v if v else "{}"
|
||||
else:
|
||||
d[k] = v
|
||||
for k in ["docnm_kwd", "title_tks", "title_sm_tks", "important_kwd", "important_tks", "content_with_weight",
|
||||
@ -434,7 +451,10 @@ class InfinityConnection(InfinityConnectionBase):
|
||||
# logger.info(f"update position_int: {newValue['position_int']}")
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
if index_name.startswith("ragflow_doc_meta_"):
|
||||
table_name = index_name
|
||||
else:
|
||||
table_name = f"{index_name}_{knowledgebase_id}"
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
# if "exists" in condition:
|
||||
# del condition["exists"]
|
||||
|
||||
@ -0,0 +1,184 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""
|
||||
End-to-end test for metadata filtering during retrieval.
|
||||
|
||||
Tests that chunks are only retrieved from documents matching the metadata condition.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import logging
|
||||
from common import (
|
||||
create_dataset,
|
||||
delete_datasets,
|
||||
list_documents,
|
||||
update_document,
|
||||
)
|
||||
from utils import wait_for
|
||||
|
||||
|
||||
@wait_for(30, 1, "Document parsing timeout")
|
||||
def _condition_parsing_complete(_auth, dataset_id):
|
||||
res = list_documents(_auth, dataset_id)
|
||||
if res["code"] != 0:
|
||||
return False
|
||||
|
||||
for doc in res["data"]["docs"]:
|
||||
status = doc.get("run", "UNKNOWN")
|
||||
if status == "FAILED":
|
||||
pytest.fail(f"Document parsing failed: {doc}")
|
||||
return False
|
||||
if status != "DONE":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def add_dataset_with_metadata(HttpApiAuth):
|
||||
# First create the dataset
|
||||
res = create_dataset(HttpApiAuth, {
|
||||
"name": f"test_metadata_{int(__import__('time').time())}",
|
||||
"chunk_method": "naive"
|
||||
})
|
||||
|
||||
assert res["code"] == 0, f"Failed to create dataset: {res}"
|
||||
dataset_id = res["data"]["id"]
|
||||
|
||||
# Then configure metadata via the update_metadata_setting endpoint
|
||||
import requests
|
||||
from configs import HOST_ADDRESS, VERSION
|
||||
|
||||
metadata_config = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"character": {
|
||||
"description": "Historical figure name",
|
||||
"type": "string"
|
||||
},
|
||||
"era": {
|
||||
"description": "Historical era",
|
||||
"type": "string"
|
||||
},
|
||||
"achievements": {
|
||||
"description": "Major achievements",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res = requests.post(
|
||||
url=f"{HOST_ADDRESS}/{VERSION}/kb/update_metadata_setting",
|
||||
headers={"Content-Type": "application/json"},
|
||||
auth=HttpApiAuth,
|
||||
json={
|
||||
"kb_id": dataset_id,
|
||||
"metadata": metadata_config,
|
||||
"enable_metadata": False
|
||||
}
|
||||
).json()
|
||||
|
||||
assert res["code"] == 0, f"Failed to configure metadata: {res}"
|
||||
|
||||
yield dataset_id
|
||||
|
||||
# Cleanup
|
||||
delete_datasets(HttpApiAuth, {"ids": [dataset_id]})
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
class TestMetadataWithRetrieval:
|
||||
"""Test retrieval with metadata filtering."""
|
||||
|
||||
def test_retrieval_with_metadata_filter(self, HttpApiAuth, add_dataset_with_metadata, tmp_path):
|
||||
"""
|
||||
Test that retrieval respects metadata filters.
|
||||
|
||||
Verifies that chunks are only retrieved from documents matching the metadata condition.
|
||||
"""
|
||||
from common import upload_documents, parse_documents, retrieval_chunks
|
||||
|
||||
dataset_id = add_dataset_with_metadata
|
||||
|
||||
# Create two documents with different metadata
|
||||
content_doc1 = "Document about Zhuge Liang who lived in Three Kingdoms period."
|
||||
content_doc2 = "Document about Cao Cao who lived in Late Eastern Han Dynasty."
|
||||
|
||||
fp1 = tmp_path / "doc1_zhuge_liang.txt"
|
||||
fp2 = tmp_path / "doc2_cao_cao.txt"
|
||||
|
||||
with open(fp1, "w", encoding="utf-8") as f:
|
||||
f.write(content_doc1)
|
||||
with open(fp2, "w", encoding="utf-8") as f:
|
||||
f.write(content_doc2)
|
||||
|
||||
# Upload both documents
|
||||
res = upload_documents(HttpApiAuth, dataset_id, [fp1, fp2])
|
||||
assert res["code"] == 0, f"Failed to upload documents: {res}"
|
||||
|
||||
doc1_id = res["data"][0]["id"]
|
||||
doc2_id = res["data"][1]["id"]
|
||||
|
||||
# Add different metadata to each document
|
||||
res = update_document(HttpApiAuth, dataset_id, doc1_id, {
|
||||
"meta_fields": {"character": "Zhuge Liang", "era": "Three Kingdoms"}
|
||||
})
|
||||
assert res["code"] == 0, f"Failed to update doc1 metadata: {res}"
|
||||
|
||||
res = update_document(HttpApiAuth, dataset_id, doc2_id, {
|
||||
"meta_fields": {"character": "Cao Cao", "era": "Late Eastern Han"}
|
||||
})
|
||||
assert res["code"] == 0, f"Failed to update doc2 metadata: {res}"
|
||||
|
||||
# Parse both documents
|
||||
res = parse_documents(HttpApiAuth, dataset_id, {"document_ids": [doc1_id, doc2_id]})
|
||||
assert res["code"] == 0, f"Failed to trigger parsing: {res}"
|
||||
|
||||
# Wait for parsing to complete
|
||||
assert _condition_parsing_complete(HttpApiAuth, dataset_id), "Parsing timeout"
|
||||
|
||||
# Test retrieval WITH metadata filter for "Zhuge Liang"
|
||||
res = retrieval_chunks(HttpApiAuth, {
|
||||
"question": "Zhuge Liang",
|
||||
"dataset_ids": [dataset_id],
|
||||
"metadata_condition": {
|
||||
"logic": "and",
|
||||
"conditions": [
|
||||
{
|
||||
"name": "character",
|
||||
"comparison_operator": "is",
|
||||
"value": "Zhuge Liang"
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
assert res["code"] == 0, f"Retrieval with metadata filter failed: {res}"
|
||||
|
||||
chunks_with_filter = res["data"]["chunks"]
|
||||
doc_ids_with_filter = set(chunk.get("document_id", "") for chunk in chunks_with_filter)
|
||||
|
||||
logging.info(f"✓ Retrieved {len(chunks_with_filter)} chunks from documents: {doc_ids_with_filter}")
|
||||
|
||||
# Verify that filtered results only contain doc1 (Zhuge Liang)
|
||||
if len(chunks_with_filter) > 0:
|
||||
assert doc1_id in doc_ids_with_filter, f"Filtered results should contain doc1 (Zhuge Liang), but got: {doc_ids_with_filter}"
|
||||
assert doc2_id not in doc_ids_with_filter, f"Filtered results should NOT contain doc2 (Cao Cao), but got: {doc_ids_with_filter}"
|
||||
logging.info("Metadata filter correctly excluded chunks from non-matching documents")
|
||||
else:
|
||||
logging.warning("No chunks retrieved with filter - this might be due to embedding model not configured")
|
||||
@ -22,7 +22,9 @@
|
||||
|
||||
def _summary_to_counts(summary):
|
||||
counts = {}
|
||||
for key, pairs in summary.items():
|
||||
for key, field_data in summary.items():
|
||||
# New format: {key: {"type": "...", "values": [[value, count], ...]}}
|
||||
pairs = field_data["values"]
|
||||
counts[key] = {str(k): v for k, v in pairs}
|
||||
return counts
|
||||
|
||||
|
||||
Reference in New Issue
Block a user