Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
This commit is contained in:
qinling0210
2026-01-28 13:29:34 +08:00
committed by GitHub
parent fd11aca8e5
commit 9a5208976c
24 changed files with 1529 additions and 304 deletions

View File

@ -21,7 +21,7 @@ import re
from abc import ABC from abc import ABC
from agent.tools.base import ToolParamBase, ToolBase, ToolMeta from agent.tools.base import ToolParamBase, ToolBase, ToolMeta
from common.constants import LLMType from common.constants import LLMType
from api.db.services.document_service import DocumentService from api.db.services.doc_metadata_service import DocMetadataService
from common.metadata_utils import apply_meta_data_filter from common.metadata_utils import apply_meta_data_filter
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
@ -125,7 +125,7 @@ class Retrieval(ToolBase, ABC):
doc_ids = [] doc_ids = []
if self._param.meta_data_filter != {}: if self._param.meta_data_filter != {}:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
def _resolve_manual_filter(flt: dict) -> dict: def _resolve_manual_filter(flt: dict) -> dict:
pat = re.compile(self.variable_ref_patt) pat = re.compile(self.variable_ref_patt)

View File

@ -22,6 +22,7 @@ import xxhash
from quart import request from quart import request
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from common.metadata_utils import apply_meta_data_filter from common.metadata_utils import apply_meta_data_filter
@ -381,7 +382,7 @@ async def retrieval_test():
chat_mdl = LLMBundle(user_id, LLMType.CHAT) chat_mdl = LLMBundle(user_id, LLMType.CHAT)
if meta_data_filter: if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, local_doc_ids) local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, local_doc_ids)
tenants = UserTenantService.query(user_id=user_id) tenants = UserTenantService.query(user_id=user_id)

View File

@ -26,6 +26,7 @@ from api.db import VALID_FILE_TYPES, FileType
from api.db.db_models import Task from api.db.db_models import Task
from api.db.services import duplicate_name from api.db.services import duplicate_name
from api.db.services.document_service import DocumentService, doc_upload_and_parse from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.db.services.doc_metadata_service import DocMetadataService
from common.metadata_utils import meta_filter, convert_conditions, turn2jsonschema from common.metadata_utils import meta_filter, convert_conditions, turn2jsonschema
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService from api.db.services.file_service import FileService
@ -281,7 +282,7 @@ async def list_docs():
doc_ids_filter = None doc_ids_filter = None
metas = None metas = None
if metadata_condition or metadata: if metadata_condition or metadata:
metas = DocumentService.get_flatted_meta_by_kbs([kb_id]) metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id])
if metadata_condition: if metadata_condition:
doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
@ -401,7 +402,11 @@ async def doc_infos():
if not DocumentService.accessible(doc_id, current_user.id): if not DocumentService.accessible(doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR) return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
docs = DocumentService.get_by_ids(doc_ids) docs = DocumentService.get_by_ids(doc_ids)
return get_json_result(data=list(docs.dicts())) docs_list = list(docs.dicts())
# Add meta_fields for each document
for doc in docs_list:
doc["meta_fields"] = DocMetadataService.get_document_metadata(doc["id"])
return get_json_result(data=docs_list)
@manager.route("/metadata/summary", methods=["POST"]) # noqa: F821 @manager.route("/metadata/summary", methods=["POST"]) # noqa: F821
@ -421,7 +426,7 @@ async def metadata_summary():
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR) return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
try: try:
summary = DocumentService.get_metadata_summary(kb_id, doc_ids) summary = DocMetadataService.get_metadata_summary(kb_id, doc_ids)
return get_json_result(data={"summary": summary}) return get_json_result(data={"summary": summary})
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@ -432,10 +437,14 @@ async def metadata_summary():
@validate_request("doc_ids") @validate_request("doc_ids")
async def metadata_update(): async def metadata_update():
req = await get_request_json() req = await get_request_json()
kb_id = req.get("kb_id")
document_ids = req.get("doc_ids") document_ids = req.get("doc_ids")
updates = req.get("updates", []) or [] updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or [] deletes = req.get("deletes", []) or []
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
if not isinstance(updates, list) or not isinstance(deletes, list): if not isinstance(updates, list) or not isinstance(deletes, list):
return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR) return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR)
@ -446,8 +455,8 @@ async def metadata_update():
if not isinstance(d, dict) or not d.get("key"): if not isinstance(d, dict) or not d.get("key"):
return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR) return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR)
updated = DocumentService.batch_update_metadata(None, document_ids, updates, deletes) updated = DocMetadataService.batch_update_metadata(kb_id, document_ids, updates, deletes)
return get_json_result(data={"updated": updated}) return get_json_result(data={"updated": updated, "matched_docs": len(document_ids)})
@manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821 @manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821
@ -905,7 +914,7 @@ async def set_meta():
if not e: if not e:
return get_data_error_result(message="Document not found!") return get_data_error_result(message="Document not found!")
if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}): if not DocMetadataService.update_document_metadata(req["doc_id"], meta):
return get_data_error_result(message="Database error (meta updates)!") return get_data_error_result(message="Database error (meta updates)!")
return get_json_result(data=True) return get_json_result(data=True)

View File

@ -25,6 +25,7 @@ import numpy as np
from api.db.services.connector_service import Connector2KbService from api.db.services.connector_service import Connector2KbService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService from api.db.services.file_service import FileService
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
@ -467,7 +468,7 @@ def get_meta():
message='No authorization.', message='No authorization.',
code=RetCode.AUTHENTICATION_ERROR code=RetCode.AUTHENTICATION_ERROR
) )
return get_json_result(data=DocumentService.get_meta_by_kbs(kb_ids)) return get_json_result(data=DocMetadataService.get_flatted_meta_by_kbs(kb_ids))
@manager.route("/basic_info", methods=["GET"]) # noqa: F821 @manager.route("/basic_info", methods=["GET"]) # noqa: F821

View File

@ -18,6 +18,7 @@ import logging
from quart import jsonify from quart import jsonify
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from common.metadata_utils import meta_filter, convert_conditions from common.metadata_utils import meta_filter, convert_conditions
@ -121,7 +122,7 @@ async def retrieval(tenant_id):
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0)) similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
top = int(retrieval_setting.get("top_k", 1024)) top = int(retrieval_setting.get("top_k", 1024))
metadata_condition = req.get("metadata_condition", {}) or {} metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocumentService.get_meta_by_kbs([kb_id]) metas = DocMetadataService.get_meta_by_kbs([kb_id])
doc_ids = [] doc_ids = []
try: try:

View File

@ -29,6 +29,7 @@ from api.constants import FILE_NAME_LEN_LIMIT
from api.db import FileType from api.db import FileType
from api.db.db_models import File, Task from api.db.db_models import File, Task
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
@ -255,7 +256,8 @@ async def update_doc(tenant_id, dataset_id, document_id):
if "meta_fields" in req: if "meta_fields" in req:
if not isinstance(req["meta_fields"], dict): if not isinstance(req["meta_fields"], dict):
return get_error_data_result(message="meta_fields must be a dictionary") return get_error_data_result(message="meta_fields must be a dictionary")
DocumentService.update_meta_fields(document_id, req["meta_fields"]) if not DocMetadataService.update_document_metadata(document_id, req["meta_fields"]):
return get_error_data_result(message="Failed to update metadata")
if "name" in req and req["name"] != doc.name: if "name" in req and req["name"] != doc.name:
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT: if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
@ -568,7 +570,7 @@ def list_docs(dataset_id, tenant_id):
doc_ids_filter = None doc_ids_filter = None
if metadata_condition: if metadata_condition:
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id]) metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
if metadata_condition.get("conditions") and not doc_ids_filter: if metadata_condition.get("conditions") and not doc_ids_filter:
return get_result(data={"total": 0, "docs": []}) return get_result(data={"total": 0, "docs": []})
@ -611,7 +613,7 @@ async def metadata_summary(dataset_id, tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
req = await get_request_json() req = await get_request_json()
try: try:
summary = DocumentService.get_metadata_summary(dataset_id, req.get("doc_ids")) summary = DocMetadataService.get_metadata_summary(dataset_id, req.get("doc_ids"))
return get_result(data={"summary": summary}) return get_result(data={"summary": summary})
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@ -657,14 +659,14 @@ async def metadata_batch_update(dataset_id, tenant_id):
target_doc_ids = set(document_ids) target_doc_ids = set(document_ids)
if metadata_condition: if metadata_condition:
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id]) metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
target_doc_ids = target_doc_ids & filtered_ids target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids: if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0}) return get_result(data={"updated": 0, "matched_docs": 0})
target_doc_ids = list(target_doc_ids) target_doc_ids = list(target_doc_ids)
updated = DocumentService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes) updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)}) return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821 @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
@ -1534,7 +1536,7 @@ async def retrieval_test(tenant_id):
if not doc_ids: if not doc_ids:
metadata_condition = req.get("metadata_condition") metadata_condition = req.get("metadata_condition")
if metadata_condition: if metadata_condition:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocMetadataService.get_meta_by_kbs(kb_ids)
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
# If metadata_condition has conditions but no docs match, return empty result # If metadata_condition has conditions but no docs match, return empty result
if not doc_ids and metadata_condition.get("conditions"): if not doc_ids and metadata_condition.get("conditions"):

View File

@ -35,7 +35,7 @@ from api.db.services.conversation_service import ConversationService
from api.db.services.conversation_service import async_iframe_completion as iframe_completion from api.db.services.conversation_service import async_iframe_completion as iframe_completion
from api.db.services.conversation_service import async_completion as rag_completion from api.db.services.conversation_service import async_completion as rag_completion
from api.db.services.dialog_service import DialogService, async_ask, async_chat, gen_mindmap from api.db.services.dialog_service import DialogService, async_ask, async_chat, gen_mindmap
from api.db.services.document_service import DocumentService from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from common.metadata_utils import apply_meta_data_filter, convert_conditions, meta_filter from common.metadata_utils import apply_meta_data_filter, convert_conditions, meta_filter
@ -147,7 +147,7 @@ async def chat_completion(tenant_id, chat_id):
return get_error_data_result(message="metadata_condition must be an object.") return get_error_data_result(message="metadata_condition must be an object.")
if metadata_condition and req.get("question"): if metadata_condition and req.get("question"):
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or []) metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
filtered_doc_ids = meta_filter( filtered_doc_ids = meta_filter(
metas, metas,
convert_conditions(metadata_condition), convert_conditions(metadata_condition),
@ -279,7 +279,7 @@ async def chat_completion_openai_like(tenant_id, chat_id):
doc_ids_str = None doc_ids_str = None
if metadata_condition: if metadata_condition:
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or []) metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
filtered_doc_ids = meta_filter( filtered_doc_ids = meta_filter(
metas, metas,
convert_conditions(metadata_condition), convert_conditions(metadata_condition),
@ -1084,7 +1084,7 @@ async def retrieval_test_embedded():
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT) chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
if meta_data_filter: if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, _question, chat_mdl, local_doc_ids) local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, _question, chat_mdl, local_doc_ids)
tenants = UserTenantService.query(user_id=tenant_id) tenants = UserTenantService.query(user_id=tenant_id)

View File

@ -795,7 +795,6 @@ class Document(DataBaseModel):
progress_msg = TextField(null=True, help_text="process message", default="") progress_msg = TextField(null=True, help_text="process message", default="")
process_begin_at = DateTimeField(null=True, index=True) process_begin_at = DateTimeField(null=True, index=True)
process_duration = FloatField(default=0) process_duration = FloatField(default=0)
meta_fields = JSONField(null=True, default={})
suffix = CharField(max_length=32, null=False, help_text="The real file extension suffix", index=True) suffix = CharField(max_length=32, null=False, help_text="The real file extension suffix", index=True)
run = CharField(max_length=1, null=True, help_text="start to run processing or cancel.(1: run it; 2: cancel)", default="0", index=True) run = CharField(max_length=1, null=True, help_text="start to run processing or cancel.(1: run it; 2: cancel)", default="0", index=True)
@ -1267,7 +1266,6 @@ def migrate_db():
alter_db_add_column(migrator, "task", "digest", TextField(null=True, help_text="task digest", default="")) alter_db_add_column(migrator, "task", "digest", TextField(null=True, help_text="task digest", default=""))
alter_db_add_column(migrator, "task", "chunk_ids", LongTextField(null=True, help_text="chunk ids", default="")) alter_db_add_column(migrator, "task", "chunk_ids", LongTextField(null=True, help_text="chunk ids", default=""))
alter_db_add_column(migrator, "conversation", "user_id", CharField(max_length=255, null=True, help_text="user_id", index=True)) alter_db_add_column(migrator, "conversation", "user_id", CharField(max_length=255, null=True, help_text="user_id", index=True))
alter_db_add_column(migrator, "document", "meta_fields", JSONField(null=True, default={}))
alter_db_add_column(migrator, "task", "task_type", CharField(max_length=32, null=False, default="")) alter_db_add_column(migrator, "task", "task_type", CharField(max_length=32, null=False, default=""))
alter_db_add_column(migrator, "task", "priority", IntegerField(default=0)) alter_db_add_column(migrator, "task", "priority", IntegerField(default=0))
alter_db_add_column(migrator, "user_canvas", "permission", CharField(max_length=16, null=False, help_text="me|team", default="me", index=True)) alter_db_add_column(migrator, "user_canvas", "permission", CharField(max_length=16, null=False, help_text="me|team", default="me", index=True))

View File

@ -23,6 +23,7 @@ from api.db.services.canvas_service import UserCanvasService
from api.db.services.conversation_service import ConversationService from api.db.services.conversation_service import ConversationService
from api.db.services.dialog_service import DialogService from api.db.services.dialog_service import DialogService
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.langfuse_service import TenantLangfuseService from api.db.services.langfuse_service import TenantLangfuseService
@ -107,6 +108,11 @@ def create_new_user(user_info: dict) -> dict:
except Exception as create_error: except Exception as create_error:
logging.exception(create_error) logging.exception(create_error)
# rollback # rollback
try:
metadata_index_name = DocMetadataService._get_doc_meta_index_name(user_id)
settings.docStoreConn.delete_idx(metadata_index_name, "")
except Exception as e:
logging.exception(e)
try: try:
TenantService.delete_by_id(user_id) TenantService.delete_by_id(user_id)
except Exception as e: except Exception as e:
@ -165,6 +171,12 @@ def delete_user_data(user_id: str) -> dict:
# step1.1.2 delete file and document info in db # step1.1.2 delete file and document info in db
doc_ids = DocumentService.get_all_doc_ids_by_kb_ids(kb_ids) doc_ids = DocumentService.get_all_doc_ids_by_kb_ids(kb_ids)
if doc_ids: if doc_ids:
for doc in doc_ids:
try:
DocMetadataService.delete_document_metadata(doc["id"], skip_empty_check=True)
except Exception as e:
logging.warning(f"Failed to delete metadata for document {doc['id']}: {e}")
doc_delete_res = DocumentService.delete_by_ids([i["id"] for i in doc_ids]) doc_delete_res = DocumentService.delete_by_ids([i["id"] for i in doc_ids])
done_msg += f"- Deleted {doc_delete_res} document records.\n" done_msg += f"- Deleted {doc_delete_res} document records.\n"
task_delete_res = TaskService.delete_by_doc_ids([i["id"] for i in doc_ids]) task_delete_res = TaskService.delete_by_doc_ids([i["id"] for i in doc_ids])
@ -202,6 +214,13 @@ def delete_user_data(user_id: str) -> dict:
done_msg += f"- Deleted {llm_delete_res} tenant-LLM records.\n" done_msg += f"- Deleted {llm_delete_res} tenant-LLM records.\n"
langfuse_delete_res = TenantLangfuseService.delete_ty_tenant_id(tenant_id) langfuse_delete_res = TenantLangfuseService.delete_ty_tenant_id(tenant_id)
done_msg += f"- Deleted {langfuse_delete_res} langfuse records.\n" done_msg += f"- Deleted {langfuse_delete_res} langfuse records.\n"
try:
metadata_index_name = DocMetadataService._get_doc_meta_index_name(tenant_id)
settings.docStoreConn.delete_idx(metadata_index_name, "")
done_msg += f"- Deleted metadata table {metadata_index_name}.\n"
except Exception as e:
logging.warning(f"Failed to delete metadata table for tenant {tenant_id}: {e}")
done_msg += "- Warning: Failed to delete metadata table (continuing).\n"
# step1.3 delete memory and messages # step1.3 delete memory and messages
user_memory = MemoryService.get_by_tenant_id(tenant_id) user_memory = MemoryService.get_by_tenant_id(tenant_id)
if user_memory: if user_memory:
@ -269,6 +288,11 @@ def delete_user_data(user_id: str) -> dict:
# step2.1.5 delete document record # step2.1.5 delete document record
doc_delete_res = DocumentService.delete_by_ids([d['id'] for d in created_documents]) doc_delete_res = DocumentService.delete_by_ids([d['id'] for d in created_documents])
done_msg += f"- Deleted {doc_delete_res} documents.\n" done_msg += f"- Deleted {doc_delete_res} documents.\n"
for doc in created_documents:
try:
DocMetadataService.delete_document_metadata(doc['id'])
except Exception as e:
logging.warning(f"Failed to delete metadata for document {doc['id']}: {e}")
# step2.1.6 update dataset doc&chunk&token cnt # step2.1.6 update dataset doc&chunk&token cnt
for kb_id, doc_num in kb_doc_info.items(): for kb_id, doc_num in kb_doc_info.items():
KnowledgebaseService.decrease_document_num_in_delete(kb_id, doc_num) KnowledgebaseService.decrease_document_num_in_delete(kb_id, doc_num)

View File

@ -25,6 +25,7 @@ from api.db import InputType
from api.db.db_models import Connector, SyncLogs, Connector2Kb, Knowledgebase from api.db.db_models import Connector, SyncLogs, Connector2Kb, Knowledgebase
from api.db.services.common_service import CommonService from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.document_service import DocMetadataService
from common.misc_utils import get_uuid from common.misc_utils import get_uuid
from common.constants import TaskStatus from common.constants import TaskStatus
from common.time_utils import current_timestamp, timestamp_to_date from common.time_utils import current_timestamp, timestamp_to_date
@ -227,7 +228,7 @@ class SyncLogsService(CommonService):
# Set metadata if available for this document # Set metadata if available for this document
if doc["name"] in metadata_map: if doc["name"] in metadata_map:
DocumentService.update_by_id(doc["id"], {"meta_fields": metadata_map[doc["name"]]}) DocMetadataService.update_document_metadata(doc["id"], metadata_map[doc["name"]])
if not auto_parse or auto_parse == "0": if not auto_parse or auto_parse == "0":
continue continue

View File

@ -28,7 +28,7 @@ from api.db.services.file_service import FileService
from common.constants import LLMType, ParserType, StatusEnum from common.constants import LLMType, ParserType, StatusEnum
from api.db.db_models import DB, Dialog from api.db.db_models import DB, Dialog
from api.db.services.common_service import CommonService from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.langfuse_service import TenantLangfuseService from api.db.services.langfuse_service import TenantLangfuseService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
@ -355,7 +355,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
questions = [await cross_languages(dialog.tenant_id, dialog.llm_id, questions[0], prompt_config["cross_languages"])] questions = [await cross_languages(dialog.tenant_id, dialog.llm_id, questions[0], prompt_config["cross_languages"])]
if dialog.meta_data_filter: if dialog.meta_data_filter:
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids) metas = DocMetadataService.get_flatted_meta_by_kbs(dialog.kb_ids)
attachments = await apply_meta_data_filter( attachments = await apply_meta_data_filter(
dialog.meta_data_filter, dialog.meta_data_filter,
metas, metas,
@ -1048,7 +1048,7 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf
tenant_ids = list(set([kb.tenant_id for kb in kbs])) tenant_ids = list(set([kb.tenant_id for kb in kbs]))
if meta_data_filter: if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids) doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
kbinfos = await retriever.retrieval( kbinfos = await retriever.retrieval(
@ -1124,7 +1124,7 @@ async def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
rerank_mdl = LLMBundle(tenant_id, LLMType.RERANK, rerank_id) rerank_mdl = LLMBundle(tenant_id, LLMType.RERANK, rerank_id)
if meta_data_filter: if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids) doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
ranks = await settings.retriever.retrieval( ranks = await settings.retriever.retrieval(

File diff suppressed because it is too large Load Diff

View File

@ -33,7 +33,7 @@ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTena
from api.db.db_utils import bulk_insert_into_db from api.db.db_utils import bulk_insert_into_db
from api.db.services.common_service import CommonService from api.db.services.common_service import CommonService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from common.metadata_utils import dedupe_list from api.db.services.doc_metadata_service import DocMetadataService
from common.misc_utils import get_uuid from common.misc_utils import get_uuid
from common.time_utils import current_timestamp, get_format_time from common.time_utils import current_timestamp, get_format_time
from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME
@ -67,7 +67,6 @@ class DocumentService(CommonService):
cls.model.progress_msg, cls.model.progress_msg,
cls.model.process_begin_at, cls.model.process_begin_at,
cls.model.process_duration, cls.model.process_duration,
cls.model.meta_fields,
cls.model.suffix, cls.model.suffix,
cls.model.run, cls.model.run,
cls.model.status, cls.model.status,
@ -154,8 +153,11 @@ class DocumentService(CommonService):
docs = docs.where(cls.model.type.in_(types)) docs = docs.where(cls.model.type.in_(types))
if suffix: if suffix:
docs = docs.where(cls.model.suffix.in_(suffix)) docs = docs.where(cls.model.suffix.in_(suffix))
if return_empty_metadata:
docs = docs.where(fn.COALESCE(fn.JSON_LENGTH(cls.model.meta_fields), 0) == 0) metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
doc_ids_with_metadata = set(metadata_map.keys())
if return_empty_metadata and doc_ids_with_metadata:
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
count = docs.count() count = docs.count()
if desc: if desc:
@ -166,7 +168,14 @@ class DocumentService(CommonService):
if page_number and items_per_page: if page_number and items_per_page:
docs = docs.paginate(page_number, items_per_page) docs = docs.paginate(page_number, items_per_page)
return list(docs.dicts()), count docs_list = list(docs.dicts())
if return_empty_metadata:
for doc in docs_list:
doc["meta_fields"] = {}
else:
for doc in docs_list:
doc["meta_fields"] = metadata_map.get(doc["id"], {})
return docs_list, count
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()
@ -212,7 +221,7 @@ class DocumentService(CommonService):
if suffix: if suffix:
query = query.where(cls.model.suffix.in_(suffix)) query = query.where(cls.model.suffix.in_(suffix))
rows = query.select(cls.model.run, cls.model.suffix, cls.model.meta_fields) rows = query.select(cls.model.run, cls.model.suffix, cls.model.id)
total = rows.count() total = rows.count()
suffix_counter = {} suffix_counter = {}
@ -220,10 +229,18 @@ class DocumentService(CommonService):
metadata_counter = {} metadata_counter = {}
empty_metadata_count = 0 empty_metadata_count = 0
doc_ids = [row.id for row in rows]
metadata = {}
if doc_ids:
try:
metadata = DocMetadataService.get_metadata_for_documents(doc_ids, kb_id)
except Exception as e:
logging.warning(f"Failed to fetch metadata from ES/Infinity: {e}")
for row in rows: for row in rows:
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1 suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1 run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
meta_fields = row.meta_fields or {} meta_fields = metadata.get(row.id, {})
if not meta_fields: if not meta_fields:
empty_metadata_count += 1 empty_metadata_count += 1
continue continue
@ -374,6 +391,12 @@ class DocumentService(CommonService):
except Exception as e: except Exception as e:
logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}") logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}")
# Delete document metadata (non-critical, log and continue)
try:
DocMetadataService.delete_document_metadata(doc.id)
except Exception as e:
logging.warning(f"Failed to delete metadata for document {doc.id}: {e}")
# Cleanup knowledge graph references (non-critical, log and continue) # Cleanup knowledge graph references (non-critical, log and continue)
try: try:
graph_source = settings.docStoreConn.get_fields( graph_source = settings.docStoreConn.get_fields(
@ -707,246 +730,6 @@ class DocumentService(CommonService):
cls.update_by_id(doc_id, info) cls.update_by_id(doc_id, info)
@classmethod
@DB.connection_context()
def update_meta_fields(cls, doc_id, meta_fields):
return cls.update_by_id(doc_id, {"meta_fields": meta_fields})
@classmethod
@DB.connection_context()
def get_meta_by_kbs(cls, kb_ids):
"""
Legacy metadata aggregator (backward-compatible).
- Does NOT expand list values and a list is kept as one string key.
Example: {"tags": ["foo","bar"]} -> meta["tags"]["['foo', 'bar']"] = [doc_id]
- Expects meta_fields is a dict.
Use when existing callers rely on the old list-as-string semantics.
"""
fields = [
cls.model.id,
cls.model.meta_fields,
]
meta = {}
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
doc_id = r.id
for k,v in r.meta_fields.items():
if k not in meta:
meta[k] = {}
if not isinstance(v, list):
v = [v]
for vv in v:
if vv not in meta[k]:
if isinstance(vv, list) or isinstance(vv, dict):
continue
meta[k][vv] = []
meta[k][vv].append(doc_id)
return meta
@classmethod
@DB.connection_context()
def get_flatted_meta_by_kbs(cls, kb_ids):
"""
- Parses stringified JSON meta_fields when possible and skips non-dict or unparsable values.
- Expands list values into individual entries.
Example: {"tags": ["foo","bar"], "author": "alice"} ->
meta["tags"]["foo"] = [doc_id], meta["tags"]["bar"] = [doc_id], meta["author"]["alice"] = [doc_id]
Prefer for metadata_condition filtering and scenarios that must respect list semantics.
"""
fields = [
cls.model.id,
cls.model.meta_fields,
]
meta = {}
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
doc_id = r.id
meta_fields = r.meta_fields or {}
if isinstance(meta_fields, str):
try:
meta_fields = json.loads(meta_fields)
except Exception:
continue
if not isinstance(meta_fields, dict):
continue
for k, v in meta_fields.items():
if k not in meta:
meta[k] = {}
values = v if isinstance(v, list) else [v]
for vv in values:
if vv is None:
continue
sv = str(vv)
if sv not in meta[k]:
meta[k][sv] = []
meta[k][sv].append(doc_id)
return meta
@classmethod
@DB.connection_context()
def get_metadata_summary(cls, kb_id, document_ids=None):
def _meta_value_type(value):
if value is None:
return None
if isinstance(value, list):
return "list"
if isinstance(value, bool):
return "string"
if isinstance(value, (int, float)):
return "number"
if re.match(r"\d{4}\-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", str(value)):
return "time"
return "string"
fields = [cls.model.id, cls.model.meta_fields]
summary = {}
type_counter = {}
query = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
if document_ids:
query = query.where(cls.model.id.in_(document_ids))
for r in query:
meta_fields = r.meta_fields or {}
if isinstance(meta_fields, str):
try:
meta_fields = json.loads(meta_fields)
except Exception:
continue
if not isinstance(meta_fields, dict):
continue
for k, v in meta_fields.items():
value_type = _meta_value_type(v)
if value_type:
if k not in type_counter:
type_counter[k] = {}
type_counter[k][value_type] = type_counter[k].get(value_type, 0) + 1
values = v if isinstance(v, list) else [v]
for vv in values:
if not vv:
continue
sv = str(vv)
if k not in summary:
summary[k] = {}
summary[k][sv] = summary[k].get(sv, 0) + 1
result = {}
for k, v in summary.items():
values = sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True)
type_counts = type_counter.get(k, {})
value_type = "string"
if type_counts:
value_type = max(type_counts.items(), key=lambda item: item[1])[0]
result[k] = {"type": value_type, "values": values}
return result
@classmethod
@DB.connection_context()
def batch_update_metadata(cls, kb_id, doc_ids, updates=None, deletes=None, adds=None):
updates = updates or []
deletes = deletes or []
if not doc_ids:
return 0
def _normalize_meta(meta):
if isinstance(meta, str):
try:
meta = json.loads(meta)
except Exception:
return {}
if not isinstance(meta, dict):
return {}
return deepcopy(meta)
def _str_equal(a, b):
return str(a) == str(b)
def _apply_updates(meta):
changed = False
for upd in updates:
key = upd.get("key")
if not key:
continue
new_value = upd.get("value")
match_provided = upd.get("match")
if key not in meta:
if match_provided:
continue
meta[key] = dedupe_list(new_value) if isinstance(new_value, list) else new_value
changed = True
continue
if isinstance(meta[key], list):
if not match_provided:
if isinstance(new_value, list):
meta[key] = dedupe_list(new_value)
else:
meta[key].append(new_value)
changed = True
else:
match_value = upd.get("match")
replaced = False
new_list = []
for item in meta[key]:
if _str_equal(item, match_value):
new_list.append(new_value)
replaced = True
else:
new_list.append(item)
if replaced:
meta[key] = dedupe_list(new_list)
changed = True
else:
if not match_provided:
meta[key] = new_value
changed = True
else:
match_value = upd.get("match")
if _str_equal(meta[key], match_value):
meta[key] = new_value
changed = True
return changed
def _apply_deletes(meta):
changed = False
for d in deletes:
key = d.get("key")
if not key or key not in meta:
continue
value = d.get("value", None)
if isinstance(meta[key], list):
if value is None:
del meta[key]
changed = True
continue
new_list = [item for item in meta[key] if not _str_equal(item, value)]
if len(new_list) != len(meta[key]):
if new_list:
meta[key] = new_list
else:
del meta[key]
changed = True
else:
if value is None or _str_equal(meta[key], value):
del meta[key]
changed = True
return changed
updated_docs = 0
with DB.atomic():
rows = cls.model.select(cls.model.id, cls.model.meta_fields).where(
cls.model.id.in_(doc_ids)
)
for r in rows:
meta = _normalize_meta(r.meta_fields or {})
original_meta = deepcopy(meta)
changed = _apply_updates(meta)
changed = _apply_deletes(meta) or changed
if changed and meta != original_meta:
cls.model.update(
meta_fields=meta,
update_time=current_timestamp(),
update_date=get_format_time()
).where(cls.model.id == r.id).execute()
updated_docs += 1
return updated_docs
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()
def update_progress(cls): def update_progress(cls):

View File

@ -24,6 +24,7 @@ from abc import abstractmethod
from elasticsearch import NotFoundError from elasticsearch import NotFoundError
from elasticsearch_dsl import Index from elasticsearch_dsl import Index
from elastic_transport import ConnectionTimeout from elastic_transport import ConnectionTimeout
from elasticsearch.client import IndicesClient
from common.file_utils import get_project_base_directory from common.file_utils import get_project_base_directory
from common.misc_utils import convert_bytes from common.misc_utils import convert_bytes
from common.doc_store.doc_store_base import DocStoreConnection, OrderByExpr, MatchExpr from common.doc_store.doc_store_base import DocStoreConnection, OrderByExpr, MatchExpr
@ -128,13 +129,34 @@ class ESConnectionBase(DocStoreConnection):
if self.index_exist(index_name, dataset_id): if self.index_exist(index_name, dataset_id):
return True return True
try: try:
from elasticsearch.client import IndicesClient
return IndicesClient(self.es).create(index=index_name, return IndicesClient(self.es).create(index=index_name,
settings=self.mapping["settings"], settings=self.mapping["settings"],
mappings=self.mapping["mappings"]) mappings=self.mapping["mappings"])
except Exception: except Exception:
self.logger.exception("ESConnection.createIndex error %s" % index_name) self.logger.exception("ESConnection.createIndex error %s" % index_name)
def create_doc_meta_idx(self, index_name: str):
"""
Create a document metadata index.
Index name pattern: ragflow_doc_meta_{tenant_id}
- Per-tenant metadata index for storing document metadata fields
"""
if self.index_exist(index_name, ""):
return True
try:
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_es_mapping.json")
if not os.path.exists(fp_mapping):
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
return False
doc_meta_mapping = json.load(open(fp_mapping, "r"))
return IndicesClient(self.es).create(index=index_name,
settings=doc_meta_mapping["settings"],
mappings=doc_meta_mapping["mappings"])
except Exception as e:
self.logger.exception(f"Error creating document metadata index {index_name}: {e}")
def delete_idx(self, index_name: str, dataset_id: str): def delete_idx(self, index_name: str, dataset_id: str):
if len(dataset_id) > 0: if len(dataset_id) > 0:
# The index need to be alive after any kb deletion since all kb under this tenant are in one index. # The index need to be alive after any kb deletion since all kb under this tenant are in one index.

View File

@ -285,8 +285,65 @@ class InfinityConnectionBase(DocStoreConnection):
self.logger.info(f"INFINITY created table {table_name}, vector size {vector_size}") self.logger.info(f"INFINITY created table {table_name}, vector size {vector_size}")
return True return True
def create_doc_meta_idx(self, index_name: str):
"""
Create a document metadata table.
Table name pattern: ragflow_doc_meta_{tenant_id}
- Per-tenant metadata table for storing document metadata fields
"""
table_name = index_name
inf_conn = self.connPool.get_conn()
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
try:
fp_mapping = os.path.join(get_project_base_directory(), "conf", "doc_meta_infinity_mapping.json")
if not os.path.exists(fp_mapping):
self.logger.error(f"Document metadata mapping file not found at {fp_mapping}")
return False
schema = json.load(open(fp_mapping))
inf_db.create_table(
table_name,
schema,
ConflictType.Ignore,
)
# Create secondary indexes on id and kb_id for better query performance
inf_table = inf_db.get_table(table_name)
try:
inf_table.create_index(
f"idx_{table_name}_id",
IndexInfo("id", IndexType.Secondary),
ConflictType.Ignore,
)
self.logger.debug(f"INFINITY created secondary index on id for table {table_name}")
except Exception as e:
self.logger.warning(f"Failed to create index on id for {table_name}: {e}")
try:
inf_table.create_index(
f"idx_{table_name}_kb_id",
IndexInfo("kb_id", IndexType.Secondary),
ConflictType.Ignore,
)
self.logger.debug(f"INFINITY created secondary index on kb_id for table {table_name}")
except Exception as e:
self.logger.warning(f"Failed to create index on kb_id for {table_name}: {e}")
self.connPool.release_conn(inf_conn)
self.logger.debug(f"INFINITY created document metadata table {table_name} with secondary indexes")
return True
except Exception as e:
self.connPool.release_conn(inf_conn)
self.logger.exception(f"Error creating document metadata table {table_name}: {e}")
return False
def delete_idx(self, index_name: str, dataset_id: str): def delete_idx(self, index_name: str, dataset_id: str):
table_name = f"{index_name}_{dataset_id}" if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{dataset_id}"
inf_conn = self.connPool.get_conn() inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName) db_instance = inf_conn.get_database(self.dbName)
db_instance.drop_table(table_name, ConflictType.Ignore) db_instance.drop_table(table_name, ConflictType.Ignore)
@ -294,7 +351,10 @@ class InfinityConnectionBase(DocStoreConnection):
self.logger.info(f"INFINITY dropped table {table_name}") self.logger.info(f"INFINITY dropped table {table_name}")
def index_exist(self, index_name: str, dataset_id: str) -> bool: def index_exist(self, index_name: str, dataset_id: str) -> bool:
table_name = f"{index_name}_{dataset_id}" if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{dataset_id}"
try: try:
inf_conn = self.connPool.get_conn() inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName) db_instance = inf_conn.get_database(self.dbName)
@ -341,7 +401,10 @@ class InfinityConnectionBase(DocStoreConnection):
def delete(self, condition: dict, index_name: str, dataset_id: str) -> int: def delete(self, condition: dict, index_name: str, dataset_id: str) -> int:
inf_conn = self.connPool.get_conn() inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName) db_instance = inf_conn.get_database(self.dbName)
table_name = f"{index_name}_{dataset_id}" if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{dataset_id}"
try: try:
table_instance = db_instance.get_table(table_name) table_instance = db_instance.get_table(table_name)
except Exception: except Exception:

View File

@ -106,10 +106,10 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
else: else:
if logic == "and": if logic == "and":
doc_ids = doc_ids & set(ids) doc_ids = doc_ids & set(ids)
if not doc_ids:
return []
else: else:
doc_ids = doc_ids | set(ids) doc_ids = doc_ids | set(ids)
if not doc_ids:
return []
return list(doc_ids) return list(doc_ids)

View File

@ -0,0 +1,29 @@
{
"settings": {
"index": {
"number_of_shards": 2,
"number_of_replicas": 0,
"refresh_interval": "1000ms"
}
},
"mappings": {
"_source": {
"enabled": true
},
"dynamic": "runtime",
"properties": {
"id": {
"type": "keyword",
"store": true
},
"kb_id": {
"type": "keyword",
"store": true
},
"meta_fields": {
"type": "object",
"dynamic": true
}
}
}
}

View File

@ -0,0 +1,5 @@
{
"id": {"type": "varchar", "default": ""},
"kb_id": {"type": "varchar", "default": ""},
"meta_fields": {"type": "json", "default": "{}"}
}

View File

@ -98,6 +98,7 @@ def message_fit_in(msg, max_length=4000):
def kb_prompt(kbinfos, max_tokens, hash_id=False): def kb_prompt(kbinfos, max_tokens, hash_id=False):
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
knowledges = [get_value(ck, "content", "content_with_weight") for ck in kbinfos["chunks"]] knowledges = [get_value(ck, "content", "content_with_weight") for ck in kbinfos["chunks"]]
kwlg_len = len(knowledges) kwlg_len = len(knowledges)
@ -114,7 +115,12 @@ def kb_prompt(kbinfos, max_tokens, hash_id=False):
break break
docs = DocumentService.get_by_ids([get_value(ck, "doc_id", "document_id") for ck in kbinfos["chunks"][:chunks_num]]) docs = DocumentService.get_by_ids([get_value(ck, "doc_id", "document_id") for ck in kbinfos["chunks"][:chunks_num]])
docs = {d.id: d.meta_fields for d in docs}
docs_with_meta = {}
for d in docs:
meta = DocMetadataService.get_document_metadata(d.id)
docs_with_meta[d.id] = meta if meta else {}
docs = docs_with_meta
def draw_node(k, line): def draw_node(k, line):
if line is not None and not isinstance(line, str): if line is not None and not isinstance(line, str):

View File

@ -61,6 +61,7 @@ import numpy as np
from peewee import DoesNotExist from peewee import DoesNotExist
from common.constants import LLMType, ParserType, PipelineTaskType from common.constants import LLMType, ParserType, PipelineTaskType
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
@ -438,12 +439,10 @@ async def build_chunks(task, progress_callback):
metadata = update_metadata_to(metadata, doc["metadata_obj"]) metadata = update_metadata_to(metadata, doc["metadata_obj"])
del doc["metadata_obj"] del doc["metadata_obj"]
if metadata: if metadata:
e, doc = DocumentService.get_by_id(task["doc_id"]) existing_meta = DocMetadataService.get_document_metadata(task["doc_id"])
if e: existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
if isinstance(doc.meta_fields, str): metadata = update_metadata_to(metadata, existing_meta)
doc.meta_fields = json.loads(doc.meta_fields) DocMetadataService.update_document_metadata(task["doc_id"], metadata)
metadata = update_metadata_to(metadata, doc.meta_fields)
DocumentService.update_by_id(task["doc_id"], {"meta_fields": metadata})
progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st)) progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
if task["kb_parser_config"].get("tag_kb_ids", []): if task["kb_parser_config"].get("tag_kb_ids", []):
@ -735,12 +734,10 @@ async def run_dataflow(task: dict):
del ck["positions"] del ck["positions"]
if metadata: if metadata:
e, doc = DocumentService.get_by_id(doc_id) existing_meta = DocMetadataService.get_document_metadata(doc_id)
if e: existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
if isinstance(doc.meta_fields, str): metadata = update_metadata_to(metadata, existing_meta)
doc.meta_fields = json.loads(doc.meta_fields) DocMetadataService.update_document_metadata(doc_id, metadata)
metadata = update_metadata_to(metadata, doc.meta_fields)
DocumentService.update_by_id(doc_id, {"meta_fields": metadata})
start_ts = timer() start_ts = timer()
set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...") set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")

View File

@ -162,7 +162,11 @@ class ESConnection(ESConnectionBase):
self._connect() self._connect()
continue continue
except Exception as e: except Exception as e:
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e)) # Only log debug for NotFoundError(accepted when metadata index doesn't exist)
if 'NotFound' in str(e):
self.logger.debug(f"ESConnection.search {str(index_names)} query: " + str(q) + " - " + str(e))
else:
self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
raise e raise e
self.logger.error(f"ESConnection.search timeout for {ATTEMPT_TIME} times!") self.logger.error(f"ESConnection.search timeout for {ATTEMPT_TIME} times!")

View File

@ -149,8 +149,11 @@ class InfinityConnection(InfinityConnectionBase):
if condition: if condition:
table_found = False table_found = False
for indexName in index_names: for indexName in index_names:
for kb_id in knowledgebase_ids: if indexName.startswith("ragflow_doc_meta_"):
table_name = f"{indexName}_{kb_id}" table_names_to_search = [indexName]
else:
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
for table_name in table_names_to_search:
try: try:
filter_cond = self.equivalent_condition_to_str(condition, db_instance.get_table(table_name)) filter_cond = self.equivalent_condition_to_str(condition, db_instance.get_table(table_name))
table_found = True table_found = True
@ -221,8 +224,11 @@ class InfinityConnection(InfinityConnectionBase):
total_hits_count = 0 total_hits_count = 0
# Scatter search tables and gather the results # Scatter search tables and gather the results
for indexName in index_names: for indexName in index_names:
for knowledgebaseId in knowledgebase_ids: if indexName.startswith("ragflow_doc_meta_"):
table_name = f"{indexName}_{knowledgebaseId}" table_names_to_search = [indexName]
else:
table_names_to_search = [f"{indexName}_{kb_id}" for kb_id in knowledgebase_ids]
for table_name in table_names_to_search:
try: try:
table_instance = db_instance.get_table(table_name) table_instance = db_instance.get_table(table_name)
except Exception: except Exception:
@ -276,8 +282,11 @@ class InfinityConnection(InfinityConnectionBase):
df_list = list() df_list = list()
assert isinstance(knowledgebase_ids, list) assert isinstance(knowledgebase_ids, list)
table_list = list() table_list = list()
for knowledgebaseId in knowledgebase_ids: if index_name.startswith("ragflow_doc_meta_"):
table_name = f"{index_name}_{knowledgebaseId}" table_names_to_search = [index_name]
else:
table_names_to_search = [f"{index_name}_{kb_id}" for kb_id in knowledgebase_ids]
for table_name in table_names_to_search:
table_list.append(table_name) table_list.append(table_name)
try: try:
table_instance = db_instance.get_table(table_name) table_instance = db_instance.get_table(table_name)
@ -301,7 +310,10 @@ class InfinityConnection(InfinityConnectionBase):
def insert(self, documents: list[dict], index_name: str, knowledgebase_id: str = None) -> list[str]: def insert(self, documents: list[dict], index_name: str, knowledgebase_id: str = None) -> list[str]:
inf_conn = self.connPool.get_conn() inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName) db_instance = inf_conn.get_database(self.dbName)
table_name = f"{index_name}_{knowledgebase_id}" if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{knowledgebase_id}"
try: try:
table_instance = db_instance.get_table(table_name) table_instance = db_instance.get_table(table_name)
except InfinityException as e: except InfinityException as e:
@ -405,6 +417,11 @@ class InfinityConnection(InfinityConnectionBase):
elif k in ["page_num_int", "top_int"]: elif k in ["page_num_int", "top_int"]:
assert isinstance(v, list) assert isinstance(v, list)
d[k] = "_".join(f"{num:08x}" for num in v) d[k] = "_".join(f"{num:08x}" for num in v)
elif k == "meta_fields":
if isinstance(v, dict):
d[k] = json.dumps(v, ensure_ascii=False)
else:
d[k] = v if v else "{}"
else: else:
d[k] = v d[k] = v
for k in ["docnm_kwd", "title_tks", "title_sm_tks", "important_kwd", "important_tks", "content_with_weight", for k in ["docnm_kwd", "title_tks", "title_sm_tks", "important_kwd", "important_tks", "content_with_weight",
@ -434,7 +451,10 @@ class InfinityConnection(InfinityConnectionBase):
# logger.info(f"update position_int: {newValue['position_int']}") # logger.info(f"update position_int: {newValue['position_int']}")
inf_conn = self.connPool.get_conn() inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName) db_instance = inf_conn.get_database(self.dbName)
table_name = f"{index_name}_{knowledgebase_id}" if index_name.startswith("ragflow_doc_meta_"):
table_name = index_name
else:
table_name = f"{index_name}_{knowledgebase_id}"
table_instance = db_instance.get_table(table_name) table_instance = db_instance.get_table(table_name)
# if "exists" in condition: # if "exists" in condition:
# del condition["exists"] # del condition["exists"]

View File

@ -0,0 +1,184 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
End-to-end test for metadata filtering during retrieval.
Tests that chunks are only retrieved from documents matching the metadata condition.
"""
import pytest
import logging
from common import (
create_dataset,
delete_datasets,
list_documents,
update_document,
)
from utils import wait_for
@wait_for(30, 1, "Document parsing timeout")
def _condition_parsing_complete(_auth, dataset_id):
res = list_documents(_auth, dataset_id)
if res["code"] != 0:
return False
for doc in res["data"]["docs"]:
status = doc.get("run", "UNKNOWN")
if status == "FAILED":
pytest.fail(f"Document parsing failed: {doc}")
return False
if status != "DONE":
return False
return True
@pytest.fixture(scope="function")
def add_dataset_with_metadata(HttpApiAuth):
# First create the dataset
res = create_dataset(HttpApiAuth, {
"name": f"test_metadata_{int(__import__('time').time())}",
"chunk_method": "naive"
})
assert res["code"] == 0, f"Failed to create dataset: {res}"
dataset_id = res["data"]["id"]
# Then configure metadata via the update_metadata_setting endpoint
import requests
from configs import HOST_ADDRESS, VERSION
metadata_config = {
"type": "object",
"properties": {
"character": {
"description": "Historical figure name",
"type": "string"
},
"era": {
"description": "Historical era",
"type": "string"
},
"achievements": {
"description": "Major achievements",
"type": "array",
"items": {
"type": "string"
}
}
}
}
res = requests.post(
url=f"{HOST_ADDRESS}/{VERSION}/kb/update_metadata_setting",
headers={"Content-Type": "application/json"},
auth=HttpApiAuth,
json={
"kb_id": dataset_id,
"metadata": metadata_config,
"enable_metadata": False
}
).json()
assert res["code"] == 0, f"Failed to configure metadata: {res}"
yield dataset_id
# Cleanup
delete_datasets(HttpApiAuth, {"ids": [dataset_id]})
@pytest.mark.p2
class TestMetadataWithRetrieval:
"""Test retrieval with metadata filtering."""
def test_retrieval_with_metadata_filter(self, HttpApiAuth, add_dataset_with_metadata, tmp_path):
"""
Test that retrieval respects metadata filters.
Verifies that chunks are only retrieved from documents matching the metadata condition.
"""
from common import upload_documents, parse_documents, retrieval_chunks
dataset_id = add_dataset_with_metadata
# Create two documents with different metadata
content_doc1 = "Document about Zhuge Liang who lived in Three Kingdoms period."
content_doc2 = "Document about Cao Cao who lived in Late Eastern Han Dynasty."
fp1 = tmp_path / "doc1_zhuge_liang.txt"
fp2 = tmp_path / "doc2_cao_cao.txt"
with open(fp1, "w", encoding="utf-8") as f:
f.write(content_doc1)
with open(fp2, "w", encoding="utf-8") as f:
f.write(content_doc2)
# Upload both documents
res = upload_documents(HttpApiAuth, dataset_id, [fp1, fp2])
assert res["code"] == 0, f"Failed to upload documents: {res}"
doc1_id = res["data"][0]["id"]
doc2_id = res["data"][1]["id"]
# Add different metadata to each document
res = update_document(HttpApiAuth, dataset_id, doc1_id, {
"meta_fields": {"character": "Zhuge Liang", "era": "Three Kingdoms"}
})
assert res["code"] == 0, f"Failed to update doc1 metadata: {res}"
res = update_document(HttpApiAuth, dataset_id, doc2_id, {
"meta_fields": {"character": "Cao Cao", "era": "Late Eastern Han"}
})
assert res["code"] == 0, f"Failed to update doc2 metadata: {res}"
# Parse both documents
res = parse_documents(HttpApiAuth, dataset_id, {"document_ids": [doc1_id, doc2_id]})
assert res["code"] == 0, f"Failed to trigger parsing: {res}"
# Wait for parsing to complete
assert _condition_parsing_complete(HttpApiAuth, dataset_id), "Parsing timeout"
# Test retrieval WITH metadata filter for "Zhuge Liang"
res = retrieval_chunks(HttpApiAuth, {
"question": "Zhuge Liang",
"dataset_ids": [dataset_id],
"metadata_condition": {
"logic": "and",
"conditions": [
{
"name": "character",
"comparison_operator": "is",
"value": "Zhuge Liang"
}
]
}
})
assert res["code"] == 0, f"Retrieval with metadata filter failed: {res}"
chunks_with_filter = res["data"]["chunks"]
doc_ids_with_filter = set(chunk.get("document_id", "") for chunk in chunks_with_filter)
logging.info(f"✓ Retrieved {len(chunks_with_filter)} chunks from documents: {doc_ids_with_filter}")
# Verify that filtered results only contain doc1 (Zhuge Liang)
if len(chunks_with_filter) > 0:
assert doc1_id in doc_ids_with_filter, f"Filtered results should contain doc1 (Zhuge Liang), but got: {doc_ids_with_filter}"
assert doc2_id not in doc_ids_with_filter, f"Filtered results should NOT contain doc2 (Cao Cao), but got: {doc_ids_with_filter}"
logging.info("Metadata filter correctly excluded chunks from non-matching documents")
else:
logging.warning("No chunks retrieved with filter - this might be due to embedding model not configured")

View File

@ -22,7 +22,9 @@
def _summary_to_counts(summary): def _summary_to_counts(summary):
counts = {} counts = {}
for key, pairs in summary.items(): for key, field_data in summary.items():
# New format: {key: {"type": "...", "values": [[value, count], ...]}}
pairs = field_data["values"]
counts[key] = {str(k): v for k, v in pairs} counts[key] = {str(k): v for k, v in pairs}
return counts return counts