mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-30 07:06:39 +08:00
Put document metadata in ES/Infinity (#12826)
### What problem does this PR solve?
Put document metadata in ES/Infinity.
Index name of meta data: ragflow_doc_meta_{tenant_id}
### Type of change
- [x] Refactoring
This commit is contained in:
@ -22,6 +22,7 @@ import xxhash
|
||||
from quart import request
|
||||
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from common.metadata_utils import apply_meta_data_filter
|
||||
@ -381,7 +382,7 @@ async def retrieval_test():
|
||||
chat_mdl = LLMBundle(user_id, LLMType.CHAT)
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, local_doc_ids)
|
||||
|
||||
tenants = UserTenantService.query(user_id=user_id)
|
||||
|
||||
@ -26,6 +26,7 @@ from api.db import VALID_FILE_TYPES, FileType
|
||||
from api.db.db_models import Task
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from common.metadata_utils import meta_filter, convert_conditions, turn2jsonschema
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
@ -281,7 +282,7 @@ async def list_docs():
|
||||
doc_ids_filter = None
|
||||
metas = None
|
||||
if metadata_condition or metadata:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([kb_id])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id])
|
||||
|
||||
if metadata_condition:
|
||||
doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
|
||||
@ -401,7 +402,11 @@ async def doc_infos():
|
||||
if not DocumentService.accessible(doc_id, current_user.id):
|
||||
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
|
||||
docs = DocumentService.get_by_ids(doc_ids)
|
||||
return get_json_result(data=list(docs.dicts()))
|
||||
docs_list = list(docs.dicts())
|
||||
# Add meta_fields for each document
|
||||
for doc in docs_list:
|
||||
doc["meta_fields"] = DocMetadataService.get_document_metadata(doc["id"])
|
||||
return get_json_result(data=docs_list)
|
||||
|
||||
|
||||
@manager.route("/metadata/summary", methods=["POST"]) # noqa: F821
|
||||
@ -421,7 +426,7 @@ async def metadata_summary():
|
||||
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
|
||||
|
||||
try:
|
||||
summary = DocumentService.get_metadata_summary(kb_id, doc_ids)
|
||||
summary = DocMetadataService.get_metadata_summary(kb_id, doc_ids)
|
||||
return get_json_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -432,10 +437,14 @@ async def metadata_summary():
|
||||
@validate_request("doc_ids")
|
||||
async def metadata_update():
|
||||
req = await get_request_json()
|
||||
kb_id = req.get("kb_id")
|
||||
document_ids = req.get("doc_ids")
|
||||
updates = req.get("updates", []) or []
|
||||
deletes = req.get("deletes", []) or []
|
||||
|
||||
if not kb_id:
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
if not isinstance(updates, list) or not isinstance(deletes, list):
|
||||
return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
@ -446,8 +455,8 @@ async def metadata_update():
|
||||
if not isinstance(d, dict) or not d.get("key"):
|
||||
return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
updated = DocumentService.batch_update_metadata(None, document_ids, updates, deletes)
|
||||
return get_json_result(data={"updated": updated})
|
||||
updated = DocMetadataService.batch_update_metadata(kb_id, document_ids, updates, deletes)
|
||||
return get_json_result(data={"updated": updated, "matched_docs": len(document_ids)})
|
||||
|
||||
|
||||
@manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821
|
||||
@ -905,7 +914,7 @@ async def set_meta():
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
|
||||
if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}):
|
||||
if not DocMetadataService.update_document_metadata(req["doc_id"], meta):
|
||||
return get_data_error_result(message="Database error (meta updates)!")
|
||||
|
||||
return get_json_result(data=True)
|
||||
|
||||
@ -25,6 +25,7 @@ import numpy as np
|
||||
from api.db.services.connector_service import Connector2KbService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
|
||||
@ -467,7 +468,7 @@ def get_meta():
|
||||
message='No authorization.',
|
||||
code=RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=DocumentService.get_meta_by_kbs(kb_ids))
|
||||
return get_json_result(data=DocMetadataService.get_flatted_meta_by_kbs(kb_ids))
|
||||
|
||||
|
||||
@manager.route("/basic_info", methods=["GET"]) # noqa: F821
|
||||
|
||||
@ -18,6 +18,7 @@ import logging
|
||||
from quart import jsonify
|
||||
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from common.metadata_utils import meta_filter, convert_conditions
|
||||
@ -121,7 +122,7 @@ async def retrieval(tenant_id):
|
||||
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
|
||||
top = int(retrieval_setting.get("top_k", 1024))
|
||||
metadata_condition = req.get("metadata_condition", {}) or {}
|
||||
metas = DocumentService.get_meta_by_kbs([kb_id])
|
||||
metas = DocMetadataService.get_meta_by_kbs([kb_id])
|
||||
|
||||
doc_ids = []
|
||||
try:
|
||||
|
||||
@ -29,6 +29,7 @@ from api.constants import FILE_NAME_LEN_LIMIT
|
||||
from api.db import FileType
|
||||
from api.db.db_models import File, Task
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
@ -255,7 +256,8 @@ async def update_doc(tenant_id, dataset_id, document_id):
|
||||
if "meta_fields" in req:
|
||||
if not isinstance(req["meta_fields"], dict):
|
||||
return get_error_data_result(message="meta_fields must be a dictionary")
|
||||
DocumentService.update_meta_fields(document_id, req["meta_fields"])
|
||||
if not DocMetadataService.update_document_metadata(document_id, req["meta_fields"]):
|
||||
return get_error_data_result(message="Failed to update metadata")
|
||||
|
||||
if "name" in req and req["name"] != doc.name:
|
||||
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
|
||||
@ -568,7 +570,7 @@ def list_docs(dataset_id, tenant_id):
|
||||
|
||||
doc_ids_filter = None
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
|
||||
doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
|
||||
if metadata_condition.get("conditions") and not doc_ids_filter:
|
||||
return get_result(data={"total": 0, "docs": []})
|
||||
@ -611,7 +613,7 @@ async def metadata_summary(dataset_id, tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
|
||||
req = await get_request_json()
|
||||
try:
|
||||
summary = DocumentService.get_metadata_summary(dataset_id, req.get("doc_ids"))
|
||||
summary = DocMetadataService.get_metadata_summary(dataset_id, req.get("doc_ids"))
|
||||
return get_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -657,14 +659,14 @@ async def metadata_batch_update(dataset_id, tenant_id):
|
||||
target_doc_ids = set(document_ids)
|
||||
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
|
||||
filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
|
||||
target_doc_ids = target_doc_ids & filtered_ids
|
||||
if metadata_condition.get("conditions") and not target_doc_ids:
|
||||
return get_result(data={"updated": 0, "matched_docs": 0})
|
||||
|
||||
target_doc_ids = list(target_doc_ids)
|
||||
updated = DocumentService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
|
||||
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
|
||||
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
|
||||
@ -1534,7 +1536,7 @@ async def retrieval_test(tenant_id):
|
||||
if not doc_ids:
|
||||
metadata_condition = req.get("metadata_condition")
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_meta_by_kbs(kb_ids)
|
||||
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
|
||||
# If metadata_condition has conditions but no docs match, return empty result
|
||||
if not doc_ids and metadata_condition.get("conditions"):
|
||||
|
||||
@ -35,7 +35,7 @@ from api.db.services.conversation_service import ConversationService
|
||||
from api.db.services.conversation_service import async_iframe_completion as iframe_completion
|
||||
from api.db.services.conversation_service import async_completion as rag_completion
|
||||
from api.db.services.dialog_service import DialogService, async_ask, async_chat, gen_mindmap
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from common.metadata_utils import apply_meta_data_filter, convert_conditions, meta_filter
|
||||
@ -147,7 +147,7 @@ async def chat_completion(tenant_id, chat_id):
|
||||
return get_error_data_result(message="metadata_condition must be an object.")
|
||||
|
||||
if metadata_condition and req.get("question"):
|
||||
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or [])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
|
||||
filtered_doc_ids = meta_filter(
|
||||
metas,
|
||||
convert_conditions(metadata_condition),
|
||||
@ -279,7 +279,7 @@ async def chat_completion_openai_like(tenant_id, chat_id):
|
||||
|
||||
doc_ids_str = None
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or [])
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
|
||||
filtered_doc_ids = meta_filter(
|
||||
metas,
|
||||
convert_conditions(metadata_condition),
|
||||
@ -1084,7 +1084,7 @@ async def retrieval_test_embedded():
|
||||
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, _question, chat_mdl, local_doc_ids)
|
||||
|
||||
tenants = UserTenantService.query(user_id=tenant_id)
|
||||
|
||||
Reference in New Issue
Block a user