mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-23 03:26:53 +08:00
Feat: enhance metadata arranging. (#12745)
### What problem does this PR solve? #11564 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -26,7 +26,7 @@ from api.db import VALID_FILE_TYPES, FileType
|
||||
from api.db.db_models import Task
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from common.metadata_utils import meta_filter, convert_conditions
|
||||
from common.metadata_utils import meta_filter, convert_conditions, turn2jsonschema
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
@ -226,6 +226,7 @@ async def list_docs():
|
||||
kb_id = request.args.get("kb_id")
|
||||
if not kb_id:
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
for tenant in tenants:
|
||||
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
|
||||
@ -345,6 +346,8 @@ async def list_docs():
|
||||
doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
|
||||
if doc_item.get("source_type"):
|
||||
doc_item["source_type"] = doc_item["source_type"].split("/")[0]
|
||||
if doc_item["parser_config"].get("metadata"):
|
||||
doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"])
|
||||
|
||||
return get_json_result(data={"total": tol, "docs": docs})
|
||||
except Exception as e:
|
||||
@ -406,6 +409,7 @@ async def doc_infos():
|
||||
async def metadata_summary():
|
||||
req = await get_request_json()
|
||||
kb_id = req.get("kb_id")
|
||||
doc_ids = req.get("doc_ids")
|
||||
if not kb_id:
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
@ -417,7 +421,7 @@ async def metadata_summary():
|
||||
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
|
||||
|
||||
try:
|
||||
summary = DocumentService.get_metadata_summary(kb_id)
|
||||
summary = DocumentService.get_metadata_summary(kb_id, doc_ids)
|
||||
return get_json_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -425,36 +429,16 @@ async def metadata_summary():
|
||||
|
||||
@manager.route("/metadata/update", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_ids")
|
||||
async def metadata_update():
|
||||
req = await get_request_json()
|
||||
kb_id = req.get("kb_id")
|
||||
if not kb_id:
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
for tenant in tenants:
|
||||
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
|
||||
break
|
||||
else:
|
||||
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
|
||||
|
||||
selector = req.get("selector", {}) or {}
|
||||
document_ids = req.get("doc_ids")
|
||||
updates = req.get("updates", []) or []
|
||||
deletes = req.get("deletes", []) or []
|
||||
|
||||
if not isinstance(selector, dict):
|
||||
return get_json_result(data=False, message="selector must be an object.", code=RetCode.ARGUMENT_ERROR)
|
||||
if not isinstance(updates, list) or not isinstance(deletes, list):
|
||||
return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
metadata_condition = selector.get("metadata_condition", {}) or {}
|
||||
if metadata_condition and not isinstance(metadata_condition, dict):
|
||||
return get_json_result(data=False, message="metadata_condition must be an object.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
document_ids = selector.get("document_ids", []) or []
|
||||
if document_ids and not isinstance(document_ids, list):
|
||||
return get_json_result(data=False, message="document_ids must be a list.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
for upd in updates:
|
||||
if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd:
|
||||
return get_json_result(data=False, message="Each update requires key and value.", code=RetCode.ARGUMENT_ERROR)
|
||||
@ -462,24 +446,8 @@ async def metadata_update():
|
||||
if not isinstance(d, dict) or not d.get("key"):
|
||||
return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
kb_doc_ids = KnowledgebaseService.list_documents_by_ids([kb_id])
|
||||
target_doc_ids = set(kb_doc_ids)
|
||||
if document_ids:
|
||||
invalid_ids = set(document_ids) - set(kb_doc_ids)
|
||||
if invalid_ids:
|
||||
return get_json_result(data=False, message=f"These documents do not belong to dataset {kb_id}: {', '.join(invalid_ids)}", code=RetCode.ARGUMENT_ERROR)
|
||||
target_doc_ids = set(document_ids)
|
||||
|
||||
if metadata_condition:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([kb_id])
|
||||
filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
|
||||
target_doc_ids = target_doc_ids & filtered_ids
|
||||
if metadata_condition.get("conditions") and not target_doc_ids:
|
||||
return get_json_result(data={"updated": 0, "matched_docs": 0})
|
||||
|
||||
target_doc_ids = list(target_doc_ids)
|
||||
updated = DocumentService.batch_update_metadata(kb_id, target_doc_ids, updates, deletes)
|
||||
return get_json_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
|
||||
updated = DocumentService.batch_update_metadata(None, document_ids, updates, deletes)
|
||||
return get_json_result(data={"updated": updated})
|
||||
|
||||
|
||||
@manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821
|
||||
|
||||
@ -18,6 +18,7 @@ import logging
|
||||
import random
|
||||
import re
|
||||
|
||||
from common.metadata_utils import turn2jsonschema
|
||||
from quart import request
|
||||
import numpy as np
|
||||
|
||||
@ -218,6 +219,8 @@ def detail():
|
||||
message="Can't find this dataset!")
|
||||
kb["size"] = DocumentService.get_total_size_by_kb_id(kb_id=kb["id"],keywords="", run_status=[], types=[])
|
||||
kb["connectors"] = Connector2KbService.list_connectors(kb_id)
|
||||
if kb["parser_config"].get("metadata"):
|
||||
kb["parser_config"]["metadata"] = turn2jsonschema(kb["parser_config"]["metadata"])
|
||||
|
||||
for key in ["graphrag_task_finish_at", "raptor_task_finish_at", "mindmap_task_finish_at"]:
|
||||
if finish_at := kb.get(key):
|
||||
|
||||
@ -606,12 +606,12 @@ def list_docs(dataset_id, tenant_id):
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/metadata/summary", methods=["GET"]) # noqa: F821
|
||||
@token_required
|
||||
def metadata_summary(dataset_id, tenant_id):
|
||||
async def metadata_summary(dataset_id, tenant_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
|
||||
|
||||
req = await get_request_json()
|
||||
try:
|
||||
summary = DocumentService.get_metadata_summary(dataset_id)
|
||||
summary = DocumentService.get_metadata_summary(dataset_id, req.get("doc_ids"))
|
||||
return get_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
@ -377,7 +377,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
|
||||
logging.debug("Proceeding with retrieval")
|
||||
tenant_ids = list(set([kb.tenant_id for kb in kbs]))
|
||||
knowledges = []
|
||||
if prompt_config.get("reasoning", False):
|
||||
if prompt_config.get("reasoning", False) or kwargs.get("reasoning"):
|
||||
reasoner = DeepResearcher(
|
||||
chat_mdl,
|
||||
prompt_config,
|
||||
|
||||
@ -776,10 +776,25 @@ class DocumentService(CommonService):
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_metadata_summary(cls, kb_id):
|
||||
def get_metadata_summary(cls, kb_id, document_ids=None):
|
||||
def _meta_value_type(value):
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, list):
|
||||
return "list"
|
||||
if isinstance(value, bool):
|
||||
return "string"
|
||||
if isinstance(value, (int, float)):
|
||||
return "number"
|
||||
return "string"
|
||||
|
||||
fields = [cls.model.id, cls.model.meta_fields]
|
||||
summary = {}
|
||||
for r in cls.model.select(*fields).where(cls.model.kb_id == kb_id):
|
||||
type_counter = {}
|
||||
query = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
|
||||
if document_ids:
|
||||
query = query.where(cls.model.id.in_(document_ids))
|
||||
for r in query:
|
||||
meta_fields = r.meta_fields or {}
|
||||
if isinstance(meta_fields, str):
|
||||
try:
|
||||
@ -789,6 +804,11 @@ class DocumentService(CommonService):
|
||||
if not isinstance(meta_fields, dict):
|
||||
continue
|
||||
for k, v in meta_fields.items():
|
||||
value_type = _meta_value_type(v)
|
||||
if value_type:
|
||||
if k not in type_counter:
|
||||
type_counter[k] = {}
|
||||
type_counter[k][value_type] = type_counter[k].get(value_type, 0) + 1
|
||||
values = v if isinstance(v, list) else [v]
|
||||
for vv in values:
|
||||
if not vv:
|
||||
@ -797,11 +817,19 @@ class DocumentService(CommonService):
|
||||
if k not in summary:
|
||||
summary[k] = {}
|
||||
summary[k][sv] = summary[k].get(sv, 0) + 1
|
||||
return {k: sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True) for k, v in summary.items()}
|
||||
result = {}
|
||||
for k, v in summary.items():
|
||||
values = sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True)
|
||||
type_counts = type_counter.get(k, {})
|
||||
value_type = "string"
|
||||
if type_counts:
|
||||
value_type = max(type_counts.items(), key=lambda item: item[1])[0]
|
||||
result[k] = {"type": value_type, "values": values}
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def batch_update_metadata(cls, kb_id, doc_ids, updates=None, deletes=None):
|
||||
def batch_update_metadata(cls, kb_id, doc_ids, updates=None, deletes=None, adds=None):
|
||||
updates = updates or []
|
||||
deletes = deletes or []
|
||||
if not doc_ids:
|
||||
@ -826,6 +854,8 @@ class DocumentService(CommonService):
|
||||
key = upd.get("key")
|
||||
if not key:
|
||||
continue
|
||||
if key not in meta:
|
||||
meta[key] = upd.get("value")
|
||||
|
||||
new_value = upd.get("value")
|
||||
match_provided = "match" in upd
|
||||
@ -895,7 +925,7 @@ class DocumentService(CommonService):
|
||||
updated_docs = 0
|
||||
with DB.atomic():
|
||||
rows = cls.model.select(cls.model.id, cls.model.meta_fields).where(
|
||||
(cls.model.id.in_(doc_ids)) & (cls.model.kb_id == kb_id)
|
||||
cls.model.id.in_(doc_ids)
|
||||
)
|
||||
for r in rows:
|
||||
meta = _normalize_meta(r.meta_fields or {})
|
||||
|
||||
Reference in New Issue
Block a user