Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
This commit is contained in:
qinling0210
2026-01-28 13:29:34 +08:00
committed by GitHub
parent fd11aca8e5
commit 9a5208976c
24 changed files with 1529 additions and 304 deletions

View File

@ -22,6 +22,7 @@ import xxhash
from quart import request
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from common.metadata_utils import apply_meta_data_filter
@ -381,7 +382,7 @@ async def retrieval_test():
chat_mdl = LLMBundle(user_id, LLMType.CHAT)
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, local_doc_ids)
tenants = UserTenantService.query(user_id=user_id)

View File

@ -26,6 +26,7 @@ from api.db import VALID_FILE_TYPES, FileType
from api.db.db_models import Task
from api.db.services import duplicate_name
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.db.services.doc_metadata_service import DocMetadataService
from common.metadata_utils import meta_filter, convert_conditions, turn2jsonschema
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
@ -281,7 +282,7 @@ async def list_docs():
doc_ids_filter = None
metas = None
if metadata_condition or metadata:
metas = DocumentService.get_flatted_meta_by_kbs([kb_id])
metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id])
if metadata_condition:
doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
@ -401,7 +402,11 @@ async def doc_infos():
if not DocumentService.accessible(doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
docs = DocumentService.get_by_ids(doc_ids)
return get_json_result(data=list(docs.dicts()))
docs_list = list(docs.dicts())
# Add meta_fields for each document
for doc in docs_list:
doc["meta_fields"] = DocMetadataService.get_document_metadata(doc["id"])
return get_json_result(data=docs_list)
@manager.route("/metadata/summary", methods=["POST"]) # noqa: F821
@ -421,7 +426,7 @@ async def metadata_summary():
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
try:
summary = DocumentService.get_metadata_summary(kb_id, doc_ids)
summary = DocMetadataService.get_metadata_summary(kb_id, doc_ids)
return get_json_result(data={"summary": summary})
except Exception as e:
return server_error_response(e)
@ -432,10 +437,14 @@ async def metadata_summary():
@validate_request("doc_ids")
async def metadata_update():
req = await get_request_json()
kb_id = req.get("kb_id")
document_ids = req.get("doc_ids")
updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or []
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
if not isinstance(updates, list) or not isinstance(deletes, list):
return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR)
@ -446,8 +455,8 @@ async def metadata_update():
if not isinstance(d, dict) or not d.get("key"):
return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR)
updated = DocumentService.batch_update_metadata(None, document_ids, updates, deletes)
return get_json_result(data={"updated": updated})
updated = DocMetadataService.batch_update_metadata(kb_id, document_ids, updates, deletes)
return get_json_result(data={"updated": updated, "matched_docs": len(document_ids)})
@manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821
@ -905,7 +914,7 @@ async def set_meta():
if not e:
return get_data_error_result(message="Document not found!")
if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}):
if not DocMetadataService.update_document_metadata(req["doc_id"], meta):
return get_data_error_result(message="Database error (meta updates)!")
return get_json_result(data=True)

View File

@ -25,6 +25,7 @@ import numpy as np
from api.db.services.connector_service import Connector2KbService
from api.db.services.llm_service import LLMBundle
from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
@ -467,7 +468,7 @@ def get_meta():
message='No authorization.',
code=RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=DocumentService.get_meta_by_kbs(kb_ids))
return get_json_result(data=DocMetadataService.get_flatted_meta_by_kbs(kb_ids))
@manager.route("/basic_info", methods=["GET"]) # noqa: F821

View File

@ -18,6 +18,7 @@ import logging
from quart import jsonify
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from common.metadata_utils import meta_filter, convert_conditions
@ -121,7 +122,7 @@ async def retrieval(tenant_id):
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
top = int(retrieval_setting.get("top_k", 1024))
metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocumentService.get_meta_by_kbs([kb_id])
metas = DocMetadataService.get_meta_by_kbs([kb_id])
doc_ids = []
try:

View File

@ -29,6 +29,7 @@ from api.constants import FILE_NAME_LEN_LIMIT
from api.db import FileType
from api.db.db_models import File, Task
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
@ -255,7 +256,8 @@ async def update_doc(tenant_id, dataset_id, document_id):
if "meta_fields" in req:
if not isinstance(req["meta_fields"], dict):
return get_error_data_result(message="meta_fields must be a dictionary")
DocumentService.update_meta_fields(document_id, req["meta_fields"])
if not DocMetadataService.update_document_metadata(document_id, req["meta_fields"]):
return get_error_data_result(message="Failed to update metadata")
if "name" in req and req["name"] != doc.name:
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
@ -568,7 +570,7 @@ def list_docs(dataset_id, tenant_id):
doc_ids_filter = None
if metadata_condition:
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id])
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
if metadata_condition.get("conditions") and not doc_ids_filter:
return get_result(data={"total": 0, "docs": []})
@ -611,7 +613,7 @@ async def metadata_summary(dataset_id, tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
req = await get_request_json()
try:
summary = DocumentService.get_metadata_summary(dataset_id, req.get("doc_ids"))
summary = DocMetadataService.get_metadata_summary(dataset_id, req.get("doc_ids"))
return get_result(data={"summary": summary})
except Exception as e:
return server_error_response(e)
@ -657,14 +659,14 @@ async def metadata_batch_update(dataset_id, tenant_id):
target_doc_ids = set(document_ids)
if metadata_condition:
metas = DocumentService.get_flatted_meta_by_kbs([dataset_id])
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})
target_doc_ids = list(target_doc_ids)
updated = DocumentService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
@ -1534,7 +1536,7 @@ async def retrieval_test(tenant_id):
if not doc_ids:
metadata_condition = req.get("metadata_condition")
if metadata_condition:
metas = DocumentService.get_meta_by_kbs(kb_ids)
metas = DocMetadataService.get_meta_by_kbs(kb_ids)
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
# If metadata_condition has conditions but no docs match, return empty result
if not doc_ids and metadata_condition.get("conditions"):

View File

@ -35,7 +35,7 @@ from api.db.services.conversation_service import ConversationService
from api.db.services.conversation_service import async_iframe_completion as iframe_completion
from api.db.services.conversation_service import async_completion as rag_completion
from api.db.services.dialog_service import DialogService, async_ask, async_chat, gen_mindmap
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from common.metadata_utils import apply_meta_data_filter, convert_conditions, meta_filter
@ -147,7 +147,7 @@ async def chat_completion(tenant_id, chat_id):
return get_error_data_result(message="metadata_condition must be an object.")
if metadata_condition and req.get("question"):
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or [])
metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
filtered_doc_ids = meta_filter(
metas,
convert_conditions(metadata_condition),
@ -279,7 +279,7 @@ async def chat_completion_openai_like(tenant_id, chat_id):
doc_ids_str = None
if metadata_condition:
metas = DocumentService.get_meta_by_kbs(dia.kb_ids or [])
metas = DocMetadataService.get_flatted_meta_by_kbs(dia.kb_ids or [])
filtered_doc_ids = meta_filter(
metas,
convert_conditions(metadata_condition),
@ -1084,7 +1084,7 @@ async def retrieval_test_embedded():
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
local_doc_ids = await apply_meta_data_filter(meta_data_filter, metas, _question, chat_mdl, local_doc_ids)
tenants = UserTenantService.query(user_id=tenant_id)

View File

@ -795,7 +795,6 @@ class Document(DataBaseModel):
progress_msg = TextField(null=True, help_text="process message", default="")
process_begin_at = DateTimeField(null=True, index=True)
process_duration = FloatField(default=0)
meta_fields = JSONField(null=True, default={})
suffix = CharField(max_length=32, null=False, help_text="The real file extension suffix", index=True)
run = CharField(max_length=1, null=True, help_text="start to run processing or cancel.(1: run it; 2: cancel)", default="0", index=True)
@ -1267,7 +1266,6 @@ def migrate_db():
alter_db_add_column(migrator, "task", "digest", TextField(null=True, help_text="task digest", default=""))
alter_db_add_column(migrator, "task", "chunk_ids", LongTextField(null=True, help_text="chunk ids", default=""))
alter_db_add_column(migrator, "conversation", "user_id", CharField(max_length=255, null=True, help_text="user_id", index=True))
alter_db_add_column(migrator, "document", "meta_fields", JSONField(null=True, default={}))
alter_db_add_column(migrator, "task", "task_type", CharField(max_length=32, null=False, default=""))
alter_db_add_column(migrator, "task", "priority", IntegerField(default=0))
alter_db_add_column(migrator, "user_canvas", "permission", CharField(max_length=16, null=False, help_text="me|team", default="me", index=True))

View File

@ -23,6 +23,7 @@ from api.db.services.canvas_service import UserCanvasService
from api.db.services.conversation_service import ConversationService
from api.db.services.dialog_service import DialogService
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.langfuse_service import TenantLangfuseService
@ -107,6 +108,11 @@ def create_new_user(user_info: dict) -> dict:
except Exception as create_error:
logging.exception(create_error)
# rollback
try:
metadata_index_name = DocMetadataService._get_doc_meta_index_name(user_id)
settings.docStoreConn.delete_idx(metadata_index_name, "")
except Exception as e:
logging.exception(e)
try:
TenantService.delete_by_id(user_id)
except Exception as e:
@ -165,6 +171,12 @@ def delete_user_data(user_id: str) -> dict:
# step1.1.2 delete file and document info in db
doc_ids = DocumentService.get_all_doc_ids_by_kb_ids(kb_ids)
if doc_ids:
for doc in doc_ids:
try:
DocMetadataService.delete_document_metadata(doc["id"], skip_empty_check=True)
except Exception as e:
logging.warning(f"Failed to delete metadata for document {doc['id']}: {e}")
doc_delete_res = DocumentService.delete_by_ids([i["id"] for i in doc_ids])
done_msg += f"- Deleted {doc_delete_res} document records.\n"
task_delete_res = TaskService.delete_by_doc_ids([i["id"] for i in doc_ids])
@ -202,6 +214,13 @@ def delete_user_data(user_id: str) -> dict:
done_msg += f"- Deleted {llm_delete_res} tenant-LLM records.\n"
langfuse_delete_res = TenantLangfuseService.delete_ty_tenant_id(tenant_id)
done_msg += f"- Deleted {langfuse_delete_res} langfuse records.\n"
try:
metadata_index_name = DocMetadataService._get_doc_meta_index_name(tenant_id)
settings.docStoreConn.delete_idx(metadata_index_name, "")
done_msg += f"- Deleted metadata table {metadata_index_name}.\n"
except Exception as e:
logging.warning(f"Failed to delete metadata table for tenant {tenant_id}: {e}")
done_msg += "- Warning: Failed to delete metadata table (continuing).\n"
# step1.3 delete memory and messages
user_memory = MemoryService.get_by_tenant_id(tenant_id)
if user_memory:
@ -269,6 +288,11 @@ def delete_user_data(user_id: str) -> dict:
# step2.1.5 delete document record
doc_delete_res = DocumentService.delete_by_ids([d['id'] for d in created_documents])
done_msg += f"- Deleted {doc_delete_res} documents.\n"
for doc in created_documents:
try:
DocMetadataService.delete_document_metadata(doc['id'])
except Exception as e:
logging.warning(f"Failed to delete metadata for document {doc['id']}: {e}")
# step2.1.6 update dataset doc&chunk&token cnt
for kb_id, doc_num in kb_doc_info.items():
KnowledgebaseService.decrease_document_num_in_delete(kb_id, doc_num)

View File

@ -25,6 +25,7 @@ from api.db import InputType
from api.db.db_models import Connector, SyncLogs, Connector2Kb, Knowledgebase
from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService
from api.db.services.document_service import DocMetadataService
from common.misc_utils import get_uuid
from common.constants import TaskStatus
from common.time_utils import current_timestamp, timestamp_to_date
@ -227,7 +228,7 @@ class SyncLogsService(CommonService):
# Set metadata if available for this document
if doc["name"] in metadata_map:
DocumentService.update_by_id(doc["id"], {"meta_fields": metadata_map[doc["name"]]})
DocMetadataService.update_document_metadata(doc["id"], metadata_map[doc["name"]])
if not auto_parse or auto_parse == "0":
continue

View File

@ -28,7 +28,7 @@ from api.db.services.file_service import FileService
from common.constants import LLMType, ParserType, StatusEnum
from api.db.db_models import DB, Dialog
from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.langfuse_service import TenantLangfuseService
from api.db.services.llm_service import LLMBundle
@ -355,7 +355,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
questions = [await cross_languages(dialog.tenant_id, dialog.llm_id, questions[0], prompt_config["cross_languages"])]
if dialog.meta_data_filter:
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
metas = DocMetadataService.get_flatted_meta_by_kbs(dialog.kb_ids)
attachments = await apply_meta_data_filter(
dialog.meta_data_filter,
metas,
@ -1048,7 +1048,7 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf
tenant_ids = list(set([kb.tenant_id for kb in kbs]))
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
kbinfos = await retriever.retrieval(
@ -1124,7 +1124,7 @@ async def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
rerank_mdl = LLMBundle(tenant_id, LLMType.RERANK, rerank_id)
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
ranks = await settings.retriever.retrieval(

File diff suppressed because it is too large Load Diff

View File

@ -33,7 +33,7 @@ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTena
from api.db.db_utils import bulk_insert_into_db
from api.db.services.common_service import CommonService
from api.db.services.knowledgebase_service import KnowledgebaseService
from common.metadata_utils import dedupe_list
from api.db.services.doc_metadata_service import DocMetadataService
from common.misc_utils import get_uuid
from common.time_utils import current_timestamp, get_format_time
from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME
@ -67,7 +67,6 @@ class DocumentService(CommonService):
cls.model.progress_msg,
cls.model.process_begin_at,
cls.model.process_duration,
cls.model.meta_fields,
cls.model.suffix,
cls.model.run,
cls.model.status,
@ -154,8 +153,11 @@ class DocumentService(CommonService):
docs = docs.where(cls.model.type.in_(types))
if suffix:
docs = docs.where(cls.model.suffix.in_(suffix))
if return_empty_metadata:
docs = docs.where(fn.COALESCE(fn.JSON_LENGTH(cls.model.meta_fields), 0) == 0)
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
doc_ids_with_metadata = set(metadata_map.keys())
if return_empty_metadata and doc_ids_with_metadata:
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
count = docs.count()
if desc:
@ -166,7 +168,14 @@ class DocumentService(CommonService):
if page_number and items_per_page:
docs = docs.paginate(page_number, items_per_page)
return list(docs.dicts()), count
docs_list = list(docs.dicts())
if return_empty_metadata:
for doc in docs_list:
doc["meta_fields"] = {}
else:
for doc in docs_list:
doc["meta_fields"] = metadata_map.get(doc["id"], {})
return docs_list, count
@classmethod
@DB.connection_context()
@ -212,7 +221,7 @@ class DocumentService(CommonService):
if suffix:
query = query.where(cls.model.suffix.in_(suffix))
rows = query.select(cls.model.run, cls.model.suffix, cls.model.meta_fields)
rows = query.select(cls.model.run, cls.model.suffix, cls.model.id)
total = rows.count()
suffix_counter = {}
@ -220,10 +229,18 @@ class DocumentService(CommonService):
metadata_counter = {}
empty_metadata_count = 0
doc_ids = [row.id for row in rows]
metadata = {}
if doc_ids:
try:
metadata = DocMetadataService.get_metadata_for_documents(doc_ids, kb_id)
except Exception as e:
logging.warning(f"Failed to fetch metadata from ES/Infinity: {e}")
for row in rows:
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
meta_fields = row.meta_fields or {}
meta_fields = metadata.get(row.id, {})
if not meta_fields:
empty_metadata_count += 1
continue
@ -374,6 +391,12 @@ class DocumentService(CommonService):
except Exception as e:
logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}")
# Delete document metadata (non-critical, log and continue)
try:
DocMetadataService.delete_document_metadata(doc.id)
except Exception as e:
logging.warning(f"Failed to delete metadata for document {doc.id}: {e}")
# Cleanup knowledge graph references (non-critical, log and continue)
try:
graph_source = settings.docStoreConn.get_fields(
@ -707,246 +730,6 @@ class DocumentService(CommonService):
cls.update_by_id(doc_id, info)
@classmethod
@DB.connection_context()
def update_meta_fields(cls, doc_id, meta_fields):
return cls.update_by_id(doc_id, {"meta_fields": meta_fields})
@classmethod
@DB.connection_context()
def get_meta_by_kbs(cls, kb_ids):
"""
Legacy metadata aggregator (backward-compatible).
- Does NOT expand list values and a list is kept as one string key.
Example: {"tags": ["foo","bar"]} -> meta["tags"]["['foo', 'bar']"] = [doc_id]
- Expects meta_fields is a dict.
Use when existing callers rely on the old list-as-string semantics.
"""
fields = [
cls.model.id,
cls.model.meta_fields,
]
meta = {}
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
doc_id = r.id
for k,v in r.meta_fields.items():
if k not in meta:
meta[k] = {}
if not isinstance(v, list):
v = [v]
for vv in v:
if vv not in meta[k]:
if isinstance(vv, list) or isinstance(vv, dict):
continue
meta[k][vv] = []
meta[k][vv].append(doc_id)
return meta
@classmethod
@DB.connection_context()
def get_flatted_meta_by_kbs(cls, kb_ids):
"""
- Parses stringified JSON meta_fields when possible and skips non-dict or unparsable values.
- Expands list values into individual entries.
Example: {"tags": ["foo","bar"], "author": "alice"} ->
meta["tags"]["foo"] = [doc_id], meta["tags"]["bar"] = [doc_id], meta["author"]["alice"] = [doc_id]
Prefer for metadata_condition filtering and scenarios that must respect list semantics.
"""
fields = [
cls.model.id,
cls.model.meta_fields,
]
meta = {}
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
doc_id = r.id
meta_fields = r.meta_fields or {}
if isinstance(meta_fields, str):
try:
meta_fields = json.loads(meta_fields)
except Exception:
continue
if not isinstance(meta_fields, dict):
continue
for k, v in meta_fields.items():
if k not in meta:
meta[k] = {}
values = v if isinstance(v, list) else [v]
for vv in values:
if vv is None:
continue
sv = str(vv)
if sv not in meta[k]:
meta[k][sv] = []
meta[k][sv].append(doc_id)
return meta
@classmethod
@DB.connection_context()
def get_metadata_summary(cls, kb_id, document_ids=None):
def _meta_value_type(value):
if value is None:
return None
if isinstance(value, list):
return "list"
if isinstance(value, bool):
return "string"
if isinstance(value, (int, float)):
return "number"
if re.match(r"\d{4}\-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", str(value)):
return "time"
return "string"
fields = [cls.model.id, cls.model.meta_fields]
summary = {}
type_counter = {}
query = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
if document_ids:
query = query.where(cls.model.id.in_(document_ids))
for r in query:
meta_fields = r.meta_fields or {}
if isinstance(meta_fields, str):
try:
meta_fields = json.loads(meta_fields)
except Exception:
continue
if not isinstance(meta_fields, dict):
continue
for k, v in meta_fields.items():
value_type = _meta_value_type(v)
if value_type:
if k not in type_counter:
type_counter[k] = {}
type_counter[k][value_type] = type_counter[k].get(value_type, 0) + 1
values = v if isinstance(v, list) else [v]
for vv in values:
if not vv:
continue
sv = str(vv)
if k not in summary:
summary[k] = {}
summary[k][sv] = summary[k].get(sv, 0) + 1
result = {}
for k, v in summary.items():
values = sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True)
type_counts = type_counter.get(k, {})
value_type = "string"
if type_counts:
value_type = max(type_counts.items(), key=lambda item: item[1])[0]
result[k] = {"type": value_type, "values": values}
return result
@classmethod
@DB.connection_context()
def batch_update_metadata(cls, kb_id, doc_ids, updates=None, deletes=None, adds=None):
updates = updates or []
deletes = deletes or []
if not doc_ids:
return 0
def _normalize_meta(meta):
if isinstance(meta, str):
try:
meta = json.loads(meta)
except Exception:
return {}
if not isinstance(meta, dict):
return {}
return deepcopy(meta)
def _str_equal(a, b):
return str(a) == str(b)
def _apply_updates(meta):
changed = False
for upd in updates:
key = upd.get("key")
if not key:
continue
new_value = upd.get("value")
match_provided = upd.get("match")
if key not in meta:
if match_provided:
continue
meta[key] = dedupe_list(new_value) if isinstance(new_value, list) else new_value
changed = True
continue
if isinstance(meta[key], list):
if not match_provided:
if isinstance(new_value, list):
meta[key] = dedupe_list(new_value)
else:
meta[key].append(new_value)
changed = True
else:
match_value = upd.get("match")
replaced = False
new_list = []
for item in meta[key]:
if _str_equal(item, match_value):
new_list.append(new_value)
replaced = True
else:
new_list.append(item)
if replaced:
meta[key] = dedupe_list(new_list)
changed = True
else:
if not match_provided:
meta[key] = new_value
changed = True
else:
match_value = upd.get("match")
if _str_equal(meta[key], match_value):
meta[key] = new_value
changed = True
return changed
def _apply_deletes(meta):
changed = False
for d in deletes:
key = d.get("key")
if not key or key not in meta:
continue
value = d.get("value", None)
if isinstance(meta[key], list):
if value is None:
del meta[key]
changed = True
continue
new_list = [item for item in meta[key] if not _str_equal(item, value)]
if len(new_list) != len(meta[key]):
if new_list:
meta[key] = new_list
else:
del meta[key]
changed = True
else:
if value is None or _str_equal(meta[key], value):
del meta[key]
changed = True
return changed
updated_docs = 0
with DB.atomic():
rows = cls.model.select(cls.model.id, cls.model.meta_fields).where(
cls.model.id.in_(doc_ids)
)
for r in rows:
meta = _normalize_meta(r.meta_fields or {})
original_meta = deepcopy(meta)
changed = _apply_updates(meta)
changed = _apply_deletes(meta) or changed
if changed and meta != original_meta:
cls.model.update(
meta_fields=meta,
update_time=current_timestamp(),
update_date=get_format_time()
).where(cls.model.id == r.id).execute()
updated_docs += 1
return updated_docs
@classmethod
@DB.connection_context()
def update_progress(cls):