mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-30 15:16:45 +08:00
Put document metadata in ES/Infinity (#12826)
### What problem does this PR solve?
Put document metadata in ES/Infinity.
Index name of meta data: ragflow_doc_meta_{tenant_id}
### Type of change
- [x] Refactoring
This commit is contained in:
@ -25,6 +25,7 @@ from api.db import InputType
|
||||
from api.db.db_models import Connector, SyncLogs, Connector2Kb, Knowledgebase
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.document_service import DocMetadataService
|
||||
from common.misc_utils import get_uuid
|
||||
from common.constants import TaskStatus
|
||||
from common.time_utils import current_timestamp, timestamp_to_date
|
||||
@ -227,7 +228,7 @@ class SyncLogsService(CommonService):
|
||||
|
||||
# Set metadata if available for this document
|
||||
if doc["name"] in metadata_map:
|
||||
DocumentService.update_by_id(doc["id"], {"meta_fields": metadata_map[doc["name"]]})
|
||||
DocMetadataService.update_document_metadata(doc["id"], metadata_map[doc["name"]])
|
||||
|
||||
if not auto_parse or auto_parse == "0":
|
||||
continue
|
||||
|
||||
@ -28,7 +28,7 @@ from api.db.services.file_service import FileService
|
||||
from common.constants import LLMType, ParserType, StatusEnum
|
||||
from api.db.db_models import DB, Dialog
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.langfuse_service import TenantLangfuseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
@ -355,7 +355,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
|
||||
questions = [await cross_languages(dialog.tenant_id, dialog.llm_id, questions[0], prompt_config["cross_languages"])]
|
||||
|
||||
if dialog.meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(dialog.kb_ids)
|
||||
attachments = await apply_meta_data_filter(
|
||||
dialog.meta_data_filter,
|
||||
metas,
|
||||
@ -1048,7 +1048,7 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf
|
||||
tenant_ids = list(set([kb.tenant_id for kb in kbs]))
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
|
||||
|
||||
kbinfos = await retriever.retrieval(
|
||||
@ -1124,7 +1124,7 @@ async def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
|
||||
rerank_mdl = LLMBundle(tenant_id, LLMType.RERANK, rerank_id)
|
||||
|
||||
if meta_data_filter:
|
||||
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
|
||||
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl, doc_ids)
|
||||
|
||||
ranks = await settings.retriever.retrieval(
|
||||
|
||||
1073
api/db/services/doc_metadata_service.py
Normal file
1073
api/db/services/doc_metadata_service.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -33,7 +33,7 @@ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTena
|
||||
from api.db.db_utils import bulk_insert_into_db
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from common.metadata_utils import dedupe_list
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from common.misc_utils import get_uuid
|
||||
from common.time_utils import current_timestamp, get_format_time
|
||||
from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME
|
||||
@ -67,7 +67,6 @@ class DocumentService(CommonService):
|
||||
cls.model.progress_msg,
|
||||
cls.model.process_begin_at,
|
||||
cls.model.process_duration,
|
||||
cls.model.meta_fields,
|
||||
cls.model.suffix,
|
||||
cls.model.run,
|
||||
cls.model.status,
|
||||
@ -154,8 +153,11 @@ class DocumentService(CommonService):
|
||||
docs = docs.where(cls.model.type.in_(types))
|
||||
if suffix:
|
||||
docs = docs.where(cls.model.suffix.in_(suffix))
|
||||
if return_empty_metadata:
|
||||
docs = docs.where(fn.COALESCE(fn.JSON_LENGTH(cls.model.meta_fields), 0) == 0)
|
||||
|
||||
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
|
||||
doc_ids_with_metadata = set(metadata_map.keys())
|
||||
if return_empty_metadata and doc_ids_with_metadata:
|
||||
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
|
||||
|
||||
count = docs.count()
|
||||
if desc:
|
||||
@ -166,7 +168,14 @@ class DocumentService(CommonService):
|
||||
if page_number and items_per_page:
|
||||
docs = docs.paginate(page_number, items_per_page)
|
||||
|
||||
return list(docs.dicts()), count
|
||||
docs_list = list(docs.dicts())
|
||||
if return_empty_metadata:
|
||||
for doc in docs_list:
|
||||
doc["meta_fields"] = {}
|
||||
else:
|
||||
for doc in docs_list:
|
||||
doc["meta_fields"] = metadata_map.get(doc["id"], {})
|
||||
return docs_list, count
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
@ -212,7 +221,7 @@ class DocumentService(CommonService):
|
||||
if suffix:
|
||||
query = query.where(cls.model.suffix.in_(suffix))
|
||||
|
||||
rows = query.select(cls.model.run, cls.model.suffix, cls.model.meta_fields)
|
||||
rows = query.select(cls.model.run, cls.model.suffix, cls.model.id)
|
||||
total = rows.count()
|
||||
|
||||
suffix_counter = {}
|
||||
@ -220,10 +229,18 @@ class DocumentService(CommonService):
|
||||
metadata_counter = {}
|
||||
empty_metadata_count = 0
|
||||
|
||||
doc_ids = [row.id for row in rows]
|
||||
metadata = {}
|
||||
if doc_ids:
|
||||
try:
|
||||
metadata = DocMetadataService.get_metadata_for_documents(doc_ids, kb_id)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to fetch metadata from ES/Infinity: {e}")
|
||||
|
||||
for row in rows:
|
||||
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
|
||||
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
|
||||
meta_fields = row.meta_fields or {}
|
||||
meta_fields = metadata.get(row.id, {})
|
||||
if not meta_fields:
|
||||
empty_metadata_count += 1
|
||||
continue
|
||||
@ -374,6 +391,12 @@ class DocumentService(CommonService):
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}")
|
||||
|
||||
# Delete document metadata (non-critical, log and continue)
|
||||
try:
|
||||
DocMetadataService.delete_document_metadata(doc.id)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to delete metadata for document {doc.id}: {e}")
|
||||
|
||||
# Cleanup knowledge graph references (non-critical, log and continue)
|
||||
try:
|
||||
graph_source = settings.docStoreConn.get_fields(
|
||||
@ -707,246 +730,6 @@ class DocumentService(CommonService):
|
||||
|
||||
cls.update_by_id(doc_id, info)
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def update_meta_fields(cls, doc_id, meta_fields):
|
||||
return cls.update_by_id(doc_id, {"meta_fields": meta_fields})
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_meta_by_kbs(cls, kb_ids):
|
||||
"""
|
||||
Legacy metadata aggregator (backward-compatible).
|
||||
- Does NOT expand list values and a list is kept as one string key.
|
||||
Example: {"tags": ["foo","bar"]} -> meta["tags"]["['foo', 'bar']"] = [doc_id]
|
||||
- Expects meta_fields is a dict.
|
||||
Use when existing callers rely on the old list-as-string semantics.
|
||||
"""
|
||||
fields = [
|
||||
cls.model.id,
|
||||
cls.model.meta_fields,
|
||||
]
|
||||
meta = {}
|
||||
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
|
||||
doc_id = r.id
|
||||
for k,v in r.meta_fields.items():
|
||||
if k not in meta:
|
||||
meta[k] = {}
|
||||
if not isinstance(v, list):
|
||||
v = [v]
|
||||
for vv in v:
|
||||
if vv not in meta[k]:
|
||||
if isinstance(vv, list) or isinstance(vv, dict):
|
||||
continue
|
||||
meta[k][vv] = []
|
||||
meta[k][vv].append(doc_id)
|
||||
return meta
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_flatted_meta_by_kbs(cls, kb_ids):
|
||||
"""
|
||||
- Parses stringified JSON meta_fields when possible and skips non-dict or unparsable values.
|
||||
- Expands list values into individual entries.
|
||||
Example: {"tags": ["foo","bar"], "author": "alice"} ->
|
||||
meta["tags"]["foo"] = [doc_id], meta["tags"]["bar"] = [doc_id], meta["author"]["alice"] = [doc_id]
|
||||
Prefer for metadata_condition filtering and scenarios that must respect list semantics.
|
||||
"""
|
||||
fields = [
|
||||
cls.model.id,
|
||||
cls.model.meta_fields,
|
||||
]
|
||||
meta = {}
|
||||
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
|
||||
doc_id = r.id
|
||||
meta_fields = r.meta_fields or {}
|
||||
if isinstance(meta_fields, str):
|
||||
try:
|
||||
meta_fields = json.loads(meta_fields)
|
||||
except Exception:
|
||||
continue
|
||||
if not isinstance(meta_fields, dict):
|
||||
continue
|
||||
for k, v in meta_fields.items():
|
||||
if k not in meta:
|
||||
meta[k] = {}
|
||||
values = v if isinstance(v, list) else [v]
|
||||
for vv in values:
|
||||
if vv is None:
|
||||
continue
|
||||
sv = str(vv)
|
||||
if sv not in meta[k]:
|
||||
meta[k][sv] = []
|
||||
meta[k][sv].append(doc_id)
|
||||
return meta
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_metadata_summary(cls, kb_id, document_ids=None):
|
||||
def _meta_value_type(value):
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, list):
|
||||
return "list"
|
||||
if isinstance(value, bool):
|
||||
return "string"
|
||||
if isinstance(value, (int, float)):
|
||||
return "number"
|
||||
if re.match(r"\d{4}\-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", str(value)):
|
||||
return "time"
|
||||
return "string"
|
||||
|
||||
fields = [cls.model.id, cls.model.meta_fields]
|
||||
summary = {}
|
||||
type_counter = {}
|
||||
query = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
|
||||
if document_ids:
|
||||
query = query.where(cls.model.id.in_(document_ids))
|
||||
for r in query:
|
||||
meta_fields = r.meta_fields or {}
|
||||
if isinstance(meta_fields, str):
|
||||
try:
|
||||
meta_fields = json.loads(meta_fields)
|
||||
except Exception:
|
||||
continue
|
||||
if not isinstance(meta_fields, dict):
|
||||
continue
|
||||
for k, v in meta_fields.items():
|
||||
value_type = _meta_value_type(v)
|
||||
if value_type:
|
||||
if k not in type_counter:
|
||||
type_counter[k] = {}
|
||||
type_counter[k][value_type] = type_counter[k].get(value_type, 0) + 1
|
||||
values = v if isinstance(v, list) else [v]
|
||||
for vv in values:
|
||||
if not vv:
|
||||
continue
|
||||
sv = str(vv)
|
||||
if k not in summary:
|
||||
summary[k] = {}
|
||||
summary[k][sv] = summary[k].get(sv, 0) + 1
|
||||
result = {}
|
||||
for k, v in summary.items():
|
||||
values = sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True)
|
||||
type_counts = type_counter.get(k, {})
|
||||
value_type = "string"
|
||||
if type_counts:
|
||||
value_type = max(type_counts.items(), key=lambda item: item[1])[0]
|
||||
result[k] = {"type": value_type, "values": values}
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def batch_update_metadata(cls, kb_id, doc_ids, updates=None, deletes=None, adds=None):
|
||||
updates = updates or []
|
||||
deletes = deletes or []
|
||||
if not doc_ids:
|
||||
return 0
|
||||
|
||||
def _normalize_meta(meta):
|
||||
if isinstance(meta, str):
|
||||
try:
|
||||
meta = json.loads(meta)
|
||||
except Exception:
|
||||
return {}
|
||||
if not isinstance(meta, dict):
|
||||
return {}
|
||||
return deepcopy(meta)
|
||||
|
||||
def _str_equal(a, b):
|
||||
return str(a) == str(b)
|
||||
|
||||
def _apply_updates(meta):
|
||||
changed = False
|
||||
for upd in updates:
|
||||
key = upd.get("key")
|
||||
if not key:
|
||||
continue
|
||||
|
||||
new_value = upd.get("value")
|
||||
match_provided = upd.get("match")
|
||||
if key not in meta:
|
||||
if match_provided:
|
||||
continue
|
||||
meta[key] = dedupe_list(new_value) if isinstance(new_value, list) else new_value
|
||||
changed = True
|
||||
continue
|
||||
|
||||
if isinstance(meta[key], list):
|
||||
if not match_provided:
|
||||
if isinstance(new_value, list):
|
||||
meta[key] = dedupe_list(new_value)
|
||||
else:
|
||||
meta[key].append(new_value)
|
||||
changed = True
|
||||
else:
|
||||
match_value = upd.get("match")
|
||||
replaced = False
|
||||
new_list = []
|
||||
for item in meta[key]:
|
||||
if _str_equal(item, match_value):
|
||||
new_list.append(new_value)
|
||||
replaced = True
|
||||
else:
|
||||
new_list.append(item)
|
||||
if replaced:
|
||||
meta[key] = dedupe_list(new_list)
|
||||
changed = True
|
||||
else:
|
||||
if not match_provided:
|
||||
meta[key] = new_value
|
||||
changed = True
|
||||
else:
|
||||
match_value = upd.get("match")
|
||||
if _str_equal(meta[key], match_value):
|
||||
meta[key] = new_value
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
def _apply_deletes(meta):
|
||||
changed = False
|
||||
for d in deletes:
|
||||
key = d.get("key")
|
||||
if not key or key not in meta:
|
||||
continue
|
||||
value = d.get("value", None)
|
||||
if isinstance(meta[key], list):
|
||||
if value is None:
|
||||
del meta[key]
|
||||
changed = True
|
||||
continue
|
||||
new_list = [item for item in meta[key] if not _str_equal(item, value)]
|
||||
if len(new_list) != len(meta[key]):
|
||||
if new_list:
|
||||
meta[key] = new_list
|
||||
else:
|
||||
del meta[key]
|
||||
changed = True
|
||||
else:
|
||||
if value is None or _str_equal(meta[key], value):
|
||||
del meta[key]
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
updated_docs = 0
|
||||
with DB.atomic():
|
||||
rows = cls.model.select(cls.model.id, cls.model.meta_fields).where(
|
||||
cls.model.id.in_(doc_ids)
|
||||
)
|
||||
for r in rows:
|
||||
meta = _normalize_meta(r.meta_fields or {})
|
||||
original_meta = deepcopy(meta)
|
||||
changed = _apply_updates(meta)
|
||||
changed = _apply_deletes(meta) or changed
|
||||
if changed and meta != original_meta:
|
||||
cls.model.update(
|
||||
meta_fields=meta,
|
||||
update_time=current_timestamp(),
|
||||
update_date=get_format_time()
|
||||
).where(cls.model.id == r.id).execute()
|
||||
updated_docs += 1
|
||||
return updated_docs
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def update_progress(cls):
|
||||
|
||||
Reference in New Issue
Block a user