From c987d336493edf888e47c61907ac80b36f99a032 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Wed, 24 Dec 2025 09:32:55 +0800 Subject: [PATCH] Feat: deduplicate metadata lists during updates (#12125) ### What problem does this PR solve? Deduplicate metadata lists during updates. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/db/services/document_service.py | 8 ++++++-- common/metadata_utils.py | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 2f9e143f9..81bd160e3 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -33,6 +33,7 @@ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTena from api.db.db_utils import bulk_insert_into_db from api.db.services.common_service import CommonService from api.db.services.knowledgebase_service import KnowledgebaseService +from common.metadata_utils import dedupe_list from common.misc_utils import get_uuid from common.time_utils import current_timestamp, get_format_time from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME @@ -799,7 +800,10 @@ class DocumentService(CommonService): match_provided = "match" in upd if isinstance(meta[key], list): if not match_provided: - meta[key] = new_value + if isinstance(new_value, list): + meta[key] = dedupe_list(new_value) + else: + meta[key] = new_value changed = True else: match_value = upd.get("match") @@ -812,7 +816,7 @@ class DocumentService(CommonService): else: new_list.append(item) if replaced: - meta[key] = new_list + meta[key] = dedupe_list(new_list) changed = True else: if not match_provided: diff --git a/common/metadata_utils.py b/common/metadata_utils.py index 4d4cceb4a..d7bb3c818 100644 --- a/common/metadata_utils.py +++ b/common/metadata_utils.py @@ -151,6 +151,18 @@ async def apply_meta_data_filter( return doc_ids +def dedupe_list(values: list) -> list: + seen = set() + deduped = [] + for item in values: + key = str(item) + if key in seen: + continue + seen.add(key) + deduped.append(item) + return deduped + + def update_metadata_to(metadata, meta): if not meta: return metadata @@ -162,11 +174,13 @@ def update_metadata_to(metadata, meta): return metadata if not isinstance(meta, dict): return metadata + for k, v in meta.items(): if isinstance(v, list): v = [vv for vv in v if isinstance(vv, str)] if not v: continue + v = dedupe_list(v) if not isinstance(v, list) and not isinstance(v, str): continue if k not in metadata: @@ -177,6 +191,7 @@ def update_metadata_to(metadata, meta): metadata[k].extend(v) else: metadata[k].append(v) + metadata[k] = dedupe_list(metadata[k]) else: metadata[k] = v @@ -208,4 +223,4 @@ def metadata_schema(metadata: list|None) -> Dict[str, Any]: } json_schema["additionalProperties"] = False - return json_schema \ No newline at end of file + return json_schema