mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-26 17:16:52 +08:00
Feat: deduplicate metadata lists during updates (#12125)
### What problem does this PR solve? Deduplicate metadata lists during updates. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -33,6 +33,7 @@ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTena
|
|||||||
from api.db.db_utils import bulk_insert_into_db
|
from api.db.db_utils import bulk_insert_into_db
|
||||||
from api.db.services.common_service import CommonService
|
from api.db.services.common_service import CommonService
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
|
from common.metadata_utils import dedupe_list
|
||||||
from common.misc_utils import get_uuid
|
from common.misc_utils import get_uuid
|
||||||
from common.time_utils import current_timestamp, get_format_time
|
from common.time_utils import current_timestamp, get_format_time
|
||||||
from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME
|
from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME
|
||||||
@ -799,7 +800,10 @@ class DocumentService(CommonService):
|
|||||||
match_provided = "match" in upd
|
match_provided = "match" in upd
|
||||||
if isinstance(meta[key], list):
|
if isinstance(meta[key], list):
|
||||||
if not match_provided:
|
if not match_provided:
|
||||||
meta[key] = new_value
|
if isinstance(new_value, list):
|
||||||
|
meta[key] = dedupe_list(new_value)
|
||||||
|
else:
|
||||||
|
meta[key] = new_value
|
||||||
changed = True
|
changed = True
|
||||||
else:
|
else:
|
||||||
match_value = upd.get("match")
|
match_value = upd.get("match")
|
||||||
@ -812,7 +816,7 @@ class DocumentService(CommonService):
|
|||||||
else:
|
else:
|
||||||
new_list.append(item)
|
new_list.append(item)
|
||||||
if replaced:
|
if replaced:
|
||||||
meta[key] = new_list
|
meta[key] = dedupe_list(new_list)
|
||||||
changed = True
|
changed = True
|
||||||
else:
|
else:
|
||||||
if not match_provided:
|
if not match_provided:
|
||||||
|
|||||||
@ -151,6 +151,18 @@ async def apply_meta_data_filter(
|
|||||||
return doc_ids
|
return doc_ids
|
||||||
|
|
||||||
|
|
||||||
|
def dedupe_list(values: list) -> list:
|
||||||
|
seen = set()
|
||||||
|
deduped = []
|
||||||
|
for item in values:
|
||||||
|
key = str(item)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
deduped.append(item)
|
||||||
|
return deduped
|
||||||
|
|
||||||
|
|
||||||
def update_metadata_to(metadata, meta):
|
def update_metadata_to(metadata, meta):
|
||||||
if not meta:
|
if not meta:
|
||||||
return metadata
|
return metadata
|
||||||
@ -162,11 +174,13 @@ def update_metadata_to(metadata, meta):
|
|||||||
return metadata
|
return metadata
|
||||||
if not isinstance(meta, dict):
|
if not isinstance(meta, dict):
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
for k, v in meta.items():
|
for k, v in meta.items():
|
||||||
if isinstance(v, list):
|
if isinstance(v, list):
|
||||||
v = [vv for vv in v if isinstance(vv, str)]
|
v = [vv for vv in v if isinstance(vv, str)]
|
||||||
if not v:
|
if not v:
|
||||||
continue
|
continue
|
||||||
|
v = dedupe_list(v)
|
||||||
if not isinstance(v, list) and not isinstance(v, str):
|
if not isinstance(v, list) and not isinstance(v, str):
|
||||||
continue
|
continue
|
||||||
if k not in metadata:
|
if k not in metadata:
|
||||||
@ -177,6 +191,7 @@ def update_metadata_to(metadata, meta):
|
|||||||
metadata[k].extend(v)
|
metadata[k].extend(v)
|
||||||
else:
|
else:
|
||||||
metadata[k].append(v)
|
metadata[k].append(v)
|
||||||
|
metadata[k] = dedupe_list(metadata[k])
|
||||||
else:
|
else:
|
||||||
metadata[k] = v
|
metadata[k] = v
|
||||||
|
|
||||||
@ -208,4 +223,4 @@ def metadata_schema(metadata: list|None) -> Dict[str, Any]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
json_schema["additionalProperties"] = False
|
json_schema["additionalProperties"] = False
|
||||||
return json_schema
|
return json_schema
|
||||||
|
|||||||
Reference in New Issue
Block a user