From b40a7b2e7d7a9500a3993c6bcbe344ba29bbb8f4 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Thu, 15 Jan 2026 14:02:15 +0800 Subject: [PATCH] Feat: Hash doc id to avoid duplicate name. (#12573) ### What problem does this PR solve? Feat: Hash doc id to avoid duplicate name. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/db/services/connector_service.py | 4 ++-- api/db/services/file_service.py | 10 +++++++++- api/utils/common.py | 5 +++++ rag/svr/sync_data_source.py | 3 ++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/api/db/services/connector_service.py b/api/db/services/connector_service.py index 660530c82..0a7b5cb71 100644 --- a/api/db/services/connector_service.py +++ b/api/db/services/connector_service.py @@ -29,7 +29,6 @@ from common.misc_utils import get_uuid from common.constants import TaskStatus from common.time_utils import current_timestamp, timestamp_to_date - class ConnectorService(CommonService): model = Connector @@ -202,6 +201,7 @@ class SyncLogsService(CommonService): return None class FileObj(BaseModel): + id: str filename: str blob: bytes @@ -209,7 +209,7 @@ class SyncLogsService(CommonService): return self.blob errs = [] - files = [FileObj(filename=d["semantic_identifier"]+(f"{d['extension']}" if d["semantic_identifier"][::-1].find(d['extension'][::-1])<0 else ""), blob=d["blob"]) for d in docs] + files = [FileObj(id=d["id"], filename=d["semantic_identifier"]+(f"{d['extension']}" if d["semantic_identifier"][::-1].find(d['extension'][::-1])<0 else ""), blob=d["blob"]) for d in docs] doc_ids = [] err, doc_blob_pairs = FileService.upload_document(kb, files, tenant_id, src) errs.extend(err) diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index d6a157b2d..eba59a3cf 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -439,6 +439,15 @@ class FileService(CommonService): err, files = [], [] for file in file_objs: + doc_id = file.id if hasattr(file, "id") else get_uuid() + e, doc = DocumentService.get_by_id(doc_id) + if e: + blob = file.read() + settings.STORAGE_IMPL.put(kb.id, doc.location, blob, kb.tenant_id) + doc.size = len(blob) + doc = doc.to_dict() + DocumentService.update_by_id(doc["id"], doc) + continue try: DocumentService.check_doc_health(kb.tenant_id, file.filename) filename = duplicate_name(DocumentService.query, name=file.filename, kb_id=kb.id) @@ -455,7 +464,6 @@ class FileService(CommonService): blob = read_potential_broken_pdf(blob) settings.STORAGE_IMPL.put(kb.id, location, blob) - doc_id = get_uuid() img = thumbnail_img(filename, blob) thumbnail_location = "" diff --git a/api/utils/common.py b/api/utils/common.py index 958cf20ff..4d38c40d2 100644 --- a/api/utils/common.py +++ b/api/utils/common.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import xxhash + def string_to_bytes(string): return string if isinstance( @@ -22,3 +24,6 @@ def string_to_bytes(string): def bytes_to_string(byte): return byte.decode(encoding="utf-8") +# 128 bit = 32 character +def hash128(data: str) -> str: + return xxhash.xxh128(data).hexdigest() diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 764bee830..aae977891 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -36,6 +36,7 @@ from typing import Any from flask import json +from api.utils.common import hash128 from api.db.services.connector_service import ConnectorService, SyncLogsService from api.db.services.knowledgebase_service import KnowledgebaseService from common import settings @@ -126,7 +127,7 @@ class SyncBase: docs = [] for doc in document_batch: d = { - "id": doc.id, + "id": hash128(doc.id), "connector_id": task["connector_id"], "source": self.SOURCE_NAME, "semantic_identifier": doc.semantic_identifier,