mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-30 23:26:36 +08:00
Feat: Hash doc id to avoid duplicate name. (#12573)
### What problem does this PR solve? Feat: Hash doc id to avoid duplicate name. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -29,7 +29,6 @@ from common.misc_utils import get_uuid
|
|||||||
from common.constants import TaskStatus
|
from common.constants import TaskStatus
|
||||||
from common.time_utils import current_timestamp, timestamp_to_date
|
from common.time_utils import current_timestamp, timestamp_to_date
|
||||||
|
|
||||||
|
|
||||||
class ConnectorService(CommonService):
|
class ConnectorService(CommonService):
|
||||||
model = Connector
|
model = Connector
|
||||||
|
|
||||||
@ -202,6 +201,7 @@ class SyncLogsService(CommonService):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
class FileObj(BaseModel):
|
class FileObj(BaseModel):
|
||||||
|
id: str
|
||||||
filename: str
|
filename: str
|
||||||
blob: bytes
|
blob: bytes
|
||||||
|
|
||||||
@ -209,7 +209,7 @@ class SyncLogsService(CommonService):
|
|||||||
return self.blob
|
return self.blob
|
||||||
|
|
||||||
errs = []
|
errs = []
|
||||||
files = [FileObj(filename=d["semantic_identifier"]+(f"{d['extension']}" if d["semantic_identifier"][::-1].find(d['extension'][::-1])<0 else ""), blob=d["blob"]) for d in docs]
|
files = [FileObj(id=d["id"], filename=d["semantic_identifier"]+(f"{d['extension']}" if d["semantic_identifier"][::-1].find(d['extension'][::-1])<0 else ""), blob=d["blob"]) for d in docs]
|
||||||
doc_ids = []
|
doc_ids = []
|
||||||
err, doc_blob_pairs = FileService.upload_document(kb, files, tenant_id, src)
|
err, doc_blob_pairs = FileService.upload_document(kb, files, tenant_id, src)
|
||||||
errs.extend(err)
|
errs.extend(err)
|
||||||
|
|||||||
@ -439,6 +439,15 @@ class FileService(CommonService):
|
|||||||
|
|
||||||
err, files = [], []
|
err, files = [], []
|
||||||
for file in file_objs:
|
for file in file_objs:
|
||||||
|
doc_id = file.id if hasattr(file, "id") else get_uuid()
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if e:
|
||||||
|
blob = file.read()
|
||||||
|
settings.STORAGE_IMPL.put(kb.id, doc.location, blob, kb.tenant_id)
|
||||||
|
doc.size = len(blob)
|
||||||
|
doc = doc.to_dict()
|
||||||
|
DocumentService.update_by_id(doc["id"], doc)
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
DocumentService.check_doc_health(kb.tenant_id, file.filename)
|
DocumentService.check_doc_health(kb.tenant_id, file.filename)
|
||||||
filename = duplicate_name(DocumentService.query, name=file.filename, kb_id=kb.id)
|
filename = duplicate_name(DocumentService.query, name=file.filename, kb_id=kb.id)
|
||||||
@ -455,7 +464,6 @@ class FileService(CommonService):
|
|||||||
blob = read_potential_broken_pdf(blob)
|
blob = read_potential_broken_pdf(blob)
|
||||||
settings.STORAGE_IMPL.put(kb.id, location, blob)
|
settings.STORAGE_IMPL.put(kb.id, location, blob)
|
||||||
|
|
||||||
doc_id = get_uuid()
|
|
||||||
|
|
||||||
img = thumbnail_img(filename, blob)
|
img = thumbnail_img(filename, blob)
|
||||||
thumbnail_location = ""
|
thumbnail_location = ""
|
||||||
|
|||||||
@ -13,6 +13,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import xxhash
|
||||||
|
|
||||||
|
|
||||||
def string_to_bytes(string):
|
def string_to_bytes(string):
|
||||||
return string if isinstance(
|
return string if isinstance(
|
||||||
@ -22,3 +24,6 @@ def string_to_bytes(string):
|
|||||||
def bytes_to_string(byte):
|
def bytes_to_string(byte):
|
||||||
return byte.decode(encoding="utf-8")
|
return byte.decode(encoding="utf-8")
|
||||||
|
|
||||||
|
# 128 bit = 32 character
|
||||||
|
def hash128(data: str) -> str:
|
||||||
|
return xxhash.xxh128(data).hexdigest()
|
||||||
|
|||||||
@ -36,6 +36,7 @@ from typing import Any
|
|||||||
|
|
||||||
from flask import json
|
from flask import json
|
||||||
|
|
||||||
|
from api.utils.common import hash128
|
||||||
from api.db.services.connector_service import ConnectorService, SyncLogsService
|
from api.db.services.connector_service import ConnectorService, SyncLogsService
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from common import settings
|
from common import settings
|
||||||
@ -126,7 +127,7 @@ class SyncBase:
|
|||||||
docs = []
|
docs = []
|
||||||
for doc in document_batch:
|
for doc in document_batch:
|
||||||
d = {
|
d = {
|
||||||
"id": doc.id,
|
"id": hash128(doc.id),
|
||||||
"connector_id": task["connector_id"],
|
"connector_id": task["connector_id"],
|
||||||
"source": self.SOURCE_NAME,
|
"source": self.SOURCE_NAME,
|
||||||
"semantic_identifier": doc.semantic_identifier,
|
"semantic_identifier": doc.semantic_identifier,
|
||||||
|
|||||||
Reference in New Issue
Block a user