Replaced md5 with xxhash64 for chunk id (#4009)

### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-12-12 17:47:39 +08:00
committed by GitHub
parent 301f95837c
commit c8b1a564aa
5 changed files with 17 additions and 34 deletions

View File

@ -31,7 +31,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va
from api.db.services.document_service import DocumentService
from api import settings
from api.utils.api_utils import get_json_result
import hashlib
import xxhash
import re
@ -208,9 +208,7 @@ def rm():
@validate_request("doc_id", "content_with_weight")
def create():
req = request.json
md5 = hashlib.md5()
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
chunck_id = md5.hexdigest()
chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
"content_with_weight": req["content_with_weight"]}
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])