mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Replaced md5 with xxhash64 for chunk id (#4009)
### What problem does this PR solve? Replaced md5 with xxhash64 for chunk id ### Type of change - [x] Refactoring
This commit is contained in:
@ -31,7 +31,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api import settings
|
||||
from api.utils.api_utils import get_json_result
|
||||
import hashlib
|
||||
import xxhash
|
||||
import re
|
||||
|
||||
|
||||
@ -208,9 +208,7 @@ def rm():
|
||||
@validate_request("doc_id", "content_with_weight")
|
||||
def create():
|
||||
req = request.json
|
||||
md5 = hashlib.md5()
|
||||
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
||||
chunck_id = md5.hexdigest()
|
||||
chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
|
||||
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
||||
"content_with_weight": req["content_with_weight"]}
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
|
||||
@ -22,7 +22,7 @@ from rag.nlp import rag_tokenizer
|
||||
from api.db import LLMType, ParserType
|
||||
from api.db.services.llm_service import TenantLLMService
|
||||
from api import settings
|
||||
import hashlib
|
||||
import xxhash
|
||||
import re
|
||||
from api.utils.api_utils import token_required
|
||||
from api.db.db_models import Task
|
||||
@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
||||
return get_error_data_result(
|
||||
"`questions` is required to be a list"
|
||||
)
|
||||
md5 = hashlib.md5()
|
||||
md5.update((req["content"] + document_id).encode("utf-8"))
|
||||
|
||||
chunk_id = md5.hexdigest()
|
||||
chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
|
||||
d = {
|
||||
"id": chunk_id,
|
||||
"content_ltks": rag_tokenizer.tokenize(req["content"]),
|
||||
|
||||
Reference in New Issue
Block a user