Replaced md5 with xxhash64 for chunk id (#4009)

### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-12-12 17:47:39 +08:00
committed by GitHub
parent 301f95837c
commit c8b1a564aa
5 changed files with 17 additions and 34 deletions

View File

@ -14,7 +14,7 @@
# limitations under the License.
#
import logging
import hashlib
import xxhash
import json
import random
import re
@ -508,10 +508,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
for ck in th.result():
d = deepcopy(doc)
d.update(ck)
md5 = hashlib.md5()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"):

View File

@ -35,17 +35,13 @@ from api import settings
from rag.nlp import search
def trim_header_by_lines(text: str, max_length) -> str:
if len(text) <= max_length:
len_text = len(text)
if len_text <= max_length:
return text
lines = text.split("\n")
total = 0
idx = len(lines) - 1
for i in range(len(lines)-1, -1, -1):
if total + len(lines[i]) > max_length:
break
idx = i
text2 = "\n".join(lines[idx:])
return text2
for i in range(len_text):
if text[i] == '\n' and len_text - i <= max_length:
return text[i+1:]
return text
class TaskService(CommonService):
model = Task
@ -183,7 +179,7 @@ class TaskService(CommonService):
if os.environ.get("MACOS"):
if info["progress_msg"]:
task = cls.model.get_by_id(id)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(
@ -194,7 +190,7 @@ class TaskService(CommonService):
with DB.lock("update_progress", -1):
if info["progress_msg"]:
task = cls.model.get_by_id(id)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(