Replaced md5 with xxhash64 for chunk id (#4009)

### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-12-12 17:47:39 +08:00
committed by GitHub
parent 301f95837c
commit c8b1a564aa
5 changed files with 17 additions and 34 deletions

View File

@ -27,7 +27,7 @@ import logging
import os
from datetime import datetime
import json
import hashlib
import xxhash
import copy
import re
import time
@ -226,10 +226,7 @@ def build_chunks(task, progress_callback):
for ck in cks:
d = copy.deepcopy(doc)
d.update(ck)
md5 = hashlib.md5()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"):
@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
tk_count = 0
for content, vctr in chunks[original_length:]:
d = copy.deepcopy(doc)
md5 = hashlib.md5()
md5.update((content + str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
d[vctr_nm] = vctr.tolist()