mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
FIX: If chunk["content_with_weight"] contains one or more unpaired surrogate characters (such as incomplete emoji or other special characters), then calling .encode("utf-8") directly will raise a UnicodeEncodeError. (#9246)
FIX: If chunk["content_with_weight"] contains one or more unpaired
surrogate characters (such as incomplete emoji or other special
characters), then calling .encode("utf-8") directly will raise a
UnicodeEncodeError.
### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -284,7 +284,7 @@ async def build_chunks(task, progress_callback):
|
||||
try:
|
||||
d = copy.deepcopy(document)
|
||||
d.update(chunk)
|
||||
d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
||||
d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
|
||||
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||
d["create_timestamp_flt"] = datetime.now().timestamp()
|
||||
if not d.get("image"):
|
||||
|
||||
Reference in New Issue
Block a user