mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-03 09:05:07 +08:00
Fix: Parent chunking fails on DOCX files (#12822)
### What problem does this PR solve? Fixes parent chunking fails on DOCX files. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -330,7 +330,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=
|
||||
def doc_tokenize_chunks_with_images(chunks, doc, eng, child_delimiters_pattern=None, batch_size=10):
|
||||
res = []
|
||||
for ii, ck in enumerate(chunks):
|
||||
text = ck.get('context_above', "") + ck.get('text') + ck.get('context_below', "")
|
||||
text = ck.get("context_above", "") + ck.get("text") + ck.get("context_below", "")
|
||||
if len(text.strip()) == 0:
|
||||
continue
|
||||
logging.debug("-- {}".format(ck))
|
||||
@ -341,7 +341,7 @@ def doc_tokenize_chunks_with_images(chunks, doc, eng, child_delimiters_pattern=N
|
||||
|
||||
if ck.get("ck_type") == "text":
|
||||
if child_delimiters_pattern:
|
||||
d["mom_with_weight"] = ck
|
||||
d["mom_with_weight"] = text
|
||||
res.extend(split_with_pattern(d, child_delimiters_pattern, text, eng))
|
||||
continue
|
||||
elif ck.get("ck_type") == "image":
|
||||
|
||||
Reference in New Issue
Block a user