Fix: Parent chunking fails on DOCX files (#12822)

### What problem does this PR solve?

Fixes parent chunking fails on DOCX files.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2026-01-26 17:55:09 +08:00
committed by GitHub
parent e04cd99ae2
commit 13076bb87b

View File

@ -330,7 +330,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=
def doc_tokenize_chunks_with_images(chunks, doc, eng, child_delimiters_pattern=None, batch_size=10): def doc_tokenize_chunks_with_images(chunks, doc, eng, child_delimiters_pattern=None, batch_size=10):
res = [] res = []
for ii, ck in enumerate(chunks): for ii, ck in enumerate(chunks):
text = ck.get('context_above', "") + ck.get('text') + ck.get('context_below', "") text = ck.get("context_above", "") + ck.get("text") + ck.get("context_below", "")
if len(text.strip()) == 0: if len(text.strip()) == 0:
continue continue
logging.debug("-- {}".format(ck)) logging.debug("-- {}".format(ck))
@ -341,7 +341,7 @@ def doc_tokenize_chunks_with_images(chunks, doc, eng, child_delimiters_pattern=N
if ck.get("ck_type") == "text": if ck.get("ck_type") == "text":
if child_delimiters_pattern: if child_delimiters_pattern:
d["mom_with_weight"] = ck d["mom_with_weight"] = text
res.extend(split_with_pattern(d, child_delimiters_pattern, text, eng)) res.extend(split_with_pattern(d, child_delimiters_pattern, text, eng))
continue continue
elif ck.get("ck_type") == "image": elif ck.get("ck_type") == "image":
@ -1367,7 +1367,7 @@ def _merge_cks(cks, chunk_token_num):
merged = [] merged = []
image_idxs = [] image_idxs = []
prev_text_ck = -1 prev_text_ck = -1
for i in range(len(cks)): for i in range(len(cks)):
ck_type = cks[i]["ck_type"] ck_type = cks[i]["ck_type"]
@ -1376,8 +1376,8 @@ def _merge_cks(cks, chunk_token_num):
if ck_type == "image": if ck_type == "image":
image_idxs.append(len(merged) - 1) image_idxs.append(len(merged) - 1)
continue continue
if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num: if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num:
merged.append(cks[i]) merged.append(cks[i])
prev_text_ck = len(merged) - 1 prev_text_ck = len(merged) - 1
@ -1390,25 +1390,25 @@ def _merge_cks(cks, chunk_token_num):
def naive_merge_docx( def naive_merge_docx(
sections, sections,
chunk_token_num = 128, chunk_token_num = 128,
delimiter="\n。;!?", delimiter="\n。;!?",
table_context_size=0, table_context_size=0,
image_context_size=0,): image_context_size=0,):
if not sections: if not sections:
return [], [] return [], []
cks, tables, images = _build_cks(sections, delimiter) cks, tables, images = _build_cks(sections, delimiter)
if table_context_size > 0: if table_context_size > 0:
for i in tables: for i in tables:
_add_context(cks, i, table_context_size) _add_context(cks, i, table_context_size)
if image_context_size > 0: if image_context_size > 0:
for i in images: for i in images:
_add_context(cks, i, image_context_size) _add_context(cks, i, image_context_size)
merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num) merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num)
return merged_cks, merged_image_idx return merged_cks, merged_image_idx