Fix: docx parser output consistent (#12965)

### What problem does this PR solve?

Fix: docx parser output consistent

> File "/home/bxy/ragflow/rag/flow/parser/parser.py", line 506, in _word
>     sections, tbls = docx_parser(name, binary=blob)
>     ^^^^^^^^^^^^^^
> ValueError: too many values to unpack (expected 2)
> 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Magicbook1108
2026-02-03 15:36:58 +08:00
committed by GitHub
parent deeae8dba4
commit f11ca54e0e
3 changed files with 12 additions and 4 deletions

View File

@ -896,7 +896,7 @@ Success:
"vector_similarity_weight": 0.3 "vector_similarity_weight": 0.3
} }
], ],
"total": 1 "total_datasets": 1
} }
``` ```

View File

@ -503,7 +503,13 @@ class Parser(ProcessBase):
docx_parser = Docx() docx_parser = Docx()
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob) main_sections = docx_parser(name, binary=blob)
sections = []
tbls = []
for text, image, html in main_sections:
sections.append((text, image))
tbls.append(((None, html), ""))
sections = [{"text": section[0], "image": section[1]} for section in sections if section] sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls]) sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])

View File

@ -1168,6 +1168,8 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
cks, result_images, tk_nums = [], [], [] cks, result_images, tk_nums = [], [], []
for text, image in zip(texts, images): for text, image in zip(texts, images):
text_str = text[0] if isinstance(text, tuple) else text text_str = text[0] if isinstance(text, tuple) else text
if text_str is None:
text_str = ""
text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else "" text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else ""
split_sec = re.split(r"(%s)" % custom_pattern, text_str) split_sec = re.split(r"(%s)" % custom_pattern, text_str)
for sub_sec in split_sec: for sub_sec in split_sec:
@ -1187,11 +1189,11 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
for text, image in zip(texts, images): for text, image in zip(texts, images):
# if text is tuple, unpack it # if text is tuple, unpack it
if isinstance(text, tuple): if isinstance(text, tuple):
text_str = text[0] text_str = text[0] if text[0] is not None else ""
text_pos = text[1] if len(text) > 1 else "" text_pos = text[1] if len(text) > 1 else ""
add_chunk("\n" + text_str, image, text_pos) add_chunk("\n" + text_str, image, text_pos)
else: else:
add_chunk("\n" + text, image) add_chunk("\n" + (text or ""), image)
return cks, result_images return cks, result_images