From f11ca54e0ecca759c5c403e7521d306aba7c1e0c Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Tue, 3 Feb 2026 15:36:58 +0800 Subject: [PATCH] Fix: docx parser output consistent (#12965) ### What problem does this PR solve? Fix: docx parser output consistent > File "/home/bxy/ragflow/rag/flow/parser/parser.py", line 506, in _word > sections, tbls = docx_parser(name, binary=blob) > ^^^^^^^^^^^^^^ > ValueError: too many values to unpack (expected 2) > ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- docs/references/http_api_reference.md | 2 +- rag/flow/parser/parser.py | 8 +++++++- rag/nlp/__init__.py | 6 ++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 1c7b0a171..374967e76 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -896,7 +896,7 @@ Success: "vector_similarity_weight": 0.3 } ], - "total": 1 + "total_datasets": 1 } ``` diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index b2cc15c4f..7fcdde860 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -503,7 +503,13 @@ class Parser(ProcessBase): docx_parser = Docx() if conf.get("output_format") == "json": - sections, tbls = docx_parser(name, binary=blob) + main_sections = docx_parser(name, binary=blob) + sections = [] + tbls = [] + for text, image, html in main_sections: + sections.append((text, image)) + tbls.append(((None, html), "")) + sections = [{"text": section[0], "image": section[1]} for section in sections if section] sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls]) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 2cd725197..d94d6301e 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1168,6 +1168,8 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 cks, result_images, tk_nums = [], [], [] for text, image in zip(texts, images): text_str = text[0] if isinstance(text, tuple) else text + if text_str is None: + text_str = "" text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else "" split_sec = re.split(r"(%s)" % custom_pattern, text_str) for sub_sec in split_sec: @@ -1187,11 +1189,11 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 for text, image in zip(texts, images): # if text is tuple, unpack it if isinstance(text, tuple): - text_str = text[0] + text_str = text[0] if text[0] is not None else "" text_pos = text[1] if len(text) > 1 else "" add_chunk("\n" + text_str, image, text_pos) else: - add_chunk("\n" + text, image) + add_chunk("\n" + (text or ""), image) return cks, result_images