From f11ca54e0ecca759c5c403e7521d306aba7c1e0c Mon Sep 17 00:00:00 2001
From: Magicbook1108 <newyorkupperbay@gmail.com>
Date: Tue, 3 Feb 2026 15:36:58 +0800
Subject: [PATCH] Fix: docx parser output consistent (#12965)

### What problem does this PR solve?

Fix: docx parser output consistent

> File "/home/bxy/ragflow/rag/flow/parser/parser.py", line 506, in _word
>     sections, tbls = docx_parser(name, binary=blob)
>     ^^^^^^^^^^^^^^
> ValueError: too many values to unpack (expected 2)
>

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 docs/references/http_api_reference.md | 2 +-
 rag/flow/parser/parser.py             | 8 +++++++-
 rag/nlp/__init__.py                   | 6 ++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md
index 1c7b0a171..374967e76 100644
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -896,7 +896,7 @@ Success:
             "vector_similarity_weight": 0.3
         }
     ],
-    "total": 1
+    "total_datasets": 1
 }
 ```
 
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index b2cc15c4f..7fcdde860 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -503,7 +503,13 @@ class Parser(ProcessBase):
         docx_parser = Docx()
 
         if conf.get("output_format") == "json":
-            sections, tbls = docx_parser(name, binary=blob)
+            main_sections = docx_parser(name, binary=blob)
+            sections = []
+            tbls = []
+            for text, image, html in main_sections:
+                sections.append((text, image))
+                tbls.append(((None, html), ""))
+
             sections = [{"text": section[0], "image": section[1]} for section in sections if section]
             sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
 
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 2cd725197..d94d6301e 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -1168,6 +1168,8 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
         cks, result_images, tk_nums = [], [], []
         for text, image in zip(texts, images):
             text_str = text[0] if isinstance(text, tuple) else text
+            if text_str is None:
+                text_str = ""
             text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else ""
             split_sec = re.split(r"(%s)" % custom_pattern, text_str)
             for sub_sec in split_sec:
@@ -1187,11 +1189,11 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
     for text, image in zip(texts, images):
         # if text is tuple, unpack it
         if isinstance(text, tuple):
-            text_str = text[0]
+            text_str = text[0] if text[0] is not None else ""
             text_pos = text[1] if len(text) > 1 else ""
             add_chunk("\n" + text_str, image, text_pos)
         else:
-            add_chunk("\n" + text, image)
+            add_chunk("\n" + (text or ""), image)
 
     return cks, result_images