Fix: debug hierachical merging... (#10337)

### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-06 18:45:08 +08:00 · 2025-09-29 09:29:33 +08:00
parent 664bc0b961
commit 71f69cdb75
13 changed files with 113 additions and 46 deletions
--- a/rag/flow/hierarchical_merger/hierarchical_merger.py
+++ b/rag/flow/hierarchical_merger/hierarchical_merger.py
@ -12,7 +12,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import json
+
 import random
 import re
 from copy import deepcopy
@ -68,9 +68,10 @@ class HierarchicalMerger(ProcessBase):

            lines = [ln for ln in payload.split("\n") if ln]
        else:
-            lines = [o.get("text", "") for o in from_upstream.json_result]
+            arr = from_upstream.chunks if from_upstream.output_format == "chunks" else from_upstream.json_result
+            lines = [o.get("text", "") for o in arr]
            sections, section_images = [], []
-            for o in from_upstream.json_result or []:
+            for o in arr or []:
                sections.append((o.get("text", ""), o.get("position_tag", "")))
                section_images.append(o.get("img_id"))

@ -128,21 +129,26 @@ class HierarchicalMerger(ProcessBase):
        all_pathes = []
        def dfs(n, path, depth):
            nonlocal all_pathes
-            if depth < self._param.hierarchy:
-                path = deepcopy(path)
+            if not n["children"] and path:
+                all_pathes.append(path)

            for nn in n["children"]:
-                path.extend([nn["index"], *nn["texts"]])
-                dfs(nn, path, depth+1)
+                if depth < self._param.hierarchy:
+                    _path = deepcopy(path)
+                else:
+                    _path = path
+                _path.extend([nn["index"], *nn["texts"]])
+                dfs(nn, _path, depth+1)

-            if depth == self._param.hierarchy:
-                all_pathes.append(path)
+                if depth == self._param.hierarchy:
+                    all_pathes.append(_path)

        for i in range(len(lines)):
            print(i, lines[i])
        dfs(root, [], 0)
-        print("sSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS", json.dumps(root, ensure_ascii=False, indent=2))

+        if root["texts"]:
+            all_pathes.insert(0, root["texts"])
        if from_upstream.output_format in ["markdown", "text", "html"]:
            cks = []
            for path in all_pathes:
@ -161,7 +167,7 @@ class HierarchicalMerger(ProcessBase):
                for i in path:
                    txt += lines[i] + "\n"
                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
-                cks.append(cks)
+                cks.append(txt)
                images.append(img)

            cks = [
@ -175,5 +181,6 @@ class HierarchicalMerger(ProcessBase):
            async with trio.open_nursery() as nursery:
                for d in cks:
                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+            self.set_output("chunks", cks)

        self.callback(1, "Done.")
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -235,8 +235,8 @@ class Parser(ProcessBase):
        self.set_output("output_format", conf["output_format"])
        spreadsheet_parser = ExcelParser()
        if conf.get("output_format") == "html":
-            html = spreadsheet_parser.html(blob, 1000000000)
-            self.set_output("html", html)
+            htmls = spreadsheet_parser.html(blob, 1000000000)
+            self.set_output("html", htmls[0])
        elif conf.get("output_format") == "json":
            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
        elif conf.get("output_format") == "markdown":
--- a/rag/flow/pipeline.py
+++ b/rag/flow/pipeline.py
@ -75,7 +75,6 @@ class Pipeline(Graph):
                        "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
                    }
                ]
-            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
            if component_name != "END" and self._doc_id and self.task_id:
                percentage = 1.0 / len(self.components.items())
                finished = 0.0
@ -94,6 +93,10 @@ class Pipeline(Graph):
                t = obj[-1]["trace"][-1]
                msg += "%s: %s\n" % (t["datetime"], t["message"])
                TaskService.update_progress(self.task_id, {"progress": finished, "progress_msg": msg})
+            elif component_name == "END" and not self._doc_id:
+                obj[-1]["trace"][-1]["dsl"] = json.loads(str(self))
+            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
+
        except Exception as e:
            logging.exception(e)

--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@ -102,7 +102,7 @@ class Splitter(ProcessBase):
                "image": img,
                "positions": [[pos[0][-1], *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
            }
-            for c, img in zip(chunks, images)
+            for c, img in zip(chunks, images) if c.strip()
        ]
        async with trio.open_nursery() as nursery:
            for d in cks:
--- a/rag/flow/tokenizer/schema.py
+++ b/rag/flow/tokenizer/schema.py
@ -40,12 +40,14 @@ class TokenizerFromUpstream(BaseModel):
        if self.chunks:
            return self

-        if self.output_format in {"markdown", "text"}:
+        if self.output_format in {"markdown", "text", "html"}:
            if self.output_format == "markdown" and not self.markdown_result:
                raise ValueError("output_format=markdown requires a markdown payload (field: 'markdown' or 'markdown_result').")
            if self.output_format == "text" and not self.text_result:
                raise ValueError("output_format=text requires a text payload (field: 'text' or 'text_result').")
+            if self.output_format == "html" and not self.html_result:
+                raise ValueError("output_format=text requires a html payload (field: 'html' or 'html_result').")
        else:
-            if not self.json_result:
+            if not self.json_result and not self.chunks:
                raise ValueError("When no chunks are provided and output_format is not markdown/text, a JSON list payload is required (field: 'json' or 'json_result').")
        return self
--- a/rag/flow/tokenizer/tokenizer.py
+++ b/rag/flow/tokenizer/tokenizer.py
@ -137,7 +137,7 @@ class Tokenizer(ProcessBase):
                    payload = from_upstream.markdown_result
                elif from_upstream.output_format == "text":
                    payload = from_upstream.text_result
-                else:  # == "html"
+                else:
                    payload = from_upstream.html_result

                if not payload: