Feat: add splitter (#10161)

### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com>
2026-02-05 18:15:06 +08:00 · 2025-09-19 10:15:19 +08:00
parent f9c7404bee
commit a1b947ffd6
81 changed files with 3083 additions and 799 deletions
--- a/rag/flow/hierarchical_merger/hierarchical_merger.py
+++ b/rag/flow/hierarchical_merger/hierarchical_merger.py
@ -0,0 +1,178 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import json
+import random
+import re
+from copy import deepcopy
+from functools import partial
+
+import trio
+
+from api.utils import get_uuid
+from api.utils.base64_image import id2image, image2id
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
+from rag.nlp import concat_img
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class HierarchicalMergerParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.levels = []
+        self.hierarchy = None
+
+    def check(self):
+        self.check_empty(self.levels, "Hierarchical setups.")
+        self.check_empty(self.hierarchy, "Hierarchy number.")
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class HierarchicalMerger(ProcessBase):
+    component_name = "HierarchicalMerger"
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            if from_upstream.output_format == "markdown":
+                payload = from_upstream.markdown_result
+            elif from_upstream.output_format == "text":
+                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result
+
+            if not payload:
+                payload = ""
+
+            lines = [ln for ln in payload.split("\n") if ln]
+        else:
+            lines = [o.get("text", "") for o in from_upstream.json_result]
+            sections, section_images = [], []
+            for o in from_upstream.json_result or []:
+                sections.append((o.get("text", ""), o.get("position_tag", "")))
+                section_images.append(o.get("img_id"))
+
+        matches = []
+        for txt in lines:
+            good = False
+            for lvl, regs in enumerate(self._param.levels):
+                for reg in regs:
+                    if re.search(reg, txt):
+                        matches.append(lvl)
+                        good = True
+                        break
+                if good:
+                    break
+            if not good:
+                matches.append(len(self._param.levels))
+        assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}"
+
+        root = {
+            "level": -1,
+            "index": -1,
+            "texts": [],
+            "children": []
+        }
+        for i, m in enumerate(matches):
+            if m == 0:
+                root["children"].append({
+                    "level": m,
+                    "index": i,
+                    "texts": [],
+                    "children": []
+                })
+            elif m == len(self._param.levels):
+                def dfs(b):
+                    if not b["children"]:
+                        b["texts"].append(i)
+                    else:
+                        dfs(b["children"][-1])
+                dfs(root)
+            else:
+                def dfs(b):
+                    nonlocal m, i
+                    if not b["children"] or  m == b["level"] + 1:
+                        b["children"].append({
+                            "level": m,
+                            "index": i,
+                            "texts": [],
+                            "children": []
+                        })
+                        return
+                    dfs(b["children"][-1])
+
+                dfs(root)
+
+        all_pathes = []
+        def dfs(n, path, depth):
+            nonlocal all_pathes
+            if depth < self._param.hierarchy:
+                path = deepcopy(path)
+
+            for nn in n["children"]:
+                path.extend([nn["index"], *nn["texts"]])
+                dfs(nn, path, depth+1)
+
+            if depth == self._param.hierarchy:
+                all_pathes.append(path)
+
+        for i in range(len(lines)):
+            print(i, lines[i])
+        dfs(root, [], 0)
+        print("sSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS", json.dumps(root, ensure_ascii=False, indent=2))
+
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            cks = []
+            for path in all_pathes:
+                txt = ""
+                for i in path:
+                    txt += lines[i] + "\n"
+                cks.append(txt)
+
+            self.set_output("chunks", [{"text": c} for c in cks if c])
+        else:
+            cks = []
+            images = []
+            for path in all_pathes:
+                txt = ""
+                img = None
+                for i in path:
+                    txt += lines[i] + "\n"
+                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
+                cks.append(cks)
+                images.append(img)
+
+            cks = [
+                {
+                    "text": RAGFlowPdfParser.remove_tag(c),
+                    "image": img,
+                    "positions": RAGFlowPdfParser.extract_positions(c),
+                }
+                for c, img in zip(cks, images)
+            ]
+            async with trio.open_nursery() as nursery:
+                for d in cks:
+                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
+
+        self.callback(1, "Done.")