# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import random import re from copy import deepcopy from functools import partial import trio from common.misc_utils import get_uuid from rag.utils.base64_image import id2image, image2id from deepdoc.parser.pdf_parser import RAGFlowPdfParser from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream from rag.nlp import concat_img from rag.utils.storage_factory import STORAGE_IMPL class HierarchicalMergerParam(ProcessParamBase): def __init__(self): super().__init__() self.levels = [] self.hierarchy = None def check(self): self.check_empty(self.levels, "Hierarchical setups.") self.check_empty(self.hierarchy, "Hierarchy number.") def get_input_form(self) -> dict[str, dict]: return {} class HierarchicalMerger(ProcessBase): component_name = "HierarchicalMerger" async def _invoke(self, **kwargs): try: from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs) except Exception as e: self.set_output("_ERROR", f"Input error: {str(e)}") return self.set_output("output_format", "chunks") self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.") if from_upstream.output_format in ["markdown", "text", "html"]: if from_upstream.output_format == "markdown": payload = from_upstream.markdown_result elif from_upstream.output_format == "text": payload = from_upstream.text_result else: # == "html" payload = from_upstream.html_result if not payload: payload = "" lines = [ln for ln in payload.split("\n") if ln] else: arr = from_upstream.chunks if from_upstream.output_format == "chunks" else from_upstream.json_result lines = [o.get("text", "") for o in arr] sections, section_images = [], [] for o in arr or []: sections.append((o.get("text", ""), o.get("position_tag", ""))) section_images.append(o.get("img_id")) matches = [] for txt in lines: good = False for lvl, regs in enumerate(self._param.levels): for reg in regs: if re.search(reg, txt): matches.append(lvl) good = True break if good: break if not good: matches.append(len(self._param.levels)) assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}" root = { "level": -1, "index": -1, "texts": [], "children": [] } for i, m in enumerate(matches): if m == 0: root["children"].append({ "level": m, "index": i, "texts": [], "children": [] }) elif m == len(self._param.levels): def dfs(b): if not b["children"]: b["texts"].append(i) else: dfs(b["children"][-1]) dfs(root) else: def dfs(b): nonlocal m, i if not b["children"] or m == b["level"] + 1: b["children"].append({ "level": m, "index": i, "texts": [], "children": [] }) return dfs(b["children"][-1]) dfs(root) all_pathes = [] def dfs(n, path, depth): nonlocal all_pathes if not n["children"] and path: all_pathes.append(path) for nn in n["children"]: if depth < self._param.hierarchy: _path = deepcopy(path) else: _path = path _path.extend([nn["index"], *nn["texts"]]) dfs(nn, _path, depth+1) if depth == self._param.hierarchy: all_pathes.append(_path) for i in range(len(lines)): print(i, lines[i]) dfs(root, [], 0) if root["texts"]: all_pathes.insert(0, root["texts"]) if from_upstream.output_format in ["markdown", "text", "html"]: cks = [] for path in all_pathes: txt = "" for i in path: txt += lines[i] + "\n" cks.append(txt) self.set_output("chunks", [{"text": c} for c in cks if c]) else: cks = [] images = [] for path in all_pathes: txt = "" img = None for i in path: txt += lines[i] + "\n" concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id))) cks.append(txt) images.append(img) cks = [ { "text": RAGFlowPdfParser.remove_tag(c), "image": img, "positions": RAGFlowPdfParser.extract_positions(c), } for c, img in zip(cks, images) ] async with trio.open_nursery() as nursery: for d in cks: nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid()) self.set_output("chunks", cks) self.callback(1, "Done.")