mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add splitter (#10161)
### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com>
This commit is contained in:
178
rag/flow/hierarchical_merger/hierarchical_merger.py
Normal file
178
rag/flow/hierarchical_merger/hierarchical_merger.py
Normal file
@ -0,0 +1,178 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
|
||||
from rag.nlp import concat_img
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
class HierarchicalMergerParam(ProcessParamBase):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.levels = []
|
||||
self.hierarchy = None
|
||||
|
||||
def check(self):
|
||||
self.check_empty(self.levels, "Hierarchical setups.")
|
||||
self.check_empty(self.hierarchy, "Hierarchy number.")
|
||||
|
||||
def get_input_form(self) -> dict[str, dict]:
|
||||
return {}
|
||||
|
||||
|
||||
class HierarchicalMerger(ProcessBase):
|
||||
component_name = "HierarchicalMerger"
|
||||
|
||||
async def _invoke(self, **kwargs):
|
||||
try:
|
||||
from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs)
|
||||
except Exception as e:
|
||||
self.set_output("_ERROR", f"Input error: {str(e)}")
|
||||
return
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.")
|
||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
if from_upstream.output_format == "markdown":
|
||||
payload = from_upstream.markdown_result
|
||||
elif from_upstream.output_format == "text":
|
||||
payload = from_upstream.text_result
|
||||
else: # == "html"
|
||||
payload = from_upstream.html_result
|
||||
|
||||
if not payload:
|
||||
payload = ""
|
||||
|
||||
lines = [ln for ln in payload.split("\n") if ln]
|
||||
else:
|
||||
lines = [o.get("text", "") for o in from_upstream.json_result]
|
||||
sections, section_images = [], []
|
||||
for o in from_upstream.json_result or []:
|
||||
sections.append((o.get("text", ""), o.get("position_tag", "")))
|
||||
section_images.append(o.get("img_id"))
|
||||
|
||||
matches = []
|
||||
for txt in lines:
|
||||
good = False
|
||||
for lvl, regs in enumerate(self._param.levels):
|
||||
for reg in regs:
|
||||
if re.search(reg, txt):
|
||||
matches.append(lvl)
|
||||
good = True
|
||||
break
|
||||
if good:
|
||||
break
|
||||
if not good:
|
||||
matches.append(len(self._param.levels))
|
||||
assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}"
|
||||
|
||||
root = {
|
||||
"level": -1,
|
||||
"index": -1,
|
||||
"texts": [],
|
||||
"children": []
|
||||
}
|
||||
for i, m in enumerate(matches):
|
||||
if m == 0:
|
||||
root["children"].append({
|
||||
"level": m,
|
||||
"index": i,
|
||||
"texts": [],
|
||||
"children": []
|
||||
})
|
||||
elif m == len(self._param.levels):
|
||||
def dfs(b):
|
||||
if not b["children"]:
|
||||
b["texts"].append(i)
|
||||
else:
|
||||
dfs(b["children"][-1])
|
||||
dfs(root)
|
||||
else:
|
||||
def dfs(b):
|
||||
nonlocal m, i
|
||||
if not b["children"] or m == b["level"] + 1:
|
||||
b["children"].append({
|
||||
"level": m,
|
||||
"index": i,
|
||||
"texts": [],
|
||||
"children": []
|
||||
})
|
||||
return
|
||||
dfs(b["children"][-1])
|
||||
|
||||
dfs(root)
|
||||
|
||||
all_pathes = []
|
||||
def dfs(n, path, depth):
|
||||
nonlocal all_pathes
|
||||
if depth < self._param.hierarchy:
|
||||
path = deepcopy(path)
|
||||
|
||||
for nn in n["children"]:
|
||||
path.extend([nn["index"], *nn["texts"]])
|
||||
dfs(nn, path, depth+1)
|
||||
|
||||
if depth == self._param.hierarchy:
|
||||
all_pathes.append(path)
|
||||
|
||||
for i in range(len(lines)):
|
||||
print(i, lines[i])
|
||||
dfs(root, [], 0)
|
||||
print("sSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS", json.dumps(root, ensure_ascii=False, indent=2))
|
||||
|
||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
cks = []
|
||||
for path in all_pathes:
|
||||
txt = ""
|
||||
for i in path:
|
||||
txt += lines[i] + "\n"
|
||||
cks.append(txt)
|
||||
|
||||
self.set_output("chunks", [{"text": c} for c in cks if c])
|
||||
else:
|
||||
cks = []
|
||||
images = []
|
||||
for path in all_pathes:
|
||||
txt = ""
|
||||
img = None
|
||||
for i in path:
|
||||
txt += lines[i] + "\n"
|
||||
concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
|
||||
cks.append(cks)
|
||||
images.append(img)
|
||||
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
"image": img,
|
||||
"positions": RAGFlowPdfParser.extract_positions(c),
|
||||
}
|
||||
for c, img in zip(cks, images)
|
||||
]
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
|
||||
|
||||
self.callback(1, "Done.")
|
||||
Reference in New Issue
Block a user