mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-20 04:39:00 +08:00
Feat: add splitter (#10161)
### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com>
This commit is contained in:
@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from functools import partial
|
||||
@ -44,17 +43,17 @@ class ProcessBase(ComponentBase):
|
||||
self.set_output("_created_time", time.perf_counter())
|
||||
for k, v in kwargs.items():
|
||||
self.set_output(k, v)
|
||||
try:
|
||||
with trio.fail_after(self._param.timeout):
|
||||
await self._invoke(**kwargs)
|
||||
self.callback(1, "Done")
|
||||
except Exception as e:
|
||||
if self.get_exception_default_value():
|
||||
self.set_exception_default_value()
|
||||
else:
|
||||
self.set_output("_ERROR", str(e))
|
||||
logging.exception(e)
|
||||
self.callback(-1, str(e))
|
||||
#try:
|
||||
with trio.fail_after(self._param.timeout):
|
||||
await self._invoke(**kwargs)
|
||||
self.callback(1, "Done")
|
||||
#except Exception as e:
|
||||
# if self.get_exception_default_value():
|
||||
# self.set_exception_default_value()
|
||||
# else:
|
||||
# self.set_output("_ERROR", str(e))
|
||||
# logging.exception(e)
|
||||
# self.callback(-1, str(e))
|
||||
self.set_output("_elapsed_time", time.perf_counter() - self.output("_created_time"))
|
||||
return self.output()
|
||||
|
||||
|
||||
@ -12,18 +12,19 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import random
|
||||
|
||||
import trio
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from graphrag.utils import chat_limiter, get_llm_cache, set_llm_cache
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.chunker.schema import ChunkerFromUpstream
|
||||
from rag.nlp import naive_merge, naive_merge_with_images
|
||||
from rag.prompts.prompts import keyword_extraction, question_proposal
|
||||
from rag.nlp import naive_merge, naive_merge_with_images, concat_img
|
||||
from rag.prompts.prompts import keyword_extraction, question_proposal, detect_table_of_contents, \
|
||||
table_of_contents_index, toc_transformer
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class ChunkerParam(ProcessParamBase):
|
||||
@ -43,6 +44,7 @@ class ChunkerParam(ProcessParamBase):
|
||||
"paper",
|
||||
"laws",
|
||||
"presentation",
|
||||
"toc" # table of contents
|
||||
# Other
|
||||
# "Tag" # TODO: Other method
|
||||
]
|
||||
@ -54,7 +56,7 @@ class ChunkerParam(ProcessParamBase):
|
||||
self.auto_keywords = 0
|
||||
self.auto_questions = 0
|
||||
self.tag_sets = []
|
||||
self.llm_setting = {"llm_name": "", "lang": "Chinese"}
|
||||
self.llm_setting = {"llm_id": "", "lang": "Chinese"}
|
||||
|
||||
def check(self):
|
||||
self.check_valid_value(self.method.lower(), "Chunk method abnormal.", self.method_options)
|
||||
@ -142,6 +144,91 @@ class Chunker(ProcessBase):
|
||||
def _one(self, from_upstream: ChunkerFromUpstream):
|
||||
pass
|
||||
|
||||
def _toc(self, from_upstream: ChunkerFromUpstream):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `ToC`.")
|
||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
return
|
||||
|
||||
# json
|
||||
sections, section_images, page_1024, tc_arr = [], [], [""], [0]
|
||||
for o in from_upstream.json_result or []:
|
||||
txt = o.get("text", "")
|
||||
tc = num_tokens_from_string(txt)
|
||||
page_1024[-1] += "\n" + txt
|
||||
tc_arr[-1] += tc
|
||||
if tc_arr[-1] > 1024:
|
||||
page_1024.append("")
|
||||
tc_arr.append(0)
|
||||
sections.append((o.get("text", ""), o.get("position_tag", "")))
|
||||
section_images.append(o.get("image"))
|
||||
print(len(sections), o)
|
||||
|
||||
llm_setting = self._param.llm_setting
|
||||
chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])
|
||||
self.callback(random.randint(5, 15) / 100.0, "Start to detect table of contents...")
|
||||
toc_secs = detect_table_of_contents(page_1024, chat_mdl)
|
||||
if toc_secs:
|
||||
self.callback(random.randint(25, 35) / 100.0, "Start to extract table of contents...")
|
||||
toc_arr = toc_transformer(toc_secs, chat_mdl)
|
||||
toc_arr = [it for it in toc_arr if it.get("structure")]
|
||||
print(json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True)
|
||||
self.callback(random.randint(35, 75) / 100.0, "Start to link table of contents...")
|
||||
toc_arr = table_of_contents_index(toc_arr, [t for t,_ in sections], chat_mdl)
|
||||
for i in range(len(toc_arr)-1):
|
||||
if not toc_arr[i].get("indices"):
|
||||
continue
|
||||
|
||||
for j in range(i+1, len(toc_arr)):
|
||||
if toc_arr[j].get("indices"):
|
||||
if toc_arr[j]["indices"][0] - toc_arr[i]["indices"][-1] > 1:
|
||||
toc_arr[i]["indices"].extend([x for x in range(toc_arr[i]["indices"][-1]+1, toc_arr[j]["indices"][0])])
|
||||
break
|
||||
# put all sections ahead of toc_arr[0] into it
|
||||
# for i in range(len(toc_arr)):
|
||||
# if toc_arr[i].get("indices") and toc_arr[i]["indices"][0]:
|
||||
# toc_arr[i]["indices"] = [x for x in range(toc_arr[i]["indices"][-1]+1)]
|
||||
# break
|
||||
# put all sections after toc_arr[-1] into it
|
||||
for i in range(len(toc_arr)-1, -1, -1):
|
||||
if toc_arr[i].get("indices") and toc_arr[i]["indices"][-1]:
|
||||
toc_arr[i]["indices"] = [x for x in range(toc_arr[i]["indices"][0], len(sections))]
|
||||
break
|
||||
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True)
|
||||
|
||||
chunks, images = [], []
|
||||
for it in toc_arr:
|
||||
if not it.get("indices"):
|
||||
continue
|
||||
txt = ""
|
||||
img = None
|
||||
for i in it["indices"]:
|
||||
idx = i
|
||||
txt += "\n" + sections[idx][0] + "\t" + sections[idx][1]
|
||||
if img and section_images[idx]:
|
||||
img = concat_img(img, section_images[idx])
|
||||
elif section_images[idx]:
|
||||
img = section_images[idx]
|
||||
|
||||
it["indices"] = []
|
||||
if not txt:
|
||||
continue
|
||||
it["indices"] = [len(chunks)]
|
||||
print(it, "KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK\n", txt)
|
||||
chunks.append(txt)
|
||||
images.append(img)
|
||||
self.callback(1, "Done")
|
||||
return [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
"image": img,
|
||||
"positions": RAGFlowPdfParser.extract_positions(c),
|
||||
}
|
||||
for c, img in zip(chunks, images)
|
||||
]
|
||||
|
||||
self.callback(message="No table of contents detected.")
|
||||
|
||||
|
||||
async def _invoke(self, **kwargs):
|
||||
function_map = {
|
||||
"general": self._general,
|
||||
@ -154,6 +241,7 @@ class Chunker(ProcessBase):
|
||||
"laws": self._laws,
|
||||
"presentation": self._presentation,
|
||||
"one": self._one,
|
||||
"toc": self._toc,
|
||||
}
|
||||
|
||||
try:
|
||||
@ -167,7 +255,7 @@ class Chunker(ProcessBase):
|
||||
|
||||
async def auto_keywords():
|
||||
nonlocal chunks, llm_setting
|
||||
chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])
|
||||
chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])
|
||||
|
||||
async def doc_keyword_extraction(chat_mdl, ck, topn):
|
||||
cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "keywords", {"topn": topn})
|
||||
@ -184,7 +272,7 @@ class Chunker(ProcessBase):
|
||||
|
||||
async def auto_questions():
|
||||
nonlocal chunks, llm_setting
|
||||
chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])
|
||||
chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])
|
||||
|
||||
async def doc_question_proposal(chat_mdl, d, topn):
|
||||
cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "question", {"topn": topn})
|
||||
|
||||
@ -22,7 +22,7 @@ class ChunkerFromUpstream(BaseModel):
|
||||
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
|
||||
|
||||
name: str
|
||||
blob: bytes
|
||||
file: dict | None = Field(default=None)
|
||||
|
||||
output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
|
||||
|
||||
|
||||
@ -14,10 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
class FileParam(ProcessParamBase):
|
||||
@ -41,10 +38,13 @@ class File(ProcessBase):
|
||||
self.set_output("_ERROR", f"Document({self._canvas._doc_id}) not found!")
|
||||
return
|
||||
|
||||
b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
|
||||
self.set_output("blob", STORAGE_IMPL.get(b, n))
|
||||
#b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
|
||||
#self.set_output("blob", STORAGE_IMPL.get(b, n))
|
||||
self.set_output("name", doc.name)
|
||||
else:
|
||||
file = kwargs.get("file")
|
||||
self.set_output("name", file["name"])
|
||||
self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
|
||||
self.set_output("file", file)
|
||||
#self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
|
||||
|
||||
self.callback(1, "File fetched.")
|
||||
|
||||
15
rag/flow/hierarchical_merger/__init__.py
Normal file
15
rag/flow/hierarchical_merger/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
178
rag/flow/hierarchical_merger/hierarchical_merger.py
Normal file
178
rag/flow/hierarchical_merger/hierarchical_merger.py
Normal file
@ -0,0 +1,178 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
|
||||
from rag.nlp import concat_img
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
class HierarchicalMergerParam(ProcessParamBase):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.levels = []
|
||||
self.hierarchy = None
|
||||
|
||||
def check(self):
|
||||
self.check_empty(self.levels, "Hierarchical setups.")
|
||||
self.check_empty(self.hierarchy, "Hierarchy number.")
|
||||
|
||||
def get_input_form(self) -> dict[str, dict]:
|
||||
return {}
|
||||
|
||||
|
||||
class HierarchicalMerger(ProcessBase):
|
||||
component_name = "HierarchicalMerger"
|
||||
|
||||
async def _invoke(self, **kwargs):
|
||||
try:
|
||||
from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs)
|
||||
except Exception as e:
|
||||
self.set_output("_ERROR", f"Input error: {str(e)}")
|
||||
return
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.")
|
||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
if from_upstream.output_format == "markdown":
|
||||
payload = from_upstream.markdown_result
|
||||
elif from_upstream.output_format == "text":
|
||||
payload = from_upstream.text_result
|
||||
else: # == "html"
|
||||
payload = from_upstream.html_result
|
||||
|
||||
if not payload:
|
||||
payload = ""
|
||||
|
||||
lines = [ln for ln in payload.split("\n") if ln]
|
||||
else:
|
||||
lines = [o.get("text", "") for o in from_upstream.json_result]
|
||||
sections, section_images = [], []
|
||||
for o in from_upstream.json_result or []:
|
||||
sections.append((o.get("text", ""), o.get("position_tag", "")))
|
||||
section_images.append(o.get("img_id"))
|
||||
|
||||
matches = []
|
||||
for txt in lines:
|
||||
good = False
|
||||
for lvl, regs in enumerate(self._param.levels):
|
||||
for reg in regs:
|
||||
if re.search(reg, txt):
|
||||
matches.append(lvl)
|
||||
good = True
|
||||
break
|
||||
if good:
|
||||
break
|
||||
if not good:
|
||||
matches.append(len(self._param.levels))
|
||||
assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}"
|
||||
|
||||
root = {
|
||||
"level": -1,
|
||||
"index": -1,
|
||||
"texts": [],
|
||||
"children": []
|
||||
}
|
||||
for i, m in enumerate(matches):
|
||||
if m == 0:
|
||||
root["children"].append({
|
||||
"level": m,
|
||||
"index": i,
|
||||
"texts": [],
|
||||
"children": []
|
||||
})
|
||||
elif m == len(self._param.levels):
|
||||
def dfs(b):
|
||||
if not b["children"]:
|
||||
b["texts"].append(i)
|
||||
else:
|
||||
dfs(b["children"][-1])
|
||||
dfs(root)
|
||||
else:
|
||||
def dfs(b):
|
||||
nonlocal m, i
|
||||
if not b["children"] or m == b["level"] + 1:
|
||||
b["children"].append({
|
||||
"level": m,
|
||||
"index": i,
|
||||
"texts": [],
|
||||
"children": []
|
||||
})
|
||||
return
|
||||
dfs(b["children"][-1])
|
||||
|
||||
dfs(root)
|
||||
|
||||
all_pathes = []
|
||||
def dfs(n, path, depth):
|
||||
nonlocal all_pathes
|
||||
if depth < self._param.hierarchy:
|
||||
path = deepcopy(path)
|
||||
|
||||
for nn in n["children"]:
|
||||
path.extend([nn["index"], *nn["texts"]])
|
||||
dfs(nn, path, depth+1)
|
||||
|
||||
if depth == self._param.hierarchy:
|
||||
all_pathes.append(path)
|
||||
|
||||
for i in range(len(lines)):
|
||||
print(i, lines[i])
|
||||
dfs(root, [], 0)
|
||||
print("sSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS", json.dumps(root, ensure_ascii=False, indent=2))
|
||||
|
||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
cks = []
|
||||
for path in all_pathes:
|
||||
txt = ""
|
||||
for i in path:
|
||||
txt += lines[i] + "\n"
|
||||
cks.append(txt)
|
||||
|
||||
self.set_output("chunks", [{"text": c} for c in cks if c])
|
||||
else:
|
||||
cks = []
|
||||
images = []
|
||||
for path in all_pathes:
|
||||
txt = ""
|
||||
img = None
|
||||
for i in path:
|
||||
txt += lines[i] + "\n"
|
||||
concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
|
||||
cks.append(cks)
|
||||
images.append(img)
|
||||
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
"image": img,
|
||||
"positions": RAGFlowPdfParser.extract_positions(c),
|
||||
}
|
||||
for c, img in zip(cks, images)
|
||||
]
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
|
||||
|
||||
self.callback(1, "Done.")
|
||||
37
rag/flow/hierarchical_merger/schema.py
Normal file
37
rag/flow/hierarchical_merger/schema.py
Normal file
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class HierarchicalMergerFromUpstream(BaseModel):
|
||||
created_time: float | None = Field(default=None, alias="_created_time")
|
||||
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
|
||||
|
||||
name: str
|
||||
file: dict | None = Field(default=None)
|
||||
chunks: list[dict[str, Any]] | None = Field(default=None)
|
||||
|
||||
output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
|
||||
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
||||
markdown_result: str | None = Field(default=None, alias="markdown")
|
||||
text_result: str | None = Field(default=None, alias="text")
|
||||
html_result: list[str] | None = Field(default=None, alias="html")
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||
|
||||
# def to_dict(self, *, exclude_none: bool = True) -> dict:
|
||||
# return self.model_dump(by_alias=True, exclude_none=exclude_none)
|
||||
@ -12,18 +12,27 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import io
|
||||
import logging
|
||||
import random
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import image2id
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.parser.schema import ParserFromUpstream
|
||||
from rag.llm.cv_model import Base as VLM
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
class ParserParam(ProcessParamBase):
|
||||
@ -43,17 +52,24 @@ class ParserParam(ProcessParamBase):
|
||||
"json",
|
||||
],
|
||||
"ppt": [],
|
||||
"image": [],
|
||||
"image": [
|
||||
"text"
|
||||
],
|
||||
"email": [],
|
||||
"text": [],
|
||||
"audio": [],
|
||||
"text": [
|
||||
"text",
|
||||
"json"
|
||||
],
|
||||
"audio": [
|
||||
"json"
|
||||
],
|
||||
"video": [],
|
||||
}
|
||||
|
||||
self.setups = {
|
||||
"pdf": {
|
||||
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
|
||||
"vlm_name": "",
|
||||
"llm_id": "",
|
||||
"lang": "Chinese",
|
||||
"suffix": [
|
||||
"pdf",
|
||||
@ -76,16 +92,46 @@ class ParserParam(ProcessParamBase):
|
||||
"output_format": "json",
|
||||
},
|
||||
"markdown": {
|
||||
"suffix": ["md", "markdown"],
|
||||
"suffix": ["md", "markdown", "mdx"],
|
||||
"output_format": "json",
|
||||
},
|
||||
"ppt": {},
|
||||
"image": {
|
||||
"parse_method": "ocr",
|
||||
"llm_id": "",
|
||||
"lang": "Chinese",
|
||||
"suffix": ["jpg", "jpeg", "png", "gif"],
|
||||
"output_format": "json",
|
||||
},
|
||||
"email": {
|
||||
"fields": []
|
||||
},
|
||||
"text": {
|
||||
"suffix": [
|
||||
"txt"
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
"audio": {
|
||||
"suffix":[
|
||||
"da",
|
||||
"wave",
|
||||
"wav",
|
||||
"mp3",
|
||||
"aac",
|
||||
"flac",
|
||||
"ogg",
|
||||
"aiff",
|
||||
"au",
|
||||
"midi",
|
||||
"wma",
|
||||
"realaudio",
|
||||
"vqf",
|
||||
"oggvorbis",
|
||||
"ape"
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
"email": {},
|
||||
"text": {},
|
||||
"audio": {},
|
||||
"video": {},
|
||||
}
|
||||
|
||||
@ -96,7 +142,7 @@ class ParserParam(ProcessParamBase):
|
||||
self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
|
||||
|
||||
if pdf_parse_method not in ["deepdoc", "plain_text"]:
|
||||
self.check_empty(pdf_config.get("vlm_name"), "VLM")
|
||||
self.check_empty(pdf_config.get("llm_id"), "VLM")
|
||||
|
||||
pdf_language = pdf_config.get("lang", "")
|
||||
self.check_empty(pdf_language, "Language")
|
||||
@ -117,7 +163,23 @@ class ParserParam(ProcessParamBase):
|
||||
image_config = self.setups.get("image", "")
|
||||
if image_config:
|
||||
image_parse_method = image_config.get("parse_method", "")
|
||||
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"])
|
||||
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
|
||||
if image_parse_method not in ["ocr"]:
|
||||
self.check_empty(image_config.get("llm_id"), "VLM")
|
||||
|
||||
image_language = image_config.get("lang", "")
|
||||
self.check_empty(image_language, "Language")
|
||||
|
||||
text_config = self.setups.get("text", "")
|
||||
if text_config:
|
||||
text_output_format = text_config.get("output_format", "")
|
||||
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
|
||||
|
||||
audio_config = self.setups.get("audio", "")
|
||||
if audio_config:
|
||||
self.check_empty(audio_config.get("llm_id"), "VLM")
|
||||
audio_language = audio_config.get("lang", "")
|
||||
self.check_empty(audio_language, "Language")
|
||||
|
||||
def get_input_form(self) -> dict[str, dict]:
|
||||
return {}
|
||||
@ -126,10 +188,8 @@ class ParserParam(ProcessParamBase):
|
||||
class Parser(ProcessBase):
|
||||
component_name = "Parser"
|
||||
|
||||
def _pdf(self, from_upstream: ParserFromUpstream):
|
||||
def _pdf(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
conf = self._param.setups["pdf"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
@ -139,8 +199,8 @@ class Parser(ProcessBase):
|
||||
lines, _ = PlainParser()(blob)
|
||||
bboxes = [{"text": t} for t, _ in lines]
|
||||
else:
|
||||
assert conf.get("vlm_name")
|
||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("vlm_name"), lang=self._param.setups["pdf"].get("lang"))
|
||||
assert conf.get("llm_id")
|
||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
|
||||
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
@ -149,6 +209,7 @@ class Parser(ProcessBase):
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", bboxes)
|
||||
|
||||
if conf.get("output_format") == "markdown":
|
||||
mkdn = ""
|
||||
for b in bboxes:
|
||||
@ -160,14 +221,10 @@ class Parser(ProcessBase):
|
||||
mkdn += b.get("text", "") + "\n"
|
||||
self.set_output("markdown", mkdn)
|
||||
|
||||
def _spreadsheet(self, from_upstream: ParserFromUpstream):
|
||||
def _spreadsheet(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
conf = self._param.setups["spreadsheet"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
print("spreadsheet {conf=}", flush=True)
|
||||
spreadsheet_parser = ExcelParser()
|
||||
if conf.get("output_format") == "html":
|
||||
html = spreadsheet_parser.html(blob, 1000000000)
|
||||
@ -177,19 +234,13 @@ class Parser(ProcessBase):
|
||||
elif conf.get("output_format") == "markdown":
|
||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||
|
||||
def _word(self, from_upstream: ParserFromUpstream):
|
||||
def _word(self, name, blob):
|
||||
from tika import parser as word_parser
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||
|
||||
blob = from_upstream.blob
|
||||
name = from_upstream.name
|
||||
conf = self._param.setups["word"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
print("word {conf=}", flush=True)
|
||||
doc_parsed = word_parser.from_buffer(blob)
|
||||
|
||||
sections = []
|
||||
if doc_parsed.get("content"):
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
@ -202,26 +253,18 @@ class Parser(ProcessBase):
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", sections)
|
||||
|
||||
def _markdown(self, from_upstream: ParserFromUpstream):
|
||||
def _markdown(self, name, blob):
|
||||
from functools import reduce
|
||||
|
||||
from rag.app.naive import Markdown as naive_markdown_parser
|
||||
from rag.nlp import concat_img
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||
|
||||
blob = from_upstream.blob
|
||||
name = from_upstream.name
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
|
||||
conf = self._param.setups["markdown"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
print("markdown {conf=}", flush=True)
|
||||
|
||||
markdown_parser = naive_markdown_parser()
|
||||
sections, tables = markdown_parser(name, blob, separate_tables=False)
|
||||
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for doc"
|
||||
if conf.get("output_format") == "json":
|
||||
json_results = []
|
||||
|
||||
@ -239,14 +282,86 @@ class Parser(ProcessBase):
|
||||
json_results.append(json_result)
|
||||
|
||||
self.set_output("json", json_results)
|
||||
else:
|
||||
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
||||
|
||||
def _text(self, name, blob):
|
||||
from deepdoc.parser.utils import get_text
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
|
||||
conf = self._param.setups["text"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
# parse binary to text
|
||||
text_content = get_text(name, binary=blob)
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
result = [{"text": text_content}]
|
||||
self.set_output("json", result)
|
||||
else:
|
||||
result = text_content
|
||||
self.set_output("text", result)
|
||||
|
||||
def _image(self, from_upstream: ParserFromUpstream):
|
||||
from deepdoc.vision import OCR
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
conf = self._param.setups["image"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
img = Image.open(io.BytesIO(blob)).convert("RGB")
|
||||
lang = conf["lang"]
|
||||
|
||||
if conf["parse_method"] == "ocr":
|
||||
# use ocr, recognize chars only
|
||||
ocr = OCR()
|
||||
bxs = ocr(np.array(img)) # return boxes and recognize result
|
||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||
|
||||
else:
|
||||
# use VLM to describe the picture
|
||||
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
|
||||
img_binary = io.BytesIO()
|
||||
img.save(img_binary, format="JPEG")
|
||||
img_binary.seek(0)
|
||||
txt = cv_model.describe(img_binary.read())
|
||||
|
||||
self.set_output("text", txt)
|
||||
|
||||
def _audio(self, from_upstream: ParserFromUpstream):
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
name = from_upstream.name
|
||||
conf = self._param.setups["audio"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
lang = conf["lang"]
|
||||
_, ext = os.path.splitext(name)
|
||||
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
|
||||
tmpf.write(blob)
|
||||
tmpf.flush()
|
||||
tmp_path = os.path.abspath(tmpf.name)
|
||||
|
||||
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
|
||||
txt = seq2txt_mdl.transcription(tmp_path)
|
||||
|
||||
self.set_output("text", txt)
|
||||
|
||||
async def _invoke(self, **kwargs):
|
||||
function_map = {
|
||||
"pdf": self._pdf,
|
||||
"markdown": self._markdown,
|
||||
"spreadsheet": self._spreadsheet,
|
||||
"word": self._word
|
||||
"word": self._word,
|
||||
"text": self._text,
|
||||
"image": self._image,
|
||||
"audio": self._audio,
|
||||
}
|
||||
try:
|
||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||
@ -254,8 +369,20 @@ class Parser(ProcessBase):
|
||||
self.set_output("_ERROR", f"Input error: {str(e)}")
|
||||
return
|
||||
|
||||
name = from_upstream.name
|
||||
if self._canvas._doc_id:
|
||||
b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
|
||||
blob = STORAGE_IMPL.get(b, n)
|
||||
else:
|
||||
blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
|
||||
|
||||
for p_type, conf in self._param.setups.items():
|
||||
if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
|
||||
continue
|
||||
await trio.to_thread.run_sync(function_map[p_type], from_upstream)
|
||||
await trio.to_thread.run_sync(function_map[p_type], name, blob)
|
||||
break
|
||||
|
||||
outs = self.output()
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in outs.get("json", []):
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
|
||||
|
||||
@ -20,6 +20,5 @@ class ParserFromUpstream(BaseModel):
|
||||
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
|
||||
|
||||
name: str
|
||||
blob: bytes
|
||||
|
||||
file: dict | None = Field(default=None)
|
||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||
|
||||
@ -48,7 +48,24 @@ class Pipeline(Graph):
|
||||
obj.append({"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]})
|
||||
else:
|
||||
obj = [{"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]}]
|
||||
REDIS_CONN.set_obj(log_key, obj, 60 * 10)
|
||||
REDIS_CONN.set_obj(log_key, obj, 60 * 30)
|
||||
if self._doc_id:
|
||||
percentage = 1./len(self.components.items())
|
||||
msg = ""
|
||||
finished = 0.
|
||||
for o in obj:
|
||||
if o['component_name'] == "END":
|
||||
continue
|
||||
msg += f"\n[{o['component_name']}]:\n"
|
||||
for t in o["trace"]:
|
||||
msg += "%s: %s\n"%(t["datetime"], t["message"])
|
||||
if t["progress"] < 0:
|
||||
finished = -1
|
||||
break
|
||||
if finished < 0:
|
||||
break
|
||||
finished += o["trace"][-1]["progress"] * percentage
|
||||
DocumentService.update_by_id(self._doc_id, {"progress": finished, "progress_msg": msg})
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
|
||||
@ -108,5 +125,11 @@ class Pipeline(Graph):
|
||||
idx += 1
|
||||
self.path.extend(cpn_obj.get_downstream())
|
||||
|
||||
self.callback("END", 1, json.dumps(self.get_component_obj(self.path[-1]).output(), ensure_ascii=False))
|
||||
|
||||
if self._doc_id:
|
||||
DocumentService.update_by_id(self._doc_id, {"progress": 1 if not self.error else -1, "progress_msg": "Pipeline finished...\n" + self.error, "process_duration": time.perf_counter() - st})
|
||||
DocumentService.update_by_id(self._doc_id,{
|
||||
"progress": 1 if not self.error else -1,
|
||||
"progress_msg": "Pipeline finished...\n" + self.error,
|
||||
"process_duration": time.perf_counter() - st
|
||||
})
|
||||
|
||||
15
rag/flow/splitter/__init__.py
Normal file
15
rag/flow/splitter/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
38
rag/flow/splitter/schema.py
Normal file
38
rag/flow/splitter/schema.py
Normal file
@ -0,0 +1,38 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class SplitterFromUpstream(BaseModel):
|
||||
created_time: float | None = Field(default=None, alias="_created_time")
|
||||
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
|
||||
|
||||
name: str
|
||||
file: dict | None = Field(default=None)
|
||||
chunks: list[dict[str, Any]] | None = Field(default=None)
|
||||
|
||||
output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
|
||||
|
||||
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
||||
markdown_result: str | None = Field(default=None, alias="markdown")
|
||||
text_result: str | None = Field(default=None, alias="text")
|
||||
html_result: list[str] | None = Field(default=None, alias="html")
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||
|
||||
# def to_dict(self, *, exclude_none: bool = True) -> dict:
|
||||
# return self.model_dump(by_alias=True, exclude_none=exclude_none)
|
||||
112
rag/flow/splitter/splitter.py
Normal file
112
rag/flow/splitter/splitter.py
Normal file
@ -0,0 +1,112 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import random
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.splitter.schema import SplitterFromUpstream
|
||||
from rag.nlp import naive_merge, naive_merge_with_images
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
class SplitterParam(ProcessParamBase):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.chunk_token_size = 512
|
||||
self.delimiters = ["\n"]
|
||||
self.overlapped_percent = 0
|
||||
|
||||
def check(self):
|
||||
self.check_empty(self.delimiters, "Delimiters.")
|
||||
self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
|
||||
self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
|
||||
|
||||
def get_input_form(self) -> dict[str, dict]:
|
||||
return {}
|
||||
|
||||
|
||||
class Splitter(ProcessBase):
|
||||
component_name = "Splitter"
|
||||
|
||||
async def _invoke(self, **kwargs):
|
||||
try:
|
||||
from_upstream = SplitterFromUpstream.model_validate(kwargs)
|
||||
except Exception as e:
|
||||
self.set_output("_ERROR", f"Input error: {str(e)}")
|
||||
return
|
||||
|
||||
deli = ""
|
||||
for d in self._param.delimiters:
|
||||
if len(d) > 1:
|
||||
deli += f"`{d}`"
|
||||
else:
|
||||
deli += d
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
|
||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
if from_upstream.output_format == "markdown":
|
||||
payload = from_upstream.markdown_result
|
||||
elif from_upstream.output_format == "text":
|
||||
payload = from_upstream.text_result
|
||||
else: # == "html"
|
||||
payload = from_upstream.html_result
|
||||
|
||||
if not payload:
|
||||
payload = ""
|
||||
|
||||
cks = naive_merge(
|
||||
payload,
|
||||
self._param.chunk_token_size,
|
||||
deli,
|
||||
self._param.overlapped_percent,
|
||||
)
|
||||
self.set_output("chunks", [{"text": c} for c in cks])
|
||||
|
||||
self.callback(1, "Done.")
|
||||
return
|
||||
|
||||
# json
|
||||
sections, section_images = [], []
|
||||
for o in from_upstream.json_result or []:
|
||||
sections.append((o.get("text", ""), o.get("position_tag", "")))
|
||||
section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get)))
|
||||
|
||||
chunks, images = naive_merge_with_images(
|
||||
sections,
|
||||
section_images,
|
||||
self._param.chunk_token_size,
|
||||
deli,
|
||||
self._param.overlapped_percent,
|
||||
)
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
"image": img,
|
||||
"positions": RAGFlowPdfParser.extract_positions(c),
|
||||
}
|
||||
for c, img in zip(chunks, images)
|
||||
]
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
|
||||
print("SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n", json.dumps(cks, ensure_ascii=False, indent=2))
|
||||
self.set_output("chunks", cks)
|
||||
self.callback(1, "Done.")
|
||||
@ -44,20 +44,58 @@
|
||||
"markdown"
|
||||
],
|
||||
"output_format": "json"
|
||||
},
|
||||
"text": {
|
||||
"suffix": ["txt"],
|
||||
"output_format": "json"
|
||||
},
|
||||
"image": {
|
||||
"parse_method": "vlm",
|
||||
"llm_id":"glm-4.5v",
|
||||
"lang": "Chinese",
|
||||
"suffix": [
|
||||
"jpg",
|
||||
"jpeg",
|
||||
"png",
|
||||
"gif"
|
||||
],
|
||||
"output_format": "text"
|
||||
},
|
||||
"audio": {
|
||||
"suffix": [
|
||||
"da",
|
||||
"wave",
|
||||
"wav",
|
||||
"mp3",
|
||||
"aac",
|
||||
"flac",
|
||||
"ogg",
|
||||
"aiff",
|
||||
"au",
|
||||
"midi",
|
||||
"wma",
|
||||
"realaudio",
|
||||
"vqf",
|
||||
"oggvorbis",
|
||||
"ape"
|
||||
],
|
||||
"lang": "Chinese",
|
||||
"llm_id": "SenseVoiceSmall",
|
||||
"output_format": "json"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"downstream": ["Chunker:0"],
|
||||
"downstream": ["Splitter:0"],
|
||||
"upstream": ["Begin"]
|
||||
},
|
||||
"Chunker:0": {
|
||||
"Splitter:0": {
|
||||
"obj": {
|
||||
"component_name": "Chunker",
|
||||
"component_name": "Splitter",
|
||||
"params": {
|
||||
"method": "general",
|
||||
"auto_keywords": 5
|
||||
"chunk_token_size": 512,
|
||||
"delimiters": ["\n"],
|
||||
"overlapped_percent": 0
|
||||
}
|
||||
},
|
||||
"downstream": ["Tokenizer:0"],
|
||||
|
||||
84
rag/flow/tests/dsl_examples/hierarchical_merger.json
Normal file
84
rag/flow/tests/dsl_examples/hierarchical_merger.json
Normal file
@ -0,0 +1,84 @@
|
||||
{
|
||||
"components": {
|
||||
"File": {
|
||||
"obj":{
|
||||
"component_name": "File",
|
||||
"params": {
|
||||
}
|
||||
},
|
||||
"downstream": ["Parser:0"],
|
||||
"upstream": []
|
||||
},
|
||||
"Parser:0": {
|
||||
"obj": {
|
||||
"component_name": "Parser",
|
||||
"params": {
|
||||
"setups": {
|
||||
"pdf": {
|
||||
"parse_method": "deepdoc",
|
||||
"vlm_name": "",
|
||||
"lang": "Chinese",
|
||||
"suffix": [
|
||||
"pdf"
|
||||
],
|
||||
"output_format": "json"
|
||||
},
|
||||
"spreadsheet": {
|
||||
"suffix": [
|
||||
"xls",
|
||||
"xlsx",
|
||||
"csv"
|
||||
],
|
||||
"output_format": "html"
|
||||
},
|
||||
"word": {
|
||||
"suffix": [
|
||||
"doc",
|
||||
"docx"
|
||||
],
|
||||
"output_format": "json"
|
||||
},
|
||||
"markdown": {
|
||||
"suffix": [
|
||||
"md",
|
||||
"markdown"
|
||||
],
|
||||
"output_format": "text"
|
||||
},
|
||||
"text": {
|
||||
"suffix": ["txt"],
|
||||
"output_format": "json"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"downstream": ["Splitter:0"],
|
||||
"upstream": ["File"]
|
||||
},
|
||||
"Splitter:0": {
|
||||
"obj": {
|
||||
"component_name": "Splitter",
|
||||
"params": {
|
||||
"chunk_token_size": 512,
|
||||
"delimiters": ["\r\n"],
|
||||
"overlapped_percent": 0
|
||||
}
|
||||
},
|
||||
"downstream": ["HierarchicalMerger:0"],
|
||||
"upstream": ["Parser:0"]
|
||||
},
|
||||
"HierarchicalMerger:0": {
|
||||
"obj": {
|
||||
"component_name": "HierarchicalMerger",
|
||||
"params": {
|
||||
"levels": [["^#[^#]"], ["^##[^#]"], ["^###[^#]"], ["^####[^#]"]],
|
||||
"hierarchy": 2
|
||||
}
|
||||
},
|
||||
"downstream": [],
|
||||
"upstream": ["Splitter:0"]
|
||||
}
|
||||
},
|
||||
"path": []
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ class TokenizerFromUpstream(BaseModel):
|
||||
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
|
||||
|
||||
name: str = ""
|
||||
blob: bytes
|
||||
file: dict | None = Field(default=None)
|
||||
|
||||
output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
|
||||
|
||||
|
||||
@ -37,6 +37,7 @@ class TokenizerParam(ProcessParamBase):
|
||||
super().__init__()
|
||||
self.search_method = ["full_text", "embedding"]
|
||||
self.filename_embd_weight = 0.1
|
||||
self.fields = ["text"]
|
||||
|
||||
def check(self):
|
||||
for v in self.search_method:
|
||||
@ -61,10 +62,14 @@ class Tokenizer(ProcessBase):
|
||||
embedding_model = LLMBundle(self._canvas._tenant_id, LLMType.EMBEDDING, llm_name=embedding_id)
|
||||
texts = []
|
||||
for c in chunks:
|
||||
if c.get("questions"):
|
||||
texts.append("\n".join(c["questions"]))
|
||||
else:
|
||||
texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c["text"]))
|
||||
txt = ""
|
||||
for f in self._param.fields:
|
||||
f = c.get(f)
|
||||
if isinstance(f, str):
|
||||
txt += f
|
||||
elif isinstance(f, list):
|
||||
txt += "\n".join(f)
|
||||
texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt))
|
||||
vts, c = embedding_model.encode([name])
|
||||
token_count += c
|
||||
tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0)
|
||||
|
||||
Reference in New Issue
Block a user