Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-09-05 18:50:46 +08:00
committed by GitHub
parent 9aa8cfb73a
commit 45f52e85d7
21 changed files with 959 additions and 256 deletions

View File

@ -14,36 +14,45 @@
# limitations under the License.
#
import os
import importlib
import inspect
import pkgutil
from pathlib import Path
from types import ModuleType
from typing import Dict, Type
_package_path = os.path.dirname(__file__)
__all_classes: Dict[str, Type] = {}
def _import_submodules() -> None:
for filename in os.listdir(_package_path): # noqa: F821
if filename.startswith("__") or not filename.endswith(".py") or filename.startswith("base"):
continue
module_name = filename[:-3]
_pkg_dir = Path(__file__).resolve().parent
_pkg_name = __name__
def _should_skip_module(mod_name: str) -> bool:
leaf = mod_name.rsplit(".", 1)[-1]
return leaf in {"__init__"} or leaf.startswith("__") or leaf.startswith("_") or leaf.startswith("base")
def _import_submodules() -> None:
for modinfo in pkgutil.walk_packages([str(_pkg_dir)], prefix=_pkg_name + "."): # noqa: F821
mod_name = modinfo.name
if _should_skip_module(mod_name): # noqa: F821
continue
try:
module = importlib.import_module(f".{module_name}", package=__name__)
module = importlib.import_module(mod_name)
_extract_classes_from_module(module) # noqa: F821
except ImportError as e:
print(f"Warning: Failed to import module {module_name}: {str(e)}")
print(f"Warning: Failed to import module {mod_name}: {e}")
def _extract_classes_from_module(module: ModuleType) -> None:
for name, obj in inspect.getmembers(module):
if (inspect.isclass(obj) and
obj.__module__ == module.__name__ and not name.startswith("_")):
if inspect.isclass(obj) and obj.__module__ == module.__name__ and not name.startswith("_"):
__all_classes[name] = obj
globals()[name] = obj
_import_submodules()
__all__ = list(__all_classes.keys()) + ["__all_classes"]
del _package_path, _import_submodules, _extract_classes_from_module
del _pkg_dir, _pkg_name, _import_submodules, _extract_classes_from_module

View File

@ -1,5 +1,5 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -13,13 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import os
import logging
import os
import time
from functools import partial
from typing import Any
import trio
from agent.component.base import ComponentParamBase, ComponentBase
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.api_utils import timeout
@ -31,14 +33,16 @@ class ProcessParamBase(ComponentParamBase):
class ProcessBase(ComponentBase):
def __init__(self, pipeline, id, param: ProcessParamBase):
super().__init__(pipeline, id, param)
self.callback = partial(self._canvas.callback, self.component_name)
if hasattr(self._canvas, "callback"):
self.callback = partial(self._canvas.callback, self.component_name)
else:
self.callback = partial(lambda *args, **kwargs: None, self.component_name)
async def invoke(self, **kwargs) -> dict[str, Any]:
self.set_output("_created_time", time.perf_counter())
for k,v in kwargs.items():
for k, v in kwargs.items():
self.set_output(k, v)
try:
with trio.fail_after(self._param.timeout):
@ -54,6 +58,6 @@ class ProcessBase(ComponentBase):
self.set_output("_elapsed_time", time.perf_counter() - self.output("_created_time"))
return self.output()
@timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60))
@timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60))
async def _invoke(self, **kwargs):
raise NotImplementedError()

View File

@ -0,0 +1,15 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,5 +1,5 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -13,12 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import trio
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from graphrag.utils import get_llm_cache, chat_limiter, set_llm_cache
from graphrag.utils import chat_limiter, get_llm_cache, set_llm_cache
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.chunker.schema import ChunkerFromUpstream
from rag.nlp import naive_merge, naive_merge_with_images
from rag.prompts.prompts import keyword_extraction, question_proposal
@ -26,7 +29,23 @@ from rag.prompts.prompts import keyword_extraction, question_proposal
class ChunkerParam(ProcessParamBase):
def __init__(self):
super().__init__()
self.method_options = ["general", "q&a", "resume", "manual", "table", "paper", "book", "laws", "presentation", "one"]
self.method_options = [
# General
"general",
"onetable",
# Customer Service
"q&a",
"manual",
# Recruitment
"resume",
# Education & Research
"book",
"paper",
"laws",
"presentation",
# Other
# "Tag" # TODO: Other method
]
self.method = "general"
self.chunk_token_size = 512
self.delimiter = "\n"
@ -35,10 +54,7 @@ class ChunkerParam(ProcessParamBase):
self.auto_keywords = 0
self.auto_questions = 0
self.tag_sets = []
self.llm_setting = {
"llm_name": "",
"lang": "Chinese"
}
self.llm_setting = {"llm_name": "", "lang": "Chinese"}
def check(self):
self.check_valid_value(self.method.lower(), "Chunk method abnormal.", self.method_options)
@ -48,53 +64,79 @@ class ChunkerParam(ProcessParamBase):
self.check_nonnegative_number(self.auto_questions, "Auto-question value: (0, 10]")
self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
def get_input_form(self) -> dict[str, dict]:
return {}
class Chunker(ProcessBase):
component_name = "Chunker"
def _general(self, **kwargs):
self.callback(random.randint(1,5)/100., "Start to chunk via `General`.")
if kwargs.get("output_format") in ["markdown", "text"]:
cks = naive_merge(kwargs.get(kwargs["output_format"]), self._param.chunk_token_size, self._param.delimiter, self._param.overlapped_percent)
def _general(self, from_upstream: ChunkerFromUpstream):
self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `General`.")
if from_upstream.output_format in ["markdown", "text"]:
if from_upstream.output_format == "markdown":
payload = from_upstream.markdown_result
else: # == "text"
payload = from_upstream.text_result
if not payload:
payload = ""
cks = naive_merge(
payload,
self._param.chunk_token_size,
self._param.delimiter,
self._param.overlapped_percent,
)
return [{"text": c} for c in cks]
sections, section_images = [], []
for o in kwargs["json"]:
sections.append((o["text"], o.get("position_tag","")))
for o in from_upstream.json_result or []:
sections.append((o.get("text", ""), o.get("position_tag", "")))
section_images.append(o.get("image"))
chunks, images = naive_merge_with_images(sections, section_images,self._param.chunk_token_size, self._param.delimiter, self._param.overlapped_percent)
return [{
"text": RAGFlowPdfParser.remove_tag(c),
"image": img,
"positions": RAGFlowPdfParser.extract_positions(c)
} for c,img in zip(chunks,images)]
chunks, images = naive_merge_with_images(
sections,
section_images,
self._param.chunk_token_size,
self._param.delimiter,
self._param.overlapped_percent,
)
def _q_and_a(self, **kwargs):
return [
{
"text": RAGFlowPdfParser.remove_tag(c),
"image": img,
"positions": RAGFlowPdfParser.extract_positions(c),
}
for c, img in zip(chunks, images)
]
def _q_and_a(self, from_upstream: ChunkerFromUpstream):
pass
def _resume(self, **kwargs):
def _resume(self, from_upstream: ChunkerFromUpstream):
pass
def _manual(self, **kwargs):
def _manual(self, from_upstream: ChunkerFromUpstream):
pass
def _table(self, **kwargs):
def _table(self, from_upstream: ChunkerFromUpstream):
pass
def _paper(self, **kwargs):
def _paper(self, from_upstream: ChunkerFromUpstream):
pass
def _book(self, **kwargs):
def _book(self, from_upstream: ChunkerFromUpstream):
pass
def _laws(self, **kwargs):
def _laws(self, from_upstream: ChunkerFromUpstream):
pass
def _presentation(self, **kwargs):
def _presentation(self, from_upstream: ChunkerFromUpstream):
pass
def _one(self, **kwargs):
def _one(self, from_upstream: ChunkerFromUpstream):
pass
async def _invoke(self, **kwargs):
@ -110,7 +152,14 @@ class Chunker(ProcessBase):
"presentation": self._presentation,
"one": self._one,
}
chunks = function_map[self._param.method](**kwargs)
try:
from_upstream = ChunkerFromUpstream.model_validate(kwargs)
except Exception as e:
self.set_output("_ERROR", f"Input error: {str(e)}")
return
chunks = function_map[self._param.method](from_upstream)
llm_setting = self._param.llm_setting
async def auto_keywords():

View File

@ -0,0 +1,37 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
class ChunkerFromUpstream(BaseModel):
created_time: float | None = Field(default=None, alias="_created_time")
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
name: str
blob: bytes
output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: str | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")
# def to_dict(self, *, exclude_none: bool = True) -> dict:
# return self.model_dump(by_alias=True, exclude_none=exclude_none)

View File

@ -1,5 +1,5 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -27,6 +27,9 @@ class FileParam(ProcessParamBase):
def check(self):
pass
def get_input_form(self) -> dict[str, dict]:
return {}
class File(ProcessBase):
component_name = "File"

View File

@ -1,107 +0,0 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import trio
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.parser.pdf_parser import RAGFlowPdfParser, PlainParser, VisionParser
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.llm.cv_model import Base as VLM
from deepdoc.parser import ExcelParser
class ParserParam(ProcessParamBase):
def __init__(self):
super().__init__()
self.setups = {
"pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
"vlm_name": "",
"lang": "Chinese",
"suffix": ["pdf"],
"output_format": "json"
},
"excel": {
"output_format": "html"
},
"ppt": {},
"image": {
"parse_method": "ocr"
},
"email": {},
"text": {},
"audio": {},
"video": {},
}
def check(self):
if self.setups["pdf"].get("parse_method") not in ["deepdoc", "plain_text"]:
assert self.setups["pdf"].get("vlm_name"), "No VLM specified."
assert self.setups["pdf"].get("lang"), "No language specified."
class Parser(ProcessBase):
component_name = "Parser"
def _pdf(self, blob):
self.callback(random.randint(1,5)/100., "Start to work on a PDF.")
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method") == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
elif conf.get("parse_method") == "plain_text":
lines,_ = PlainParser()(blob)
bboxes = [{"text": t} for t,_ in lines]
else:
assert conf.get("vlm_name")
vision_model = LLMBundle(self._canvas.tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("vlm_name"), lang=self.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(bin, callback=self.callback)
bboxes = []
for t, poss in lines:
pn, x0, x1, top, bott = poss.split(" ")
bboxes.append({"page_number": int(pn), "x0": int(x0), "x1": int(x1), "top": int(top), "bottom": int(bott), "text": t})
self.set_output("json", bboxes)
mkdn = ""
for b in bboxes:
if b.get("layout_type", "") == "title":
mkdn += "\n## "
if b.get("layout_type", "") == "figure":
mkdn += "\n![Image]({})".format(VLM.image2base64(b["image"]))
continue
mkdn += b.get("text", "") + "\n"
self.set_output("markdown", mkdn)
def _excel(self, blob):
self.callback(random.randint(1,5)/100., "Start to work on a Excel.")
conf = self._param.setups["excel"]
excel_parser = ExcelParser()
if conf.get("output_format") == "html":
html = excel_parser.html(blob,1000000000)
self.set_output("html", html)
elif conf.get("output_format") == "json":
self.set_output("json", [{"text": txt} for txt in excel_parser(blob) if txt])
elif conf.get("output_format") == "markdown":
self.set_output("markdown", excel_parser.markdown(blob))
async def _invoke(self, **kwargs):
function_map = {
"pdf": self._pdf,
}
for p_type, conf in self._param.setups.items():
if kwargs.get("name", "").split(".")[-1].lower() not in conf.get("suffix", []):
continue
await trio.to_thread.run_sync(function_map[p_type], kwargs["blob"])
break

View File

@ -0,0 +1,14 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

154
rag/flow/parser/parser.py Normal file
View File

@ -0,0 +1,154 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import trio
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.parser import ExcelParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
from rag.llm.cv_model import Base as VLM
class ParserParam(ProcessParamBase):
def __init__(self):
super().__init__()
self.allowed_output_format = {
"pdf": ["json", "markdown"],
"excel": ["json", "markdown", "html"],
"ppt": [],
"image": [],
"email": [],
"text": [],
"audio": [],
"video": [],
}
self.setups = {
"pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
"vlm_name": "",
"lang": "Chinese",
"suffix": ["pdf"],
"output_format": "json",
},
"excel": {
"output_format": "html",
"suffix": ["xls", "xlsx", "csv"],
},
"ppt": {},
"image": {
"parse_method": "ocr",
},
"email": {},
"text": {},
"audio": {},
"video": {},
}
def check(self):
pdf_config = self.setups.get("pdf", {})
if pdf_config:
pdf_parse_method = pdf_config.get("parse_method", "")
self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
if pdf_parse_method not in ["deepdoc", "plain_text"]:
self.check_empty(pdf_config.get("vlm_name"), "VLM")
pdf_language = pdf_config.get("lang", "")
self.check_empty(pdf_language, "Language")
pdf_output_format = pdf_config.get("output_format", "")
self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
excel_config = self.setups.get("excel", "")
if excel_config:
excel_output_format = excel_config.get("output_format", "")
self.check_valid_value(excel_output_format, "Excel output format abnormal.", self.allowed_output_format["excel"])
image_config = self.setups.get("image", "")
if image_config:
image_parse_method = image_config.get("parse_method", "")
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"])
def get_input_form(self) -> dict[str, dict]:
return {}
class Parser(ProcessBase):
component_name = "Parser"
def _pdf(self, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method") == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
elif conf.get("parse_method") == "plain_text":
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]
else:
assert conf.get("vlm_name")
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("vlm_name"), lang=self._param.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
bboxes = []
for t, poss in lines:
pn, x0, x1, top, bott = poss.split(" ")
bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
if conf.get("output_format") == "json":
self.set_output("json", bboxes)
if conf.get("output_format") == "markdown":
mkdn = ""
for b in bboxes:
if b.get("layout_type", "") == "title":
mkdn += "\n## "
if b.get("layout_type", "") == "figure":
mkdn += "\n![Image]({})".format(VLM.image2base64(b["image"]))
continue
mkdn += b.get("text", "") + "\n"
self.set_output("markdown", mkdn)
def _excel(self, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Excel.")
conf = self._param.setups["excel"]
self.set_output("output_format", conf["output_format"])
excel_parser = ExcelParser()
if conf.get("output_format") == "html":
html = excel_parser.html(blob, 1000000000)
self.set_output("html", html)
elif conf.get("output_format") == "json":
self.set_output("json", [{"text": txt} for txt in excel_parser(blob) if txt])
elif conf.get("output_format") == "markdown":
self.set_output("markdown", excel_parser.markdown(blob))
async def _invoke(self, **kwargs):
function_map = {
"pdf": self._pdf,
"excel": self._excel,
}
try:
from_upstream = ParserFromUpstream.model_validate(kwargs)
except Exception as e:
self.set_output("_ERROR", f"Input error: {str(e)}")
return
for p_type, conf in self._param.setups.items():
if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
continue
await trio.to_thread.run_sync(function_map[p_type], from_upstream.blob)
break

25
rag/flow/parser/schema.py Normal file
View File

@ -0,0 +1,25 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pydantic import BaseModel, ConfigDict, Field
class ParserFromUpstream(BaseModel):
created_time: float | None = Field(default=None, alias="_created_time")
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
name: str
blob: bytes
model_config = ConfigDict(populate_by_name=True, extra="forbid")

View File

@ -1,5 +1,5 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -18,14 +18,15 @@ import json
import logging
import random
import time
import trio
from agent.canvas import Graph
from api.db.services.document_service import DocumentService
from rag.utils.redis_conn import REDIS_CONN
class Pipeline(Graph):
def __init__(self, dsl: str, tenant_id=None, doc_id=None, task_id=None, flow_id=None):
super().__init__(dsl, tenant_id, task_id)
self._doc_id = doc_id
@ -35,7 +36,7 @@ class Pipeline(Graph):
self._kb_id = DocumentService.get_knowledgebase_id(doc_id)
assert self._kb_id, f"Can't find KB of this document: {doc_id}"
def callback(self, component_name: str, progress: float|int|None=None, message: str = "") -> None:
def callback(self, component_name: str, progress: float | int | None = None, message: str = "") -> None:
log_key = f"{self._flow_id}-{self.task_id}-logs"
try:
bin = REDIS_CONN.get(log_key)
@ -44,16 +45,10 @@ class Pipeline(Graph):
if obj[-1]["component_name"] == component_name:
obj[-1]["trace"].append({"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")})
else:
obj.append({
"component_name": component_name,
"trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]
})
obj.append({"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]})
else:
obj = [{
"component_name": component_name,
"trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]
}]
REDIS_CONN.set_obj(log_key, obj, 60*10)
obj = [{"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]}]
REDIS_CONN.set_obj(log_key, obj, 60 * 10)
except Exception as e:
logging.exception(e)
@ -71,21 +66,19 @@ class Pipeline(Graph):
super().reset()
log_key = f"{self._flow_id}-{self.task_id}-logs"
try:
REDIS_CONN.set_obj(log_key, [], 60*10)
REDIS_CONN.set_obj(log_key, [], 60 * 10)
except Exception as e:
logging.exception(e)
async def run(self, **kwargs):
st = time.perf_counter()
if not self.path:
self.path.append("begin")
self.path.append("File")
if self._doc_id:
DocumentService.update_by_id(self._doc_id, {
"progress": random.randint(0,5)/100.,
"progress_msg": "Start the pipeline...",
"process_begin_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
})
DocumentService.update_by_id(
self._doc_id, {"progress": random.randint(0, 5) / 100.0, "progress_msg": "Start the pipeline...", "process_begin_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
)
self.error = ""
idx = len(self.path) - 1
@ -99,23 +92,21 @@ class Pipeline(Graph):
self.path.extend(cpn_obj.get_downstream())
while idx < len(self.path) and not self.error:
last_cpn = self.get_component_obj(self.path[idx-1])
last_cpn = self.get_component_obj(self.path[idx - 1])
cpn_obj = self.get_component_obj(self.path[idx])
async def invoke():
nonlocal last_cpn, cpn_obj
await cpn_obj.invoke(**last_cpn.output())
async with trio.open_nursery() as nursery:
nursery.start_soon(invoke)
if cpn_obj.error():
self.error = "[ERROR]" + cpn_obj.error()
self.callback(cpn_obj.component_name, -1, self.error)
break
idx += 1
self.path.extend(cpn_obj.get_downstream())
if self._doc_id:
DocumentService.update_by_id(self._doc_id, {
"progress": 1 if not self.error else -1,
"progress_msg": "Pipeline finished...\n" + self.error,
"process_duration": time.perf_counter() - st
})
DocumentService.update_by_id(self._doc_id, {"progress": 1 if not self.error else -1, "progress_msg": "Pipeline finished...\n" + self.error, "process_duration": time.perf_counter() - st})

View File

@ -1,5 +1,5 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -18,12 +18,14 @@ import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
import trio
from api import settings
from rag.flow.pipeline import Pipeline
def print_logs(pipeline):
def print_logs(pipeline: Pipeline):
last_logs = "[]"
while True:
time.sleep(5)
@ -34,16 +36,16 @@ def print_logs(pipeline):
last_logs = logs_str
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
dsl_default_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"dsl_examples",
"general_pdf_all.json",
)
parser.add_argument('-s', '--dsl', default=dsl_default_path, help="input dsl", action='store', required=True)
parser.add_argument('-d', '--doc_id', default=False, help="Document ID", action='store', required=True)
parser.add_argument('-t', '--tenant_id', default=False, help="Tenant ID", action='store', required=True)
parser.add_argument("-s", "--dsl", default=dsl_default_path, help="input dsl", action="store", required=False)
parser.add_argument("-d", "--doc_id", default=False, help="Document ID", action="store", required=True)
parser.add_argument("-t", "--tenant_id", default=False, help="Tenant ID", action="store", required=True)
args = parser.parse_args()
settings.init_settings()
@ -53,5 +55,7 @@ if __name__ == '__main__':
exe = ThreadPoolExecutor(max_workers=5)
thr = exe.submit(print_logs, pipeline)
# queue_dataflow(dsl=open(args.dsl, "r").read(), tenant_id=args.tenant_id, doc_id=args.doc_id, task_id="xxxx", flow_id="xxx", priority=0)
trio.run(pipeline.run)
thr.result()
thr.result()

View File

@ -1,15 +1,15 @@
{
"components": {
"begin": {
"File": {
"obj":{
"component_name": "File",
"params": {
}
},
"downstream": ["parser:0"],
"downstream": ["Parser:0"],
"upstream": []
},
"parser:0": {
"Parser:0": {
"obj": {
"component_name": "Parser",
"params": {
@ -22,14 +22,22 @@
"pdf"
],
"output_format": "json"
},
"excel": {
"output_format": "html",
"suffix": [
"xls",
"xlsx",
"csv"
]
}
}
}
},
"downstream": ["chunker:0"],
"upstream": ["begin"]
"downstream": ["Chunker:0"],
"upstream": ["Begin"]
},
"chunker:0": {
"Chunker:0": {
"obj": {
"component_name": "Chunker",
"params": {
@ -37,18 +45,19 @@
"auto_keywords": 5
}
},
"downstream": ["tokenizer:0"],
"upstream": ["chunker:0"]
"downstream": ["Tokenizer:0"],
"upstream": ["Parser:0"]
},
"tokenizer:0": {
"Tokenizer:0": {
"obj": {
"component_name": "Tokenizer",
"params": {
}
},
"downstream": [],
"upstream": ["chunker:0"]
"upstream": ["Chunker:0"]
}
},
"path": []
}
}

View File

@ -0,0 +1,14 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,51 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field, model_validator
class TokenizerFromUpstream(BaseModel):
created_time: float | None = Field(default=None, alias="_created_time")
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
name: str = ""
blob: bytes
output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
chunks: list[dict[str, Any]] | None = Field(default=None)
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
markdown_result: str | None = Field(default=None, alias="markdown")
text_result: str | None = Field(default=None, alias="text")
html_result: str | None = Field(default=None, alias="html")
model_config = ConfigDict(populate_by_name=True, extra="forbid")
@model_validator(mode="after")
def _check_payloads(self) -> "TokenizerFromUpstream":
if self.chunks:
return self
if self.output_format in {"markdown", "text"}:
if self.output_format == "markdown" and not self.markdown_result:
raise ValueError("output_format=markdown requires a markdown payload (field: 'markdown' or 'markdown_result').")
if self.output_format == "text" and not self.text_result:
raise ValueError("output_format=text requires a text payload (field: 'text' or 'text_result').")
else:
if not self.json_result:
raise ValueError("When no chunks are provided and output_format is not markdown/text, a JSON list payload is required (field: 'json' or 'json_result').")
return self

View File

@ -1,5 +1,5 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import random
import re
@ -24,6 +25,7 @@ from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import TenantService
from api.utils.api_utils import timeout
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.tokenizer.schema import TokenizerFromUpstream
from rag.nlp import rag_tokenizer
from rag.settings import EMBEDDING_BATCH_SIZE
from rag.svr.task_executor import embed_limiter
@ -40,6 +42,9 @@ class TokenizerParam(ProcessParamBase):
for v in self.search_method:
self.check_valid_value(v.lower(), "Chunk method abnormal.", ["full_text", "embedding"])
def get_input_form(self) -> dict[str, dict]:
return {}
class Tokenizer(ProcessBase):
component_name = "Tokenizer"
@ -67,19 +72,19 @@ class Tokenizer(ProcessBase):
@timeout(60)
def batch_encode(txts):
nonlocal embedding_model
return embedding_model.encode([truncate(c, embedding_model.max_length-10) for c in txts])
return embedding_model.encode([truncate(c, embedding_model.max_length - 10) for c in txts])
cnts_ = np.array([])
for i in range(0, len(texts), EMBEDDING_BATCH_SIZE):
async with embed_limiter:
vts, c = await trio.to_thread.run_sync(lambda: batch_encode(texts[i: i + EMBEDDING_BATCH_SIZE]))
vts, c = await trio.to_thread.run_sync(lambda: batch_encode(texts[i : i + EMBEDDING_BATCH_SIZE]))
if len(cnts_) == 0:
cnts_ = vts
else:
cnts_ = np.concatenate((cnts_, vts), axis=0)
token_count += c
if i % 33 == 32:
self.callback(i*1./len(texts)/parts/EMBEDDING_BATCH_SIZE + 0.5*(parts-1))
self.callback(i * 1.0 / len(texts) / parts / EMBEDDING_BATCH_SIZE + 0.5 * (parts - 1))
cnts = cnts_
title_w = float(self._param.filename_embd_weight)
@ -92,11 +97,17 @@ class Tokenizer(ProcessBase):
return chunks, token_count
async def _invoke(self, **kwargs):
try:
from_upstream = TokenizerFromUpstream.model_validate(kwargs)
except Exception as e:
self.set_output("_ERROR", f"Input error: {str(e)}")
return
parts = sum(["full_text" in self._param.search_method, "embedding" in self._param.search_method])
if "full_text" in self._param.search_method:
self.callback(random.randint(1,5)/100., "Start to tokenize.")
if kwargs.get("chunks"):
chunks = kwargs["chunks"]
self.callback(random.randint(1, 5) / 100.0, "Start to tokenize.")
if from_upstream.chunks:
chunks = from_upstream.chunks
for i, ck in enumerate(chunks):
if ck.get("questions"):
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
@ -105,30 +116,40 @@ class Tokenizer(ProcessBase):
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
if i % 100 == 99:
self.callback(i*1./len(chunks)/parts)
elif kwargs.get("output_format") in ["markdown", "text"]:
ck = {
"text": kwargs.get(kwargs["output_format"], "")
}
if "full_text" in self._param.search_method:
self.callback(i * 1.0 / len(chunks) / parts)
elif from_upstream.output_format in ["markdown", "text"]:
if from_upstream.output_format == "markdown":
payload = from_upstream.markdown_result
else: # == "text"
payload = from_upstream.text_result
if not payload:
return ""
ck = {"text": payload}
if "full_text" in self._param.search_method:
ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], ""))
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
chunks = [ck]
else:
chunks = kwargs["json"]
chunks = from_upstream.json_result
for i, ck in enumerate(chunks):
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
if i % 100 == 99:
self.callback(i*1./len(chunks)/parts)
self.callback(i * 1.0 / len(chunks) / parts)
self.callback(1./parts, "Finish tokenizing.")
self.callback(1.0 / parts, "Finish tokenizing.")
if "embedding" in self._param.search_method:
self.callback(random.randint(1,5)/100. + 0.5*(parts-1), "Start embedding inference.")
chunks, token_count = await self._embedding(kwargs.get("name", ""), chunks)
self.callback(random.randint(1, 5) / 100.0 + 0.5 * (parts - 1), "Start embedding inference.")
if from_upstream.name.strip() == "":
logging.warning("Tokenizer: empty name provided from upstream, embedding may be not accurate.")
chunks, token_count = await self._embedding(from_upstream.name, chunks)
self.set_output("embedding_token_consumption", token_count)
self.callback(1., "Finish embedding.")
self.callback(1.0, "Finish embedding.")
self.set_output("chunks", chunks)

View File

@ -21,10 +21,12 @@ import sys
import threading
import time
from api.utils import get_uuid
from api.utils.api_utils import timeout
from api.utils.log_utils import init_root_logger, get_project_base_directory
from graphrag.general.index import run_graphrag
from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
from rag.flow.pipeline import Pipeline
from rag.prompts import keyword_extraction, question_proposal, content_tagging
import logging
@ -223,7 +225,14 @@ async def collect():
logging.warning(f"collect task {msg['id']} {state}")
redis_msg.ack()
return None, None
task["task_type"] = msg.get("task_type", "")
task_type = msg.get("task_type", "")
task["task_type"] = task_type
if task_type == "dataflow":
task["tenant_id"]=msg.get("tenant_id", "")
task["dsl"] = msg.get("dsl", "")
task["dataflow_id"] = msg.get("dataflow_id", get_uuid())
task["kb_id"] = msg.get("kb_id", "")
return redis_msg, task
@ -473,6 +482,15 @@ async def embedding(docs, mdl, parser_config=None, callback=None):
return tk_count, vector_size
async def run_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, callback=None):
_ = callback
pipeline = Pipeline(dsl=dsl, tenant_id=tenant_id, doc_id=doc_id, task_id=task_id, flow_id=flow_id)
pipeline.reset()
await pipeline.run()
@timeout(3600)
async def run_raptor(row, chat_mdl, embd_mdl, vector_size, callback=None):
chunks = []
@ -558,15 +576,20 @@ async def do_handle_task(task):
init_kb(task, vector_size)
# Either using RAPTOR or Standard chunking methods
if task.get("task_type", "") == "raptor":
task_type = task.get("task_type", "")
if task_type == "dataflow":
task_dataflow_dsl = task["dsl"]
task_dataflow_id = task["dataflow_id"]
await run_dataflow(dsl=task_dataflow_dsl, tenant_id=task_tenant_id, doc_id=task_doc_id, task_id=task_id, flow_id=task_dataflow_id, callback=None)
return
elif task_type == "raptor":
# bind LLM for raptor
chat_model = LLMBundle(task_tenant_id, LLMType.CHAT, llm_name=task_llm_id, lang=task_language)
# run RAPTOR
async with kg_limiter:
chunks, token_count = await run_raptor(task, chat_model, embedding_model, vector_size, progress_callback)
# Either using graphrag or Standard chunking methods
elif task.get("task_type", "") == "graphrag":
elif task_type == "graphrag":
if not task_parser_config.get("graphrag", {}).get("use_graphrag", False):
progress_callback(prog=-1.0, msg="Internal configuration error.")
return