Feat: support dataflow run. (#10182)

### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-06 18:45:08 +08:00 · 2025-09-22 09:36:21 +08:00
parent 028c2d83e9
commit d050ef568d
7 changed files with 50 additions and 78 deletions
--- a/rag/flow/base.py
+++ b/rag/flow/base.py
@ -35,9 +35,9 @@ class ProcessBase(ComponentBase):
    def __init__(self, pipeline, id, param: ProcessParamBase):
        super().__init__(pipeline, id, param)
        if hasattr(self._canvas, "callback"):
-            self.callback = partial(self._canvas.callback, self.component_name)
+            self.callback = partial(self._canvas.callback, id)
        else:
-            self.callback = partial(lambda *args, **kwargs: None, self.component_name)
+            self.callback = partial(lambda *args, **kwargs: None, id)

    async def invoke(self, **kwargs) -> dict[str, Any]:
        self.set_output("_created_time", time.perf_counter())
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -76,7 +76,6 @@ class ParserParam(ProcessParamBase):
        self.setups = {
            "pdf": {
                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
-                "llm_id": "",
                "lang": "Chinese",
                "suffix": [
                    "pdf",
@ -98,8 +97,8 @@ class ParserParam(ProcessParamBase):
                ],
                "output_format": "json",
            },
-            "markdown": {
-                "suffix": ["md", "markdown", "mdx"],
+            "text&markdown": {
+                "suffix": ["md", "markdown", "mdx", "txt"],
                "output_format": "json",
            },
            "slides": {
@ -156,13 +155,10 @@ class ParserParam(ProcessParamBase):
        pdf_config = self.setups.get("pdf", {})
        if pdf_config:
            pdf_parse_method = pdf_config.get("parse_method", "")
-            self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
+            self.check_empty(pdf_parse_method, "Parse method abnormal.")

-            if pdf_parse_method not in ["deepdoc", "plain_text"]:
-                self.check_empty(pdf_config.get("llm_id"), "VLM")
-
-            pdf_language = pdf_config.get("lang", "")
-            self.check_empty(pdf_language, "Language")
+            if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
+                self.check_empty(pdf_config.get("lang", ""), "Language")

            pdf_output_format = pdf_config.get("output_format", "")
            self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
@ -226,8 +222,7 @@ class Parser(ProcessBase):
            lines, _ = PlainParser()(blob)
            bboxes = [{"text": t} for t, _ in lines]
        else:
-            assert conf.get("llm_id")
-            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
+            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
            bboxes = []
            for t, poss in lines:
@ -236,6 +231,7 @@ class Parser(ProcessBase):

        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
+
        if conf.get("output_format") == "markdown":
            mkdn = ""
            for b in bboxes:
@ -299,7 +295,6 @@ class Parser(ProcessBase):

    def _markdown(self, name, blob):
        from functools import reduce
-
        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

@ -330,22 +325,6 @@ class Parser(ProcessBase):
        else:
            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))

-    def _text(self, name, blob):
-        from deepdoc.parser.utils import get_text
-
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
-        conf = self._param.setups["text"]
-        self.set_output("output_format", conf["output_format"])
-
-        # parse binary to text
-        text_content = get_text(name, binary=blob)
-
-        if conf.get("output_format") == "json":
-            result = [{"text": text_content}]
-            self.set_output("json", result)
-        else:
-            result = text_content
-            self.set_output("text", result)

    def _image(self, from_upstream: ParserFromUpstream):
        from deepdoc.vision import OCR
@ -367,7 +346,7 @@ class Parser(ProcessBase):

        else:
            # use VLM to describe the picture
-            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"], lang=lang)
+            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
            img_binary = io.BytesIO()
            img.save(img_binary, format="JPEG")
            img_binary.seek(0)
@ -519,11 +498,10 @@ class Parser(ProcessBase):
    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
-            "markdown": self._markdown,
+            "text&markdown": self._markdown,
            "spreadsheet": self._spreadsheet,
            "slides": self._slides,
            "word": self._word,
-            "text": self._text,
            "image": self._image,
            "audio": self._audio,
            "email": self._email,
--- a/rag/flow/pipeline.py
+++ b/rag/flow/pipeline.py
@ -18,7 +18,7 @@ import json
 import logging
 import random
 import time
-
+from timeit import default_timer as timer
 import trio

 from agent.canvas import Graph
@ -38,25 +38,26 @@ class Pipeline(Graph):

    def callback(self, component_name: str, progress: float | int | None = None, message: str = "") -> None:
        log_key = f"{self._flow_id}-{self.task_id}-logs"
+        timestamp = timer()
        try:
            bin = REDIS_CONN.get(log_key)
            obj = json.loads(bin.encode("utf-8"))
            if obj:
-                if obj[-1]["component_name"] == component_name:
-                    obj[-1]["trace"].append({"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")})
+                if obj[-1]["component_id"] == component_name:
+                    obj[-1]["trace"].append({"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": timestamp-obj[-1]["trace"][-1]["timestamp"]})
                else:
-                    obj.append({"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]})
+                    obj.append({"component_id": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}]})
            else:
-                obj = [{"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]}]
+                obj = [{"component_id": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}]}]
            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
            if self._doc_id:
                percentage = 1./len(self.components.items())
                msg = ""
                finished = 0.
                for o in obj:
-                    if o['component_name'] == "END":
+                    if o['component_id'] == "END":
                        continue
-                    msg += f"\n[{o['component_name']}]:\n"
+                    msg += f"\n[{o['component_id']}]:\n"
                    for t in o["trace"]:
                        msg += "%s: %s\n"%(t["datetime"], t["message"])
                        if t["progress"] < 0:
--- a/rag/flow/tests/client.py
+++ b/rag/flow/tests/client.py
@ -30,7 +30,7 @@ def print_logs(pipeline: Pipeline):
    while True:
        time.sleep(5)
        logs = pipeline.fetch_logs()
-        logs_str = json.dumps(logs)
+        logs_str = json.dumps(logs, ensure_ascii=False)
        if logs_str != last_logs:
            print(logs_str)
        last_logs = logs_str