Feat: dataflow supports text (#10058)

### What problem does this PR solve? dataflow supports text. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 00:25:06 +08:00 · 2025-09-11 19:03:51 +08:00
parent aa30f20730
commit 65571e5254
2 changed files with 41 additions and 7 deletions
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -45,7 +45,10 @@ class ParserParam(ProcessParamBase):
            "ppt": [],
            "image": [],
            "email": [],
-            "text": [],
+            "text": [
+                "text",
+                "json"
+            ],
            "audio": [],
            "video": [],
        }
@ -84,7 +87,12 @@ class ParserParam(ProcessParamBase):
                "parse_method": "ocr",
            },
            "email": {},
-            "text": {},
+            "text": {
+                "suffix": [
+                    "txt"
+                ],
+                "output_format": "json",
+            },
            "audio": {},
            "video": {},
        }
@ -119,6 +127,11 @@ class ParserParam(ProcessParamBase):
            image_parse_method = image_config.get("parse_method", "")
            self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"])

+        text_config = self.setups.get("text", "")
+        if text_config:
+            text_output_format = text_config.get("output_format", "")
+            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
+
    def get_input_form(self) -> dict[str, dict]:
        return {}

@ -208,15 +221,13 @@ class Parser(ProcessBase):
        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")

        blob = from_upstream.blob
        name = from_upstream.name
        conf = self._param.setups["markdown"]
        self.set_output("output_format", conf["output_format"])

-        print("markdown {conf=}", flush=True)
-
        markdown_parser = naive_markdown_parser()
        sections, tables = markdown_parser(name, blob, separate_tables=False)

@ -240,13 +251,33 @@ class Parser(ProcessBase):

            self.set_output("json", json_results)

+    def _text(self, from_upstream: ParserFromUpstream):
+        from deepdoc.parser.utils import get_text
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
+
+        blob = from_upstream.blob
+        name = from_upstream.name
+        conf = self._param.setups["text"]
+        self.set_output("output_format", conf["output_format"])
+
+        # parse binary to text
+        text_content = get_text(name, binary=blob)
+
+        if conf.get("output_format") == "json":
+            result = [{"text": text_content}]
+            self.set_output("json", result)
+        else:
+            result = text_content
+            self.set_output("text", result)

    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
            "markdown": self._markdown,
            "spreadsheet": self._spreadsheet,
-            "word": self._word
+            "word": self._word,
+            "text": self._text,
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@ -44,9 +44,12 @@
                    "markdown"
                  ],
                  "output_format": "json"
+                },
+                "text": {
+                  "suffix": ["txt"],
+                  "output_format": "json"
                }
              }
-            }
          }
        },
        "downstream": ["Chunker:0"],