Feat: dataflow supports markdown (#10003)

### What problem does this PR solve? Dataflow supports markdown. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-02-02 00:25:06 +08:00 · 2025-09-10 13:31:02 +08:00
parent 0d9c1f1c3c
commit 41cdba19ba
2 changed files with 53 additions and 1 deletions
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -75,6 +75,10 @@ class ParserParam(ProcessParamBase):
                ],
                "output_format": "json",
            },
            "markdown": {
                "suffix": ["md", "markdown"],
                "output_format": "json",
            },
            "ppt": {},
            "image": {
                "parse_method": "ocr",
@ -198,11 +202,51 @@ class Parser(ProcessBase):
        if conf.get("output_format") == "json":
            self.set_output("json", sections)
    def _markdown(self, from_upstream: ParserFromUpstream):
        from functools import reduce
        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
        blob = from_upstream.blob
        name = from_upstream.name
        conf = self._param.setups["markdown"]
        self.set_output("output_format", conf["output_format"])
        print("markdown {conf=}", flush=True)
        markdown_parser = naive_markdown_parser()
        sections, tables = markdown_parser(name, blob, separate_tables=False)
        # json
        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            json_results = []
            for section_text, _ in sections:
                json_result = {
                    "text": section_text,
                }
                images = markdown_parser.get_pictures(section_text) if section_text else None
                if images:
                    # If multiple images found, combine them using concat_img
                    combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
                    json_result["image"] = combined_image
                json_results.append(json_result)
            self.set_output("json", json_results)
    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
            "markdown": self._markdown,
            "spreadsheet": self._spreadsheet,
-            "word": self._word,
+            "word": self._word
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@ -37,6 +37,14 @@
                    "docx"
                  ],
                  "output_format": "json"
                },
                "markdown": {
                  "suffix": [
                    "md",
                    "markdown"
                  ],
                  "output_format": "json"
                }
              }
            }
          }