Feat/parse audio (#10133)

### What problem does this PR solve? Dataflow support audio. And fix giteeAI's sequence2text model. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 00:25:06 +08:00 · 2025-09-18 09:31:32 +08:00
parent f3738b06f1
commit d353f7f7f8
4 changed files with 79 additions and 5 deletions
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -74,10 +74,10 @@ class RAGFlowPdfParser:
            recognizer_domain = "layout"
        if layout_recognizer_type == "ascend":
-            logging.debug("Using Ascend LayoutRecognizer", flush=True)
+            logging.debug("Using Ascend LayoutRecognizer")
            self.layouter = AscendLayoutRecognizer(recognizer_domain)
        else:  # onnx
-            logging.debug("Using Onnx LayoutRecognizer", flush=True)
+            logging.debug("Using Onnx LayoutRecognizer")
            self.layouter = LayoutRecognizer(recognizer_domain)
        self.tbl_det = TableStructureRecognizer()
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -54,7 +54,9 @@ class ParserParam(ProcessParamBase):
                "text",
                "json"
            ],
-            "audio": [],
+            "audio": [
                "json"
            ],
            "video": [],
        }
@ -102,7 +104,26 @@ class ParserParam(ProcessParamBase):
                ],
                "output_format": "json",
            },
-            "audio": {},
+            "audio": {
                "suffix":[
                    "da",
                    "wave",
                    "wav",
                    "mp3",
                    "aac",
                    "flac",
                    "ogg",
                    "aiff",
                    "au",
                    "midi",
                    "wma",
                    "realaudio",
                    "vqf",
                    "oggvorbis",
                    "ape"
                ],
                "output_format": "json",
            },
            "video": {},
        }
@ -146,6 +167,12 @@ class ParserParam(ProcessParamBase):
            text_output_format = text_config.get("output_format", "")
            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
        audio_config = self.setups.get("audio", "")
        if audio_config:
            self.check_empty(audio_config.get("llm_id"), "VLM")
            audio_language = audio_config.get("lang", "")
            self.check_empty(audio_language, "Language")
    def get_input_form(self) -> dict[str, dict]:
        return {}
@ -313,6 +340,30 @@ class Parser(ProcessBase):
        self.set_output("text", txt)
    def _audio(self, from_upstream: ParserFromUpstream):
        import os
        import tempfile
        self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
        blob = from_upstream.blob
        name = from_upstream.name
        conf = self._param.setups["audio"]
        self.set_output("output_format", conf["output_format"])
        lang = conf["lang"]
        _, ext = os.path.splitext(name)
        tmp_path = ""
        with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
            tmpf.write(blob)
            tmpf.flush()
            tmp_path = os.path.abspath(tmpf.name)
            seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
            txt = seq2txt_mdl.transcription(tmp_path)
            self.set_output("text", txt)
    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
@ -321,6 +372,7 @@ class Parser(ProcessBase):
            "word": self._word,
            "text": self._text,
            "image": self._image,
            "audio": self._audio,
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@ -60,6 +60,28 @@
                    "gif"
                  ],
                  "output_format": "text"
                },
                "audio": {
                  "suffix": [
                    "da",
                    "wave",
                    "wav",
                    "mp3",
                    "aac",
                    "flac",
                    "ogg",
                    "aiff",
                    "au",
                    "midi",
                    "wma",
                    "realaudio",
                    "vqf",
                    "oggvorbis",
                    "ape"
                  ],
                  "lang": "Chinese",
                  "llm_id": "SenseVoiceSmall",
                  "output_format": "json"
                }
              }
          }
--- a/rag/llm/sequence2txt_model.py
+++ b/rag/llm/sequence2txt_model.py
@ -218,7 +218,7 @@ class GPUStackSeq2txt(Base):
 class GiteeSeq2txt(Base):
    _FACTORY_NAME = "GiteeAI"
-    def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/"):
+    def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/", **kwargs):
        if not base_url:
            base_url = "https://ai.gitee.com/v1/"
        self.client = OpenAI(api_key=key, base_url=base_url)