From d353f7f7f8d30170072ead615857715d09aa8fbe Mon Sep 17 00:00:00 2001 From: Lynn Date: Thu, 18 Sep 2025 09:31:32 +0800 Subject: [PATCH] Feat/parse audio (#10133) ### What problem does this PR solve? Dataflow support audio. And fix giteeAI's sequence2text model. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/pdf_parser.py | 4 +- rag/flow/parser/parser.py | 56 ++++++++++++++++++- .../tests/dsl_examples/general_pdf_all.json | 22 ++++++++ rag/llm/sequence2txt_model.py | 2 +- 4 files changed, 79 insertions(+), 5 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 25d2b0a7c..dea0a93ef 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -74,10 +74,10 @@ class RAGFlowPdfParser: recognizer_domain = "layout" if layout_recognizer_type == "ascend": - logging.debug("Using Ascend LayoutRecognizer", flush=True) + logging.debug("Using Ascend LayoutRecognizer") self.layouter = AscendLayoutRecognizer(recognizer_domain) else: # onnx - logging.debug("Using Onnx LayoutRecognizer", flush=True) + logging.debug("Using Onnx LayoutRecognizer") self.layouter = LayoutRecognizer(recognizer_domain) self.tbl_det = TableStructureRecognizer() diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 87b3731f5..8c1e8c1f0 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -54,7 +54,9 @@ class ParserParam(ProcessParamBase): "text", "json" ], - "audio": [], + "audio": [ + "json" + ], "video": [], } @@ -102,7 +104,26 @@ class ParserParam(ProcessParamBase): ], "output_format": "json", }, - "audio": {}, + "audio": { + "suffix":[ + "da", + "wave", + "wav", + "mp3", + "aac", + "flac", + "ogg", + "aiff", + "au", + "midi", + "wma", + "realaudio", + "vqf", + "oggvorbis", + "ape" + ], + "output_format": "json", + }, "video": {}, } @@ -146,6 +167,12 @@ class ParserParam(ProcessParamBase): text_output_format = text_config.get("output_format", "") self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"]) + audio_config = self.setups.get("audio", "") + if audio_config: + self.check_empty(audio_config.get("llm_id"), "VLM") + audio_language = audio_config.get("lang", "") + self.check_empty(audio_language, "Language") + def get_input_form(self) -> dict[str, dict]: return {} @@ -313,6 +340,30 @@ class Parser(ProcessBase): self.set_output("text", txt) + def _audio(self, from_upstream: ParserFromUpstream): + import os + import tempfile + + self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.") + + blob = from_upstream.blob + name = from_upstream.name + conf = self._param.setups["audio"] + self.set_output("output_format", conf["output_format"]) + + lang = conf["lang"] + _, ext = os.path.splitext(name) + tmp_path = "" + with tempfile.NamedTemporaryFile(suffix=ext) as tmpf: + tmpf.write(blob) + tmpf.flush() + tmp_path = os.path.abspath(tmpf.name) + + seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang) + txt = seq2txt_mdl.transcription(tmp_path) + + self.set_output("text", txt) + async def _invoke(self, **kwargs): function_map = { "pdf": self._pdf, @@ -321,6 +372,7 @@ class Parser(ProcessBase): "word": self._word, "text": self._text, "image": self._image, + "audio": self._audio, } try: from_upstream = ParserFromUpstream.model_validate(kwargs) diff --git a/rag/flow/tests/dsl_examples/general_pdf_all.json b/rag/flow/tests/dsl_examples/general_pdf_all.json index dd7cc5f29..352dc847d 100644 --- a/rag/flow/tests/dsl_examples/general_pdf_all.json +++ b/rag/flow/tests/dsl_examples/general_pdf_all.json @@ -60,6 +60,28 @@ "gif" ], "output_format": "text" + }, + "audio": { + "suffix": [ + "da", + "wave", + "wav", + "mp3", + "aac", + "flac", + "ogg", + "aiff", + "au", + "midi", + "wma", + "realaudio", + "vqf", + "oggvorbis", + "ape" + ], + "lang": "Chinese", + "llm_id": "SenseVoiceSmall", + "output_format": "json" } } } diff --git a/rag/llm/sequence2txt_model.py b/rag/llm/sequence2txt_model.py index 95203cace..66c3bbf10 100644 --- a/rag/llm/sequence2txt_model.py +++ b/rag/llm/sequence2txt_model.py @@ -218,7 +218,7 @@ class GPUStackSeq2txt(Base): class GiteeSeq2txt(Base): _FACTORY_NAME = "GiteeAI" - def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/"): + def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/", **kwargs): if not base_url: base_url = "https://ai.gitee.com/v1/" self.client = OpenAI(api_key=key, base_url=base_url)