Feat:new api /sequence2txt and update QWenSeq2txt (#11643)

### What problem does this PR solve? change: new api /sequence2txt, update QWenSeq2txt and ZhipuSeq2txt ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 23:55:06 +08:00 · 2025-12-02 11:17:31 +08:00
parent d1e172171f
commit b8c0fb4572
7 changed files with 3630 additions and 3408 deletions
--- a/rag/llm/sequence2txt_model.py
+++ b/rag/llm/sequence2txt_model.py
@ -19,6 +19,7 @@ import json
 import os
 import re
 from abc import ABC
+import tempfile

 import requests
 from openai import OpenAI
@ -68,32 +69,80 @@ class QWenSeq2txt(Base):
        self.model_name = model_name

    def transcription(self, audio_path):
-        if "paraformer" in self.model_name or "sensevoice" in self.model_name:
-            return f"**ERROR**: model {self.model_name} is not suppported yet.", 0
+        import dashscope

-        from dashscope import MultiModalConversation
+        if audio_path.startswith("http"):
+            audio_input = audio_path
+        else:
+            audio_input = f"file://{audio_path}"

-        audio_path = f"file://{audio_path}"
        messages = [
+            {
+                "role": "system",
+                "content": [{"text": ""}]
+            },
            {
                "role": "user",
-                "content": [{"audio": audio_path}],
+                "content": [{"audio": audio_input}]
            }
        ]

-        response = None
-        full_content = ""
-        try:
-            response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True)
-            for response in response:
-                try:
-                    full_content += response["output"]["choices"][0]["message"].content[0]["text"]
-                except Exception:
-                    pass
-            return full_content, num_tokens_from_string(full_content)
-        except Exception as e:
-            return "**ERROR**: " + str(e), 0
+        resp = dashscope.MultiModalConversation.call(
+            model=self.model_name,
+            messages=messages,
+            result_format="message",
+            asr_options={
+                "enable_lid": True,
+                "enable_itn": False
+            }
+        )

+        try:
+            text = resp["output"]["choices"][0]["message"].content[0]["text"]
+        except Exception as e:
+            text = "**ERROR**: " + str(e)
+        return text, num_tokens_from_string(text)
+
+    def stream_transcription(self, audio_path):
+        import dashscope
+
+        if audio_path.startswith("http"):
+            audio_input = audio_path
+        else:
+            audio_input = f"file://{audio_path}"
+
+        messages = [
+            {
+                "role": "system",
+                "content": [{"text": ""}]
+            },
+            {
+                "role": "user",
+                "content": [{"audio": audio_input}]
+            }
+        ]
+
+        stream = dashscope.MultiModalConversation.call(
+            model=self.model_name,
+            messages=messages,
+            result_format="message",
+            stream=True,
+            asr_options={
+                "enable_lid": True,
+                "enable_itn": False
+            }
+        )
+
+        full = ""
+        for chunk in stream:
+            try:
+                piece = chunk["output"]["choices"][0]["message"].content[0]["text"]
+                full = piece
+                yield {"event": "delta", "text": piece}
+            except Exception as e:
+                yield {"event": "error", "text": str(e)}
+
+        yield {"event": "final", "text": full}

 class AzureSeq2txt(Base):
    _FACTORY_NAME = "Azure-OpenAI"
@ -268,6 +317,27 @@ class ZhipuSeq2txt(Base):
        self.gen_conf = kwargs.get("gen_conf", {})
        self.stream = kwargs.get("stream", False)

+    def _convert_to_wav(self, input_path):
+        ext = os.path.splitext(input_path)[1].lower()
+        if ext in [".wav", ".mp3"]:
+            return input_path
+        fd, out_path = tempfile.mkstemp(suffix=".wav")
+        os.close(fd)
+        try:
+            import ffmpeg
+            import imageio_ffmpeg as ffmpeg_exe
+            ffmpeg_path = ffmpeg_exe.get_ffmpeg_exe()
+            (
+                ffmpeg
+                .input(input_path)
+                .output(out_path, ar=16000, ac=1)
+                .overwrite_output()
+                .run(cmd=ffmpeg_path,quiet=True)
+            )
+            return out_path
+        except Exception as e:
+            raise RuntimeError(f"audio convert failed: {e}")
+
    def transcription(self, audio_path):
        payload = {
            "model": self.model_name,
@ -276,7 +346,9 @@ class ZhipuSeq2txt(Base):
        }

        headers = {"Authorization": f"Bearer {self.api_key}"}
-        with open(audio_path, "rb") as audio_file:
+        converted = self._convert_to_wav(audio_path)
+
+        with open(converted, "rb") as audio_file:
            files = {"file": audio_file}

            try: