Feat:new api /sequence2txt and update QWenSeq2txt (#11643)

### What problem does this PR solve?
change:
new api /sequence2txt,
update QWenSeq2txt and ZhipuSeq2txt

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
buua436
2025-12-02 11:17:31 +08:00
committed by GitHub
parent d1e172171f
commit b8c0fb4572
7 changed files with 3630 additions and 3408 deletions

View File

@ -41,6 +41,7 @@ class MessageParam(ComponentParamBase):
self.content = [] self.content = []
self.stream = True self.stream = True
self.output_format = None # default output format self.output_format = None # default output format
self.auto_play = False
self.outputs = { self.outputs = {
"content": { "content": {
"type": "str" "type": "str"

View File

@ -14,9 +14,11 @@
# limitations under the License. # limitations under the License.
# #
import json import json
import os
import re import re
import logging import logging
from copy import deepcopy from copy import deepcopy
import tempfile
from quart import Response, request from quart import Response, request
from api.apps import current_user, login_required from api.apps import current_user, login_required
from api.db.db_models import APIToken from api.db.db_models import APIToken
@ -248,6 +250,64 @@ async def completion():
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@manager.route("/sequence2txt", methods=["POST"]) # noqa: F821
@login_required
async def sequence2txt():
req = await request.form
stream_mode = req.get("stream", "false").lower() == "true"
files = await request.files
if "file" not in files:
return get_data_error_result(message="Missing 'file' in multipart form-data")
uploaded = files["file"]
ALLOWED_EXTS = {
".wav", ".mp3", ".m4a", ".aac",
".flac", ".ogg", ".webm",
".opus", ".wma"
}
filename = uploaded.filename or ""
suffix = os.path.splitext(filename)[-1].lower()
if suffix not in ALLOWED_EXTS:
return get_data_error_result(message=
f"Unsupported audio format: {suffix}. "
f"Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
)
fd, temp_audio_path = tempfile.mkstemp(suffix=suffix)
os.close(fd)
await uploaded.save(temp_audio_path)
tenants = TenantService.get_info_by(current_user.id)
if not tenants:
return get_data_error_result(message="Tenant not found!")
asr_id = tenants[0]["asr_id"]
if not asr_id:
return get_data_error_result(message="No default ASR model is set")
asr_mdl=LLMBundle(tenants[0]["tenant_id"], LLMType.SPEECH2TEXT, asr_id)
if not stream_mode:
text = asr_mdl.transcription(temp_audio_path)
try:
os.remove(temp_audio_path)
except Exception as e:
logging.error(f"Failed to remove temp audio file: {str(e)}")
return get_json_result(data={"text": text})
async def event_stream():
try:
for evt in asr_mdl.stream_transcription(temp_audio_path):
yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n"
except Exception as e:
err = {"event": "error", "text": str(e)}
yield f"data: {json.dumps(err, ensure_ascii=False)}\n\n"
finally:
try:
os.remove(temp_audio_path)
except Exception as e:
logging.error(f"Failed to remove temp audio file: {str(e)}")
return Response(event_stream(), content_type="text/event-stream")
@manager.route("/tts", methods=["POST"]) # noqa: F821 @manager.route("/tts", methods=["POST"]) # noqa: F821
@login_required @login_required

View File

@ -185,6 +185,66 @@ class LLMBundle(LLM4Tenant):
return txt return txt
def stream_transcription(self, audio):
mdl = self.mdl
supports_stream = hasattr(mdl, "stream_transcription") and callable(getattr(mdl, "stream_transcription"))
if supports_stream:
if self.langfuse:
generation = self.langfuse.start_generation(
trace_context=self.trace_context,
name="stream_transcription",
metadata={"model": self.llm_name}
)
final_text = ""
used_tokens = 0
try:
for evt in mdl.stream_transcription(audio):
if evt.get("event") == "final":
final_text = evt.get("text", "")
yield evt
except Exception as e:
err = {"event": "error", "text": str(e)}
yield err
final_text = final_text or ""
finally:
if final_text:
used_tokens = num_tokens_from_string(final_text)
TenantLLMService.increase_usage(self.tenant_id, self.llm_type, used_tokens)
if self.langfuse:
generation.update(
output={"output": final_text},
usage_details={"total_tokens": used_tokens}
)
generation.end()
return
if self.langfuse:
generation = self.langfuse.start_generation(trace_context=self.trace_context, name="stream_transcription", metadata={"model": self.llm_name})
full_text, used_tokens = mdl.transcription(audio)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens
):
logging.error(
f"LLMBundle.stream_transcription can't update token usage for {self.tenant_id}/SEQUENCE2TXT used_tokens: {used_tokens}"
)
if self.langfuse:
generation.update(
output={"output": full_text},
usage_details={"total_tokens": used_tokens}
)
generation.end()
yield {
"event": "final",
"text": full_text,
"streaming": False
}
def tts(self, text: str) -> Generator[bytes, None, None]: def tts(self, text: str) -> Generator[bytes, None, None]:
if self.langfuse: if self.langfuse:
generation = self.langfuse.start_generation(trace_context=self.trace_context, name="tts", input={"text": text}) generation = self.langfuse.start_generation(trace_context=self.trace_context, name="tts", input={"text": text})

View File

@ -714,19 +714,13 @@
"model_type": "rerank" "model_type": "rerank"
}, },
{ {
"llm_name": "qwen-audio-asr", "llm_name": "qwen3-asr-flash",
"tags": "SPEECH2TEXT,8k", "tags": "SPEECH2TEXT,8k",
"max_tokens": 8000, "max_tokens": 8000,
"model_type": "speech2text" "model_type": "speech2text"
}, },
{ {
"llm_name": "qwen-audio-asr-latest", "llm_name": "qwen3-asr-flash-2025-09-08",
"tags": "SPEECH2TEXT,8k",
"max_tokens": 8000,
"model_type": "speech2text"
},
{
"llm_name": "qwen-audio-asr-1204",
"tags": "SPEECH2TEXT,8k", "tags": "SPEECH2TEXT,8k",
"max_tokens": 8000, "max_tokens": 8000,
"model_type": "speech2text" "model_type": "speech2text"

View File

@ -152,7 +152,9 @@ dependencies = [
"moodlepy>=0.23.0", "moodlepy>=0.23.0",
"pypandoc>=1.16", "pypandoc>=1.16",
"pyobvector==0.2.18", "pyobvector==0.2.18",
"exceptiongroup>=1.3.0,<2.0.0" "exceptiongroup>=1.3.0,<2.0.0",
"ffmpeg-python>=0.2.0",
"imageio-ffmpeg>=0.6.0",
] ]
[dependency-groups] [dependency-groups]
@ -168,6 +170,9 @@ test = [
"requests-toolbelt>=1.0.0", "requests-toolbelt>=1.0.0",
] ]
[[tool.uv.index]]
url = "https://mirrors.aliyun.com/pypi/simple"
[[tool.uv.index]] [[tool.uv.index]]
url = "https://pypi.tuna.tsinghua.edu.cn/simple" url = "https://pypi.tuna.tsinghua.edu.cn/simple"

View File

@ -19,6 +19,7 @@ import json
import os import os
import re import re
from abc import ABC from abc import ABC
import tempfile
import requests import requests
from openai import OpenAI from openai import OpenAI
@ -68,32 +69,80 @@ class QWenSeq2txt(Base):
self.model_name = model_name self.model_name = model_name
def transcription(self, audio_path): def transcription(self, audio_path):
if "paraformer" in self.model_name or "sensevoice" in self.model_name: import dashscope
return f"**ERROR**: model {self.model_name} is not suppported yet.", 0
from dashscope import MultiModalConversation if audio_path.startswith("http"):
audio_input = audio_path
else:
audio_input = f"file://{audio_path}"
audio_path = f"file://{audio_path}"
messages = [ messages = [
{
"role": "system",
"content": [{"text": ""}]
},
{ {
"role": "user", "role": "user",
"content": [{"audio": audio_path}], "content": [{"audio": audio_input}]
} }
] ]
response = None resp = dashscope.MultiModalConversation.call(
full_content = "" model=self.model_name,
try: messages=messages,
response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True) result_format="message",
for response in response: asr_options={
try: "enable_lid": True,
full_content += response["output"]["choices"][0]["message"].content[0]["text"] "enable_itn": False
except Exception: }
pass )
return full_content, num_tokens_from_string(full_content)
except Exception as e:
return "**ERROR**: " + str(e), 0
try:
text = resp["output"]["choices"][0]["message"].content[0]["text"]
except Exception as e:
text = "**ERROR**: " + str(e)
return text, num_tokens_from_string(text)
def stream_transcription(self, audio_path):
import dashscope
if audio_path.startswith("http"):
audio_input = audio_path
else:
audio_input = f"file://{audio_path}"
messages = [
{
"role": "system",
"content": [{"text": ""}]
},
{
"role": "user",
"content": [{"audio": audio_input}]
}
]
stream = dashscope.MultiModalConversation.call(
model=self.model_name,
messages=messages,
result_format="message",
stream=True,
asr_options={
"enable_lid": True,
"enable_itn": False
}
)
full = ""
for chunk in stream:
try:
piece = chunk["output"]["choices"][0]["message"].content[0]["text"]
full = piece
yield {"event": "delta", "text": piece}
except Exception as e:
yield {"event": "error", "text": str(e)}
yield {"event": "final", "text": full}
class AzureSeq2txt(Base): class AzureSeq2txt(Base):
_FACTORY_NAME = "Azure-OpenAI" _FACTORY_NAME = "Azure-OpenAI"
@ -268,6 +317,27 @@ class ZhipuSeq2txt(Base):
self.gen_conf = kwargs.get("gen_conf", {}) self.gen_conf = kwargs.get("gen_conf", {})
self.stream = kwargs.get("stream", False) self.stream = kwargs.get("stream", False)
def _convert_to_wav(self, input_path):
ext = os.path.splitext(input_path)[1].lower()
if ext in [".wav", ".mp3"]:
return input_path
fd, out_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
try:
import ffmpeg
import imageio_ffmpeg as ffmpeg_exe
ffmpeg_path = ffmpeg_exe.get_ffmpeg_exe()
(
ffmpeg
.input(input_path)
.output(out_path, ar=16000, ac=1)
.overwrite_output()
.run(cmd=ffmpeg_path,quiet=True)
)
return out_path
except Exception as e:
raise RuntimeError(f"audio convert failed: {e}")
def transcription(self, audio_path): def transcription(self, audio_path):
payload = { payload = {
"model": self.model_name, "model": self.model_name,
@ -276,7 +346,9 @@ class ZhipuSeq2txt(Base):
} }
headers = {"Authorization": f"Bearer {self.api_key}"} headers = {"Authorization": f"Bearer {self.api_key}"}
with open(audio_path, "rb") as audio_file: converted = self._convert_to_wav(audio_path)
with open(converted, "rb") as audio_file:
files = {"file": audio_file} files = {"file": audio_file}
try: try:

6792
uv.lock generated

File diff suppressed because it is too large Load Diff