mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat:new api /sequence2txt and update QWenSeq2txt (#11643)
### What problem does this PR solve? change: new api /sequence2txt, update QWenSeq2txt and ZhipuSeq2txt ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -41,6 +41,7 @@ class MessageParam(ComponentParamBase):
|
||||
self.content = []
|
||||
self.stream = True
|
||||
self.output_format = None # default output format
|
||||
self.auto_play = False
|
||||
self.outputs = {
|
||||
"content": {
|
||||
"type": "str"
|
||||
|
||||
@ -14,9 +14,11 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
import tempfile
|
||||
from quart import Response, request
|
||||
from api.apps import current_user, login_required
|
||||
from api.db.db_models import APIToken
|
||||
@ -248,6 +250,64 @@ async def completion():
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
@manager.route("/sequence2txt", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
async def sequence2txt():
|
||||
req = await request.form
|
||||
stream_mode = req.get("stream", "false").lower() == "true"
|
||||
files = await request.files
|
||||
if "file" not in files:
|
||||
return get_data_error_result(message="Missing 'file' in multipart form-data")
|
||||
|
||||
uploaded = files["file"]
|
||||
|
||||
ALLOWED_EXTS = {
|
||||
".wav", ".mp3", ".m4a", ".aac",
|
||||
".flac", ".ogg", ".webm",
|
||||
".opus", ".wma"
|
||||
}
|
||||
|
||||
filename = uploaded.filename or ""
|
||||
suffix = os.path.splitext(filename)[-1].lower()
|
||||
if suffix not in ALLOWED_EXTS:
|
||||
return get_data_error_result(message=
|
||||
f"Unsupported audio format: {suffix}. "
|
||||
f"Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
|
||||
)
|
||||
fd, temp_audio_path = tempfile.mkstemp(suffix=suffix)
|
||||
os.close(fd)
|
||||
await uploaded.save(temp_audio_path)
|
||||
|
||||
tenants = TenantService.get_info_by(current_user.id)
|
||||
if not tenants:
|
||||
return get_data_error_result(message="Tenant not found!")
|
||||
|
||||
asr_id = tenants[0]["asr_id"]
|
||||
if not asr_id:
|
||||
return get_data_error_result(message="No default ASR model is set")
|
||||
|
||||
asr_mdl=LLMBundle(tenants[0]["tenant_id"], LLMType.SPEECH2TEXT, asr_id)
|
||||
if not stream_mode:
|
||||
text = asr_mdl.transcription(temp_audio_path)
|
||||
try:
|
||||
os.remove(temp_audio_path)
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to remove temp audio file: {str(e)}")
|
||||
return get_json_result(data={"text": text})
|
||||
async def event_stream():
|
||||
try:
|
||||
for evt in asr_mdl.stream_transcription(temp_audio_path):
|
||||
yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n"
|
||||
except Exception as e:
|
||||
err = {"event": "error", "text": str(e)}
|
||||
yield f"data: {json.dumps(err, ensure_ascii=False)}\n\n"
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_audio_path)
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to remove temp audio file: {str(e)}")
|
||||
|
||||
return Response(event_stream(), content_type="text/event-stream")
|
||||
|
||||
@manager.route("/tts", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
|
||||
@ -185,6 +185,66 @@ class LLMBundle(LLM4Tenant):
|
||||
|
||||
return txt
|
||||
|
||||
def stream_transcription(self, audio):
|
||||
mdl = self.mdl
|
||||
supports_stream = hasattr(mdl, "stream_transcription") and callable(getattr(mdl, "stream_transcription"))
|
||||
if supports_stream:
|
||||
if self.langfuse:
|
||||
generation = self.langfuse.start_generation(
|
||||
trace_context=self.trace_context,
|
||||
name="stream_transcription",
|
||||
metadata={"model": self.llm_name}
|
||||
)
|
||||
final_text = ""
|
||||
used_tokens = 0
|
||||
|
||||
try:
|
||||
for evt in mdl.stream_transcription(audio):
|
||||
if evt.get("event") == "final":
|
||||
final_text = evt.get("text", "")
|
||||
|
||||
yield evt
|
||||
|
||||
except Exception as e:
|
||||
err = {"event": "error", "text": str(e)}
|
||||
yield err
|
||||
final_text = final_text or ""
|
||||
finally:
|
||||
if final_text:
|
||||
used_tokens = num_tokens_from_string(final_text)
|
||||
TenantLLMService.increase_usage(self.tenant_id, self.llm_type, used_tokens)
|
||||
|
||||
if self.langfuse:
|
||||
generation.update(
|
||||
output={"output": final_text},
|
||||
usage_details={"total_tokens": used_tokens}
|
||||
)
|
||||
generation.end()
|
||||
|
||||
return
|
||||
|
||||
if self.langfuse:
|
||||
generation = self.langfuse.start_generation(trace_context=self.trace_context, name="stream_transcription", metadata={"model": self.llm_name})
|
||||
full_text, used_tokens = mdl.transcription(audio)
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, used_tokens
|
||||
):
|
||||
logging.error(
|
||||
f"LLMBundle.stream_transcription can't update token usage for {self.tenant_id}/SEQUENCE2TXT used_tokens: {used_tokens}"
|
||||
)
|
||||
if self.langfuse:
|
||||
generation.update(
|
||||
output={"output": full_text},
|
||||
usage_details={"total_tokens": used_tokens}
|
||||
)
|
||||
generation.end()
|
||||
|
||||
yield {
|
||||
"event": "final",
|
||||
"text": full_text,
|
||||
"streaming": False
|
||||
}
|
||||
|
||||
def tts(self, text: str) -> Generator[bytes, None, None]:
|
||||
if self.langfuse:
|
||||
generation = self.langfuse.start_generation(trace_context=self.trace_context, name="tts", input={"text": text})
|
||||
|
||||
@ -714,19 +714,13 @@
|
||||
"model_type": "rerank"
|
||||
},
|
||||
{
|
||||
"llm_name": "qwen-audio-asr",
|
||||
"llm_name": "qwen3-asr-flash",
|
||||
"tags": "SPEECH2TEXT,8k",
|
||||
"max_tokens": 8000,
|
||||
"model_type": "speech2text"
|
||||
},
|
||||
{
|
||||
"llm_name": "qwen-audio-asr-latest",
|
||||
"tags": "SPEECH2TEXT,8k",
|
||||
"max_tokens": 8000,
|
||||
"model_type": "speech2text"
|
||||
},
|
||||
{
|
||||
"llm_name": "qwen-audio-asr-1204",
|
||||
"llm_name": "qwen3-asr-flash-2025-09-08",
|
||||
"tags": "SPEECH2TEXT,8k",
|
||||
"max_tokens": 8000,
|
||||
"model_type": "speech2text"
|
||||
|
||||
@ -152,7 +152,9 @@ dependencies = [
|
||||
"moodlepy>=0.23.0",
|
||||
"pypandoc>=1.16",
|
||||
"pyobvector==0.2.18",
|
||||
"exceptiongroup>=1.3.0,<2.0.0"
|
||||
"exceptiongroup>=1.3.0,<2.0.0",
|
||||
"ffmpeg-python>=0.2.0",
|
||||
"imageio-ffmpeg>=0.6.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
@ -168,6 +170,9 @@ test = [
|
||||
"requests-toolbelt>=1.0.0",
|
||||
]
|
||||
|
||||
[[tool.uv.index]]
|
||||
url = "https://mirrors.aliyun.com/pypi/simple"
|
||||
|
||||
[[tool.uv.index]]
|
||||
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
from abc import ABC
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
@ -68,32 +69,80 @@ class QWenSeq2txt(Base):
|
||||
self.model_name = model_name
|
||||
|
||||
def transcription(self, audio_path):
|
||||
if "paraformer" in self.model_name or "sensevoice" in self.model_name:
|
||||
return f"**ERROR**: model {self.model_name} is not suppported yet.", 0
|
||||
import dashscope
|
||||
|
||||
from dashscope import MultiModalConversation
|
||||
if audio_path.startswith("http"):
|
||||
audio_input = audio_path
|
||||
else:
|
||||
audio_input = f"file://{audio_path}"
|
||||
|
||||
audio_path = f"file://{audio_path}"
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"text": ""}]
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"audio": audio_path}],
|
||||
"content": [{"audio": audio_input}]
|
||||
}
|
||||
]
|
||||
|
||||
response = None
|
||||
full_content = ""
|
||||
try:
|
||||
response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True)
|
||||
for response in response:
|
||||
try:
|
||||
full_content += response["output"]["choices"][0]["message"].content[0]["text"]
|
||||
except Exception:
|
||||
pass
|
||||
return full_content, num_tokens_from_string(full_content)
|
||||
except Exception as e:
|
||||
return "**ERROR**: " + str(e), 0
|
||||
resp = dashscope.MultiModalConversation.call(
|
||||
model=self.model_name,
|
||||
messages=messages,
|
||||
result_format="message",
|
||||
asr_options={
|
||||
"enable_lid": True,
|
||||
"enable_itn": False
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
text = resp["output"]["choices"][0]["message"].content[0]["text"]
|
||||
except Exception as e:
|
||||
text = "**ERROR**: " + str(e)
|
||||
return text, num_tokens_from_string(text)
|
||||
|
||||
def stream_transcription(self, audio_path):
|
||||
import dashscope
|
||||
|
||||
if audio_path.startswith("http"):
|
||||
audio_input = audio_path
|
||||
else:
|
||||
audio_input = f"file://{audio_path}"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"text": ""}]
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"audio": audio_input}]
|
||||
}
|
||||
]
|
||||
|
||||
stream = dashscope.MultiModalConversation.call(
|
||||
model=self.model_name,
|
||||
messages=messages,
|
||||
result_format="message",
|
||||
stream=True,
|
||||
asr_options={
|
||||
"enable_lid": True,
|
||||
"enable_itn": False
|
||||
}
|
||||
)
|
||||
|
||||
full = ""
|
||||
for chunk in stream:
|
||||
try:
|
||||
piece = chunk["output"]["choices"][0]["message"].content[0]["text"]
|
||||
full = piece
|
||||
yield {"event": "delta", "text": piece}
|
||||
except Exception as e:
|
||||
yield {"event": "error", "text": str(e)}
|
||||
|
||||
yield {"event": "final", "text": full}
|
||||
|
||||
class AzureSeq2txt(Base):
|
||||
_FACTORY_NAME = "Azure-OpenAI"
|
||||
@ -268,6 +317,27 @@ class ZhipuSeq2txt(Base):
|
||||
self.gen_conf = kwargs.get("gen_conf", {})
|
||||
self.stream = kwargs.get("stream", False)
|
||||
|
||||
def _convert_to_wav(self, input_path):
|
||||
ext = os.path.splitext(input_path)[1].lower()
|
||||
if ext in [".wav", ".mp3"]:
|
||||
return input_path
|
||||
fd, out_path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(fd)
|
||||
try:
|
||||
import ffmpeg
|
||||
import imageio_ffmpeg as ffmpeg_exe
|
||||
ffmpeg_path = ffmpeg_exe.get_ffmpeg_exe()
|
||||
(
|
||||
ffmpeg
|
||||
.input(input_path)
|
||||
.output(out_path, ar=16000, ac=1)
|
||||
.overwrite_output()
|
||||
.run(cmd=ffmpeg_path,quiet=True)
|
||||
)
|
||||
return out_path
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"audio convert failed: {e}")
|
||||
|
||||
def transcription(self, audio_path):
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
@ -276,7 +346,9 @@ class ZhipuSeq2txt(Base):
|
||||
}
|
||||
|
||||
headers = {"Authorization": f"Bearer {self.api_key}"}
|
||||
with open(audio_path, "rb") as audio_file:
|
||||
converted = self._convert_to_wav(audio_path)
|
||||
|
||||
with open(converted, "rb") as audio_file:
|
||||
files = {"file": audio_file}
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user