diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py index 85778bb90..ae8ac56f5 100644 --- a/api/apps/dialog_app.py +++ b/api/apps/dialog_app.py @@ -41,6 +41,9 @@ def set_dialog(): return get_data_error_result(message="Dialog name can't be empty.") if len(name.encode("utf-8")) > 255: return get_data_error_result(message=f"Dialog name length is {len(name)} which is larger than 255") + + if DialogService.get_or_none(tenant_id=current_user.id, name=name): + return get_data_error_result(message=f"Duplicated Dialog name {name}.") description = req.get("description", "A helpful dialog") icon = req.get("icon", "") top_n = req.get("top_n", 6) diff --git a/conf/llm_factories.json b/conf/llm_factories.json index 072185a9c..1022ee257 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -505,6 +505,24 @@ "tags": "RE-RANK,4k", "max_tokens": 4000, "model_type": "rerank" + }, + { + "llm_name": "qwen-audio-asr", + "tags": "SPEECH2TEXT,8k", + "max_tokens": 8000, + "model_type": "speech2text" + }, + { + "llm_name": "qwen-audio-asr-latest", + "tags": "SPEECH2TEXT,8k", + "max_tokens": 8000, + "model_type": "speech2text" + }, + { + "llm_name": "qwen-audio-asr-1204", + "tags": "SPEECH2TEXT,8k", + "max_tokens": 8000, + "model_type": "speech2text" } ] }, diff --git a/rag/app/audio.py b/rag/app/audio.py index 95e552ae9..e8a1d2655 100644 --- a/rag/app/audio.py +++ b/rag/app/audio.py @@ -14,31 +14,48 @@ # limitations under the License. # +import os import re +import tempfile from api.db import LLMType -from rag.nlp import rag_tokenizer from api.db.services.llm_service import LLMBundle -from rag.nlp import tokenize +from rag.nlp import rag_tokenizer, tokenize def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): - doc = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) - } + doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) # is it English eng = lang.lower() == "english" # is_english(sections) try: + _, ext = os.path.splitext(filename) + if not ext: + raise RuntimeError("No extension detected.") + + if ext not in [".da", ".wave", ".wav", ".mp3", ".wav", ".aac", ".flac", ".ogg", ".aiff", ".au", ".midi", ".wma", ".realaudio", ".vqf", ".oggvorbis", ".aac", ".ape"]: + raise RuntimeError(f"Extension {ext} is not supported yet.") + + tmp_path = "" + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmpf: + tmpf.write(binary) + tmpf.flush() + tmp_path = os.path.abspath(tmpf.name) + callback(0.1, "USE Sequence2Txt LLM to transcription the audio") seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang) - ans = seq2txt_mdl.transcription(binary) + ans = seq2txt_mdl.transcription(tmp_path) callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32]) + tokenize(doc, ans, eng) return [doc] except Exception as e: callback(prog=-1, msg=str(e)) - + finally: + if tmp_path and os.path.exists(tmp_path): + try: + os.unlink(tmp_path) + except Exception: + pass return [] diff --git a/rag/llm/sequence2txt_model.py b/rag/llm/sequence2txt_model.py index 27b83425f..95203cace 100644 --- a/rag/llm/sequence2txt_model.py +++ b/rag/llm/sequence2txt_model.py @@ -35,8 +35,9 @@ class Base(ABC): """ pass - def transcription(self, audio, **kwargs): - transcription = self.client.audio.transcriptions.create(model=self.model_name, file=audio, response_format="text") + def transcription(self, audio_path, **kwargs): + audio_file = open(audio_path, "rb") + transcription = self.client.audio.transcriptions.create(model=self.model_name, file=audio_file) return transcription.text.strip(), num_tokens_from_string(transcription.text.strip()) def audio2base64(self, audio): @@ -50,7 +51,7 @@ class Base(ABC): class GPTSeq2txt(Base): _FACTORY_NAME = "OpenAI" - def __init__(self, key, model_name="whisper-1", base_url="https://api.openai.com/v1"): + def __init__(self, key, model_name="whisper-1", base_url="https://api.openai.com/v1", **kwargs): if not base_url: base_url = "https://api.openai.com/v1" self.client = OpenAI(api_key=key, base_url=base_url) @@ -60,27 +61,38 @@ class GPTSeq2txt(Base): class QWenSeq2txt(Base): _FACTORY_NAME = "Tongyi-Qianwen" - def __init__(self, key, model_name="paraformer-realtime-8k-v1", **kwargs): + def __init__(self, key, model_name="qwen-audio-asr", **kwargs): import dashscope dashscope.api_key = key self.model_name = model_name - def transcription(self, audio, format): - from http import HTTPStatus + def transcription(self, audio_path): + if "paraformer" in self.model_name or "sensevoice" in self.model_name: + return f"**ERROR**: model {self.model_name} is not suppported yet.", 0 - from dashscope.audio.asr import Recognition + from dashscope import MultiModalConversation - recognition = Recognition(model=self.model_name, format=format, sample_rate=16000, callback=None) - result = recognition.call(audio) + audio_path = f"file://{audio_path}" + messages = [ + { + "role": "user", + "content": [{"audio": audio_path}], + } + ] - ans = "" - if result.status_code == HTTPStatus.OK: - for sentence in result.get_sentence(): - ans += sentence.text.decode("utf-8") + "\n" - return ans, num_tokens_from_string(ans) - - return "**ERROR**: " + result.message, 0 + response = None + full_content = "" + try: + response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True) + for response in response: + try: + full_content += response["output"]["choices"][0]["message"].content[0]["text"] + except Exception: + pass + return full_content, num_tokens_from_string(full_content) + except Exception as e: + return "**ERROR**: " + str(e), 0 class AzureSeq2txt(Base): @@ -212,6 +224,7 @@ class GiteeSeq2txt(Base): self.client = OpenAI(api_key=key, base_url=base_url) self.model_name = model_name + class DeepInfraSeq2txt(Base): _FACTORY_NAME = "DeepInfra"