diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py
index 85778bb90..ae8ac56f5 100644
--- a/api/apps/dialog_app.py
+++ b/api/apps/dialog_app.py
@@ -41,6 +41,9 @@ def set_dialog():
         return get_data_error_result(message="Dialog name can't be empty.")
     if len(name.encode("utf-8")) > 255:
         return get_data_error_result(message=f"Dialog name length is {len(name)} which is larger than 255")
+
+    if DialogService.get_or_none(tenant_id=current_user.id, name=name):
+        return get_data_error_result(message=f"Duplicated Dialog name {name}.")
     description = req.get("description", "A helpful dialog")
     icon = req.get("icon", "")
     top_n = req.get("top_n", 6)
diff --git a/conf/llm_factories.json b/conf/llm_factories.json
index 072185a9c..1022ee257 100644
--- a/conf/llm_factories.json
+++ b/conf/llm_factories.json
@@ -505,6 +505,24 @@
                     "tags": "RE-RANK,4k",
                     "max_tokens": 4000,
                     "model_type": "rerank"
+                },
+                {
+                    "llm_name": "qwen-audio-asr",
+                    "tags": "SPEECH2TEXT,8k",
+                    "max_tokens": 8000,
+                    "model_type": "speech2text"
+                },
+                {
+                    "llm_name": "qwen-audio-asr-latest",
+                    "tags": "SPEECH2TEXT,8k",
+                    "max_tokens": 8000,
+                    "model_type": "speech2text"
+                },
+                {
+                    "llm_name": "qwen-audio-asr-1204",
+                    "tags": "SPEECH2TEXT,8k",
+                    "max_tokens": 8000,
+                    "model_type": "speech2text"
                 }
             ]
         },
diff --git a/rag/app/audio.py b/rag/app/audio.py
index 95e552ae9..e8a1d2655 100644
--- a/rag/app/audio.py
+++ b/rag/app/audio.py
@@ -14,31 +14,48 @@
 #  limitations under the License.
 #
 
+import os
 import re
+import tempfile
 
 from api.db import LLMType
-from rag.nlp import rag_tokenizer
 from api.db.services.llm_service import LLMBundle
-from rag.nlp import tokenize
+from rag.nlp import rag_tokenizer, tokenize
 
 
 def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
+    doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
     doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
 
     # is it English
     eng = lang.lower() == "english"  # is_english(sections)
     try:
+        _, ext = os.path.splitext(filename)
+        if not ext:
+            raise RuntimeError("No extension detected.")
+
+        if ext not in [".da", ".wave", ".wav", ".mp3", ".wav", ".aac", ".flac", ".ogg", ".aiff", ".au", ".midi", ".wma", ".realaudio", ".vqf", ".oggvorbis", ".aac", ".ape"]:
+            raise RuntimeError(f"Extension {ext} is not supported yet.")
+
+        tmp_path = ""
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmpf:
+            tmpf.write(binary)
+            tmpf.flush()
+            tmp_path = os.path.abspath(tmpf.name)
+
         callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
         seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
-        ans = seq2txt_mdl.transcription(binary)
+        ans = seq2txt_mdl.transcription(tmp_path)
         callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
+
         tokenize(doc, ans, eng)
         return [doc]
     except Exception as e:
         callback(prog=-1, msg=str(e))
-
+    finally:
+        if tmp_path and os.path.exists(tmp_path):
+            try:
+                os.unlink(tmp_path)
+            except Exception:
+                pass
     return []
diff --git a/rag/llm/sequence2txt_model.py b/rag/llm/sequence2txt_model.py
index 27b83425f..95203cace 100644
--- a/rag/llm/sequence2txt_model.py
+++ b/rag/llm/sequence2txt_model.py
@@ -35,8 +35,9 @@ class Base(ABC):
         """
         pass
 
-    def transcription(self, audio, **kwargs):
-        transcription = self.client.audio.transcriptions.create(model=self.model_name, file=audio, response_format="text")
+    def transcription(self, audio_path, **kwargs):
+        audio_file = open(audio_path, "rb")
+        transcription = self.client.audio.transcriptions.create(model=self.model_name, file=audio_file)
         return transcription.text.strip(), num_tokens_from_string(transcription.text.strip())
 
     def audio2base64(self, audio):
@@ -50,7 +51,7 @@ class Base(ABC):
 class GPTSeq2txt(Base):
     _FACTORY_NAME = "OpenAI"
 
-    def __init__(self, key, model_name="whisper-1", base_url="https://api.openai.com/v1"):
+    def __init__(self, key, model_name="whisper-1", base_url="https://api.openai.com/v1", **kwargs):
         if not base_url:
             base_url = "https://api.openai.com/v1"
         self.client = OpenAI(api_key=key, base_url=base_url)
@@ -60,27 +61,38 @@ class GPTSeq2txt(Base):
 class QWenSeq2txt(Base):
     _FACTORY_NAME = "Tongyi-Qianwen"
 
-    def __init__(self, key, model_name="paraformer-realtime-8k-v1", **kwargs):
+    def __init__(self, key, model_name="qwen-audio-asr", **kwargs):
         import dashscope
 
         dashscope.api_key = key
         self.model_name = model_name
 
-    def transcription(self, audio, format):
-        from http import HTTPStatus
+    def transcription(self, audio_path):
+        if "paraformer" in self.model_name or "sensevoice" in self.model_name:
+            return f"**ERROR**: model {self.model_name} is not suppported yet.", 0
 
-        from dashscope.audio.asr import Recognition
+        from dashscope import MultiModalConversation
 
-        recognition = Recognition(model=self.model_name, format=format, sample_rate=16000, callback=None)
-        result = recognition.call(audio)
+        audio_path = f"file://{audio_path}"
+        messages = [
+            {
+                "role": "user",
+                "content": [{"audio": audio_path}],
+            }
+        ]
 
-        ans = ""
-        if result.status_code == HTTPStatus.OK:
-            for sentence in result.get_sentence():
-                ans += sentence.text.decode("utf-8") + "\n"
-            return ans, num_tokens_from_string(ans)
-
-        return "**ERROR**: " + result.message, 0
+        response = None
+        full_content = ""
+        try:
+            response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages, result_format="message", stream=True)
+            for response in response:
+                try:
+                    full_content += response["output"]["choices"][0]["message"].content[0]["text"]
+                except Exception:
+                    pass
+            return full_content, num_tokens_from_string(full_content)
+        except Exception as e:
+            return "**ERROR**: " + str(e), 0
 
 
 class AzureSeq2txt(Base):
@@ -212,6 +224,7 @@ class GiteeSeq2txt(Base):
         self.client = OpenAI(api_key=key, base_url=base_url)
         self.model_name = model_name
 
+
 class DeepInfraSeq2txt(Base):
     _FACTORY_NAME = "DeepInfra"