Add ParsertType Audio (#1637)

### What problem does this PR solve? #1514 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 08:05:07 +08:00 · 2024-07-22 19:17:30 +08:00
parent 9f109adf28
commit ac7a0d4fbf
10 changed files with 80 additions and 8 deletions
--- a/rag/app/audio.py
+++ b/rag/app/audio.py
@ -0,0 +1,42 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import io
+import re
+import numpy as np
+
+from api.db import LLMType
+from rag.nlp import rag_tokenizer
+from api.db.services.llm_service import LLMBundle
+from rag.nlp import tokenize
+
+
+def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+
+    # is it English
+    eng = lang.lower() == "english"  # is_english(sections)
+    try:
+        callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
+        seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
+        ans = seq2txt_mdl.transcription(binary)
+        callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
+        tokenize(doc, ans, eng)
+        return [doc]
+    except Exception as e:
+        callback(prog=-1, msg=str(e))
+
+    return []
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
        callback(0.4, "Use CV LLM to describe the picture.")
        cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
        ans = cv_mdl.describe(binary)
-        callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
+        callback(0.8, "CV LLM respond: %s ..." % ans[:32])
        txt += "\n" + ans
        tokenize(doc, txt, eng)
        return [doc]
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
 from io import BytesIO
 import pandas as pd

-from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
+from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio

 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
@ -68,6 +68,7 @@ FACTORY = {
    ParserType.RESUME.value: resume,
    ParserType.PICTURE.value: picture,
    ParserType.ONE.value: one,
+    ParserType.AUDIO.value: audio
 }