Add ParsertType Audio (#1637)

### What problem does this PR solve? #1514 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-30 07:06:39 +08:00 · 2024-07-22 19:17:30 +08:00
parent 9f109adf28
commit ac7a0d4fbf
10 changed files with 80 additions and 8 deletions
--- a/api/apps/api_app.py
+++ b/api/apps/api_app.py
@ -335,6 +335,8 @@ def upload():
                doc["parser_id"] = request.form.get("parser_id").strip()
        if doc["type"] == FileType.VISUAL:
            doc["parser_id"] = ParserType.PICTURE.value
+        if doc["type"] == FileType.AURAL:
+            doc["parser_id"] = ParserType.AUDIO.value
        if re.search(r"\.(ppt|pptx|pages)$", filename):
            doc["parser_id"] = ParserType.PRESENTATION.value

@ -581,4 +583,4 @@ def completion_faq():
        return response

    except Exception as e:
-        return server_error_response(e)
+        return server_error_response(e)
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@ -39,7 +39,7 @@ from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
-from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
+from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
@ -377,6 +377,8 @@ def upload_documents(dataset_id):
            }
            if doc["type"] == FileType.VISUAL:
                doc["parser_id"] = ParserType.PICTURE.value
+            if doc["type"] == FileType.AURAL:
+                doc["parser_id"] = ParserType.AUDIO.value
            if re.search(r"\.(ppt|pptx|pages)$", filename):
                doc["parser_id"] = ParserType.PRESENTATION.value
            DocumentService.insert(doc)
@ -648,6 +650,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
            resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case "table":
            table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
+        case "audio":
+            audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case _:
            return False

--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -105,6 +105,8 @@ def upload():
            }
            if doc["type"] == FileType.VISUAL:
                doc["parser_id"] = ParserType.PICTURE.value
+            if doc["type"] == FileType.AURAL:
+                doc["parser_id"] = ParserType.AUDIO.value
            if re.search(r"\.(ppt|pptx|pages)$", filename):
                doc["parser_id"] = ParserType.PRESENTATION.value
            DocumentService.insert(doc)
@ -171,6 +173,8 @@ def web_crawl():
        }
        if doc["type"] == FileType.VISUAL:
            doc["parser_id"] = ParserType.PICTURE.value
+        if doc["type"] == FileType.AURAL:
+            doc["parser_id"] = ParserType.AUDIO.value
        if re.search(r"\.(ppt|pptx|pages)$", filename):
            doc["parser_id"] = ParserType.PRESENTATION.value
        DocumentService.insert(doc)
--- a/api/db/init.py
+++ b/api/db/init.py
@ -84,6 +84,7 @@ class ParserType(StrEnum):
    NAIVE = "naive"
    PICTURE = "picture"
    ONE = "one"
+    AUDIO = "audio"


 class FileSource(StrEnum):
@ -96,4 +97,4 @@ class CanvasType(StrEnum):
    ChatBot = "chatbot"
    DocBot = "docbot"

-KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
+KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@ -121,6 +121,8 @@ def init_llm_factory():
    LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
    LLMService.filter_delete([LLMService.model.fid == "QAnything"])
    TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
+    TenantService.filter_update([1 == 1], {
+        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
    ## insert openai two embedding models to the current openai user.
    print("Start to insert 2 OpenAI embedding models...")
    tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@ -143,7 +145,7 @@ def init_llm_factory():
    """
    drop table llm;
    drop table llm_factories;
-    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
+    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
    alter table knowledgebase modify avatar longtext;
    alter table user modify avatar longtext;
    alter table dialog modify icon longtext;
--- a/api/db/services/llm_service.py
+++ b/api/db/services/llm_service.py
@ -15,7 +15,7 @@
 #
 from api.db.services.user_service import TenantService
 from api.settings import database_logger
-from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel
+from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
 from api.db import LLMType
 from api.db.db_models import DB, UserTenant
 from api.db.db_models import LLMFactories, LLM, TenantLLM
@ -120,6 +120,14 @@ class TenantLLMService(CommonService):
            return ChatModel[model_config["llm_factory"]](
                model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])

+        if llm_type == LLMType.SPEECH2TEXT:
+            if model_config["llm_factory"] not in Seq2txtModel:
+                return
+            return Seq2txtModel[model_config["llm_factory"]](
+                model_config["api_key"], model_config["llm_name"], lang,
+                base_url=model_config["api_base"]
+            )
+
    @classmethod
    @DB.connection_context()
    def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
@ -207,6 +215,14 @@ class LLMBundle(object):
                "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
        return txt

+    def transcription(self, audio):
+        txt, used_tokens = self.mdl.transcription(audio)
+        if not TenantLLMService.increase_usage(
+                self.tenant_id, self.llm_type, used_tokens):
+            database_logger.error(
+                "Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id))
+        return txt
+
    def chat(self, system, history, gen_conf):
        txt, used_tokens = self.mdl.chat(system, history, gen_conf)
        if not TenantLLMService.increase_usage(
--- a/api/settings.py
+++ b/api/settings.py
@ -131,7 +131,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get(
    "parsers",
-    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
+    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio")

 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)