Add ParsertType Audio (#1637)

### What problem does this PR solve?

#1514 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
H
2024-07-22 19:17:30 +08:00
committed by GitHub
parent 9f109adf28
commit ac7a0d4fbf
10 changed files with 80 additions and 8 deletions

View File

@ -84,6 +84,7 @@ class ParserType(StrEnum):
NAIVE = "naive"
PICTURE = "picture"
ONE = "one"
AUDIO = "audio"
class FileSource(StrEnum):
@ -96,4 +97,4 @@ class CanvasType(StrEnum):
ChatBot = "chatbot"
DocBot = "docbot"
KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"

View File

@ -121,6 +121,8 @@ def init_llm_factory():
LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
TenantService.filter_update([1 == 1], {
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
## insert openai two embedding models to the current openai user.
print("Start to insert 2 OpenAI embedding models...")
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@ -143,7 +145,7 @@ def init_llm_factory():
"""
drop table llm;
drop table llm_factories;
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
alter table knowledgebase modify avatar longtext;
alter table user modify avatar longtext;
alter table dialog modify icon longtext;

View File

@ -15,7 +15,7 @@
#
from api.db.services.user_service import TenantService
from api.settings import database_logger
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
from api.db import LLMType
from api.db.db_models import DB, UserTenant
from api.db.db_models import LLMFactories, LLM, TenantLLM
@ -120,6 +120,14 @@ class TenantLLMService(CommonService):
return ChatModel[model_config["llm_factory"]](
model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
if llm_type == LLMType.SPEECH2TEXT:
if model_config["llm_factory"] not in Seq2txtModel:
return
return Seq2txtModel[model_config["llm_factory"]](
model_config["api_key"], model_config["llm_name"], lang,
base_url=model_config["api_base"]
)
@classmethod
@DB.connection_context()
def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
@ -207,6 +215,14 @@ class LLMBundle(object):
"Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
return txt
def transcription(self, audio):
txt, used_tokens = self.mdl.transcription(audio)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
database_logger.error(
"Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id))
return txt
def chat(self, system, history, gen_conf):
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
if not TenantLLMService.increase_usage(