Add ParsertType Audio (#1637)

### What problem does this PR solve?

#1514 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
H
2024-07-22 19:17:30 +08:00
committed by GitHub
parent 9f109adf28
commit ac7a0d4fbf
10 changed files with 80 additions and 8 deletions

42
rag/app/audio.py Normal file
View File

@ -0,0 +1,42 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import re
import numpy as np
from api.db import LLMType
from rag.nlp import rag_tokenizer
from api.db.services.llm_service import LLMBundle
from rag.nlp import tokenize
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # is_english(sections)
try:
callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
ans = seq2txt_mdl.transcription(binary)
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
tokenize(doc, ans, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))
return []

View File

@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.4, "Use CV LLM to describe the picture.")
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.describe(binary)
callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
return [doc]

View File

@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
from io import BytesIO
import pandas as pd
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio
from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService
@ -68,6 +68,7 @@ FACTORY = {
ParserType.RESUME.value: resume,
ParserType.PICTURE.value: picture,
ParserType.ONE.value: one,
ParserType.AUDIO.value: audio
}