mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat/parse audio (#10133)
### What problem does this PR solve? Dataflow support audio. And fix giteeAI's sequence2text model. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -74,10 +74,10 @@ class RAGFlowPdfParser:
|
|||||||
recognizer_domain = "layout"
|
recognizer_domain = "layout"
|
||||||
|
|
||||||
if layout_recognizer_type == "ascend":
|
if layout_recognizer_type == "ascend":
|
||||||
logging.debug("Using Ascend LayoutRecognizer", flush=True)
|
logging.debug("Using Ascend LayoutRecognizer")
|
||||||
self.layouter = AscendLayoutRecognizer(recognizer_domain)
|
self.layouter = AscendLayoutRecognizer(recognizer_domain)
|
||||||
else: # onnx
|
else: # onnx
|
||||||
logging.debug("Using Onnx LayoutRecognizer", flush=True)
|
logging.debug("Using Onnx LayoutRecognizer")
|
||||||
self.layouter = LayoutRecognizer(recognizer_domain)
|
self.layouter = LayoutRecognizer(recognizer_domain)
|
||||||
self.tbl_det = TableStructureRecognizer()
|
self.tbl_det = TableStructureRecognizer()
|
||||||
|
|
||||||
|
|||||||
@ -54,7 +54,9 @@ class ParserParam(ProcessParamBase):
|
|||||||
"text",
|
"text",
|
||||||
"json"
|
"json"
|
||||||
],
|
],
|
||||||
"audio": [],
|
"audio": [
|
||||||
|
"json"
|
||||||
|
],
|
||||||
"video": [],
|
"video": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,7 +104,26 @@ class ParserParam(ProcessParamBase):
|
|||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
},
|
},
|
||||||
"audio": {},
|
"audio": {
|
||||||
|
"suffix":[
|
||||||
|
"da",
|
||||||
|
"wave",
|
||||||
|
"wav",
|
||||||
|
"mp3",
|
||||||
|
"aac",
|
||||||
|
"flac",
|
||||||
|
"ogg",
|
||||||
|
"aiff",
|
||||||
|
"au",
|
||||||
|
"midi",
|
||||||
|
"wma",
|
||||||
|
"realaudio",
|
||||||
|
"vqf",
|
||||||
|
"oggvorbis",
|
||||||
|
"ape"
|
||||||
|
],
|
||||||
|
"output_format": "json",
|
||||||
|
},
|
||||||
"video": {},
|
"video": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,6 +167,12 @@ class ParserParam(ProcessParamBase):
|
|||||||
text_output_format = text_config.get("output_format", "")
|
text_output_format = text_config.get("output_format", "")
|
||||||
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
|
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
|
||||||
|
|
||||||
|
audio_config = self.setups.get("audio", "")
|
||||||
|
if audio_config:
|
||||||
|
self.check_empty(audio_config.get("llm_id"), "VLM")
|
||||||
|
audio_language = audio_config.get("lang", "")
|
||||||
|
self.check_empty(audio_language, "Language")
|
||||||
|
|
||||||
def get_input_form(self) -> dict[str, dict]:
|
def get_input_form(self) -> dict[str, dict]:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@ -313,6 +340,30 @@ class Parser(ProcessBase):
|
|||||||
|
|
||||||
self.set_output("text", txt)
|
self.set_output("text", txt)
|
||||||
|
|
||||||
|
def _audio(self, from_upstream: ParserFromUpstream):
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
|
||||||
|
|
||||||
|
blob = from_upstream.blob
|
||||||
|
name = from_upstream.name
|
||||||
|
conf = self._param.setups["audio"]
|
||||||
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
|
lang = conf["lang"]
|
||||||
|
_, ext = os.path.splitext(name)
|
||||||
|
tmp_path = ""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
|
||||||
|
tmpf.write(blob)
|
||||||
|
tmpf.flush()
|
||||||
|
tmp_path = os.path.abspath(tmpf.name)
|
||||||
|
|
||||||
|
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
|
||||||
|
txt = seq2txt_mdl.transcription(tmp_path)
|
||||||
|
|
||||||
|
self.set_output("text", txt)
|
||||||
|
|
||||||
async def _invoke(self, **kwargs):
|
async def _invoke(self, **kwargs):
|
||||||
function_map = {
|
function_map = {
|
||||||
"pdf": self._pdf,
|
"pdf": self._pdf,
|
||||||
@ -321,6 +372,7 @@ class Parser(ProcessBase):
|
|||||||
"word": self._word,
|
"word": self._word,
|
||||||
"text": self._text,
|
"text": self._text,
|
||||||
"image": self._image,
|
"image": self._image,
|
||||||
|
"audio": self._audio,
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||||
|
|||||||
@ -60,6 +60,28 @@
|
|||||||
"gif"
|
"gif"
|
||||||
],
|
],
|
||||||
"output_format": "text"
|
"output_format": "text"
|
||||||
|
},
|
||||||
|
"audio": {
|
||||||
|
"suffix": [
|
||||||
|
"da",
|
||||||
|
"wave",
|
||||||
|
"wav",
|
||||||
|
"mp3",
|
||||||
|
"aac",
|
||||||
|
"flac",
|
||||||
|
"ogg",
|
||||||
|
"aiff",
|
||||||
|
"au",
|
||||||
|
"midi",
|
||||||
|
"wma",
|
||||||
|
"realaudio",
|
||||||
|
"vqf",
|
||||||
|
"oggvorbis",
|
||||||
|
"ape"
|
||||||
|
],
|
||||||
|
"lang": "Chinese",
|
||||||
|
"llm_id": "SenseVoiceSmall",
|
||||||
|
"output_format": "json"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -218,7 +218,7 @@ class GPUStackSeq2txt(Base):
|
|||||||
class GiteeSeq2txt(Base):
|
class GiteeSeq2txt(Base):
|
||||||
_FACTORY_NAME = "GiteeAI"
|
_FACTORY_NAME = "GiteeAI"
|
||||||
|
|
||||||
def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/"):
|
def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/", **kwargs):
|
||||||
if not base_url:
|
if not base_url:
|
||||||
base_url = "https://ai.gitee.com/v1/"
|
base_url = "https://ai.gitee.com/v1/"
|
||||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||||
|
|||||||
Reference in New Issue
Block a user