Feat/parse audio (#10133)

### What problem does this PR solve?

Dataflow support audio.  And fix giteeAI's sequence2text model. 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Lynn
2025-09-18 09:31:32 +08:00
committed by GitHub
parent f3738b06f1
commit d353f7f7f8
4 changed files with 79 additions and 5 deletions

View File

@ -74,10 +74,10 @@ class RAGFlowPdfParser:
recognizer_domain = "layout"
if layout_recognizer_type == "ascend":
logging.debug("Using Ascend LayoutRecognizer", flush=True)
logging.debug("Using Ascend LayoutRecognizer")
self.layouter = AscendLayoutRecognizer(recognizer_domain)
else: # onnx
logging.debug("Using Onnx LayoutRecognizer", flush=True)
logging.debug("Using Onnx LayoutRecognizer")
self.layouter = LayoutRecognizer(recognizer_domain)
self.tbl_det = TableStructureRecognizer()

View File

@ -54,7 +54,9 @@ class ParserParam(ProcessParamBase):
"text",
"json"
],
"audio": [],
"audio": [
"json"
],
"video": [],
}
@ -102,7 +104,26 @@ class ParserParam(ProcessParamBase):
],
"output_format": "json",
},
"audio": {},
"audio": {
"suffix":[
"da",
"wave",
"wav",
"mp3",
"aac",
"flac",
"ogg",
"aiff",
"au",
"midi",
"wma",
"realaudio",
"vqf",
"oggvorbis",
"ape"
],
"output_format": "json",
},
"video": {},
}
@ -146,6 +167,12 @@ class ParserParam(ProcessParamBase):
text_output_format = text_config.get("output_format", "")
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
audio_config = self.setups.get("audio", "")
if audio_config:
self.check_empty(audio_config.get("llm_id"), "VLM")
audio_language = audio_config.get("lang", "")
self.check_empty(audio_language, "Language")
def get_input_form(self) -> dict[str, dict]:
return {}
@ -313,6 +340,30 @@ class Parser(ProcessBase):
self.set_output("text", txt)
def _audio(self, from_upstream: ParserFromUpstream):
import os
import tempfile
self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
blob = from_upstream.blob
name = from_upstream.name
conf = self._param.setups["audio"]
self.set_output("output_format", conf["output_format"])
lang = conf["lang"]
_, ext = os.path.splitext(name)
tmp_path = ""
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
tmpf.write(blob)
tmpf.flush()
tmp_path = os.path.abspath(tmpf.name)
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
txt = seq2txt_mdl.transcription(tmp_path)
self.set_output("text", txt)
async def _invoke(self, **kwargs):
function_map = {
"pdf": self._pdf,
@ -321,6 +372,7 @@ class Parser(ProcessBase):
"word": self._word,
"text": self._text,
"image": self._image,
"audio": self._audio,
}
try:
from_upstream = ParserFromUpstream.model_validate(kwargs)

View File

@ -60,6 +60,28 @@
"gif"
],
"output_format": "text"
},
"audio": {
"suffix": [
"da",
"wave",
"wav",
"mp3",
"aac",
"flac",
"ogg",
"aiff",
"au",
"midi",
"wma",
"realaudio",
"vqf",
"oggvorbis",
"ape"
],
"lang": "Chinese",
"llm_id": "SenseVoiceSmall",
"output_format": "json"
}
}
}

View File

@ -218,7 +218,7 @@ class GPUStackSeq2txt(Base):
class GiteeSeq2txt(Base):
_FACTORY_NAME = "GiteeAI"
def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/"):
def __init__(self, key, model_name="whisper-1", base_url="https://ai.gitee.com/v1/", **kwargs):
if not base_url:
base_url = "https://ai.gitee.com/v1/"
self.client = OpenAI(api_key=key, base_url=base_url)