From de4f75dcd8236640ca6c3dfbe954b1dda7f8aadc Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 23 Oct 2025 09:24:16 +0800 Subject: [PATCH] Fix: add video parser (#10735) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/kb_app.py | 1 + rag/flow/parser/parser.py | 28 +++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index e168f9d1f..07c860e07 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -70,6 +70,7 @@ def create(): e, t = TenantService.get_by_id(current_user.id) if not e: return get_data_error_result(message="Tenant not found.") + req["embd_id"] = t.embd_id req["parser_config"] = { "layout_recognize": "DeepDOC", "chunk_token_num": 512, diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 0784ce6fa..1cbca83e4 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -140,7 +140,13 @@ class ParserParam(ProcessParamBase): ], "output_format": "json", }, - "video": {}, + "video": { + "suffix":[ + "mp4", + "av" + ], + "output_format": "json", + }, } def check(self): @@ -185,6 +191,10 @@ class ParserParam(ProcessParamBase): if audio_config: self.check_empty(audio_config.get("llm_id"), "Audio VLM") + video_config = self.setups.get("video", "") + if video_config: + self.check_empty(video_config.get("llm_id"), "Video VLM") + email_config = self.setups.get("email", "") if email_config: email_output_format = email_config.get("output_format", "") @@ -212,8 +222,8 @@ class Parser(ProcessBase): lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback) bboxes = [] for t, poss in lines: - pn, x0, x1, top, bott = poss.split(" ") - bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t}) + for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss): + bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t}) if conf.get("output_format") == "json": self.set_output("json", bboxes) @@ -357,6 +367,17 @@ class Parser(ProcessBase): self.set_output("text", txt) + def _video(self, name, blob): + self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.") + + conf = self._param.setups["video"] + self.set_output("output_format", conf["output_format"]) + + cv_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT) + txt = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name) + + self.set_output("text", txt) + def _email(self, name, blob): self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.") @@ -483,6 +504,7 @@ class Parser(ProcessBase): "word": self._word, "image": self._image, "audio": self._audio, + "video": self._video, "email": self._email, } try: