From de4f75dcd8236640ca6c3dfbe954b1dda7f8aadc Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Thu, 23 Oct 2025 09:24:16 +0800
Subject: [PATCH] Fix: add video parser (#10735)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 api/apps/kb_app.py        |  1 +
 rag/flow/parser/parser.py | 28 +++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py
index e168f9d1f..07c860e07 100644
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@@ -70,6 +70,7 @@ def create():
         e, t = TenantService.get_by_id(current_user.id)
         if not e:
             return get_data_error_result(message="Tenant not found.")
+        req["embd_id"] = t.embd_id
         req["parser_config"] = {
             "layout_recognize": "DeepDOC",
             "chunk_token_num": 512,
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index 0784ce6fa..1cbca83e4 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -140,7 +140,13 @@ class ParserParam(ProcessParamBase):
                 ],
                 "output_format": "json",
             },
-            "video": {},
+            "video": {
+                "suffix":[
+                    "mp4",
+                    "av"
+                ],
+                "output_format": "json",
+            },
         }
 
     def check(self):
@@ -185,6 +191,10 @@ class ParserParam(ProcessParamBase):
         if audio_config:
             self.check_empty(audio_config.get("llm_id"), "Audio VLM")
 
+        video_config = self.setups.get("video", "")
+        if video_config:
+            self.check_empty(video_config.get("llm_id"), "Video VLM")
+
         email_config = self.setups.get("email", "")
         if email_config:
             email_output_format = email_config.get("output_format", "")
@@ -212,8 +222,8 @@ class Parser(ProcessBase):
             lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
             bboxes = []
             for t, poss in lines:
-                pn, x0, x1, top, bott = poss.split(" ")
-                bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
+                for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
+                    bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
 
         if conf.get("output_format") == "json":
             self.set_output("json", bboxes)
@@ -357,6 +367,17 @@ class Parser(ProcessBase):
 
             self.set_output("text", txt)
 
+    def _video(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.")
+
+        conf = self._param.setups["video"]
+        self.set_output("output_format", conf["output_format"])
+
+        cv_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT)
+        txt = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name)
+
+        self.set_output("text", txt)
+
     def _email(self, name, blob):
         self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
 
@@ -483,6 +504,7 @@ class Parser(ProcessBase):
             "word": self._word,
             "image": self._image,
             "audio": self._audio,
+            "video": self._video,
             "email": self._email,
         }
         try: