Feat: Qwen-VL series supports video parsing (#10676)

### What problem does this PR solve? Qwen-VL series supports video parsing. #10617. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-30 15:16:45 +08:00 · 2025-10-21 09:36:13 +08:00
parent 5b2e5dd334
commit aaa4776657
1 changed files with 59 additions and 2 deletions
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@ -13,13 +13,16 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+
 import base64
 import json
 import os
+import tempfile
 import logging
 from abc import ABC
 from copy import deepcopy
 from io import BytesIO
+from pathlib import Path
 from urllib.parse import urljoin
 import requests
 from openai import OpenAI
@ -171,6 +174,7 @@ class GptV4(Base):
    def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1", **kwargs):
        if not base_url:
            base_url = "https://api.openai.com/v1"
+        self.api_key = key
        self.client = OpenAI(api_key=key, base_url=base_url)
        self.model_name = model_name
        self.lang = lang
@ -224,6 +228,61 @@ class QWenCV(GptV4):
            base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
        super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)

+    def chat(self, system, history, gen_conf, images=[], video_bytes=None, filename=""):
+        if video_bytes:
+            try:
+                summary, summary_num_tokens = self._process_video(video_bytes, filename)
+                return summary, summary_num_tokens
+            except Exception as e:
+                return "**ERROR**: " + str(e), 0
+
+        return "**ERROR**: Method chat not supported yet.", 0
+
+    def _process_video(self, video_bytes, filename):
+        from dashscope import MultiModalConversation
+
+        video_suffix = Path(filename).suffix or ".mp4"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=video_suffix) as tmp:
+            tmp.write(video_bytes)
+            tmp_path = tmp.name
+
+        video_path = f"file://{tmp_path}"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "video": video_path,
+                        "fps": 2,
+                    },
+                    {
+                        "text": "Please summarize this video in proper sentences.",
+                    },
+                ],
+            }
+        ]
+
+        def call_api():
+            response = MultiModalConversation.call(
+                api_key=self.api_key,
+                model=self.model_name,
+                messages=messages,
+            )
+            summary = response["output"]["choices"][0]["message"].content[0]["text"]
+            return summary, num_tokens_from_string(summary)
+
+        try:
+            return call_api()
+        except Exception as e1:
+            import dashscope
+
+            dashscope.base_http_api_url = "https://dashscope-intl.aliyuncs.com/api/v1"
+            try:
+                return call_api()
+            except Exception as e2:
+                raise RuntimeError(f"Both default and intl endpoint failed.\nFirst error: {e1}\nSecond error: {e2}")
+
+

 class HunyuanCV(GptV4):
    _FACTORY_NAME = "Tencent Hunyuan"
@ -616,8 +675,6 @@ class GeminiCV(Base):
    def _process_video(self, video_bytes, filename):
        from google import genai
        from google.genai import types
-        import tempfile
-        from pathlib import Path

        video_size_mb = len(video_bytes) / (1024 * 1024)
        client = genai.Client(api_key=self.api_key)