Feat: Qwen-VL series supports video parsing (#10676)

### What problem does this PR solve? Qwen-VL series supports video parsing. #10617. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-03 00:55:10 +08:00 · 2025-10-21 09:36:13 +08:00
parent 5b2e5dd334
commit aaa4776657
1 changed files with 59 additions and 2 deletions
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@ -13,13 +13,16 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import base64
 import json
 import os
 import tempfile
 import logging
 from abc import ABC
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from urllib.parse import urljoin
 import requests
 from openai import OpenAI
@ -171,6 +174,7 @@ class GptV4(Base):
    def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1", **kwargs):
        if not base_url:
            base_url = "https://api.openai.com/v1"
        self.api_key = key
        self.client = OpenAI(api_key=key, base_url=base_url)
        self.model_name = model_name
        self.lang = lang
@ -224,6 +228,61 @@ class QWenCV(GptV4):
            base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
        super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)
    def chat(self, system, history, gen_conf, images=[], video_bytes=None, filename=""):
        if video_bytes:
            try:
                summary, summary_num_tokens = self._process_video(video_bytes, filename)
                return summary, summary_num_tokens
            except Exception as e:
                return "**ERROR**: " + str(e), 0
        return "**ERROR**: Method chat not supported yet.", 0
    def _process_video(self, video_bytes, filename):
        from dashscope import MultiModalConversation
        video_suffix = Path(filename).suffix or ".mp4"
        with tempfile.NamedTemporaryFile(delete=False, suffix=video_suffix) as tmp:
            tmp.write(video_bytes)
            tmp_path = tmp.name
        video_path = f"file://{tmp_path}"
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "video": video_path,
                        "fps": 2,
                    },
                    {
                        "text": "Please summarize this video in proper sentences.",
                    },
                ],
            }
        ]
        def call_api():
            response = MultiModalConversation.call(
                api_key=self.api_key,
                model=self.model_name,
                messages=messages,
            )
            summary = response["output"]["choices"][0]["message"].content[0]["text"]
            return summary, num_tokens_from_string(summary)
        try:
            return call_api()
        except Exception as e1:
            import dashscope
            dashscope.base_http_api_url = "https://dashscope-intl.aliyuncs.com/api/v1"
            try:
                return call_api()
            except Exception as e2:
                raise RuntimeError(f"Both default and intl endpoint failed.\nFirst error: {e1}\nSecond error: {e2}")
 class HunyuanCV(GptV4):
    _FACTORY_NAME = "Tencent Hunyuan"
@ -616,8 +675,6 @@ class GeminiCV(Base):
    def _process_video(self, video_bytes, filename):
        from google import genai
        from google.genai import types
        import tempfile
        from pathlib import Path
        video_size_mb = len(video_bytes) / (1024 * 1024)
        client = genai.Client(api_key=self.api_key)