mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Qwen-VL series supports video parsing (#10676)
### What problem does this PR solve? Qwen-VL series supports video parsing. #10617. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -13,13 +13,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
from abc import ABC
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
@ -171,6 +174,7 @@ class GptV4(Base):
|
||||
def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1", **kwargs):
|
||||
if not base_url:
|
||||
base_url = "https://api.openai.com/v1"
|
||||
self.api_key = key
|
||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||
self.model_name = model_name
|
||||
self.lang = lang
|
||||
@ -224,6 +228,61 @@ class QWenCV(GptV4):
|
||||
base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)
|
||||
|
||||
def chat(self, system, history, gen_conf, images=[], video_bytes=None, filename=""):
|
||||
if video_bytes:
|
||||
try:
|
||||
summary, summary_num_tokens = self._process_video(video_bytes, filename)
|
||||
return summary, summary_num_tokens
|
||||
except Exception as e:
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
return "**ERROR**: Method chat not supported yet.", 0
|
||||
|
||||
def _process_video(self, video_bytes, filename):
|
||||
from dashscope import MultiModalConversation
|
||||
|
||||
video_suffix = Path(filename).suffix or ".mp4"
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=video_suffix) as tmp:
|
||||
tmp.write(video_bytes)
|
||||
tmp_path = tmp.name
|
||||
|
||||
video_path = f"file://{tmp_path}"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"video": video_path,
|
||||
"fps": 2,
|
||||
},
|
||||
{
|
||||
"text": "Please summarize this video in proper sentences.",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
def call_api():
|
||||
response = MultiModalConversation.call(
|
||||
api_key=self.api_key,
|
||||
model=self.model_name,
|
||||
messages=messages,
|
||||
)
|
||||
summary = response["output"]["choices"][0]["message"].content[0]["text"]
|
||||
return summary, num_tokens_from_string(summary)
|
||||
|
||||
try:
|
||||
return call_api()
|
||||
except Exception as e1:
|
||||
import dashscope
|
||||
|
||||
dashscope.base_http_api_url = "https://dashscope-intl.aliyuncs.com/api/v1"
|
||||
try:
|
||||
return call_api()
|
||||
except Exception as e2:
|
||||
raise RuntimeError(f"Both default and intl endpoint failed.\nFirst error: {e1}\nSecond error: {e2}")
|
||||
|
||||
|
||||
|
||||
class HunyuanCV(GptV4):
|
||||
_FACTORY_NAME = "Tencent Hunyuan"
|
||||
@ -616,8 +675,6 @@ class GeminiCV(Base):
|
||||
def _process_video(self, video_bytes, filename):
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
video_size_mb = len(video_bytes) / (1024 * 1024)
|
||||
client = genai.Client(api_key=self.api_key)
|
||||
|
||||
Reference in New Issue
Block a user