diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py index 120de2114..f84bda827 100644 --- a/api/db/services/llm_service.py +++ b/api/db/services/llm_service.py @@ -205,7 +205,7 @@ class LLMBundle(LLM4Tenant): return txt return txt[last_think_end + len("") :] - + @staticmethod def _clean_param(chat_partial, **kwargs): func = chat_partial.func @@ -222,15 +222,15 @@ class LLMBundle(LLM4Tenant): if not support_var_args: use_kwargs = {k: v for k, v in kwargs.items() if k in keyword_args} return use_kwargs - + def chat(self, system: str, history: list, gen_conf: dict = {}, **kwargs) -> str: if self.langfuse: generation = self.langfuse.start_generation(trace_context=self.trace_context, name="chat", model=self.llm_name, input={"system": system, "history": history}) - chat_partial = partial(self.mdl.chat, system, history, gen_conf) + chat_partial = partial(self.mdl.chat, system, history, gen_conf, **kwargs) if self.is_tools and self.mdl.is_tools: - chat_partial = partial(self.mdl.chat_with_tools, system, history, gen_conf) - + chat_partial = partial(self.mdl.chat_with_tools, system, history, gen_conf, **kwargs) + use_kwargs = self._clean_param(chat_partial, **kwargs) txt, used_tokens = chat_partial(**use_kwargs) txt = self._remove_reasoning_content(txt) diff --git a/conf/llm_factories.json b/conf/llm_factories.json index 08d3f6f5b..fee97eaf1 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -1345,35 +1345,35 @@ "llm_name": "gemini-2.5-flash", "tags": "LLM,CHAT,1024K,IMAGE2TEXT", "max_tokens": 1048576, - "model_type": "chat", + "model_type": "image2text", "is_tools": true }, { "llm_name": "gemini-2.5-pro", "tags": "LLM,CHAT,IMAGE2TEXT,1024K", "max_tokens": 1048576, - "model_type": "chat", + "model_type": "image2text", "is_tools": true }, { "llm_name": "gemini-2.5-flash-lite", "tags": "LLM,CHAT,1024K,IMAGE2TEXT", "max_tokens": 1048576, - "model_type": "chat", + "model_type": "image2text", "is_tools": true }, { "llm_name": "gemini-2.0-flash", "tags": "LLM,CHAT,1024K", "max_tokens": 1048576, - "model_type": "chat", + "model_type": "image2text", "is_tools": true }, { "llm_name": "gemini-2.0-flash-lite", "tags": "LLM,CHAT,1024K", "max_tokens": 1048576, - "model_type": "chat", + "model_type": "image2text", "is_tools": true }, { diff --git a/rag/app/picture.py b/rag/app/picture.py index a868cf4cc..8e9fccf67 100644 --- a/rag/app/picture.py +++ b/rag/app/picture.py @@ -23,44 +23,62 @@ from PIL import Image from api.db import LLMType from api.db.services.llm_service import LLMBundle from deepdoc.vision import OCR -from rag.nlp import tokenize +from rag.nlp import rag_tokenizer, tokenize from rag.utils import clean_markdown_block -from rag.nlp import rag_tokenizer - ocr = OCR() +# Gemini supported MIME types +VIDEO_EXTS = [".mp4", ".mov", ".avi", ".flv", ".mpeg", ".mpg", ".webm", ".wmv", ".3gp", ".3gpp"] + def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): - img = Image.open(io.BytesIO(binary)).convert('RGB') doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), - "image": img, - "doc_type_kwd": "image" } - bxs = ocr(np.array(img)) - txt = "\n".join([t[0] for _, t in bxs if t[0]]) eng = lang.lower() == "english" - callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) - if (eng and len(txt.split()) > 32) or len(txt) > 32: - tokenize(doc, txt, eng) - callback(0.8, "OCR results is too long to use CV LLM.") - return [doc] - try: - callback(0.4, "Use CV LLM to describe the picture.") - cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) - img_binary = io.BytesIO() - img.save(img_binary, format='JPEG') - img_binary.seek(0) - ans = cv_mdl.describe(img_binary.read()) - callback(0.8, "CV LLM respond: %s ..." % ans[:32]) - txt += "\n" + ans - tokenize(doc, txt, eng) - return [doc] - except Exception as e: - callback(prog=-1, msg=str(e)) + if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS): + try: + doc.update({"doc_type_kwd": "video"}) + cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang) + ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) + callback(0.8, "CV LLM respond: %s ..." % ans[:32]) + ans += "\n" + ans + tokenize(doc, ans, eng) + return [doc] + except Exception as e: + callback(prog=-1, msg=str(e)) + else: + img = Image.open(io.BytesIO(binary)).convert("RGB") + doc.update( + { + "image": img, + "doc_type_kwd": "image", + } + ) + bxs = ocr(np.array(img)) + txt = "\n".join([t[0] for _, t in bxs if t[0]]) + callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) + if (eng and len(txt.split()) > 32) or len(txt) > 32: + tokenize(doc, txt, eng) + callback(0.8, "OCR results is too long to use CV LLM.") + return [doc] + + try: + callback(0.4, "Use CV LLM to describe the picture.") + cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) + img_binary = io.BytesIO() + img.save(img_binary, format="JPEG") + img_binary.seek(0) + ans = cv_mdl.describe(img_binary.read()) + callback(0.8, "CV LLM respond: %s ..." % ans[:32]) + txt += "\n" + ans + tokenize(doc, txt, eng) + return [doc] + except Exception as e: + callback(prog=-1, msg=str(e)) return [] @@ -79,7 +97,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None): try: with io.BytesIO() as img_binary: - img.save(img_binary, format='JPEG') + img.save(img_binary, format="JPEG") img_binary.seek(0) ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt)) txt += "\n" + ans diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py index 739374f34..bec92ba6c 100644 --- a/rag/llm/cv_model.py +++ b/rag/llm/cv_model.py @@ -16,6 +16,7 @@ import base64 import json import os +import logging from abc import ABC from copy import deepcopy from io import BytesIO @@ -529,6 +530,7 @@ class GeminiCV(Base): client.configure(api_key=key) _client = client.get_default_generative_client() + self.api_key=key self.model_name = model_name self.model = GenerativeModel(model_name=self.model_name) self.model._client = _client @@ -571,7 +573,15 @@ class GeminiCV(Base): res = self.model.generate_content(input) return res.text, total_token_count_from_response(res) - def chat(self, system, history, gen_conf, images=[]): + + def chat(self, system, history, gen_conf, images=[], video_bytes=None, filename=""): + if video_bytes: + try: + summary, summary_num_tokens = self._process_video(video_bytes, filename) + return summary, summary_num_tokens + except Exception as e: + return "**ERROR**: " + str(e), 0 + generation_config = dict(temperature=gen_conf.get("temperature", 0.3), top_p=gen_conf.get("top_p", 0.7)) try: response = self.model.generate_content( @@ -603,6 +613,48 @@ class GeminiCV(Base): yield total_token_count_from_response(response) + def _process_video(self, video_bytes, filename): + from google import genai + from google.genai import types + import tempfile + from pathlib import Path + + video_size_mb = len(video_bytes) / (1024 * 1024) + client = genai.Client(api_key=self.api_key) + + tmp_path = None + try: + if video_size_mb <= 20: + response = client.models.generate_content( + model="models/gemini-2.5-flash", + contents=types.Content(parts=[ + types.Part(inline_data=types.Blob(data=video_bytes, mime_type="video/mp4")), + types.Part(text="Please summarize the video in proper sentences.") + ]) + ) + else: + logging.info(f"Video size {video_size_mb:.2f}MB exceeds 20MB. Using Files API...") + video_suffix = Path(filename).suffix or ".mp4" + with tempfile.NamedTemporaryFile(delete=False, suffix=video_suffix) as tmp: + tmp.write(video_bytes) + tmp_path = Path(tmp.name) + uploaded_file = client.files.upload(file=tmp_path) + + response = client.models.generate_content( + model="gemini-2.5-flash", + contents=[uploaded_file, "Please summarize this video in proper sentences."] + ) + + summary = response.text or "" + logging.info(f"Video summarized: {summary[:32]}...") + return summary, num_tokens_from_string(summary) + except Exception as e: + logging.error(f"Video processing failed: {e}") + raise + finally: + if tmp_path and tmp_path.exists(): + tmp_path.unlink() + class NvidiaCV(Base): _FACTORY_NAME = "NVIDIA"