# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import io import re import numpy as np from PIL import Image from common.constants import LLMType from api.db.services.llm_service import LLMBundle from deepdoc.vision import OCR from rag.nlp import rag_tokenizer, tokenize from common.string_utils import clean_markdown_block ocr = OCR() # Gemini supported MIME types VIDEO_EXTS = [".mp4", ".mov", ".avi", ".flv", ".mpeg", ".mpg", ".webm", ".wmv", ".3gp", ".3gpp", ".mkv"] def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), } eng = lang.lower() == "english" if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS): try: doc.update({"doc_type_kwd": "video"}) cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang) ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) callback(0.8, "CV LLM respond: %s ..." % ans[:32]) ans += "\n" + ans tokenize(doc, ans, eng) return [doc] except Exception as e: callback(prog=-1, msg=str(e)) else: img = Image.open(io.BytesIO(binary)).convert("RGB") doc.update( { "image": img, "doc_type_kwd": "image", } ) bxs = ocr(np.array(img)) txt = "\n".join([t[0] for _, t in bxs if t[0]]) callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) if (eng and len(txt.split()) > 32) or len(txt) > 32: tokenize(doc, txt, eng) callback(0.8, "OCR results is too long to use CV LLM.") return [doc] try: callback(0.4, "Use CV LLM to describe the picture.") cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) img_binary = io.BytesIO() img.save(img_binary, format="JPEG") img_binary.seek(0) ans = cv_mdl.describe(img_binary.read()) callback(0.8, "CV LLM respond: %s ..." % ans[:32]) txt += "\n" + ans tokenize(doc, txt, eng) return [doc] except Exception as e: callback(prog=-1, msg=str(e)) return [] def vision_llm_chunk(binary, vision_model, prompt=None, callback=None): """ A simple wrapper to process image to markdown texts via VLM. Returns: Simple markdown texts generated by VLM. """ callback = callback or (lambda prog, msg: None) img = binary txt = "" try: with io.BytesIO() as img_binary: img.save(img_binary, format="JPEG") img_binary.seek(0) ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt)) txt += "\n" + ans return txt except Exception as e: callback(-1, str(e)) return ""