diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py index 21f78a819..48d72ff40 100644 --- a/rag/llm/cv_model.py +++ b/rag/llm/cv_model.py @@ -114,6 +114,28 @@ class Base(ABC): yield tk_count + @staticmethod + def image2base64_rawvalue(self, image): + # Return a base64 string without data URL header + if isinstance(image, bytes): + b64 = base64.b64encode(image).decode("utf-8") + return b64 + if isinstance(image, BytesIO): + data = image.getvalue() + b64 = base64.b64encode(data).decode("utf-8") + return b64 + with BytesIO() as buffered: + try: + image.save(buffered, format="JPEG") + except Exception: + # reset buffer before saving PNG + buffered.seek(0) + buffered.truncate() + image.save(buffered, format="PNG") + data = buffered.getvalue() + b64 = base64.b64encode(data).decode("utf-8") + return b64 + @staticmethod def image2base64(image): # Return a data URL with the correct MIME to avoid provider mismatches @@ -614,23 +636,38 @@ class GeminiCV(Base): if self.lang.lower() == "chinese" else "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out." ) - b64 = self.image2base64(image) - with BytesIO(base64.b64decode(b64)) as bio: - with open(bio) as img: - input = [prompt, img] - res = self.model.generate_content(input) - return res.text, total_token_count_from_response(res) + + if image is bytes: + with BytesIO(image) as bio: + with open(bio) as img: + input = [prompt, img] + res = self.model.generate_content(input) + return res.text, total_token_count_from_response(res) + else: + b64 = self.image2base64_rawvalue(image) + with BytesIO(base64.b64decode(b64)) as bio: + with open(bio) as img: + input = [prompt, img] + res = self.model.generate_content(input) + return res.text, total_token_count_from_response(res) def describe_with_prompt(self, image, prompt=None): from PIL.Image import open - - b64 = self.image2base64(image) vision_prompt = prompt if prompt else vision_llm_describe_prompt() - with BytesIO(base64.b64decode(b64)) as bio: - with open(bio) as img: - input = [vision_prompt, img] - res = self.model.generate_content(input) - return res.text, total_token_count_from_response(res) + + if image is bytes: + with BytesIO(image) as bio: + with open(bio) as img: + input = [vision_prompt, img] + res = self.model.generate_content(input) + return res.text, total_token_count_from_response(res) + else: + b64 = self.image2base64_rawvalue(image) + with BytesIO(base64.b64decode(b64)) as bio: + with open(bio) as img: + input = [vision_prompt, img] + res = self.model.generate_content(input) + return res.text, total_token_count_from_response(res) def chat(self, system, history, gen_conf, images=None, video_bytes=None, filename="", **kwargs):