Fix: cannot parse images (#11044)

### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/11043 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-04 03:25:30 +08:00 · 2025-11-10 09:31:19 +08:00
parent 4cdaa77545
commit 660386d3b5
1 changed files with 50 additions and 13 deletions
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@ -114,6 +114,28 @@ class Base(ABC):
        yield tk_count
    @staticmethod
    def image2base64_rawvalue(self, image):
        # Return a base64 string without data URL header
        if isinstance(image, bytes):
            b64 = base64.b64encode(image).decode("utf-8")
            return b64
        if isinstance(image, BytesIO):
            data = image.getvalue()
            b64 = base64.b64encode(data).decode("utf-8")
            return b64
        with BytesIO() as buffered:
            try:
                image.save(buffered, format="JPEG")
            except Exception:
                 # reset buffer before saving PNG
                buffered.seek(0)
                buffered.truncate()
                image.save(buffered, format="PNG")
            data = buffered.getvalue()
            b64 = base64.b64encode(data).decode("utf-8")
        return b64
    @staticmethod
    def image2base64(image):
        # Return a data URL with the correct MIME to avoid provider mismatches
@ -614,23 +636,38 @@ class GeminiCV(Base):
            if self.lang.lower() == "chinese"
            else "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out."
        )
-        b64 = self.image2base64(image)
+
-        with BytesIO(base64.b64decode(b64)) as bio:
+        if image is bytes:
-            with open(bio) as img:
+            with BytesIO(image) as bio:
-                input = [prompt, img]
+                with open(bio) as img:
-                res = self.model.generate_content(input)
+                    input = [prompt, img]
-                return res.text, total_token_count_from_response(res)
+                    res = self.model.generate_content(input)
                    return res.text, total_token_count_from_response(res)
        else:
            b64 = self.image2base64_rawvalue(image)
            with BytesIO(base64.b64decode(b64)) as bio:
                with open(bio) as img:
                    input = [prompt, img]
                    res = self.model.generate_content(input)
                    return res.text, total_token_count_from_response(res)
    def describe_with_prompt(self, image, prompt=None):
        from PIL.Image import open
        b64 = self.image2base64(image)
        vision_prompt = prompt if prompt else vision_llm_describe_prompt()
-        with BytesIO(base64.b64decode(b64)) as bio:
+
-            with open(bio) as img:
+        if image is bytes:
-                input = [vision_prompt, img]
+            with BytesIO(image) as bio:
-                res = self.model.generate_content(input)
+                with open(bio) as img:
-                return res.text, total_token_count_from_response(res)
+                    input = [vision_prompt, img]
                    res = self.model.generate_content(input)
                    return res.text, total_token_count_from_response(res)
        else:
            b64 = self.image2base64_rawvalue(image)
            with BytesIO(base64.b64decode(b64)) as bio:
                with open(bio) as img:
                    input = [vision_prompt, img]
                    res = self.model.generate_content(input)
                    return res.text, total_token_count_from_response(res)
    def chat(self, system, history, gen_conf, images=None, video_bytes=None, filename="", **kwargs):