From 762aa4b8c49d959bc3a81dadcedaa98d55bda669 Mon Sep 17 00:00:00 2001
From: RuyXu <139017514+RuyXu@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:00:56 +0800
Subject: [PATCH] fix: preserve correct MIME & unify data URL handling for
 vision inputs (relates #9248) (#9474)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix: preserve correct MIME & unify data URL handling for vision inputs
(relates #9248)

- Updated image2base64() to return a full data URL
(data:image/<fmt>;base64,...) with accurate MIME
- Removed hardcoded image/jpeg in Base._image_prompt(); pass through
data URLs and default raw base64 to image/png
- Set AnthropicCV._image_prompt() raw base64 media_type default to
image/png
- Ensures MIME type matches actual image content, fixing “cannot process
base64 image” errors on vLLM/OpenAI-compatible backends

### What problem does this PR solve?

This PR fixes a compatibility issue where base64-encoded images sent to
vision models (e.g., vLLM/OpenAI-compatible backends) were rejected due
to mismatched MIME type or incorrect decoding.
Previously, the backend:
- Always converted raw base64 into data:image/jpeg;base64,... even if
the actual content was PNG.
- In some cases, base64 decoding was attempted on the full data URL
string instead of the pure base64 part.
This caused errors like:
```
cannot process base64 image
failed to decode base64 string: illegal base64 data at input byte 0
```
by strict validators such as vLLM.
With this fix, the MIME type in the request now matches the actual image
content, and data URLs are correctly handled or passed through, ensuring
vision models can decode and process images reliably.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 rag/llm/cv_model.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)
diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py
index 4e92cb028..a04ded780 100644
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@@ -68,7 +68,7 @@ class Base(ABC):
             pmpt.append({
                 "type": "image_url",
                 "image_url": {
-                    "url": f"data:image/jpeg;base64,{img}" if img[:4] != "data" else img
+                    "url": img if isinstance(img, str) and img.startswith("data:") else f"data:image/png;base64,{img}"
                 }
             })
         return pmpt
@@ -109,16 +109,33 @@ class Base(ABC):
 
     @staticmethod
     def image2base64(image):
+        # Return a data URL with the correct MIME to avoid provider mismatches
         if isinstance(image, bytes):
-            return base64.b64encode(image).decode("utf-8")
+            # Best-effort magic number sniffing
+            mime = "image/png"
+            if len(image) >= 2 and image[0] == 0xFF and image[1] == 0xD8:
+                mime = "image/jpeg"
+            b64 = base64.b64encode(image).decode("utf-8")
+            return f"data:{mime};base64,{b64}"
         if isinstance(image, BytesIO):
-            return base64.b64encode(image.getvalue()).decode("utf-8")
+            data = image.getvalue()
+            mime = "image/png"
+            if len(data) >= 2 and data[0] == 0xFF and data[1] == 0xD8:
+                mime = "image/jpeg"
+            b64 = base64.b64encode(data).decode("utf-8")
+            return f"data:{mime};base64,{b64}"
         buffered = BytesIO()
+        fmt = "JPEG"
         try:
             image.save(buffered, format="JPEG")
         except Exception:
+            buffered = BytesIO()  # reset buffer before saving PNG
             image.save(buffered, format="PNG")
-        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+            fmt = "PNG"
+        data = buffered.getvalue()
+        b64 = base64.b64encode(data).decode("utf-8")
+        mime = f"image/{fmt.lower()}"
+        return f"data:{mime};base64,{b64}"
 
     def prompt(self, b64):
         return [
@@ -674,8 +691,8 @@ class AnthropicCV(Base):
                         "type": "image",
                         "source": {
                             "type": "base64",
-                            "media_type": "image/jpeg" if img[:4] != "data" else img.split(":")[1].split(";")[0],
-                            "data": img if img[:4] != "data" else img.split(",")[1]
+                            "media_type": (img.split(":")[1].split(";")[0] if isinstance(img, str) and img[:4] == "data" else "image/png"),
+                            "data": (img.split(",")[1] if isinstance(img, str) and img[:4] == "data" else img)
                         },
                     }
             )