fix: preserve correct MIME & unify data URL handling for vision inputs (relates #9248) (#9474)

fix: preserve correct MIME & unify data URL handling for vision inputs (relates #9248) - Updated image2base64() to return a full data URL (data:image/<fmt>;base64,...) with accurate MIME - Removed hardcoded image/jpeg in Base._image_prompt(); pass through data URLs and default raw base64 to image/png - Set AnthropicCV._image_prompt() raw base64 media_type default to image/png - Ensures MIME type matches actual image content, fixing “cannot process base64 image” errors on vLLM/OpenAI-compatible backends ### What problem does this PR solve? This PR fixes a compatibility issue where base64-encoded images sent to vision models (e.g., vLLM/OpenAI-compatible backends) were rejected due to mismatched MIME type or incorrect decoding. Previously, the backend: - Always converted raw base64 into data:image/jpeg;base64,... even if the actual content was PNG. - In some cases, base64 decoding was attempted on the full data URL string instead of the pure base64 part. This caused errors like: ``` cannot process base64 image failed to decode base64 string: illegal base64 data at input byte 0 ``` by strict validators such as vLLM. With this fix, the MIME type in the request now matches the actual image content, and data URLs are correctly handled or passed through, ensuring vision models can decode and process images reliably. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-25 12:46:38 +08:00 · 2025-08-14 17:00:56 +08:00
parent 9cd09488ca
commit 762aa4b8c4
1 changed files with 23 additions and 6 deletions
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@ -68,7 +68,7 @@ class Base(ABC):
            pmpt.append({
                "type": "image_url",
                "image_url": {
-                    "url": f"data:image/jpeg;base64,{img}" if img[:4] != "data" else img
+                    "url": img if isinstance(img, str) and img.startswith("data:") else f"data:image/png;base64,{img}"
                }
            })
        return pmpt
@ -109,16 +109,33 @@ class Base(ABC):

    @staticmethod
    def image2base64(image):
+        # Return a data URL with the correct MIME to avoid provider mismatches
        if isinstance(image, bytes):
-            return base64.b64encode(image).decode("utf-8")
+            # Best-effort magic number sniffing
+            mime = "image/png"
+            if len(image) >= 2 and image[0] == 0xFF and image[1] == 0xD8:
+                mime = "image/jpeg"
+            b64 = base64.b64encode(image).decode("utf-8")
+            return f"data:{mime};base64,{b64}"
        if isinstance(image, BytesIO):
-            return base64.b64encode(image.getvalue()).decode("utf-8")
+            data = image.getvalue()
+            mime = "image/png"
+            if len(data) >= 2 and data[0] == 0xFF and data[1] == 0xD8:
+                mime = "image/jpeg"
+            b64 = base64.b64encode(data).decode("utf-8")
+            return f"data:{mime};base64,{b64}"
        buffered = BytesIO()
+        fmt = "JPEG"
        try:
            image.save(buffered, format="JPEG")
        except Exception:
+            buffered = BytesIO()  # reset buffer before saving PNG
            image.save(buffered, format="PNG")
-        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+            fmt = "PNG"
+        data = buffered.getvalue()
+        b64 = base64.b64encode(data).decode("utf-8")
+        mime = f"image/{fmt.lower()}"
+        return f"data:{mime};base64,{b64}"

    def prompt(self, b64):
        return [
@ -674,8 +691,8 @@ class AnthropicCV(Base):
                        "type": "image",
                        "source": {
                            "type": "base64",
-                            "media_type": "image/jpeg" if img[:4] != "data" else img.split(":")[1].split(";")[0],
-                            "data": img if img[:4] != "data" else img.split(",")[1]
+                            "media_type": (img.split(":")[1].split(";")[0] if isinstance(img, str) and img[:4] == "data" else "image/png"),
+                            "data": (img.split(",")[1] if isinstance(img, str) and img[:4] == "data" else img)
                        },
                    }
            )