fix: preserve correct MIME & unify data URL handling for vision inputs (relates #9248) (#9474)

fix: preserve correct MIME & unify data URL handling for vision inputs
(relates #9248)

- Updated image2base64() to return a full data URL
(data:image/<fmt>;base64,...) with accurate MIME
- Removed hardcoded image/jpeg in Base._image_prompt(); pass through
data URLs and default raw base64 to image/png
- Set AnthropicCV._image_prompt() raw base64 media_type default to
image/png
- Ensures MIME type matches actual image content, fixing “cannot process
base64 image” errors on vLLM/OpenAI-compatible backends

### What problem does this PR solve?

This PR fixes a compatibility issue where base64-encoded images sent to
vision models (e.g., vLLM/OpenAI-compatible backends) were rejected due
to mismatched MIME type or incorrect decoding.
Previously, the backend:
- Always converted raw base64 into data:image/jpeg;base64,... even if
the actual content was PNG.
- In some cases, base64 decoding was attempted on the full data URL
string instead of the pure base64 part.
This caused errors like:
```
cannot process base64 image
failed to decode base64 string: illegal base64 data at input byte 0
```
by strict validators such as vLLM.
With this fix, the MIME type in the request now matches the actual image
content, and data URLs are correctly handled or passed through, ensuring
vision models can decode and process images reliably.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
RuyXu
2025-08-14 17:00:56 +08:00
committed by GitHub
parent 9cd09488ca
commit 762aa4b8c4

View File

@ -68,7 +68,7 @@ class Base(ABC):
pmpt.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img}" if img[:4] != "data" else img
"url": img if isinstance(img, str) and img.startswith("data:") else f"data:image/png;base64,{img}"
}
})
return pmpt
@ -109,16 +109,33 @@ class Base(ABC):
@staticmethod
def image2base64(image):
# Return a data URL with the correct MIME to avoid provider mismatches
if isinstance(image, bytes):
return base64.b64encode(image).decode("utf-8")
# Best-effort magic number sniffing
mime = "image/png"
if len(image) >= 2 and image[0] == 0xFF and image[1] == 0xD8:
mime = "image/jpeg"
b64 = base64.b64encode(image).decode("utf-8")
return f"data:{mime};base64,{b64}"
if isinstance(image, BytesIO):
return base64.b64encode(image.getvalue()).decode("utf-8")
data = image.getvalue()
mime = "image/png"
if len(data) >= 2 and data[0] == 0xFF and data[1] == 0xD8:
mime = "image/jpeg"
b64 = base64.b64encode(data).decode("utf-8")
return f"data:{mime};base64,{b64}"
buffered = BytesIO()
fmt = "JPEG"
try:
image.save(buffered, format="JPEG")
except Exception:
buffered = BytesIO() # reset buffer before saving PNG
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
fmt = "PNG"
data = buffered.getvalue()
b64 = base64.b64encode(data).decode("utf-8")
mime = f"image/{fmt.lower()}"
return f"data:{mime};base64,{b64}"
def prompt(self, b64):
return [
@ -674,8 +691,8 @@ class AnthropicCV(Base):
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg" if img[:4] != "data" else img.split(":")[1].split(";")[0],
"data": img if img[:4] != "data" else img.split(",")[1]
"media_type": (img.split(":")[1].split(";")[0] if isinstance(img, str) and img[:4] == "data" else "image/png"),
"data": (img.split(",")[1] if isinstance(img, str) and img[:4] == "data" else img)
},
}
)