From 762aa4b8c49d959bc3a81dadcedaa98d55bda669 Mon Sep 17 00:00:00 2001 From: RuyXu <139017514+RuyXu@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:00:56 +0800 Subject: [PATCH] fix: preserve correct MIME & unify data URL handling for vision inputs (relates #9248) (#9474) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: preserve correct MIME & unify data URL handling for vision inputs (relates #9248) - Updated image2base64() to return a full data URL (data:image/;base64,...) with accurate MIME - Removed hardcoded image/jpeg in Base._image_prompt(); pass through data URLs and default raw base64 to image/png - Set AnthropicCV._image_prompt() raw base64 media_type default to image/png - Ensures MIME type matches actual image content, fixing “cannot process base64 image” errors on vLLM/OpenAI-compatible backends ### What problem does this PR solve? This PR fixes a compatibility issue where base64-encoded images sent to vision models (e.g., vLLM/OpenAI-compatible backends) were rejected due to mismatched MIME type or incorrect decoding. Previously, the backend: - Always converted raw base64 into data:image/jpeg;base64,... even if the actual content was PNG. - In some cases, base64 decoding was attempted on the full data URL string instead of the pure base64 part. This caused errors like: ``` cannot process base64 image failed to decode base64 string: illegal base64 data at input byte 0 ``` by strict validators such as vLLM. With this fix, the MIME type in the request now matches the actual image content, and data URLs are correctly handled or passed through, ensuring vision models can decode and process images reliably. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/llm/cv_model.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py index 4e92cb028..a04ded780 100644 --- a/rag/llm/cv_model.py +++ b/rag/llm/cv_model.py @@ -68,7 +68,7 @@ class Base(ABC): pmpt.append({ "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{img}" if img[:4] != "data" else img + "url": img if isinstance(img, str) and img.startswith("data:") else f"data:image/png;base64,{img}" } }) return pmpt @@ -109,16 +109,33 @@ class Base(ABC): @staticmethod def image2base64(image): + # Return a data URL with the correct MIME to avoid provider mismatches if isinstance(image, bytes): - return base64.b64encode(image).decode("utf-8") + # Best-effort magic number sniffing + mime = "image/png" + if len(image) >= 2 and image[0] == 0xFF and image[1] == 0xD8: + mime = "image/jpeg" + b64 = base64.b64encode(image).decode("utf-8") + return f"data:{mime};base64,{b64}" if isinstance(image, BytesIO): - return base64.b64encode(image.getvalue()).decode("utf-8") + data = image.getvalue() + mime = "image/png" + if len(data) >= 2 and data[0] == 0xFF and data[1] == 0xD8: + mime = "image/jpeg" + b64 = base64.b64encode(data).decode("utf-8") + return f"data:{mime};base64,{b64}" buffered = BytesIO() + fmt = "JPEG" try: image.save(buffered, format="JPEG") except Exception: + buffered = BytesIO() # reset buffer before saving PNG image.save(buffered, format="PNG") - return base64.b64encode(buffered.getvalue()).decode("utf-8") + fmt = "PNG" + data = buffered.getvalue() + b64 = base64.b64encode(data).decode("utf-8") + mime = f"image/{fmt.lower()}" + return f"data:{mime};base64,{b64}" def prompt(self, b64): return [ @@ -674,8 +691,8 @@ class AnthropicCV(Base): "type": "image", "source": { "type": "base64", - "media_type": "image/jpeg" if img[:4] != "data" else img.split(":")[1].split(";")[0], - "data": img if img[:4] != "data" else img.split(",")[1] + "media_type": (img.split(":")[1].split(";")[0] if isinstance(img, str) and img[:4] == "data" else "image/png"), + "data": (img.split(",")[1] if isinstance(img, str) and img[:4] == "data" else img) }, } )