Fix: add describe_image_with_prompt for ZHIPU AI (#11317)

### What problem does this PR solve? Fix: add describe_image_with_prompt for ZHIPU AI #11289 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-23 03:26:53 +08:00 · 2025-11-18 13:09:39 +08:00
parent 701761d119
commit 0db00f70b2
1 changed files with 118 additions and 1 deletions
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@ -14,6 +14,7 @@
 #  limitations under the License.
 #

+import re
 import base64
 import json
 import os
@ -32,7 +33,6 @@ from rag.nlp import is_english
 from rag.prompts.generator import vision_llm_describe_prompt
 from common.token_utils import num_tokens_from_string, total_token_count_from_response

-
 class Base(ABC):
    def __init__(self, **kwargs):
        # Configure retry parameters
@ -208,6 +208,7 @@ class GptV4(Base):
            model=self.model_name,
            messages=self.prompt(b64),
            extra_body=self.extra_body,
+            unused = None,
        )
        return res.choices[0].message.content.strip(), total_token_count_from_response(res)

@ -324,6 +325,122 @@ class Zhipu4V(GptV4):
        Base.__init__(self, **kwargs)


+    def _clean_conf(self, gen_conf):
+        if "max_tokens" in gen_conf:
+            del gen_conf["max_tokens"]
+        gen_conf = self._clean_conf_plealty(gen_conf)
+        return gen_conf
+
+
+    def _clean_conf_plealty(self, gen_conf):
+        if "presence_penalty" in gen_conf:
+            del gen_conf["presence_penalty"]
+        if "frequency_penalty" in gen_conf:
+            del gen_conf["frequency_penalty"]
+        return gen_conf
+
+
+    def _request(self, msg, stream, gen_conf={}):
+        response = requests.post(
+            self.base_url,
+            json={
+                "model": self.model_name,
+                "messages": msg,
+                "stream": stream,
+                **gen_conf
+            },
+            headers= {
+            "Authorization": f"Bearer {self.api_key}",  
+            "Content-Type": "application/json",
+            }
+        )
+        return response.json()
+
+
+    def chat(self, system, history, gen_conf, images=None, stream=False, **kwargs):
+        if system and history and history[0].get("role") != "system":
+            history.insert(0, {"role": "system", "content": system})
+
+        gen_conf = self._clean_conf(gen_conf)
+
+        logging.info(json.dumps(history, ensure_ascii=False, indent=2))
+        response = self.client.chat.completions.create(model=self.model_name, messages=self._form_history(system, history, images), stream=False, **gen_conf)
+        content = response.choices[0].message.content.strip()
+
+        cleaned = re.sub(r"<\|(begin_of_box|end_of_box)\|>", "", content).strip()
+        return cleaned, total_token_count_from_response(response)
+    
+
+    def chat_streamly(self, system, history, gen_conf, images=None, **kwargs):
+        from rag.llm.chat_model import LENGTH_NOTIFICATION_CN, LENGTH_NOTIFICATION_EN 
+        from rag.nlp import is_chinese
+
+        if system and history and history[0].get("role") != "system":
+            history.insert(0, {"role": "system", "content": system})
+        gen_conf = self._clean_conf(gen_conf)
+        ans = ""
+        tk_count = 0
+        try:
+            logging.info(json.dumps(history, ensure_ascii=False, indent=2))
+            response = self.client.chat.completions.create(model=self.model_name, messages=self._form_history(system, history, images), stream=True, **gen_conf)
+            for resp in response:
+                if not resp.choices[0].delta.content:
+                    continue
+                delta = resp.choices[0].delta.content
+                ans = delta
+                if resp.choices[0].finish_reason == "length":
+                    if is_chinese(ans):
+                        ans += LENGTH_NOTIFICATION_CN
+                    else:
+                        ans += LENGTH_NOTIFICATION_EN
+                    tk_count = total_token_count_from_response(resp)
+                if resp.choices[0].finish_reason == "stop":
+                    tk_count = total_token_count_from_response(resp)
+                yield ans
+        except Exception as e:
+            yield ans + "\n**ERROR**: " + str(e)
+
+        yield tk_count
+
+
+    def describe(self, image):
+        return self.describe_with_prompt(image)
+
+
+    def describe_with_prompt(self, image, prompt=None):
+        b64 = self.image2base64(image)
+        if prompt is None:
+            prompt = "Describe this image."
+
+        # Chat messages
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": { "url": b64 }
+                    },
+                    {
+                        "type": "text",
+                        "text": prompt
+                    }
+                ]
+            }
+        ]
+
+        resp = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            stream=False
+        )
+
+        content = resp.choices[0].message.content.strip()
+        cleaned = re.sub(r"<\|(begin_of_box|end_of_box)\|>", "", content).strip()
+
+        return cleaned, num_tokens_from_string(cleaned)
+    
+
 class StepFunCV(GptV4):
    _FACTORY_NAME = "StepFun"