Fix #10408 (#10471)

### What problem does this PR solve? Google Cloud model does not work correctly with gemini-2.5 models Close #10408 ### Type of change - [X] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-02-01 08:05:07 +08:00 · 2025-10-10 13:18:24 +02:00
parent 2cdba3d1e6
commit 0283e4098f
2 changed files with 20 additions and 6 deletions
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
@ -145,7 +145,7 @@ class Base(ABC):
            logging.info(f"[INFO] {self.model_name} detected as reasoning model, using _chat_streamly")

            final_ans = ""
-            tol_token = 0 
+            tol_token = 0
            for delta, tol in self._chat_streamly(history, gen_conf, with_reasoning=False, **kwargs):
                if delta.startswith("<think>") or delta.endswith("</think>"):
                    continue
@ -156,7 +156,7 @@ class Base(ABC):
                final_ans = "**ERROR**: Empty response from reasoning model"

            return final_ans.strip(), tol_token
-        
+
        if self.model_name.lower().find("qwen3") >= 0:
            kwargs["extra_body"] = {"enable_thinking": False}

@ -1182,6 +1182,7 @@ class GoogleChat(Base):
        else:
            if "max_tokens" in gen_conf:
                gen_conf["max_output_tokens"] = gen_conf["max_tokens"]
+                del gen_conf["max_tokens"]
            for k in list(gen_conf.keys()):
                if k not in ["temperature", "top_p", "max_output_tokens"]:
                    del gen_conf[k]
@ -1189,6 +1190,7 @@ class GoogleChat(Base):

    def _chat(self, history, gen_conf={}, **kwargs):
        system = history[0]["content"] if history and history[0]["role"] == "system" else ""
+        gen_conf = self._clean_conf(gen_conf)
        if "claude" in self.model_name:
            response = self.client.messages.create(
                model=self.model_name,
@ -1250,9 +1252,12 @@ class GoogleChat(Base):

            yield total_tokens
        else:
+            response = None
+            total_tokens = 0
            self.client._system_instruction = system
            if "max_tokens" in gen_conf:
                gen_conf["max_output_tokens"] = gen_conf["max_tokens"]
+                del gen_conf["max_tokens"]
            for k in list(gen_conf.keys()):
                if k not in ["temperature", "top_p", "max_output_tokens"]:
                    del gen_conf[k]
@ -1260,18 +1265,23 @@ class GoogleChat(Base):
                if "role" in item and item["role"] == "assistant":
                    item["role"] = "model"
                if "content" in item:
-                    item["parts"] = item.pop("content")
+                    item["parts"] = [
+                        {
+                            "text": item.pop("content"),
+                        }
+                    ]
            ans = ""
            try:
-                response = self.model.generate_content(history, generation_config=gen_conf, stream=True)
+                response = self.client.generate_content(history, generation_config=gen_conf, stream=True)
                for resp in response:
                    ans = resp.text
+                    total_tokens += num_tokens_from_string(ans)
                    yield ans

            except Exception as e:
                yield ans + "\n**ERROR**: " + str(e)

-            yield response._chunks[-1].usage_metadata.total_token_count
+            yield total_tokens


 class GPUStackChat(Base):