Fix: Disable reasoning on Gemini 2.5 Flash by default (#10477)

### What problem does this PR solve? Gemini 2.5 Flash Models use reasoning by default. There is currently no way to disable this behaviour. This leads to very long response times (> 1min). The default behaviour should be, that reasoning is disabled and configurable issue #10474 ### Type of change - [X] Bug Fix (non-breaking change which fixes an issue)
2026-01-23 03:26:53 +08:00 · 2025-10-11 04:22:51 +02:00
parent b5ddc7ca05
commit fee757eb41
1 changed files with 37 additions and 2 deletions
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
@ -1188,8 +1188,36 @@ class GoogleChat(Base):
                    del gen_conf[k]
        return gen_conf

+    def _get_thinking_config(self, gen_conf):
+        """Extract and create ThinkingConfig from gen_conf.
+
+        Default behavior for Vertex AI Generative Models: thinking_budget=0 (disabled)
+        unless explicitly specified by the user. This does not apply to Claude models.
+
+        Users can override by setting thinking_budget in gen_conf/llm_setting:
+        - 0: Disabled (default)
+        - 1-24576: Manual budget
+        - -1: Auto (model decides)
+        """
+        # Claude models don't support ThinkingConfig
+        if "claude" in self.model_name:
+            gen_conf.pop("thinking_budget", None)
+            return None
+
+        # For Vertex AI Generative Models, default to thinking disabled
+        thinking_budget = gen_conf.pop("thinking_budget", 0)
+
+        if thinking_budget is not None:
+            try:
+                import vertexai.generative_models as glm  # type: ignore
+                return glm.ThinkingConfig(thinking_budget=thinking_budget)
+            except Exception:
+                pass
+        return None
+
    def _chat(self, history, gen_conf={}, **kwargs):
        system = history[0]["content"] if history and history[0]["role"] == "system" else ""
+        thinking_config = self._get_thinking_config(gen_conf)
        gen_conf = self._clean_conf(gen_conf)
        if "claude" in self.model_name:
            response = self.client.messages.create(
@ -1223,7 +1251,10 @@ class GoogleChat(Base):
                    }
                ]

-        response = self.client.generate_content(hist, generation_config=gen_conf)
+        if thinking_config:
+            response = self.client.generate_content(hist, generation_config=gen_conf, thinking_config=thinking_config)
+        else:
+            response = self.client.generate_content(hist, generation_config=gen_conf)
        ans = response.text
        return ans, response.usage_metadata.total_token_count

@ -1255,6 +1286,7 @@ class GoogleChat(Base):
            response = None
            total_tokens = 0
            self.client._system_instruction = system
+            thinking_config = self._get_thinking_config(gen_conf)
            if "max_tokens" in gen_conf:
                gen_conf["max_output_tokens"] = gen_conf["max_tokens"]
                del gen_conf["max_tokens"]
@ -1272,7 +1304,10 @@ class GoogleChat(Base):
                    ]
            ans = ""
            try:
-                response = self.client.generate_content(history, generation_config=gen_conf, stream=True)
+                if thinking_config:
+                    response = self.client.generate_content(history, generation_config=gen_conf, thinking_config=thinking_config, stream=True)
+                else:
+                    response = self.client.generate_content(history, generation_config=gen_conf, stream=True)
                for resp in response:
                    ans = resp.text
                    total_tokens += num_tokens_from_string(ans)