Refa: change LLM chat output from full to delta (incremental) (#6534)

### What problem does this PR solve? Change LLM chat output from full to delta (incremental) ### Type of change - [x] Refactoring
2026-01-23 03:26:53 +08:00 · 2025-03-26 19:33:14 +08:00
parent 6599db1e99
commit df3890827d
3 changed files with 277 additions and 399 deletions
--- a/api/db/services/llm_service.py
+++ b/api/db/services/llm_service.py
@ -324,15 +324,18 @@ class LLMBundle:
        if self.langfuse:
            generation = self.trace.generation(name="chat_streamly", model=self.llm_name, input={"system": system, "history": history})

-        output = ""
+        ans = ""
        for txt in self.mdl.chat_streamly(system, history, gen_conf):
            if isinstance(txt, int):
                if self.langfuse:
-                    generation.end(output={"output": output})
+                    generation.end(output={"output": ans})

                if not TenantLLMService.increase_usage(self.tenant_id, self.llm_type, txt, self.llm_name):
                    logging.error("LLMBundle.chat_streamly can't update token usage for {}/CHAT llm_name: {}, content: {}".format(self.tenant_id, self.llm_name, txt))
-                return
+                return ans

-            output = txt
-            yield txt
+            if txt.endswith("</think>"):
+                ans = ans.rstrip("</think>")
+
+            ans += txt
+            yield ans