Refa: change LLM chat output from full to delta (incremental) (#6534)

### What problem does this PR solve?

Change LLM chat output from full to delta (incremental)

### Type of change

- [x] Refactoring
This commit is contained in:
Yongteng Lei
2025-03-26 19:33:14 +08:00
committed by GitHub
parent 6599db1e99
commit df3890827d
3 changed files with 277 additions and 399 deletions

View File

@ -324,15 +324,18 @@ class LLMBundle:
if self.langfuse:
generation = self.trace.generation(name="chat_streamly", model=self.llm_name, input={"system": system, "history": history})
output = ""
ans = ""
for txt in self.mdl.chat_streamly(system, history, gen_conf):
if isinstance(txt, int):
if self.langfuse:
generation.end(output={"output": output})
generation.end(output={"output": ans})
if not TenantLLMService.increase_usage(self.tenant_id, self.llm_type, txt, self.llm_name):
logging.error("LLMBundle.chat_streamly can't update token usage for {}/CHAT llm_name: {}, content: {}".format(self.tenant_id, self.llm_name, txt))
return
return ans
output = txt
yield txt
if txt.endswith("</think>"):
ans = ans.rstrip("</think>")
ans += txt
yield ans