Refactor: Enhance delta streaming in chat functions for improved reasoning and content handling (#12453)

### What problem does this PR solve? change: Enhance delta streaming in chat functions for improved reasoning and content handling ### Type of change - [x] Refactoring
2026-01-29 22:56:36 +08:00 · 2026-01-08 13:34:16 +08:00
parent f4e2783eb4
commit 1996aa0dac
5 changed files with 325 additions and 123 deletions
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
@ -304,9 +304,12 @@ async def chat_completion_openai_like(tenant_id, chat_id):
        # The choices field on the last chunk will always be an empty array [].
        async def streamed_response_generator(chat_id, dia, msg):
            token_used = 0
-            answer_cache = ""
-            reasoning_cache = ""
            last_ans = {}
+            full_content = ""
+            full_reasoning = ""
+            final_answer = None
+            final_reference = None
+            in_think = False
            response = {
                "id": f"chatcmpl-{chat_id}",
                "choices": [
@ -336,47 +339,30 @@ async def chat_completion_openai_like(tenant_id, chat_id):
                    chat_kwargs["doc_ids"] = doc_ids_str
                async for ans in async_chat(dia, msg, True, **chat_kwargs):
                    last_ans = ans
-                    answer = ans["answer"]
-
-                    reasoning_match = re.search(r"<think>(.*?)</think>", answer, flags=re.DOTALL)
-                    if reasoning_match:
-                        reasoning_part = reasoning_match.group(1)
-                        content_part = answer[reasoning_match.end() :]
-                    else:
-                        reasoning_part = ""
-                        content_part = answer
-
-                    reasoning_incremental = ""
-                    if reasoning_part:
-                        if reasoning_part.startswith(reasoning_cache):
-                            reasoning_incremental = reasoning_part.replace(reasoning_cache, "", 1)
-                        else:
-                            reasoning_incremental = reasoning_part
-                        reasoning_cache = reasoning_part
-
-                    content_incremental = ""
-                    if content_part:
-                        if content_part.startswith(answer_cache):
-                            content_incremental = content_part.replace(answer_cache, "", 1)
-                    else:
-                        content_incremental = content_part
-                    answer_cache = content_part
-
-                    token_used += len(reasoning_incremental) + len(content_incremental)
-
-                    if not any([reasoning_incremental, content_incremental]):
+                    if ans.get("final"):
+                        if ans.get("answer"):
+                            full_content = ans["answer"]
+                        final_answer = ans.get("answer") or full_content
+                        final_reference = ans.get("reference", {})
                        continue
-
-                    if reasoning_incremental:
-                        response["choices"][0]["delta"]["reasoning_content"] = reasoning_incremental
-                    else:
-                        response["choices"][0]["delta"]["reasoning_content"] = None
-
-                    if content_incremental:
-                        response["choices"][0]["delta"]["content"] = content_incremental
-                    else:
+                    if ans.get("start_to_think"):
+                        in_think = True
+                        continue
+                    if ans.get("end_to_think"):
+                        in_think = False
+                        continue
+                    delta = ans.get("answer") or ""
+                    if not delta:
+                        continue
+                    token_used += len(delta)
+                    if in_think:
+                        full_reasoning += delta
+                        response["choices"][0]["delta"]["reasoning_content"] = delta
                        response["choices"][0]["delta"]["content"] = None
-
+                    else:
+                        full_content += delta
+                        response["choices"][0]["delta"]["content"] = delta
+                        response["choices"][0]["delta"]["reasoning_content"] = None
                    yield f"data:{json.dumps(response, ensure_ascii=False)}\n\n"
            except Exception as e:
                response["choices"][0]["delta"]["content"] = "**ERROR**: " + str(e)
@ -388,8 +374,9 @@ async def chat_completion_openai_like(tenant_id, chat_id):
            response["choices"][0]["finish_reason"] = "stop"
            response["usage"] = {"prompt_tokens": len(prompt), "completion_tokens": token_used, "total_tokens": len(prompt) + token_used}
            if need_reference:
-                response["choices"][0]["delta"]["reference"] = chunks_format(last_ans.get("reference", []))
-                response["choices"][0]["delta"]["final_content"] = last_ans.get("answer", "")
+                reference_payload = final_reference if final_reference is not None else last_ans.get("reference", [])
+                response["choices"][0]["delta"]["reference"] = chunks_format(reference_payload)
+                response["choices"][0]["delta"]["final_content"] = final_answer if final_answer is not None else full_content
            yield f"data:{json.dumps(response, ensure_ascii=False)}\n\n"
            yield "data:[DONE]\n\n"