Fix: tokenizer issue. (#11902)

#11786 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2025-12-31 01:01:30 +08:00 · 2025-12-11 17:38:17 +08:00
parent 22a51a3868
commit ea4a5cd665
17 changed files with 141 additions and 216 deletions
--- a/agent/component/agent_with_tools.py
+++ b/agent/component/agent_with_tools.py
@ -271,7 +271,7 @@ class Agent(LLM, ToolBase):
        last_calling = ""
        if len(hist) > 3:
            st = timer()
-            user_request = await asyncio.to_thread(full_question, messages=history, chat_mdl=self.chat_mdl)
+            user_request = await full_question(messages=history, chat_mdl=self.chat_mdl)
            self.callback("Multi-turn conversation optimization", {}, user_request, elapsed_time=timer()-st)
        else:
            user_request = history[-1]["content"]
@ -309,7 +309,7 @@ class Agent(LLM, ToolBase):
            if len(hist) > 12:
                _hist = [hist[0], hist[1], *hist[-10:]]
            entire_txt = ""
-            async for delta_ans in self._generate_streamly_async(_hist):
+            async for delta_ans in self._generate_streamly(_hist):
                if not need2cite or cited:
                    yield delta_ans, 0
                entire_txt += delta_ans
@ -397,7 +397,7 @@ Respond immediately with your final comprehensive answer.
        retrievals = self._canvas.get_reference()
        retrievals = {"chunks": list(retrievals["chunks"].values()), "doc_aggs": list(retrievals["doc_aggs"].values())}
        formated_refer = kb_prompt(retrievals, self.chat_mdl.max_length, True)
-        async for delta_ans in self._generate_streamly_async([{"role": "system", "content": citation_plus("\n\n".join(formated_refer))},
+        async for delta_ans in self._generate_streamly([{"role": "system", "content": citation_plus("\n\n".join(formated_refer))},
                                                  {"role": "user", "content": text}
                                                  ]):
            yield delta_ans
--- a/agent/component/categorize.py
+++ b/agent/component/categorize.py
@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import asyncio
 import logging
 import os
 import re
@ -97,7 +98,7 @@ class Categorize(LLM, ABC):
    component_name = "Categorize"

    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)))
-    def _invoke(self, **kwargs):
+    async def _invoke_async(self, **kwargs):
        if self.check_if_canceled("Categorize processing"):
            return

@ -121,7 +122,7 @@ class Categorize(LLM, ABC):
        if self.check_if_canceled("Categorize processing"):
            return

-        ans = chat_mdl.chat(self._param.sys_prompt, [{"role": "user", "content": user_prompt}], self._param.gen_conf())
+        ans = await chat_mdl.async_chat(self._param.sys_prompt, [{"role": "user", "content": user_prompt}], self._param.gen_conf())
        logging.info(f"input: {user_prompt}, answer: {str(ans)}")
        if ERROR_PREFIX in ans:
            raise Exception(ans)
@ -144,5 +145,9 @@ class Categorize(LLM, ABC):
        self.set_output("category_name", max_category)
        self.set_output("_next", cpn_ids)

+    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)))
+    def _invoke(self, **kwargs):
+        return asyncio.run(self._invoke_async(**kwargs))
+
    def thoughts(self) -> str:
        return "Which should it falls into {}? ...".format(",".join([f"`{c}`" for c, _ in self._param.category_description.items()]))
--- a/agent/component/llm.py
+++ b/agent/component/llm.py
@ -18,9 +18,8 @@ import json
 import logging
 import os
 import re
-import threading
 from copy import deepcopy
-from typing import Any, Generator, AsyncGenerator
+from typing import Any, AsyncGenerator
 import json_repair
 from functools import partial
 from common.constants import LLMType
@ -168,53 +167,12 @@ class LLM(ComponentBase):
            sys_prompt = re.sub(rf"<{tag}>(.*?)</{tag}>", "", sys_prompt, flags=re.DOTALL|re.IGNORECASE)
        return pts, sys_prompt

-    def _generate(self, msg:list[dict], **kwargs) -> str:
-        if not self.imgs:
-            return self.chat_mdl.chat(msg[0]["content"], msg[1:], self._param.gen_conf(), **kwargs)
-        return self.chat_mdl.chat(msg[0]["content"], msg[1:], self._param.gen_conf(), images=self.imgs, **kwargs)
-
    async def _generate_async(self, msg: list[dict], **kwargs) -> str:
-        if not self.imgs and hasattr(self.chat_mdl, "async_chat"):
-            return await self.chat_mdl.async_chat(msg[0]["content"], msg[1:], self._param.gen_conf(), **kwargs)
-        if self.imgs and hasattr(self.chat_mdl, "async_chat"):
-            return await self.chat_mdl.async_chat(msg[0]["content"], msg[1:], self._param.gen_conf(), images=self.imgs, **kwargs)
-        return await asyncio.to_thread(self._generate, msg, **kwargs)
-
-    def _generate_streamly(self, msg:list[dict], **kwargs) -> Generator[str, None, None]:
-        ans = ""
-        last_idx = 0
-        endswith_think = False
-        def delta(txt):
-            nonlocal ans, last_idx, endswith_think
-            delta_ans = txt[last_idx:]
-            ans = txt
-
-            if delta_ans.find("<think>") == 0:
-                last_idx += len("<think>")
-                return "<think>"
-            elif delta_ans.find("<think>") > 0:
-                delta_ans = txt[last_idx:last_idx+delta_ans.find("<think>")]
-                last_idx += delta_ans.find("<think>")
-                return delta_ans
-            elif delta_ans.endswith("</think>"):
-                endswith_think = True
-            elif endswith_think:
-                endswith_think = False
-                return "</think>"
-
-            last_idx = len(ans)
-            if ans.endswith("</think>"):
-                last_idx -= len("</think>")
-            return re.sub(r"(<think>|</think>)", "", delta_ans)
-
        if not self.imgs:
-            for txt in self.chat_mdl.chat_streamly(msg[0]["content"], msg[1:], self._param.gen_conf(), **kwargs):
-                yield delta(txt)
-        else:
-            for txt in self.chat_mdl.chat_streamly(msg[0]["content"], msg[1:], self._param.gen_conf(), images=self.imgs, **kwargs):
-                yield delta(txt)
+            return await self.chat_mdl.async_chat(msg[0]["content"], msg[1:], self._param.gen_conf(), **kwargs)
+        return await self.chat_mdl.async_chat(msg[0]["content"], msg[1:], self._param.gen_conf(), images=self.imgs, **kwargs)

-    async def _generate_streamly_async(self, msg: list[dict], **kwargs) -> AsyncGenerator[str, None]:
+    async def _generate_streamly(self, msg: list[dict], **kwargs) -> AsyncGenerator[str, None]:
        async def delta_wrapper(txt_iter):
            ans = ""
            last_idx = 0
@ -246,36 +204,13 @@ class LLM(ComponentBase):
            async for t in txt_iter:
                yield delta(t)

-        if not self.imgs and hasattr(self.chat_mdl, "async_chat_streamly"):
+        if not self.imgs:
            async for t in delta_wrapper(self.chat_mdl.async_chat_streamly(msg[0]["content"], msg[1:], self._param.gen_conf(), **kwargs)):
                yield t
            return
-        if self.imgs and hasattr(self.chat_mdl, "async_chat_streamly"):
-            async for t in delta_wrapper(self.chat_mdl.async_chat_streamly(msg[0]["content"], msg[1:], self._param.gen_conf(), images=self.imgs, **kwargs)):
-                yield t
-            return

-        # fallback
-        loop = asyncio.get_running_loop()
-        queue: asyncio.Queue = asyncio.Queue()
-
-        def worker():
-            try:
-                for item in self._generate_streamly(msg, **kwargs):
-                    loop.call_soon_threadsafe(queue.put_nowait, item)
-            except Exception as e:
-                loop.call_soon_threadsafe(queue.put_nowait, e)
-            finally:
-                loop.call_soon_threadsafe(queue.put_nowait, StopAsyncIteration)
-
-        threading.Thread(target=worker, daemon=True).start()
-        while True:
-            item = await queue.get()
-            if item is StopAsyncIteration:
-                break
-            if isinstance(item, Exception):
-                raise item
-            yield item
+        async for t in delta_wrapper(self.chat_mdl.async_chat_streamly(msg[0]["content"], msg[1:], self._param.gen_conf(), images=self.imgs, **kwargs)):
+            yield t

    async def _stream_output_async(self, prompt, msg):
        _, msg = message_fit_in([{"role": "system", "content": prompt}, *msg], int(self.chat_mdl.max_length * 0.97))
@ -407,8 +342,8 @@ class LLM(ComponentBase):
    def _invoke(self, **kwargs):
        return asyncio.run(self._invoke_async(**kwargs))

-    def add_memory(self, user:str, assist:str, func_name: str, params: dict, results: str, user_defined_prompt:dict={}):
-        summ = tool_call_summary(self.chat_mdl, func_name, params, results, user_defined_prompt)
+    async def add_memory(self, user:str, assist:str, func_name: str, params: dict, results: str, user_defined_prompt:dict={}):
+        summ = await tool_call_summary(self.chat_mdl, func_name, params, results, user_defined_prompt)
        logging.info(f"[MEMORY]: {summ}")
        self._canvas.add_memory(user, assist, summ)