Feat: add primitive support for function calls (#6840)

### What problem does this PR solve? This PR introduces **primitive support for function calls**, enabling the system to handle basic function call capabilities. However, this feature is currently experimental and **not yet enabled for general use**, as it is only supported by a subset of models, namely, Qwen and OpenAI models. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-23 11:36:38 +08:00 · 2025-04-08 16:09:03 +08:00
parent a20439bf81
commit dc2c74b249
5 changed files with 574 additions and 130 deletions
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -145,6 +145,9 @@ def chat(dialog, messages, stream=True, **kwargs):
        chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id)
    else:
        chat_mdl = LLMBundle(dialog.tenant_id, LLMType.CHAT, dialog.llm_id)
+        toolcall_session, tools = kwargs.get("toolcall_session"), kwargs.get("tools")
+        if toolcall_session and tools:
+            chat_mdl.bind_tools(toolcall_session, tools)

    bind_llm_ts = timer()

@ -338,7 +341,7 @@ def chat(dialog, messages, stream=True, **kwargs):
        langfuse_output = {"time_elapsed:": re.sub(r"\n", "  \n", langfuse_output), "created_at": time.time()}

        # Add a condition check to call the end method only if langfuse_tracer exists
-        if langfuse_tracer and 'langfuse_generation' in locals():
+        if langfuse_tracer and "langfuse_generation" in locals():
            langfuse_generation.end(output=langfuse_output)

        return {"answer": think + answer, "reference": refs, "prompt": re.sub(r"\n", "  \n", prompt), "created_at": time.time()}
--- a/api/db/services/llm_service.py
+++ b/api/db/services/llm_service.py
@ -102,6 +102,9 @@ class TenantLLMService(CommonService):
        mdlnm, fid = TenantLLMService.split_model_name_and_factory(mdlnm)
        if model_config:
            model_config = model_config.to_dict()
+            llm = LLMService.query(llm_name=mdlnm) if not fid else LLMService.query(llm_name=mdlnm, fid=fid)
+            if llm:
+                model_config["is_tools"] = llm[0].is_tools
        if not model_config:
            if llm_type in [LLMType.EMBEDDING, LLMType.RERANK]:
                llm = LLMService.query(llm_name=mdlnm) if not fid else LLMService.query(llm_name=mdlnm, fid=fid)
@ -206,6 +209,8 @@ class LLMBundle:
        model_config = TenantLLMService.get_model_config(tenant_id, llm_type, llm_name)
        self.max_length = model_config.get("max_tokens", 8192)

+        self.is_tools = model_config.get("is_tools", False)
+
        langfuse_keys = TenantLangfuseService.filter_by_tenant(tenant_id=tenant_id)
        if langfuse_keys:
            langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key, host=langfuse_keys.host)
@ -215,6 +220,11 @@ class LLMBundle:
        else:
            self.langfuse = None

+    def bind_tools(self, toolcall_session, tools):
+        if not self.is_tools:
+            return
+        self.mdl.bind_tools(toolcall_session, tools)
+
    def encode(self, texts: list):
        if self.langfuse:
            generation = self.trace.generation(name="encode", model=self.llm_name, input={"texts": texts})
@ -307,11 +317,31 @@ class LLMBundle:
        if self.langfuse:
            span.end()

+    def _remove_reasoning_content(self, txt: str) -> str:
+        first_think_start = txt.find("<think>")
+        if first_think_start == -1:
+            return txt
+
+        last_think_end = txt.rfind("</think>")
+        if last_think_end == -1:
+            return txt
+
+        if last_think_end < first_think_start:
+            return txt
+
+        return txt[last_think_end + len("</think>") :]
+
    def chat(self, system, history, gen_conf):
        if self.langfuse:
            generation = self.trace.generation(name="chat", model=self.llm_name, input={"system": system, "history": history})

-        txt, used_tokens = self.mdl.chat(system, history, gen_conf)
+        chat = self.mdl.chat
+        if self.is_tools and self.mdl.is_tools:
+            chat = self.mdl.chat_with_tools
+
+        txt, used_tokens = chat(system, history, gen_conf)
+        txt = self._remove_reasoning_content(txt)
+
        if isinstance(txt, int) and not TenantLLMService.increase_usage(self.tenant_id, self.llm_type, used_tokens, self.llm_name):
            logging.error("LLMBundle.chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.llm_name, used_tokens))

@ -325,7 +355,12 @@ class LLMBundle:
            generation = self.trace.generation(name="chat_streamly", model=self.llm_name, input={"system": system, "history": history})

        ans = ""
-        for txt in self.mdl.chat_streamly(system, history, gen_conf):
+        chat_streamly = self.mdl.chat_streamly
+
+        if self.is_tools and self.mdl.is_tools:
+            chat_streamly = self.mdl.chat_streamly_with_tools
+
+        for txt in chat_streamly(system, history, gen_conf):
            if isinstance(txt, int):
                if self.langfuse:
                    generation.end(output={"output": ans})