From 6361fc4b3317e35c5747f38eb222e03f8eebe3b5 Mon Sep 17 00:00:00 2001
From: Yongteng Lei <yongtengrey@outlook.com>
Date: Thu, 5 Feb 2026 12:47:04 +0800
Subject: [PATCH] Feat: update stepfun list (#12991)

### What problem does this PR solve?

Update stepfun list.

Add TTS and Sequence2Text functionalities.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 conf/llm_factories.json       | 81 +++++++++++++++++++++++++++++++----
 rag/llm/sequence2txt_model.py |  9 ++++
 rag/llm/tts_model.py          | 43 ++++++++++++++++++-
 3 files changed, 122 insertions(+), 11 deletions(-)

diff --git a/conf/llm_factories.json b/conf/llm_factories.json
index d31d70790..1face892a 100644
--- a/conf/llm_factories.json
+++ b/conf/llm_factories.json
@@ -1600,9 +1600,30 @@
         {
             "name": "StepFun",
             "logo": "",
-            "tags": "LLM",
+            "tags": "LLM,IMAGE2TEXT,SPEECH2TEXT,TTS",
             "status": "1",
             "llm": [
+                {
+                    "llm_name": "step-3",
+                    "tags": "LLM,CHAT,IMAGE2TEXT,64k",
+                    "max_tokens": 65536,
+                    "model_type": "image2text",
+                    "is_tools": true
+                },
+                {
+                    "llm_name": "step-2-mini",
+                    "tags": "LLM,CHAT,32k",
+                    "max_tokens": 32768,
+                    "model_type": "chat",
+                    "is_tools": true
+                },
+                {
+                    "llm_name": "step-2-16k",
+                    "tags": "LLM,CHAT,16k",
+                    "max_tokens": 16384,
+                    "model_type": "chat",
+                    "is_tools": true
+                },
                 {
                     "llm_name": "step-1-8k",
                     "tags": "LLM,CHAT,8k",
@@ -1617,13 +1638,6 @@
                     "model_type": "chat",
                     "is_tools": true
                 },
-                {
-                    "llm_name": "step-1-128k",
-                    "tags": "LLM,CHAT,128k",
-                    "max_tokens": 131072,
-                    "model_type": "chat",
-                    "is_tools": true
-                },
                 {
                     "llm_name": "step-1-256k",
                     "tags": "LLM,CHAT,256k",
@@ -1631,12 +1645,61 @@
                     "model_type": "chat",
                     "is_tools": true
                 },
+                {
+                    "llm_name": "step-r1-v-mini",
+                    "tags": "LLM,CHAT,IMAGE2TEXT,100k",
+                    "max_tokens": 102400,
+                    "model_type": "image2text",
+                    "is_tools": true
+                },
                 {
                     "llm_name": "step-1v-8k",
-                    "tags": "LLM,CHAT,IMAGE2TEXT",
+                    "tags": "LLM,CHAT,IMAGE2TEXT,8k",
                     "max_tokens": 8192,
                     "model_type": "image2text",
                     "is_tools": true
+                },
+                {
+                    "llm_name": "step-1v-32k",
+                    "tags": "LLM,CHAT,IMAGE2TEXT,32k",
+                    "max_tokens": 32768,
+                    "model_type": "image2text",
+                    "is_tools": true
+                },
+                {
+                    "llm_name": "step-1o-vision-32k",
+                    "tags": "LLM,CHAT,IMAGE2TEXT,32k",
+                    "max_tokens": 32768,
+                    "model_type": "image2text",
+                    "is_tools": true
+                },
+                {
+                    "llm_name": "step-1o-turbo-vision",
+                    "tags": "LLM,CHAT,IMAGE2TEXT,32k",
+                    "max_tokens": 32768,
+                    "model_type": "image2text",
+                    "is_tools": true
+                },
+                {
+                    "llm_name": "step-tts-mini",
+                    "tags": "TTS,1000c",
+                    "max_tokens": 1000,
+                    "model_type": "tts",
+                    "is_tools": false
+                },
+                {
+                    "llm_name": "step-tts-vivid",
+                    "tags": "TTS,1000c",
+                    "max_tokens": 1000,
+                    "model_type": "tts",
+                    "is_tools": false
+                },
+                {
+                    "llm_name": "step-asr",
+                    "tags": "SPEECH2TEXT,100MB",
+                    "max_tokens": 32768,
+                    "model_type": "speech2text",
+                    "is_tools": false
                 }
             ]
         },
diff --git a/rag/llm/sequence2txt_model.py b/rag/llm/sequence2txt_model.py
index e5839afd1..abbdb4de3 100644
--- a/rag/llm/sequence2txt_model.py
+++ b/rag/llm/sequence2txt_model.py
@@ -59,6 +59,15 @@ class GPTSeq2txt(Base):
         self.model_name = model_name
 
 
+class StepFunSeq2txt(GPTSeq2txt):
+    _FACTORY_NAME = "StepFun"
+
+    def __init__(self, key, model_name="step-asr", lang="Chinese", base_url="https://api.stepfun.com/v1", **kwargs):
+        if not base_url:
+            base_url = "https://api.stepfun.com/v1"
+        super().__init__(key, model_name=model_name, base_url=base_url, **kwargs)
+
+
 class QWenSeq2txt(Base):
     _FACTORY_NAME = "Tongyi-Qianwen"
 
diff --git a/rag/llm/tts_model.py b/rag/llm/tts_model.py
index de269320d..035d8412b 100644
--- a/rag/llm/tts_model.py
+++ b/rag/llm/tts_model.py
@@ -19,6 +19,7 @@ import base64
 import hashlib
 import hmac
 import json
+import os
 import queue
 import re
 import ssl
@@ -36,6 +37,7 @@ import requests
 import websocket
 from pydantic import BaseModel, conint
 
+from common.http_client import sync_request
 from common.token_utils import num_tokens_from_string
 
 
@@ -387,6 +389,7 @@ class SILICONFLOWTTS(Base):
             if chunk:
                 yield chunk
 
+
 class DeepInfraTTS(OpenAITTS):
     _FACTORY_NAME = "DeepInfra"
 
@@ -394,7 +397,8 @@ class DeepInfraTTS(OpenAITTS):
         if not base_url:
             base_url = "https://api.deepinfra.com/v1/openai"
         super().__init__(key, model_name, base_url, **kwargs)
-        
+
+
 class CometAPITTS(OpenAITTS):
     _FACTORY_NAME = "CometAPI"
 
@@ -402,7 +406,8 @@ class CometAPITTS(OpenAITTS):
         if not base_url:
             base_url = "https://api.cometapi.com/v1"
         super().__init__(key, model_name, base_url, **kwargs)
-        
+
+
 class DeerAPITTS(OpenAITTS):
     _FACTORY_NAME = "DeerAPI"
 
@@ -410,3 +415,37 @@ class DeerAPITTS(OpenAITTS):
         if not base_url:
             base_url = "https://api.deerapi.com/v1"
         super().__init__(key, model_name, base_url, **kwargs)
+
+
+class StepFunTTS(OpenAITTS):
+    _FACTORY_NAME = "StepFun"
+    _SUPPORTED_RESPONSE_FORMATS = {"wav", "mp3", "flac", "opus", "pcm"}
+
+    def __init__(self, key, model_name, base_url="https://api.stepfun.com/v1", **kwargs):
+        if not base_url:
+            base_url = "https://api.stepfun.com/v1"
+        self.default_voice = os.environ.get("STEPFUN_TTS_VOICE") or "cixingnansheng"
+        super().__init__(key, model_name, base_url, **kwargs)
+
+    def tts(self, text, voice=None, response_format: Literal["wav", "mp3", "flac", "opus", "pcm"] = "mp3"):
+        text = self.normalize_text(text)
+        if response_format not in self._SUPPORTED_RESPONSE_FORMATS:
+            raise ValueError(f"Unsupported response_format={response_format!r}. Supported: {sorted(self._SUPPORTED_RESPONSE_FORMATS)}")
+
+        payload = {
+            "model": self.model_name,
+            "voice": voice or self.default_voice,
+            "input": text,
+            "response_format": response_format,
+        }
+
+        response = sync_request("POST", f"{self.base_url}/audio/speech", headers=self.headers, json=payload)
+
+        if response.status_code != 200:
+            raise Exception(f"**Error**: {response.status_code}, {response.text}")
+
+        for chunk in response.iter_bytes():
+            if chunk:
+                yield chunk
+
+        yield num_tokens_from_string(text)