From 6361fc4b3317e35c5747f38eb222e03f8eebe3b5 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Thu, 5 Feb 2026 12:47:04 +0800 Subject: [PATCH] Feat: update stepfun list (#12991) ### What problem does this PR solve? Update stepfun list. Add TTS and Sequence2Text functionalities. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- conf/llm_factories.json | 81 +++++++++++++++++++++++++++++++---- rag/llm/sequence2txt_model.py | 9 ++++ rag/llm/tts_model.py | 43 ++++++++++++++++++- 3 files changed, 122 insertions(+), 11 deletions(-) diff --git a/conf/llm_factories.json b/conf/llm_factories.json index d31d70790..1face892a 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -1600,9 +1600,30 @@ { "name": "StepFun", "logo": "", - "tags": "LLM", + "tags": "LLM,IMAGE2TEXT,SPEECH2TEXT,TTS", "status": "1", "llm": [ + { + "llm_name": "step-3", + "tags": "LLM,CHAT,IMAGE2TEXT,64k", + "max_tokens": 65536, + "model_type": "image2text", + "is_tools": true + }, + { + "llm_name": "step-2-mini", + "tags": "LLM,CHAT,32k", + "max_tokens": 32768, + "model_type": "chat", + "is_tools": true + }, + { + "llm_name": "step-2-16k", + "tags": "LLM,CHAT,16k", + "max_tokens": 16384, + "model_type": "chat", + "is_tools": true + }, { "llm_name": "step-1-8k", "tags": "LLM,CHAT,8k", @@ -1617,13 +1638,6 @@ "model_type": "chat", "is_tools": true }, - { - "llm_name": "step-1-128k", - "tags": "LLM,CHAT,128k", - "max_tokens": 131072, - "model_type": "chat", - "is_tools": true - }, { "llm_name": "step-1-256k", "tags": "LLM,CHAT,256k", @@ -1631,12 +1645,61 @@ "model_type": "chat", "is_tools": true }, + { + "llm_name": "step-r1-v-mini", + "tags": "LLM,CHAT,IMAGE2TEXT,100k", + "max_tokens": 102400, + "model_type": "image2text", + "is_tools": true + }, { "llm_name": "step-1v-8k", - "tags": "LLM,CHAT,IMAGE2TEXT", + "tags": "LLM,CHAT,IMAGE2TEXT,8k", "max_tokens": 8192, "model_type": "image2text", "is_tools": true + }, + { + "llm_name": "step-1v-32k", + "tags": "LLM,CHAT,IMAGE2TEXT,32k", + "max_tokens": 32768, + "model_type": "image2text", + "is_tools": true + }, + { + "llm_name": "step-1o-vision-32k", + "tags": "LLM,CHAT,IMAGE2TEXT,32k", + "max_tokens": 32768, + "model_type": "image2text", + "is_tools": true + }, + { + "llm_name": "step-1o-turbo-vision", + "tags": "LLM,CHAT,IMAGE2TEXT,32k", + "max_tokens": 32768, + "model_type": "image2text", + "is_tools": true + }, + { + "llm_name": "step-tts-mini", + "tags": "TTS,1000c", + "max_tokens": 1000, + "model_type": "tts", + "is_tools": false + }, + { + "llm_name": "step-tts-vivid", + "tags": "TTS,1000c", + "max_tokens": 1000, + "model_type": "tts", + "is_tools": false + }, + { + "llm_name": "step-asr", + "tags": "SPEECH2TEXT,100MB", + "max_tokens": 32768, + "model_type": "speech2text", + "is_tools": false } ] }, diff --git a/rag/llm/sequence2txt_model.py b/rag/llm/sequence2txt_model.py index e5839afd1..abbdb4de3 100644 --- a/rag/llm/sequence2txt_model.py +++ b/rag/llm/sequence2txt_model.py @@ -59,6 +59,15 @@ class GPTSeq2txt(Base): self.model_name = model_name +class StepFunSeq2txt(GPTSeq2txt): + _FACTORY_NAME = "StepFun" + + def __init__(self, key, model_name="step-asr", lang="Chinese", base_url="https://api.stepfun.com/v1", **kwargs): + if not base_url: + base_url = "https://api.stepfun.com/v1" + super().__init__(key, model_name=model_name, base_url=base_url, **kwargs) + + class QWenSeq2txt(Base): _FACTORY_NAME = "Tongyi-Qianwen" diff --git a/rag/llm/tts_model.py b/rag/llm/tts_model.py index de269320d..035d8412b 100644 --- a/rag/llm/tts_model.py +++ b/rag/llm/tts_model.py @@ -19,6 +19,7 @@ import base64 import hashlib import hmac import json +import os import queue import re import ssl @@ -36,6 +37,7 @@ import requests import websocket from pydantic import BaseModel, conint +from common.http_client import sync_request from common.token_utils import num_tokens_from_string @@ -387,6 +389,7 @@ class SILICONFLOWTTS(Base): if chunk: yield chunk + class DeepInfraTTS(OpenAITTS): _FACTORY_NAME = "DeepInfra" @@ -394,7 +397,8 @@ class DeepInfraTTS(OpenAITTS): if not base_url: base_url = "https://api.deepinfra.com/v1/openai" super().__init__(key, model_name, base_url, **kwargs) - + + class CometAPITTS(OpenAITTS): _FACTORY_NAME = "CometAPI" @@ -402,7 +406,8 @@ class CometAPITTS(OpenAITTS): if not base_url: base_url = "https://api.cometapi.com/v1" super().__init__(key, model_name, base_url, **kwargs) - + + class DeerAPITTS(OpenAITTS): _FACTORY_NAME = "DeerAPI" @@ -410,3 +415,37 @@ class DeerAPITTS(OpenAITTS): if not base_url: base_url = "https://api.deerapi.com/v1" super().__init__(key, model_name, base_url, **kwargs) + + +class StepFunTTS(OpenAITTS): + _FACTORY_NAME = "StepFun" + _SUPPORTED_RESPONSE_FORMATS = {"wav", "mp3", "flac", "opus", "pcm"} + + def __init__(self, key, model_name, base_url="https://api.stepfun.com/v1", **kwargs): + if not base_url: + base_url = "https://api.stepfun.com/v1" + self.default_voice = os.environ.get("STEPFUN_TTS_VOICE") or "cixingnansheng" + super().__init__(key, model_name, base_url, **kwargs) + + def tts(self, text, voice=None, response_format: Literal["wav", "mp3", "flac", "opus", "pcm"] = "mp3"): + text = self.normalize_text(text) + if response_format not in self._SUPPORTED_RESPONSE_FORMATS: + raise ValueError(f"Unsupported response_format={response_format!r}. Supported: {sorted(self._SUPPORTED_RESPONSE_FORMATS)}") + + payload = { + "model": self.model_name, + "voice": voice or self.default_voice, + "input": text, + "response_format": response_format, + } + + response = sync_request("POST", f"{self.base_url}/audio/speech", headers=self.headers, json=payload) + + if response.status_code != 200: + raise Exception(f"**Error**: {response.status_code}, {response.text}") + + for chunk in response.iter_bytes(): + if chunk: + yield chunk + + yield num_tokens_from_string(text)