Don't release full image (#10654)

### What problem does this PR solve?

Introduced gpu profile in .env
Added Dockerfile_tei
fix datrie
Removed LIGHTEN flag

### Type of change

- [x] Documentation Update
- [x] Refactoring
This commit is contained in:
Zhichang Yu
2025-10-23 23:02:27 +08:00
committed by GitHub
parent 92739ea804
commit 73144e278b
67 changed files with 2792 additions and 3608 deletions

View File

@ -14,9 +14,7 @@
# limitations under the License.
#
import json
import logging
import os
import re
import threading
from abc import ABC
from urllib.parse import urljoin
@ -25,15 +23,14 @@ import dashscope
import google.generativeai as genai
import numpy as np
import requests
from huggingface_hub import snapshot_download
from ollama import Client
from openai import OpenAI
from zhipuai import ZhipuAI
from api import settings
from api.utils.file_utils import get_home_cache_dir
from api.utils.log_utils import log_exception
from rag.utils import num_tokens_from_string, truncate
from api import settings
import logging
class Base(ABC):
@ -63,71 +60,42 @@ class Base(ABC):
return 0
class DefaultEmbedding(Base):
_FACTORY_NAME = "BAAI"
class BuiltinEmbed(Base):
_FACTORY_NAME = "Builtin"
MAX_TOKENS = {"Qwen/Qwen3-Embedding-0.6B": 30000, "BAAI/bge-m3": 8000, "BAAI/bge-small-en-v1.5": 500}
_model = None
_model_name = ""
_max_tokens = 500
_model_lock = threading.Lock()
def __init__(self, key, model_name, **kwargs):
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
For Linux:
export HF_ENDPOINT=https://hf-mirror.com
For Windows:
Good luck
^_-
"""
if not settings.LIGHTEN:
input_cuda_visible_devices = None
with DefaultEmbedding._model_lock:
import torch
from FlagEmbedding import FlagModel
if "CUDA_VISIBLE_DEVICES" in os.environ:
input_cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # handle some issues with multiple GPUs when initializing the model
if not DefaultEmbedding._model or model_name != DefaultEmbedding._model_name:
try:
DefaultEmbedding._model = FlagModel(
os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)),
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available(),
)
DefaultEmbedding._model_name = model_name
except Exception:
model_dir = snapshot_download(
repo_id="BAAI/bge-large-zh-v1.5", local_dir=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)), local_dir_use_symlinks=False
)
DefaultEmbedding._model = FlagModel(model_dir, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", use_fp16=torch.cuda.is_available())
finally:
if input_cuda_visible_devices:
# restore CUDA_VISIBLE_DEVICES
os.environ["CUDA_VISIBLE_DEVICES"] = input_cuda_visible_devices
self._model = DefaultEmbedding._model
self._model_name = DefaultEmbedding._model_name
logging.info(f"Initialize BuiltinEmbed according to settings.EMBEDDING_CFG: {settings.EMBEDDING_CFG}")
embedding_cfg = settings.EMBEDDING_CFG
if not BuiltinEmbed._model and "tei-" in os.getenv("COMPOSE_PROFILES", ""):
with BuiltinEmbed._model_lock:
BuiltinEmbed._model_name = settings.EMBEDDING_MDL
BuiltinEmbed._max_tokens = BuiltinEmbed.MAX_TOKENS.get(settings.EMBEDDING_MDL, 500)
BuiltinEmbed._model = HuggingFaceEmbed(embedding_cfg["api_key"], settings.EMBEDDING_MDL, base_url=embedding_cfg["base_url"])
self._model = BuiltinEmbed._model
self._model_name = BuiltinEmbed._model_name
self._max_tokens = BuiltinEmbed._max_tokens
def encode(self, texts: list):
batch_size = 16
texts = [truncate(t, 2048) for t in texts]
texts = [truncate(t, self._max_tokens) for t in texts]
token_count = 0
for t in texts:
token_count += num_tokens_from_string(t)
ress = None
for i in range(0, len(texts), batch_size):
embeddings, token_count_delta = self._model.encode(texts[i : i + batch_size])
token_count += token_count_delta
if ress is None:
ress = self._model.encode(texts[i : i + batch_size], convert_to_numpy=True)
ress = embeddings
else:
ress = np.concatenate((ress, self._model.encode(texts[i : i + batch_size], convert_to_numpy=True)), axis=0)
ress = np.concatenate((ress, embeddings), axis=0)
return ress, token_count
def encode_queries(self, text: str):
token_count = num_tokens_from_string(text)
return self._model.encode_queries([text], convert_to_numpy=False)[0][0].cpu().numpy(), token_count
return self._model.encode_queries(text)
class OpenAIEmbed(Base):
@ -326,51 +294,6 @@ class OllamaEmbed(Base):
log_exception(_e, res)
class FastEmbed(DefaultEmbedding):
_FACTORY_NAME = "FastEmbed"
def __init__(
self,
key: str | None = None,
model_name: str = "BAAI/bge-small-en-v1.5",
cache_dir: str | None = None,
threads: int | None = None,
**kwargs,
):
if not settings.LIGHTEN:
with FastEmbed._model_lock:
from fastembed import TextEmbedding
if not DefaultEmbedding._model or model_name != DefaultEmbedding._model_name:
try:
DefaultEmbedding._model = TextEmbedding(model_name, cache_dir, threads, **kwargs)
DefaultEmbedding._model_name = model_name
except Exception:
cache_dir = snapshot_download(
repo_id="BAAI/bge-small-en-v1.5", local_dir=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)), local_dir_use_symlinks=False
)
DefaultEmbedding._model = TextEmbedding(model_name, cache_dir, threads, **kwargs)
self._model = DefaultEmbedding._model
self._model_name = model_name
def encode(self, texts: list):
# Using the internal tokenizer to encode the texts and get the total
# number of tokens
encodings = self._model.model.tokenizer.encode_batch(texts)
total_tokens = sum(len(e) for e in encodings)
embeddings = [e.tolist() for e in self._model.embed(texts, batch_size=16)]
return np.array(embeddings), total_tokens
def encode_queries(self, text: str):
# Using the internal tokenizer to encode the texts and get the total
# number of tokens
encoding = self._model.model.tokenizer.encode(text)
embedding = next(self._model.query_embed(text))
return np.array(embedding), len(encoding.ids)
class XinferenceEmbed(Base):
_FACTORY_NAME = "Xinference"
@ -407,14 +330,7 @@ class YoudaoEmbed(Base):
_client = None
def __init__(self, key=None, model_name="maidalun1020/bce-embedding-base_v1", **kwargs):
if not settings.LIGHTEN and not YoudaoEmbed._client:
from BCEmbedding import EmbeddingModel as qanthing
try:
logging.info("LOADING BCE...")
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(get_home_cache_dir(), "bce-embedding-base_v1"))
except Exception:
YoudaoEmbed._client = qanthing(model_name_or_path=model_name.replace("maidalun1020", "InfiniFlow"))
pass
def encode(self, texts: list):
batch_size = 10
@ -885,21 +801,18 @@ class HuggingFaceEmbed(Base):
self.base_url = base_url or "http://127.0.0.1:8080"
def encode(self, texts: list):
embeddings = []
for text in texts:
response = requests.post(f"{self.base_url}/embed", json={"inputs": text}, headers={"Content-Type": "application/json"})
if response.status_code == 200:
embedding = response.json()
embeddings.append(embedding[0])
else:
raise Exception(f"Error: {response.status_code} - {response.text}")
response = requests.post(f"{self.base_url}/embed", json={"inputs": texts}, headers={"Content-Type": "application/json"})
if response.status_code == 200:
embeddings = response.json()
else:
raise Exception(f"Error: {response.status_code} - {response.text}")
return np.array(embeddings), sum([num_tokens_from_string(text) for text in texts])
def encode_queries(self, text):
def encode_queries(self, text: str):
response = requests.post(f"{self.base_url}/embed", json={"inputs": text}, headers={"Content-Type": "application/json"})
if response.status_code == 200:
embedding = response.json()
return np.array(embedding[0]), num_tokens_from_string(text)
embedding = response.json()[0]
return np.array(embedding), num_tokens_from_string(text)
else:
raise Exception(f"Error: {response.status_code} - {response.text}")