From 3d39b96c6fb45cce67b9f490a86456ff71e1f551 Mon Sep 17 00:00:00 2001
From: Lynn <lynn_inf@hotmail.com>
Date: Thu, 11 Sep 2025 12:02:12 +0800
Subject: [PATCH] Fix: token num exceed (#10046)

### What problem does this PR solve?

fix text input exceed token num limit when using siliconflow's embedding
model BAAI/bge-large-zh-v1.5 and BAAI/bge-large-en-v1.5, truncate before
input.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 rag/llm/embedding_model.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py
index d39e0f0cc..3b147885b 100644
--- a/rag/llm/embedding_model.py
+++ b/rag/llm/embedding_model.py
@@ -751,7 +751,11 @@ class SILICONFLOWEmbed(Base):
         token_count = 0
         for i in range(0, len(texts), batch_size):
             texts_batch = texts[i : i + batch_size]
-            texts_batch = [" " if not text.strip() else text for text in texts_batch]
+            if self.model_name in ["BAAI/bge-large-zh-v1.5", "BAAI/bge-large-en-v1.5"]:
+                # limit 512, 340 is almost safe
+                texts_batch = [" " if not text.strip() else truncate(text, 340) for text in texts_batch]
+            else:
+                texts_batch = [" " if not text.strip() else text for text in texts_batch]
 
             payload = {
                 "model": self.model_name,