From 41cff3e09e03a2a141b96a1b1067de60222aa27f Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Mon, 1 Dec 2025 14:24:35 +0800 Subject: [PATCH] Fix: jina embedding issue (#11628) ### What problem does this PR solve? Fix: jina embedding issue #11614 Feat: Add jina embedding v4 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- conf/llm_factories.json | 6 +++++ rag/llm/embedding_model.py | 50 +++++++++++++++----------------------- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/conf/llm_factories.json b/conf/llm_factories.json index d3b2dcc1c..3c84bd03d 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -1194,6 +1194,12 @@ "tags": "TEXT EMBEDDING", "max_tokens": 8196, "model_type": "embedding" + }, + { + "llm_name": "jina-embeddings-v4", + "tags": "TEXT EMBEDDING", + "max_tokens": 32768, + "model_type": "embedding" } ] }, diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index 7f2f9ee7d..445ecab5a 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -349,35 +349,6 @@ class YoudaoEmbed(Base): return np.array(embds[0]), num_tokens_from_string(text) -class JinaEmbed(Base): - _FACTORY_NAME = "Jina" - - def __init__(self, key, model_name="jina-embeddings-v3", base_url="https://api.jina.ai/v1/embeddings"): - self.base_url = "https://api.jina.ai/v1/embeddings" - self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"} - self.model_name = model_name - - def encode(self, texts: list): - texts = [truncate(t, 8196) for t in texts] - batch_size = 16 - ress = [] - token_count = 0 - for i in range(0, len(texts), batch_size): - data = {"model": self.model_name, "input": texts[i : i + batch_size], "encoding_type": "float"} - response = requests.post(self.base_url, headers=self.headers, json=data) - try: - res = response.json() - ress.extend([d["embedding"] for d in res["data"]]) - token_count += self.total_token_count(res) - except Exception as _e: - log_exception(_e, response) - return np.array(ress), token_count - - def encode_queries(self, text): - embds, cnt = self.encode([text]) - return np.array(embds[0]), cnt - - class JinaMultiVecEmbed(Base): _FACTORY_NAME = "Jina" @@ -403,11 +374,28 @@ class JinaMultiVecEmbed(Base): img_b64s = base64.b64encode(text).decode('utf8') input.append({"image": img_b64s}) # base64 encoded image for i in range(0, len(texts), batch_size): - data = {"model": self.model_name, "task": task, "truncate": True, "return_multivector": True, "input": input[i : i + batch_size]} + data = {"model": self.model_name, "input": input[i : i + batch_size]} + if "v4" in self.model_name: + data["return_multivector"] = True + + if "v3" in self.model_name or "v4" in self.model_name: + data['task'] = task + data['truncate'] = True + response = requests.post(self.base_url, headers=self.headers, json=data) try: res = response.json() - ress.extend([d["embeddings"] for d in res["data"]]) + for d in res['data']: + if data.get("return_multivector", False): # v4 + token_embs = np.asarray(d['embeddings'], dtype=np.float32) + chunk_emb = token_embs.mean(axis=0) + + else: + # v2/v3 + chunk_emb = np.asarray(d['embedding'], dtype=np.float32) + + ress.append(chunk_emb) + token_count += self.total_token_count(res) except Exception as _e: log_exception(_e, response)