mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Don't rerank for infinity (#10579)
### What problem does this PR solve? Don't need rerank for infinity since Infinity normalizes each way score before fusion. ### Type of change - [x] Refactoring
This commit is contained in:
@ -17,6 +17,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import math
|
import math
|
||||||
|
import os
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
@ -154,7 +155,7 @@ class Dealer:
|
|||||||
query_vector=q_vec,
|
query_vector=q_vec,
|
||||||
aggregation=aggs,
|
aggregation=aggs,
|
||||||
highlight=highlight,
|
highlight=highlight,
|
||||||
field=self.dataStore.getFields(res, src),
|
field=self.dataStore.getFields(res, src + ["_score"]),
|
||||||
keywords=keywords
|
keywords=keywords
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -354,10 +355,8 @@ class Dealer:
|
|||||||
if not question:
|
if not question:
|
||||||
return ranks
|
return ranks
|
||||||
|
|
||||||
RERANK_LIMIT = 64
|
# Ensure RERANK_LIMIT is multiple of page_size
|
||||||
RERANK_LIMIT = int(RERANK_LIMIT//page_size + ((RERANK_LIMIT%page_size)/(page_size*1.) + 0.5)) * page_size if page_size>1 else 1
|
RERANK_LIMIT = math.ceil(64/page_size) * page_size if page_size>1 else 1
|
||||||
if RERANK_LIMIT < 1: ## when page_size is very large the RERANK_LIMIT will be 0.
|
|
||||||
RERANK_LIMIT = 1
|
|
||||||
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT,
|
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT,
|
||||||
"question": question, "vector": True, "topk": top,
|
"question": question, "vector": True, "topk": top,
|
||||||
"similarity": similarity_threshold,
|
"similarity": similarity_threshold,
|
||||||
@ -376,15 +375,25 @@ class Dealer:
|
|||||||
vector_similarity_weight,
|
vector_similarity_weight,
|
||||||
rank_feature=rank_feature)
|
rank_feature=rank_feature)
|
||||||
else:
|
else:
|
||||||
|
lower_case_doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
||||||
|
if lower_case_doc_engine == "elasticsearch":
|
||||||
|
# ElasticSearch doesn't normalize each way score before fusion.
|
||||||
sim, tsim, vsim = self.rerank(
|
sim, tsim, vsim = self.rerank(
|
||||||
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
||||||
rank_feature=rank_feature)
|
rank_feature=rank_feature)
|
||||||
|
else:
|
||||||
|
# Don't need rerank here since Infinity normalizes each way score before fusion.
|
||||||
|
sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
|
||||||
|
tsim = sim
|
||||||
|
vsim = sim
|
||||||
# Already paginated in search function
|
# Already paginated in search function
|
||||||
idx = np.argsort(sim * -1)[(page - 1) * page_size:page * page_size]
|
begin = ((page % (RERANK_LIMIT//page_size)) - 1) * page_size
|
||||||
|
sim = sim[begin : begin + page_size]
|
||||||
|
sim_np = np.array(sim)
|
||||||
|
idx = np.argsort(sim_np * -1)
|
||||||
dim = len(sres.query_vector)
|
dim = len(sres.query_vector)
|
||||||
vector_column = f"q_{dim}_vec"
|
vector_column = f"q_{dim}_vec"
|
||||||
zero_vector = [0.0] * dim
|
zero_vector = [0.0] * dim
|
||||||
sim_np = np.array(sim)
|
|
||||||
filtered_count = (sim_np >= similarity_threshold).sum()
|
filtered_count = (sim_np >= similarity_threshold).sum()
|
||||||
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
|
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
|
||||||
for i in idx:
|
for i in idx:
|
||||||
|
|||||||
@ -445,8 +445,8 @@ class InfinityConnection(DocStoreConnection):
|
|||||||
self.connPool.release_conn(inf_conn)
|
self.connPool.release_conn(inf_conn)
|
||||||
res = concat_dataframes(df_list, output)
|
res = concat_dataframes(df_list, output)
|
||||||
if matchExprs:
|
if matchExprs:
|
||||||
res["Sum"] = res[score_column] + res[PAGERANK_FLD]
|
res["_score"] = res[score_column] + res[PAGERANK_FLD]
|
||||||
res = res.sort_values(by="Sum", ascending=False).reset_index(drop=True).drop(columns=["Sum"])
|
res = res.sort_values(by="_score", ascending=False).reset_index(drop=True)
|
||||||
res = res.head(limit)
|
res = res.head(limit)
|
||||||
logger.debug(f"INFINITY search final result: {str(res)}")
|
logger.debug(f"INFINITY search final result: {str(res)}")
|
||||||
return res, total_hits_count
|
return res, total_hits_count
|
||||||
|
|||||||
Reference in New Issue
Block a user