mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: incorrect retrieval total count with pagination enabled (#11400)
### What problem does this PR solve? Incorrect retrieval total count with pagination enabled. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -355,75 +355,102 @@ class Dealer:
|
|||||||
rag_tokenizer.tokenize(ans).split(),
|
rag_tokenizer.tokenize(ans).split(),
|
||||||
rag_tokenizer.tokenize(inst).split())
|
rag_tokenizer.tokenize(inst).split())
|
||||||
|
|
||||||
def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
|
def retrieval(
|
||||||
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True,
|
self,
|
||||||
rerank_mdl=None, highlight=False,
|
question,
|
||||||
rank_feature: dict | None = {PAGERANK_FLD: 10}):
|
embd_mdl,
|
||||||
|
tenant_ids,
|
||||||
|
kb_ids,
|
||||||
|
page,
|
||||||
|
page_size,
|
||||||
|
similarity_threshold=0.2,
|
||||||
|
vector_similarity_weight=0.3,
|
||||||
|
top=1024,
|
||||||
|
doc_ids=None,
|
||||||
|
aggs=True,
|
||||||
|
rerank_mdl=None,
|
||||||
|
highlight=False,
|
||||||
|
rank_feature: dict | None = {PAGERANK_FLD: 10},
|
||||||
|
):
|
||||||
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
||||||
if not question:
|
if not question:
|
||||||
return ranks
|
return ranks
|
||||||
|
|
||||||
# Ensure RERANK_LIMIT is multiple of page_size
|
# Ensure RERANK_LIMIT is multiple of page_size
|
||||||
RERANK_LIMIT = math.ceil(64/page_size) * page_size if page_size>1 else 1
|
RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1
|
||||||
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT,
|
req = {
|
||||||
"question": question, "vector": True, "topk": top,
|
"kb_ids": kb_ids,
|
||||||
"similarity": similarity_threshold,
|
"doc_ids": doc_ids,
|
||||||
"available_int": 1}
|
"page": math.ceil(page_size * page / RERANK_LIMIT),
|
||||||
|
"size": RERANK_LIMIT,
|
||||||
|
"question": question,
|
||||||
|
"vector": True,
|
||||||
|
"topk": top,
|
||||||
|
"similarity": similarity_threshold,
|
||||||
|
"available_int": 1,
|
||||||
|
}
|
||||||
|
|
||||||
if isinstance(tenant_ids, str):
|
if isinstance(tenant_ids, str):
|
||||||
tenant_ids = tenant_ids.split(",")
|
tenant_ids = tenant_ids.split(",")
|
||||||
|
|
||||||
sres = self.search(req, [index_name(tid) for tid in tenant_ids],
|
sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
|
||||||
kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
|
|
||||||
|
|
||||||
if rerank_mdl and sres.total > 0:
|
if rerank_mdl and sres.total > 0:
|
||||||
sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
|
sim, tsim, vsim = self.rerank_by_model(
|
||||||
sres, question, 1 - vector_similarity_weight,
|
rerank_mdl,
|
||||||
vector_similarity_weight,
|
sres,
|
||||||
rank_feature=rank_feature)
|
question,
|
||||||
|
1 - vector_similarity_weight,
|
||||||
|
vector_similarity_weight,
|
||||||
|
rank_feature=rank_feature,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
lower_case_doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
lower_case_doc_engine = os.getenv("DOC_ENGINE", "elasticsearch")
|
||||||
if lower_case_doc_engine in ["elasticsearch","opensearch"]:
|
if lower_case_doc_engine in ["elasticsearch", "opensearch"]:
|
||||||
# ElasticSearch doesn't normalize each way score before fusion.
|
# ElasticSearch doesn't normalize each way score before fusion.
|
||||||
sim, tsim, vsim = self.rerank(
|
sim, tsim, vsim = self.rerank(
|
||||||
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
sres,
|
||||||
rank_feature=rank_feature)
|
question,
|
||||||
|
1 - vector_similarity_weight,
|
||||||
|
vector_similarity_weight,
|
||||||
|
rank_feature=rank_feature,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Don't need rerank here since Infinity normalizes each way score before fusion.
|
# Don't need rerank here since Infinity normalizes each way score before fusion.
|
||||||
sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
|
sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
|
||||||
sim = [s if s is not None else 0. for s in sim]
|
sim = [s if s is not None else 0.0 for s in sim]
|
||||||
tsim = sim
|
tsim = sim
|
||||||
vsim = sim
|
vsim = sim
|
||||||
# Already paginated in search function
|
|
||||||
max_pages = RERANK_LIMIT // page_size
|
|
||||||
page_index = (page % max_pages) - 1
|
|
||||||
begin = max(page_index * page_size, 0)
|
|
||||||
sim = sim[begin : begin + page_size]
|
|
||||||
sim_np = np.array(sim, dtype=np.float64)
|
sim_np = np.array(sim, dtype=np.float64)
|
||||||
idx = np.argsort(sim_np * -1)
|
if sim_np.size == 0:
|
||||||
|
return ranks
|
||||||
|
|
||||||
|
sorted_idx = np.argsort(sim_np * -1)
|
||||||
|
|
||||||
|
valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= similarity_threshold]
|
||||||
|
filtered_count = len(valid_idx)
|
||||||
|
ranks["total"] = int(filtered_count)
|
||||||
|
|
||||||
|
if filtered_count == 0:
|
||||||
|
return ranks
|
||||||
|
|
||||||
|
max_pages = max(RERANK_LIMIT // max(page_size, 1), 1)
|
||||||
|
page_index = (page - 1) % max_pages
|
||||||
|
begin = page_index * page_size
|
||||||
|
end = begin + page_size
|
||||||
|
page_idx = valid_idx[begin:end]
|
||||||
|
|
||||||
dim = len(sres.query_vector)
|
dim = len(sres.query_vector)
|
||||||
vector_column = f"q_{dim}_vec"
|
vector_column = f"q_{dim}_vec"
|
||||||
zero_vector = [0.0] * dim
|
zero_vector = [0.0] * dim
|
||||||
filtered_count = (sim_np >= similarity_threshold).sum()
|
|
||||||
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
|
|
||||||
for i in idx:
|
|
||||||
if np.float64(sim[i]) < similarity_threshold:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
for i in page_idx:
|
||||||
id = sres.ids[i]
|
id = sres.ids[i]
|
||||||
chunk = sres.field[id]
|
chunk = sres.field[id]
|
||||||
dnm = chunk.get("docnm_kwd", "")
|
dnm = chunk.get("docnm_kwd", "")
|
||||||
did = chunk.get("doc_id", "")
|
did = chunk.get("doc_id", "")
|
||||||
|
|
||||||
if len(ranks["chunks"]) >= page_size:
|
|
||||||
if aggs:
|
|
||||||
if dnm not in ranks["doc_aggs"]:
|
|
||||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
|
||||||
ranks["doc_aggs"][dnm]["count"] += 1
|
|
||||||
continue
|
|
||||||
break
|
|
||||||
|
|
||||||
position_int = chunk.get("position_int", [])
|
position_int = chunk.get("position_int", [])
|
||||||
d = {
|
d = {
|
||||||
"chunk_id": id,
|
"chunk_id": id,
|
||||||
@ -434,12 +461,12 @@ class Dealer:
|
|||||||
"kb_id": chunk["kb_id"],
|
"kb_id": chunk["kb_id"],
|
||||||
"important_kwd": chunk.get("important_kwd", []),
|
"important_kwd": chunk.get("important_kwd", []),
|
||||||
"image_id": chunk.get("img_id", ""),
|
"image_id": chunk.get("img_id", ""),
|
||||||
"similarity": sim[i],
|
"similarity": float(sim_np[i]),
|
||||||
"vector_similarity": vsim[i],
|
"vector_similarity": float(vsim[i]),
|
||||||
"term_similarity": tsim[i],
|
"term_similarity": float(tsim[i]),
|
||||||
"vector": chunk.get(vector_column, zero_vector),
|
"vector": chunk.get(vector_column, zero_vector),
|
||||||
"positions": position_int,
|
"positions": position_int,
|
||||||
"doc_type_kwd": chunk.get("doc_type_kwd", "")
|
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
|
||||||
}
|
}
|
||||||
if highlight and sres.highlight:
|
if highlight and sres.highlight:
|
||||||
if id in sres.highlight:
|
if id in sres.highlight:
|
||||||
@ -447,15 +474,30 @@ class Dealer:
|
|||||||
else:
|
else:
|
||||||
d["highlight"] = d["content_with_weight"]
|
d["highlight"] = d["content_with_weight"]
|
||||||
ranks["chunks"].append(d)
|
ranks["chunks"].append(d)
|
||||||
if dnm not in ranks["doc_aggs"]:
|
|
||||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
if aggs:
|
||||||
ranks["doc_aggs"][dnm]["count"] += 1
|
for i in valid_idx:
|
||||||
ranks["doc_aggs"] = [{"doc_name": k,
|
id = sres.ids[i]
|
||||||
"doc_id": v["doc_id"],
|
chunk = sres.field[id]
|
||||||
"count": v["count"]} for k,
|
dnm = chunk.get("docnm_kwd", "")
|
||||||
v in sorted(ranks["doc_aggs"].items(),
|
did = chunk.get("doc_id", "")
|
||||||
key=lambda x: x[1]["count"] * -1)]
|
if dnm not in ranks["doc_aggs"]:
|
||||||
ranks["chunks"] = ranks["chunks"][:page_size]
|
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||||
|
ranks["doc_aggs"][dnm]["count"] += 1
|
||||||
|
|
||||||
|
ranks["doc_aggs"] = [
|
||||||
|
{
|
||||||
|
"doc_name": k,
|
||||||
|
"doc_id": v["doc_id"],
|
||||||
|
"count": v["count"],
|
||||||
|
}
|
||||||
|
for k, v in sorted(
|
||||||
|
ranks["doc_aggs"].items(),
|
||||||
|
key=lambda x: x[1]["count"] * -1,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
ranks["doc_aggs"] = []
|
||||||
|
|
||||||
return ranks
|
return ranks
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user