Feat: support parent-child in search procedure. (#11629)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-12-01 14:03:09 +08:00
committed by GitHub
parent 88a28212b3
commit 6ea4248bdc
3 changed files with 50 additions and 2 deletions

View File

@ -17,7 +17,7 @@ import json
import logging
import re
import math
from collections import OrderedDict
from collections import OrderedDict, defaultdict
from dataclasses import dataclass
from rag.prompts.generator import relevant_chunks_with_toc
@ -640,3 +640,50 @@ class Dealer:
chunks.append(d)
return sorted(chunks, key=lambda x:x["similarity"]*-1)[:topn]
def retrieval_by_children(self, chunks:list[dict], tenant_ids:list[str]):
if not chunks:
return []
idx_nms = [index_name(tid) for tid in tenant_ids]
mom_chunks = defaultdict([])
i = 0
while i < len(chunks):
ck = chunks[i]
if not ck.get("mom_id"):
i += 1
continue
mom_chunks[ck["mom_id"]].append(chunks.pop(i))
if not mom_chunks:
return chunks
if not chunks:
chunks = []
vector_size = 1024
for id, cks in mom_chunks.items():
chunk = self.dataStore.get(id, idx_nms, [ck["kb_id"] for ck in cks])
d = {
"chunk_id": id,
"content_ltks": " ".join([ck["content_ltks"] for ck in cks]),
"content_with_weight": chunk["content_with_weight"],
"doc_id": chunk["doc_id"],
"docnm_kwd": chunk.get("docnm_kwd", ""),
"kb_id": chunk["kb_id"],
"important_kwd": [kwd for ck in cks for kwd in ck.get("important_kwd", [])],
"image_id": chunk.get("img_id", ""),
"similarity": np.mean([ck["similarity"] for ck in cks]),
"vector_similarity": np.mean([ck["similarity"] for ck in cks]),
"term_similarity": np.mean([ck["similarity"] for ck in cks]),
"vector": [0.0] * vector_size,
"positions": chunk.get("position_int", []),
"doc_type_kwd": chunk.get("doc_type_kwd", "")
}
for k in cks[0].keys():
if k[-4:] == "_vec":
d["vector"] = cks[0][k]
vector_size = len(cks[0][k])
break
chunks.append(d)
return sorted(chunks, key=lambda x:x["similarity"]*-1)