From dd046be976293bfc47c696497a9a2d3ff00aab42 Mon Sep 17 00:00:00 2001 From: buua436 Date: Tue, 9 Dec 2025 09:34:01 +0800 Subject: [PATCH] Fix: parent-child chunking method (#11810) ### What problem does this PR solve? change: parent-child chunking method ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- conf/infinity_mapping.json | 1 + rag/nlp/search.py | 6 ++++-- rag/svr/task_executor.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index e68dd4f15..7a28d5754 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -2,6 +2,7 @@ "id": {"type": "varchar", "default": ""}, "doc_id": {"type": "varchar", "default": ""}, "kb_id": {"type": "varchar", "default": ""}, + "mom_id": {"type": "varchar", "default": ""}, "create_time": {"type": "varchar", "default": ""}, "create_timestamp_flt": {"type": "float", "default": 0.0}, "img_id": {"type": "varchar", "default": ""}, diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 1ca70f678..f5dd2d4de 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -91,7 +91,7 @@ class Dealer: ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks", "doc_type_kwd", - "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD]) + "available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD]) kwds = set([]) qst = req.get("question", "") @@ -469,6 +469,7 @@ class Dealer: "vector": chunk.get(vector_column, zero_vector), "positions": position_int, "doc_type_kwd": chunk.get("doc_type_kwd", ""), + "mom_id": chunk.get("mom_id", ""), } if highlight and sres.highlight: if id in sres.highlight: @@ -650,7 +651,8 @@ class Dealer: i = 0 while i < len(chunks): ck = chunks[i] - if not ck.get("mom_id"): + mom_id = ck.get("mom_id") + if not isinstance(mom_id, str) or not mom_id.strip(): i += 1 continue mom_chunks[ck["mom_id"]].append(chunks.pop(i)) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index b08aa7524..62693f24f 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -727,17 +727,17 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c if not mom: continue id = xxhash.xxh64(mom.encode("utf-8")).hexdigest() + ck["mom_id"] = id if id in mother_ids: continue mother_ids.add(id) - ck["mom_id"] = id mom_ck = copy.deepcopy(ck) mom_ck["id"] = id mom_ck["content_with_weight"] = mom mom_ck["available_int"] = 0 flds = list(mom_ck.keys()) for fld in flds: - if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int", "position_int"]: + if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int"]: del mom_ck[fld] mothers.append(mom_ck)