Fix: parent-child chunking method (#11810)

### What problem does this PR solve? change: parent-child chunking method ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-03 17:15:08 +08:00 · 2025-12-09 09:34:01 +08:00
parent 5c9672a265
commit dd046be976
3 changed files with 7 additions and 4 deletions
--- a/conf/infinity_mapping.json
+++ b/conf/infinity_mapping.json
@ -2,6 +2,7 @@
 	"id": {"type": "varchar", "default": ""},
 	"doc_id": {"type": "varchar", "default": ""},
 	"kb_id": {"type": "varchar", "default": ""},
 	"mom_id": {"type": "varchar", "default": ""},
 	"create_time": {"type": "varchar", "default": ""},
 	"create_timestamp_flt": {"type": "float", "default": 0.0},
 	"img_id": {"type": "varchar", "default": ""},
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -91,7 +91,7 @@ class Dealer:
                      ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
                       "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
                       "question_kwd", "question_tks", "doc_type_kwd",
-                       "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
+                       "available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD])
        kwds = set([])
        qst = req.get("question", "")
@ -469,6 +469,7 @@ class Dealer:
                "vector": chunk.get(vector_column, zero_vector),
                "positions": position_int,
                "doc_type_kwd": chunk.get("doc_type_kwd", ""),
                "mom_id": chunk.get("mom_id", ""),
            }
            if highlight and sres.highlight:
                if id in sres.highlight:
@ -650,7 +651,8 @@ class Dealer:
        i = 0
        while i < len(chunks):
            ck = chunks[i]
-            if not ck.get("mom_id"):
+            mom_id = ck.get("mom_id")
            if not isinstance(mom_id, str) or not mom_id.strip():
                i += 1
                continue
            mom_chunks[ck["mom_id"]].append(chunks.pop(i))
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -727,17 +727,17 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
        if not mom:
            continue
        id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
        ck["mom_id"] = id
        if id in mother_ids:
            continue
        mother_ids.add(id)
        ck["mom_id"] = id
        mom_ck = copy.deepcopy(ck)
        mom_ck["id"] = id
        mom_ck["content_with_weight"] = mom
        mom_ck["available_int"] = 0
        flds = list(mom_ck.keys())
        for fld in flds:
-            if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int", "position_int"]:
+            if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int"]:
                del mom_ck[fld]
        mothers.append(mom_ck)