Fix: parent-child chunking method (#11810)

### What problem does this PR solve?

change:
parent-child chunking method

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
buua436
2025-12-09 09:34:01 +08:00
committed by GitHub
parent 5c9672a265
commit dd046be976
3 changed files with 7 additions and 4 deletions

View File

@ -2,6 +2,7 @@
"id": {"type": "varchar", "default": ""}, "id": {"type": "varchar", "default": ""},
"doc_id": {"type": "varchar", "default": ""}, "doc_id": {"type": "varchar", "default": ""},
"kb_id": {"type": "varchar", "default": ""}, "kb_id": {"type": "varchar", "default": ""},
"mom_id": {"type": "varchar", "default": ""},
"create_time": {"type": "varchar", "default": ""}, "create_time": {"type": "varchar", "default": ""},
"create_timestamp_flt": {"type": "float", "default": 0.0}, "create_timestamp_flt": {"type": "float", "default": 0.0},
"img_id": {"type": "varchar", "default": ""}, "img_id": {"type": "varchar", "default": ""},

View File

@ -91,7 +91,7 @@ class Dealer:
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
"question_kwd", "question_tks", "doc_type_kwd", "question_kwd", "question_tks", "doc_type_kwd",
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD]) "available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD])
kwds = set([]) kwds = set([])
qst = req.get("question", "") qst = req.get("question", "")
@ -469,6 +469,7 @@ class Dealer:
"vector": chunk.get(vector_column, zero_vector), "vector": chunk.get(vector_column, zero_vector),
"positions": position_int, "positions": position_int,
"doc_type_kwd": chunk.get("doc_type_kwd", ""), "doc_type_kwd": chunk.get("doc_type_kwd", ""),
"mom_id": chunk.get("mom_id", ""),
} }
if highlight and sres.highlight: if highlight and sres.highlight:
if id in sres.highlight: if id in sres.highlight:
@ -650,7 +651,8 @@ class Dealer:
i = 0 i = 0
while i < len(chunks): while i < len(chunks):
ck = chunks[i] ck = chunks[i]
if not ck.get("mom_id"): mom_id = ck.get("mom_id")
if not isinstance(mom_id, str) or not mom_id.strip():
i += 1 i += 1
continue continue
mom_chunks[ck["mom_id"]].append(chunks.pop(i)) mom_chunks[ck["mom_id"]].append(chunks.pop(i))

View File

@ -727,17 +727,17 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
if not mom: if not mom:
continue continue
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest() id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
ck["mom_id"] = id
if id in mother_ids: if id in mother_ids:
continue continue
mother_ids.add(id) mother_ids.add(id)
ck["mom_id"] = id
mom_ck = copy.deepcopy(ck) mom_ck = copy.deepcopy(ck)
mom_ck["id"] = id mom_ck["id"] = id
mom_ck["content_with_weight"] = mom mom_ck["content_with_weight"] = mom
mom_ck["available_int"] = 0 mom_ck["available_int"] = 0
flds = list(mom_ck.keys()) flds = list(mom_ck.keys())
for fld in flds: for fld in flds:
if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int", "position_int"]: if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int"]:
del mom_ck[fld] del mom_ck[fld]
mothers.append(mom_ck) mothers.append(mom_ck)