mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-19 12:06:42 +08:00
Fix: parent-child chunking method (#11810)
### What problem does this PR solve? change: parent-child chunking method ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -2,6 +2,7 @@
|
||||
"id": {"type": "varchar", "default": ""},
|
||||
"doc_id": {"type": "varchar", "default": ""},
|
||||
"kb_id": {"type": "varchar", "default": ""},
|
||||
"mom_id": {"type": "varchar", "default": ""},
|
||||
"create_time": {"type": "varchar", "default": ""},
|
||||
"create_timestamp_flt": {"type": "float", "default": 0.0},
|
||||
"img_id": {"type": "varchar", "default": ""},
|
||||
|
||||
@ -91,7 +91,7 @@ class Dealer:
|
||||
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
||||
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
||||
"question_kwd", "question_tks", "doc_type_kwd",
|
||||
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
|
||||
"available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD])
|
||||
kwds = set([])
|
||||
|
||||
qst = req.get("question", "")
|
||||
@ -469,6 +469,7 @@ class Dealer:
|
||||
"vector": chunk.get(vector_column, zero_vector),
|
||||
"positions": position_int,
|
||||
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
|
||||
"mom_id": chunk.get("mom_id", ""),
|
||||
}
|
||||
if highlight and sres.highlight:
|
||||
if id in sres.highlight:
|
||||
@ -650,7 +651,8 @@ class Dealer:
|
||||
i = 0
|
||||
while i < len(chunks):
|
||||
ck = chunks[i]
|
||||
if not ck.get("mom_id"):
|
||||
mom_id = ck.get("mom_id")
|
||||
if not isinstance(mom_id, str) or not mom_id.strip():
|
||||
i += 1
|
||||
continue
|
||||
mom_chunks[ck["mom_id"]].append(chunks.pop(i))
|
||||
|
||||
@ -727,17 +727,17 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
|
||||
if not mom:
|
||||
continue
|
||||
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
|
||||
ck["mom_id"] = id
|
||||
if id in mother_ids:
|
||||
continue
|
||||
mother_ids.add(id)
|
||||
ck["mom_id"] = id
|
||||
mom_ck = copy.deepcopy(ck)
|
||||
mom_ck["id"] = id
|
||||
mom_ck["content_with_weight"] = mom
|
||||
mom_ck["available_int"] = 0
|
||||
flds = list(mom_ck.keys())
|
||||
for fld in flds:
|
||||
if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int", "position_int"]:
|
||||
if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int"]:
|
||||
del mom_ck[fld]
|
||||
mothers.append(mom_ck)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user