mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-19 20:16:49 +08:00
Fix: parent-child chunking method (#11810)
### What problem does this PR solve? change: parent-child chunking method ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -2,6 +2,7 @@
|
|||||||
"id": {"type": "varchar", "default": ""},
|
"id": {"type": "varchar", "default": ""},
|
||||||
"doc_id": {"type": "varchar", "default": ""},
|
"doc_id": {"type": "varchar", "default": ""},
|
||||||
"kb_id": {"type": "varchar", "default": ""},
|
"kb_id": {"type": "varchar", "default": ""},
|
||||||
|
"mom_id": {"type": "varchar", "default": ""},
|
||||||
"create_time": {"type": "varchar", "default": ""},
|
"create_time": {"type": "varchar", "default": ""},
|
||||||
"create_timestamp_flt": {"type": "float", "default": 0.0},
|
"create_timestamp_flt": {"type": "float", "default": 0.0},
|
||||||
"img_id": {"type": "varchar", "default": ""},
|
"img_id": {"type": "varchar", "default": ""},
|
||||||
|
|||||||
@ -91,7 +91,7 @@ class Dealer:
|
|||||||
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
||||||
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
||||||
"question_kwd", "question_tks", "doc_type_kwd",
|
"question_kwd", "question_tks", "doc_type_kwd",
|
||||||
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
|
"available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD])
|
||||||
kwds = set([])
|
kwds = set([])
|
||||||
|
|
||||||
qst = req.get("question", "")
|
qst = req.get("question", "")
|
||||||
@ -469,6 +469,7 @@ class Dealer:
|
|||||||
"vector": chunk.get(vector_column, zero_vector),
|
"vector": chunk.get(vector_column, zero_vector),
|
||||||
"positions": position_int,
|
"positions": position_int,
|
||||||
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
|
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
|
||||||
|
"mom_id": chunk.get("mom_id", ""),
|
||||||
}
|
}
|
||||||
if highlight and sres.highlight:
|
if highlight and sres.highlight:
|
||||||
if id in sres.highlight:
|
if id in sres.highlight:
|
||||||
@ -650,7 +651,8 @@ class Dealer:
|
|||||||
i = 0
|
i = 0
|
||||||
while i < len(chunks):
|
while i < len(chunks):
|
||||||
ck = chunks[i]
|
ck = chunks[i]
|
||||||
if not ck.get("mom_id"):
|
mom_id = ck.get("mom_id")
|
||||||
|
if not isinstance(mom_id, str) or not mom_id.strip():
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
mom_chunks[ck["mom_id"]].append(chunks.pop(i))
|
mom_chunks[ck["mom_id"]].append(chunks.pop(i))
|
||||||
|
|||||||
@ -727,17 +727,17 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
|
|||||||
if not mom:
|
if not mom:
|
||||||
continue
|
continue
|
||||||
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
|
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
|
||||||
|
ck["mom_id"] = id
|
||||||
if id in mother_ids:
|
if id in mother_ids:
|
||||||
continue
|
continue
|
||||||
mother_ids.add(id)
|
mother_ids.add(id)
|
||||||
ck["mom_id"] = id
|
|
||||||
mom_ck = copy.deepcopy(ck)
|
mom_ck = copy.deepcopy(ck)
|
||||||
mom_ck["id"] = id
|
mom_ck["id"] = id
|
||||||
mom_ck["content_with_weight"] = mom
|
mom_ck["content_with_weight"] = mom
|
||||||
mom_ck["available_int"] = 0
|
mom_ck["available_int"] = 0
|
||||||
flds = list(mom_ck.keys())
|
flds = list(mom_ck.keys())
|
||||||
for fld in flds:
|
for fld in flds:
|
||||||
if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int", "position_int"]:
|
if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int"]:
|
||||||
del mom_ck[fld]
|
del mom_ck[fld]
|
||||||
mothers.append(mom_ck)
|
mothers.append(mom_ck)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user