Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-11-28 19:25:32 +08:00
committed by GitHub
parent d2915f6984
commit 14616cf845
10 changed files with 216 additions and 130 deletions

View File

@ -128,9 +128,6 @@ def signal_handler(sig, frame):
sys.exit(0)
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
try:
if prog is not None and prog < 0:
@ -720,6 +717,34 @@ async def delete_image(kb_id, chunk_id):
async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
mothers = []
mother_ids = set([])
for ck in chunks:
mom = ck.get("mom") or ck.get("mom_with_weight") or ""
if not mom:
continue
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
if id in mother_ids:
continue
mother_ids.add(id)
ck["mom_id"] = id
mom_ck = copy.deepcopy(ck)
mom_ck["id"] = id
mom_ck["content_with_weight"] = mom
mom_ck["available_int"] = 0
flds = list(mom_ck.keys())
for fld in flds:
if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int"]:
del mom_ck[fld]
mothers.append(mom_ck)
for b in range(0, len(mothers), settings.DOC_BULK_SIZE):
await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(mothers[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
task_canceled = has_canceled(task_id)
if task_canceled:
progress_callback(-1, msg="Task has been canceled.")
return False
for b in range(0, len(chunks), settings.DOC_BULK_SIZE):
doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
task_canceled = has_canceled(task_id)