mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add child parent chunking method in backend. (#11598)
### What problem does this PR solve? #7996 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -128,9 +128,6 @@ def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
|
||||
try:
|
||||
if prog is not None and prog < 0:
|
||||
@ -720,6 +717,34 @@ async def delete_image(kb_id, chunk_id):
|
||||
|
||||
|
||||
async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
|
||||
mothers = []
|
||||
mother_ids = set([])
|
||||
for ck in chunks:
|
||||
mom = ck.get("mom") or ck.get("mom_with_weight") or ""
|
||||
if not mom:
|
||||
continue
|
||||
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
|
||||
if id in mother_ids:
|
||||
continue
|
||||
mother_ids.add(id)
|
||||
ck["mom_id"] = id
|
||||
mom_ck = copy.deepcopy(ck)
|
||||
mom_ck["id"] = id
|
||||
mom_ck["content_with_weight"] = mom
|
||||
mom_ck["available_int"] = 0
|
||||
flds = list(mom_ck.keys())
|
||||
for fld in flds:
|
||||
if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int"]:
|
||||
del mom_ck[fld]
|
||||
mothers.append(mom_ck)
|
||||
|
||||
for b in range(0, len(mothers), settings.DOC_BULK_SIZE):
|
||||
await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(mothers[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
|
||||
task_canceled = has_canceled(task_id)
|
||||
if task_canceled:
|
||||
progress_callback(-1, msg="Task has been canceled.")
|
||||
return False
|
||||
|
||||
for b in range(0, len(chunks), settings.DOC_BULK_SIZE):
|
||||
doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
|
||||
task_canceled = has_canceled(task_id)
|
||||
|
||||
Reference in New Issue
Block a user