Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Zhichang Yu
2024-12-12 16:38:03 +08:00
committed by GitHub
parent 835fd7abcd
commit 301f95837c
7 changed files with 242 additions and 85 deletions

View File

@ -39,8 +39,9 @@ from timeit import default_timer as timer
import tracemalloc
import numpy as np
from peewee import DoesNotExist
from api.db import LLMType, ParserType
from api.db import LLMType, ParserType, TaskStatus
from api.db.services.dialog_service import keyword_extraction, question_proposal
from api.db.services.document_service import DocumentService
from api.db.services.llm_service import LLMBundle
@ -89,12 +90,23 @@ DONE_TASKS = 0
FAILED_TASKS = 0
CURRENT_TASK = None
class TaskCanceledException(Exception):
def __init__(self, msg):
self.msg = msg
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
global PAYLOAD
if prog is not None and prog < 0:
msg = "[ERROR]" + msg
cancel = TaskService.do_cancel(task_id)
try:
cancel = TaskService.do_cancel(task_id)
except DoesNotExist:
logging.warning(f"set_progress task {task_id} is unknown")
if PAYLOAD:
PAYLOAD.ack()
PAYLOAD = None
return
if cancel:
msg += " [Canceled]"
prog = -1
@ -105,18 +117,22 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
d = {"progress_msg": msg}
if prog is not None:
d["progress"] = prog
try:
logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
TaskService.update_progress(task_id, d)
except Exception:
logging.exception(f"set_progress({task_id}) got exception")
close_connection()
if cancel:
logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
try:
TaskService.update_progress(task_id, d)
except DoesNotExist:
logging.warning(f"set_progress task {task_id} is unknown")
if PAYLOAD:
PAYLOAD.ack()
PAYLOAD = None
os._exit(0)
return
close_connection()
if cancel and PAYLOAD:
PAYLOAD.ack()
PAYLOAD = None
raise TaskCanceledException(msg)
def collect():
@ -136,16 +152,22 @@ def collect():
if not msg:
return None
if TaskService.do_cancel(msg["id"]):
task = None
canceled = False
try:
task = TaskService.get_task(msg["id"])
if task:
_, doc = DocumentService.get_by_id(task["doc_id"])
canceled = doc.run == TaskStatus.CANCEL.value or doc.progress < 0
except DoesNotExist:
pass
except Exception:
logging.exception("collect get_task exception")
if not task or canceled:
state = "is unknown" if not task else "has been cancelled"
with mt_lock:
DONE_TASKS += 1
logging.info("Task {} has been canceled.".format(msg["id"]))
return None
task = TaskService.get_task(msg["id"])
if not task:
with mt_lock:
DONE_TASKS += 1
logging.warning("{} empty task!".format(msg["id"]))
logging.info(f"collect task {msg['id']} {state}")
return None
if msg.get("type", "") == "raptor":
@ -186,6 +208,8 @@ def build_chunks(task, progress_callback):
to_page=task["to_page"], lang=task["language"], callback=progress_callback,
kb_id=task["kb_id"], parser_config=task["parser_config"], tenant_id=task["tenant_id"])
logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
except TaskCanceledException:
raise
except Exception as e:
progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
@ -358,6 +382,8 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
return res, tk_count, vector_size
def do_handle_task(task):
task_id = task["id"]
task_from_page = task["from_page"]
@ -373,6 +399,16 @@ def do_handle_task(task):
# prepare the progress callback function
progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
try:
task_canceled = TaskService.do_cancel(task_id)
except DoesNotExist:
logging.warning(f"task {task_id} is unknown")
return
if task_canceled:
progress_callback(-1, msg="Task has been canceled.")
return
try:
# bind embedding model
embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
@ -390,6 +426,8 @@ def do_handle_task(task):
# run RAPTOR
chunks, token_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
except TaskCanceledException:
raise
except Exception as e:
error_message = f'Fail to bind LLM used by RAPTOR: {str(e)}'
progress_callback(-1, msg=error_message)
@ -420,6 +458,7 @@ def do_handle_task(task):
progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
logging.info(progress_message)
progress_callback(msg=progress_message)
# logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
init_kb(task, vector_size)
chunk_count = len(set([chunk["id"] for chunk in chunks]))
@ -430,23 +469,25 @@ def do_handle_task(task):
doc_store_result = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
if b % 128 == 0:
progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
logging.info("Indexing {} elapsed: {:.2f}".format(task_document_name, timer() - start_ts))
if doc_store_result:
error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
progress_callback(-1, msg=error_message)
settings.docStoreConn.delete({"doc_id": task_doc_id}, search.index_name(task_tenant_id), task_dataset_id)
logging.error(error_message)
raise Exception(error_message)
if TaskService.do_cancel(task_id):
settings.docStoreConn.delete({"doc_id": task_doc_id}, search.index_name(task_tenant_id), task_dataset_id)
return
if doc_store_result:
error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
progress_callback(-1, msg=error_message)
raise Exception(error_message)
chunk_ids = [chunk["id"] for chunk in chunks[:b + es_bulk_size]]
chunk_ids_str = " ".join(chunk_ids)
try:
TaskService.update_chunk_ids(task["id"], chunk_ids_str)
except DoesNotExist:
logging.warning(f"do_handle_task update_chunk_ids failed since task {task['id']} is unknown.")
doc_store_result = settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id)
return
logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), timer() - start_ts))
DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
time_cost = timer() - start_ts
progress_callback(prog=1.0, msg="Done ({:.2f}s)".format(time_cost))
logging.info("Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(task_id, token_count, len(chunks), time_cost))
logging.info("Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), token_count, time_cost))
def handle_task():
@ -462,6 +503,12 @@ def handle_task():
DONE_TASKS += 1
CURRENT_TASK = None
logging.info(f"handle_task done for task {json.dumps(task)}")
except TaskCanceledException:
with mt_lock:
DONE_TASKS += 1
CURRENT_TASK = None
logging.info(f"handle_task got TaskCanceledException for task {json.dumps(task)}")
logging.debug("handle_task got TaskCanceledException", exc_info=True)
except Exception:
with mt_lock:
FAILED_TASKS += 1