From da8802d01004f5ea490fc93c0f49feb964cbd9d2 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 14 Aug 2024 11:09:07 +0800 Subject: [PATCH] refine error log while chunking (#1937) ### What problem does this PR solve? ### Type of change - [x] Refactoring --- api/apps/document_app.py | 4 +++- rag/svr/task_executor.py | 33 +++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 5d7e0953c..7e4580f1a 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -501,7 +501,9 @@ def upload_and_parse(): "callback": dummy, "parser_config": parser_config, "from_page": 0, - "to_page": 100000 + "to_page": 100000, + "tenant_id": kb.tenant_id, + "lang": kb.language } threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs)) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 800633b90..2391a3acd 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -146,27 +146,32 @@ def build(row): binary = get_minio_binary(bucket, name) cron_logger.info( "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"])) + except TimeoutError as e: + callback(-1, f"Internal server error: Fetch file from minio timeout. Could you try it again.") + cron_logger.error( + "Minio {}/{}: Fetch file from minio timeout.".format(row["location"], row["name"])) + return + except Exception as e: + if re.search("(No such file|not found)", str(e)): + callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"]) + else: + callback(-1, f"Get file from minio: %s" % + str(e).replace("'", "")) + traceback.print_exc() + return + + try: cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"], to_page=row["to_page"], lang=row["language"], callback=callback, kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"]) cron_logger.info( - "Chunkking({}) {}/{}".format(timer() - st, row["location"], row["name"])) - except TimeoutError as e: - callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.") - cron_logger.error( - "Chunkking {}/{}: Fetch file timeout.".format(row["location"], row["name"])) - return + "Chunking({}) {}/{}".format(timer() - st, row["location"], row["name"])) except Exception as e: - if re.search("(No such file|not found)", str(e)): - callback(-1, "Can not find file <%s>" % row["name"]) - else: - callback(-1, f"Internal server error: %s" % + callback(-1, f"Internal server error while chunking: %s" % str(e).replace("'", "")) - traceback.print_exc() - cron_logger.error( - "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e))) - + "Chunking {}/{}: {}".format(row["location"], row["name"], str(e))) + traceback.print_exc() return docs = []