Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve? Use consistent log file names, introduced initLogger ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
2025-12-08 20:42:30 +08:00 · 2024-11-14 17:13:48 +08:00
parent ab4384e011
commit 30f6421760
75 changed files with 396 additions and 402 deletions
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -13,9 +13,20 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
+import inspect
+from api.utils.log_utils import initRootLogger
+initRootLogger(inspect.getfile(inspect.currentframe()))
+for module in ["pdfminer"]:
+    module_logger = logging.getLogger(module)
+    module_logger.setLevel(logging.WARNING)
+for module in ["peewee"]:
+    module_logger = logging.getLogger(module)
+    module_logger.handlers.clear()
+    module_logger.propagate = True
+
 import datetime
 import json
-import logging
 import os
 import hashlib
 import copy
@ -42,7 +53,6 @@ from api.db.db_models import close_connection
 from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
 from rag.nlp import search, rag_tokenizer
 from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
-from api.utils.log_utils import logger, LOG_FILE
 from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
 from rag.utils import rmSpace, num_tokens_from_string
 from rag.utils.redis_conn import REDIS_CONN, Payload
@ -90,7 +100,7 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
    try:
        TaskService.update_progress(task_id, d)
    except Exception:
-        logger.exception(f"set_progress({task_id}) got exception")
+        logging.exception(f"set_progress({task_id}) got exception")

    close_connection()
    if cancel:
@ -110,7 +120,7 @@ def collect():
            time.sleep(1)
            return pd.DataFrame()
    except Exception:
-        logger.exception("Get task event from queue exception")
+        logging.exception("Get task event from queue exception")
        return pd.DataFrame()

    msg = PAYLOAD.get_message()
@ -118,11 +128,11 @@ def collect():
        return pd.DataFrame()

    if TaskService.do_cancel(msg["id"]):
-        logger.info("Task {} has been canceled.".format(msg["id"]))
+        logging.info("Task {} has been canceled.".format(msg["id"]))
        return pd.DataFrame()
    tasks = TaskService.get_tasks(msg["id"])
    if not tasks:
-        logger.warning("{} empty task!".format(msg["id"]))
+        logging.warning("{} empty task!".format(msg["id"]))
        return []

    tasks = pd.DataFrame(tasks)
@ -151,29 +161,29 @@ def build(row):
        st = timer()
        bucket, name = File2DocumentService.get_storage_address(doc_id=row["doc_id"])
        binary = get_storage_binary(bucket, name)
-        logger.info(
+        logging.info(
            "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
    except TimeoutError:
        callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
-        logger.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
+        logging.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
        return
    except Exception as e:
        if re.search("(No such file|not found)", str(e)):
            callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
        else:
            callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
-        logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
+        logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
        return

    try:
        cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
                            to_page=row["to_page"], lang=row["language"], callback=callback,
                            kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
-        logger.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
+        logging.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
    except Exception as e:
        callback(-1, "Internal server error while chunking: %s" %
                     str(e).replace("'", ""))
-        logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
+        logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
        return

    docs = []
@ -210,12 +220,12 @@ def build(row):
            STORAGE_IMPL.put(row["kb_id"], d["id"], output_buffer.getvalue())
            el += timer() - st
        except Exception:
-            logger.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))
+            logging.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))

        d["img_id"] = "{}-{}".format(row["kb_id"], d["id"])
        del d["image"]
        docs.append(d)
-    logger.info("MINIO PUT({}):{}".format(row["name"], el))
+    logging.info("MINIO PUT({}):{}".format(row["name"], el))

    if row["parser_config"].get("auto_keywords", 0):
        st = timer()
@ -345,7 +355,7 @@ def main():
            embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
        except Exception as e:
            callback(-1, msg=str(e))
-            logger.exception("LLMBundle got exception")
+            logging.exception("LLMBundle got exception")
            continue

        if r.get("task_type", "") == "raptor":
@ -354,12 +364,12 @@ def main():
                cks, tk_count, vector_size = run_raptor(r, chat_mdl, embd_mdl, callback)
            except Exception as e:
                callback(-1, msg=str(e))
-                logger.exception("run_raptor got exception")
+                logging.exception("run_raptor got exception")
                continue
        else:
            st = timer()
            cks = build(r)
-            logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
+            logging.info("Build chunks({}): {}".format(r["name"], timer() - st))
            if cks is None:
                continue
            if not cks:
@ -375,12 +385,12 @@ def main():
                tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
            except Exception as e:
                callback(-1, "Embedding error:{}".format(str(e)))
-                logger.exception("run_rembedding got exception")
+                logging.exception("run_rembedding got exception")
                tk_count = 0
-            logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
+            logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
            callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st))

-        # logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
+        # logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
        init_kb(r, vector_size)
        chunk_count = len(set([c["id"] for c in cks]))
        st = timer()
@ -391,11 +401,11 @@ def main():
            if b % 128 == 0:
                callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")

-        logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
+        logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
        if es_r:
            callback(-1, f"Insert chunk error, detail info please check {LOG_FILE}. Please also check ES status!")
            docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
-            logger.error('Insert chunk error: ' + str(es_r))
+            logging.error('Insert chunk error: ' + str(es_r))
        else:
            if TaskService.do_cancel(r["id"]):
                docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
@ -404,7 +414,7 @@ def main():
            callback(1., "Done!")
            DocumentService.increment_chunk_num(
                r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
-            logger.info(
+            logging.info(
                "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
                    r["id"], tk_count, len(cks), timer() - st))

@ -421,16 +431,11 @@ def report_status():
            obj[CONSUMER_NAME] = obj[CONSUMER_NAME][-60:]
            REDIS_CONN.set_obj("TASKEXE", obj, 60*2)
        except Exception:
-            logger.exception("report_status got exception")
+            logging.exception("report_status got exception")
        time.sleep(30)


 if __name__ == "__main__":
-    peewee_logger = logging.getLogger('peewee')
-    peewee_logger.propagate = False
-    peewee_logger.addHandler(logger.handlers[0])
-    peewee_logger.setLevel(logger.handlers[0].level)
-
    exe = ThreadPoolExecutor(max_workers=1)
    exe.submit(report_status)