Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve? Use consistent log file names, introduced initLogger ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
2025-12-08 20:42:30 +08:00 · 2024-11-14 17:13:48 +08:00
parent ab4384e011
commit 30f6421760
75 changed files with 396 additions and 402 deletions
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -11,6 +11,7 @@
 #  limitations under the License.
 #

+import logging
 import os
 import random

@ -18,7 +19,6 @@ import xgboost as xgb
 from io import BytesIO
 import re
 import pdfplumber
-import logging
 from PIL import Image
 import numpy as np
 from timeit import default_timer as timer
@ -26,15 +26,11 @@ from pypdf import PdfReader as pdf2_read

 from api.settings import LIGHTEN
 from api.utils.file_utils import get_project_base_directory
-from api.utils.log_utils import logger
 from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
 from rag.nlp import rag_tokenizer
 from copy import deepcopy
 from huggingface_hub import snapshot_download

-logging.getLogger("pdfminer").setLevel(logging.WARNING)
-
-
 class RAGFlowPdfParser:
    def __init__(self):
        self.ocr = OCR()
@ -51,7 +47,7 @@ class RAGFlowPdfParser:
                if torch.cuda.is_available():
                    self.updown_cnt_mdl.set_param({"device": "cuda"})
            except Exception:
-                logger.exception("RAGFlowPdfParser __init__")
+                logging.exception("RAGFlowPdfParser __init__")
        try:
            model_dir = os.path.join(
                get_project_base_directory(),
@ -188,7 +184,7 @@ class RAGFlowPdfParser:
        return True

    def _table_transformer_job(self, ZM):
-        logger.info("Table processing...")
+        logging.debug("Table processing...")
        imgs, pos = [], []
        tbcnt = [0]
        MARGIN = 10
@ -426,7 +422,7 @@ class RAGFlowPdfParser:
            detach_feats = [b["x1"] < b_["x0"],
                            b["x0"] > b_["x1"]]
            if (any(feats) and not any(concatting_feats)) or any(detach_feats):
-                logger.info("{} {} {} {}".format(
+                logging.debug("{} {} {} {}".format(
                    b["text"],
                    b_["text"],
                    any(feats),
@ -727,14 +723,14 @@ class RAGFlowPdfParser:
            #    continue
            if tv < fv and tk:
                tables[tk].insert(0, c)
-                logger.debug(
+                logging.debug(
                    "TABLE:" +
                    self.boxes[i]["text"] +
                    "; Cap: " +
                    tk)
            elif fk:
                figures[fk].insert(0, c)
-                logger.debug(
+                logging.debug(
                    "FIGURE:" +
                    self.boxes[i]["text"] +
                    "; Cap: " +
@ -761,7 +757,7 @@ class RAGFlowPdfParser:
                if ii is not None:
                    b = louts[ii]
                else:
-                    logger.warn(
+                    logging.warn(
                        f"Missing layout match: {pn + 1},%s" %
                        (bxs[0].get(
                            "layoutno", "")))
@ -919,7 +915,7 @@ class RAGFlowPdfParser:
                if usefull(boxes[0]):
                    dfs(boxes[0], 0)
                else:
-                    logger.debug("WASTE: " + boxes[0]["text"])
+                    logging.debug("WASTE: " + boxes[0]["text"])
            except Exception:
                pass
            boxes.pop(0)
@ -928,7 +924,7 @@ class RAGFlowPdfParser:
                res.append(
                    "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
            else:
-                logger.debug("REMOVED: " +
+                logging.debug("REMOVED: " +
                              "<<".join([c["text"] for c in lines]))

        return "\n\n".join(res)
@ -940,7 +936,7 @@ class RAGFlowPdfParser:
                fnm) if not binary else pdfplumber.open(BytesIO(binary))
            return len(pdf.pages)
        except Exception:
-            logger.exception("total_page_number")
+            logging.exception("total_page_number")

    def __images__(self, fnm, zoomin=3, page_from=0,
                   page_to=299, callback=None):
@ -964,7 +960,7 @@ class RAGFlowPdfParser:
                               self.pdf.pages[page_from:page_to]]
            self.total_page = len(self.pdf.pages)
        except Exception:
-            logger.exception("RAGFlowPdfParser __images__")
+            logging.exception("RAGFlowPdfParser __images__")

        self.outlines = []
        try:
@ -980,11 +976,11 @@ class RAGFlowPdfParser:

            dfs(outlines, 0)
        except Exception as e:
-            logger.warning(f"Outlines exception: {e}")
+            logging.warning(f"Outlines exception: {e}")
        if not self.outlines:
-            logger.warning("Miss outlines")
+            logging.warning("Miss outlines")

-        logger.info("Images converted.")
+        logging.debug("Images converted.")
        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
            random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
                           range(len(self.page_chars))]
@ -1024,7 +1020,7 @@ class RAGFlowPdfParser:
            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
                                        "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))

-        logger.info("Is it English:", self.is_english)
+        logging.debug("Is it English:", self.is_english)

        self.page_cum_height = np.cumsum(self.page_cum_height)
        assert len(self.page_cum_height) == len(self.page_images) + 1
@ -1164,9 +1160,9 @@ class PlainParser(object):

            dfs(outlines, 0)
        except Exception:
-            logger.exception("Outlines exception")
+            logging.exception("Outlines exception")
        if not self.outlines:
-            logger.warning("Miss outlines")
+            logging.warning("Miss outlines")

        return [(l, "") for l in lines], []

--- a/deepdoc/parser/resume/entities/corporations.py
+++ b/deepdoc/parser/resume/entities/corporations.py
@ -11,13 +11,13 @@
 #  limitations under the License.
 #

+import logging
 import re
 import json
 import os
 import pandas as pd
 from rag.nlp import rag_tokenizer
 from . import regions
-from api.utils.log_utils import logger


 current_file_path = os.path.dirname(os.path.abspath(__file__))
@ -71,7 +71,7 @@ GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
 for c,v in CORP_TAG.items():
    cc = corpNorm(rmNoise(c), False)
    if not cc:
-        logger.info(c)
+        logging.debug(c)
 CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}

 def is_good(nm):
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py
@ -10,7 +10,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
+import logging
 import re
 import copy
 import time
@ -23,7 +23,6 @@ from deepdoc.parser.resume.entities import degrees, schools, corporations
 from rag.nlp import rag_tokenizer, surname
 from xpinyin import Pinyin
 from contextlib import contextmanager
-from api.utils.log_utils import logger


 class TimeoutException(Exception): pass
@ -164,7 +163,7 @@ def forEdu(cv):
            y, m, d = getYMD(edu_end_dt)
            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
        except Exception as e:
-            logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
+            logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
    if sch:
        cv["school_name_kwd"] = sch
        if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
@ -276,7 +275,7 @@ def forWork(cv):
        try:
            duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
        except Exception:
-            logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
+            logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))

        if n.get("scale"):
            r = re.search(r"^([0-9]+)", str(n["scale"]))
@ -333,7 +332,7 @@ def forWork(cv):
            y, m, d = getYMD(work_st_tm)
            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
        except Exception as e:
-            logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
+            logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))

    cv["job_num_int"] = 0
    if duas:
@ -464,7 +463,7 @@ def parse(cv):
                    cv[f"{t}_kwd"] = nms
                    cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
            except Exception:
-                logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
+                logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
                cv[k] = []

        # tokenize fields
@ -565,7 +564,7 @@ def parse(cv):
                cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
                cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
    except Exception as e:
-        logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
+        logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
    if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.

    keys = list(cv.keys())
@ -580,7 +579,7 @@ def parse(cv):

    cv["tob_resume_id"] = str(cv["tob_resume_id"])
    cv["id"] = cv["tob_resume_id"]
-    logger.info("CCCCCCCCCCCCCCC")
+    logging.debug("CCCCCCCCCCCCCCC")

    return dealWithInt64(cv)