Rework logging (#3358)

Unified all log files into one. ### What problem does this PR solve? Unified all log files into one. ### Type of change - [x] Refactoring
2025-12-08 20:42:30 +08:00 · 2024-11-12 17:35:13 +08:00
parent 567a7563e7
commit a2a5631da4
75 changed files with 481 additions and 853 deletions
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -19,13 +19,14 @@ from io import BytesIO
 import re
 import pdfplumber
 import logging
-from PIL import Image, ImageDraw
+from PIL import Image
 import numpy as np
 from timeit import default_timer as timer
 from pypdf import PdfReader as pdf2_read

 from api.settings import LIGHTEN
 from api.utils.file_utils import get_project_base_directory
+from api.utils.log_utils import logger
 from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
 from rag.nlp import rag_tokenizer
 from copy import deepcopy
@ -49,15 +50,15 @@ class RAGFlowPdfParser:
                import torch
                if torch.cuda.is_available():
                    self.updown_cnt_mdl.set_param({"device": "cuda"})
-            except Exception as e:
-                logging.error(str(e))
+            except Exception:
+                logger.exception("RAGFlowPdfParser __init__")
        try:
            model_dir = os.path.join(
                get_project_base_directory(),
                "rag/res/deepdoc")
            self.updown_cnt_mdl.load_model(os.path.join(
                model_dir, "updown_concat_xgb.model"))
-        except Exception as e:
+        except Exception:
            model_dir = snapshot_download(
                repo_id="InfiniFlow/text_concat_xgb_v1.0",
                local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
@ -187,7 +188,7 @@ class RAGFlowPdfParser:
        return True

    def _table_transformer_job(self, ZM):
-        logging.info("Table processing...")
+        logger.info("Table processing...")
        imgs, pos = [], []
        tbcnt = [0]
        MARGIN = 10
@ -425,12 +426,12 @@ class RAGFlowPdfParser:
            detach_feats = [b["x1"] < b_["x0"],
                            b["x0"] > b_["x1"]]
            if (any(feats) and not any(concatting_feats)) or any(detach_feats):
-                print(
+                logger.info("{} {} {} {}".format(
                    b["text"],
                    b_["text"],
                    any(feats),
                    any(concatting_feats),
-                    any(detach_feats))
+                    ))
                i += 1
                continue
            # merge up and down
@ -726,14 +727,14 @@ class RAGFlowPdfParser:
            #    continue
            if tv < fv and tk:
                tables[tk].insert(0, c)
-                logging.debug(
+                logger.debug(
                    "TABLE:" +
                    self.boxes[i]["text"] +
                    "; Cap: " +
                    tk)
            elif fk:
                figures[fk].insert(0, c)
-                logging.debug(
+                logger.debug(
                    "FIGURE:" +
                    self.boxes[i]["text"] +
                    "; Cap: " +
@ -760,7 +761,7 @@ class RAGFlowPdfParser:
                if ii is not None:
                    b = louts[ii]
                else:
-                    logging.warn(
+                    logger.warn(
                        f"Missing layout match: {pn + 1},%s" %
                        (bxs[0].get(
                            "layoutno", "")))
@ -918,8 +919,8 @@ class RAGFlowPdfParser:
                if usefull(boxes[0]):
                    dfs(boxes[0], 0)
                else:
-                    logging.debug("WASTE: " + boxes[0]["text"])
-            except Exception as e:
+                    logger.debug("WASTE: " + boxes[0]["text"])
+            except Exception:
                pass
            boxes.pop(0)
            mw = np.mean(widths)
@ -927,7 +928,7 @@ class RAGFlowPdfParser:
                res.append(
                    "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
            else:
-                logging.debug("REMOVED: " +
+                logger.debug("REMOVED: " +
                              "<<".join([c["text"] for c in lines]))

        return "\n\n".join(res)
@ -938,8 +939,8 @@ class RAGFlowPdfParser:
            pdf = pdfplumber.open(
                fnm) if not binary else pdfplumber.open(BytesIO(binary))
            return len(pdf.pages)
-        except Exception as e:
-            logging.error(str(e))
+        except Exception:
+            logger.exception("total_page_number")

    def __images__(self, fnm, zoomin=3, page_from=0,
                   page_to=299, callback=None):
@ -962,8 +963,8 @@ class RAGFlowPdfParser:
            self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
                               self.pdf.pages[page_from:page_to]]
            self.total_page = len(self.pdf.pages)
-        except Exception as e:
-            logging.error(str(e))
+        except Exception:
+            logger.exception("RAGFlowPdfParser __images__")

        self.outlines = []
        try:
@ -979,11 +980,11 @@ class RAGFlowPdfParser:

            dfs(outlines, 0)
        except Exception as e:
-            logging.warning(f"Outlines exception: {e}")
+            logger.warning(f"Outlines exception: {e}")
        if not self.outlines:
-            logging.warning(f"Miss outlines")
+            logger.warning("Miss outlines")

-        logging.info("Images converted.")
+        logger.info("Images converted.")
        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
            random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
                           range(len(self.page_chars))]
@ -1023,7 +1024,7 @@ class RAGFlowPdfParser:
            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
                                        "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))

-        logging.info("Is it English:", self.is_english)
+        logger.info("Is it English:", self.is_english)

        self.page_cum_height = np.cumsum(self.page_cum_height)
        assert len(self.page_cum_height) == len(self.page_images) + 1
@ -1162,10 +1163,10 @@ class PlainParser(object):
                    dfs(a, depth + 1)

            dfs(outlines, 0)
-        except Exception as e:
-            logging.warning(f"Outlines exception: {e}")
+        except Exception:
+            logger.exception("Outlines exception")
        if not self.outlines:
-            logging.warning(f"Miss outlines")
+            logger.warning("Miss outlines")

        return [(l, "") for l in lines], []

--- a/deepdoc/parser/resume/entities/corporations.py
+++ b/deepdoc/parser/resume/entities/corporations.py
@ -11,10 +11,15 @@
 #  limitations under the License.
 #

-import re,json,os
+import re
+import json
+import os
 import pandas as pd
 from rag.nlp import rag_tokenizer
 from . import regions
+from api.utils.log_utils import logger
+
+
 current_file_path = os.path.dirname(os.path.abspath(__file__))
 GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
 GOODS["cid"] = GOODS["cid"].astype(str)
@ -27,7 +32,7 @@ def baike(cid, default_v=0):
    global GOODS
    try:
        return GOODS.loc[str(cid), "len"]
-    except Exception as e:
+    except Exception:
        pass
    return default_v

@ -65,7 +70,8 @@ def rmNoise(n):
 GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
 for c,v in CORP_TAG.items():
    cc = corpNorm(rmNoise(c), False)
-    if not cc: print (c)
+    if not cc:
+        logger.info(c)
 CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}

 def is_good(nm):
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py
@ -11,13 +11,19 @@
 #  limitations under the License.
 #

-import re, copy, time, datetime, demjson3, \
-    traceback, signal
+import re
+import copy
+import time
+import datetime
+import demjson3
+import traceback
+import signal
 import numpy as np
 from deepdoc.parser.resume.entities import degrees, schools, corporations
 from rag.nlp import rag_tokenizer, surname
 from xpinyin import Pinyin
 from contextlib import contextmanager
+from api.utils.log_utils import logger


 class TimeoutException(Exception): pass
@ -79,7 +85,7 @@ def forEdu(cv):
                y, m, d = getYMD(dt)
                st_dt.append(str(y))
                e["start_dt_kwd"] = str(y)
-            except Exception as e:
+            except Exception:
                pass

        r = schools.select(n.get("school_name", ""))
@ -158,7 +164,7 @@ def forEdu(cv):
            y, m, d = getYMD(edu_end_dt)
            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
        except Exception as e:
-            print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
+            logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
    if sch:
        cv["school_name_kwd"] = sch
        if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
@ -233,7 +239,7 @@ def forWork(cv):
        if type(n) == type(""):
            try:
                n = json_loads(n)
-            except Exception as e:
+            except Exception:
                continue

        if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
@ -269,8 +275,8 @@ def forWork(cv):

        try:
            duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
-        except Exception as e:
-            print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
+        except Exception:
+            logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))

        if n.get("scale"):
            r = re.search(r"^([0-9]+)", str(n["scale"]))
@ -327,7 +333,7 @@ def forWork(cv):
            y, m, d = getYMD(work_st_tm)
            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
        except Exception as e:
-            print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
+            logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))

    cv["job_num_int"] = 0
    if duas:
@ -457,8 +463,8 @@ def parse(cv):
                    t = k[:-4]
                    cv[f"{t}_kwd"] = nms
                    cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
-            except Exception as e:
-                print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
+            except Exception:
+                logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
                cv[k] = []

        # tokenize fields
@ -524,7 +530,7 @@ def parse(cv):
        if not y: y = "2012"
        if not m: m = "01"
        if not d: d = "01"
-        cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
+        cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
        # long text tokenize

    if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
@ -556,10 +562,10 @@ def parse(cv):
                cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
            elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
                y, m, d = getYMD(str(cv["work_start_time"]))
-                cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
+                cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
                cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
    except Exception as e:
-        print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
+        logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
    if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.

    keys = list(cv.keys())
@ -574,7 +580,7 @@ def parse(cv):

    cv["tob_resume_id"] = str(cv["tob_resume_id"])
    cv["id"] = cv["tob_resume_id"]
-    print("CCCCCCCCCCCCCCC")
+    logger.info("CCCCCCCCCCCCCCC")

    return dealWithInt64(cv)

@ -589,4 +595,3 @@ def dealWithInt64(d):

    if isinstance(d, np.integer): d = int(d)
    return d
-
--- a/deepdoc/vision/operators.py
+++ b/deepdoc/vision/operators.py
@ -20,6 +20,7 @@ import cv2
 import numpy as np
 import math
 from PIL import Image
+from api.utils.log_utils import logger


 class DecodeImage(object):
@ -402,7 +403,7 @@ class DetResizeForTest(object):
                return None, (None, None)
            img = cv2.resize(img, (int(resize_w), int(resize_h)))
        except BaseException:
-            print(img.shape, resize_w, resize_h)
+            logger.exception("{} {} {}".format(img.shape, resize_w, resize_h))
            sys.exit(0)
        ratio_h = resize_h / float(h)
        ratio_w = resize_w / float(w)
@ -452,7 +453,6 @@ class E2EResizeForTest(object):
        return data

    def resize_image_for_totaltext(self, im, max_side_len=512):
-
        h, w, _ = im.shape
        resize_w = w
        resize_h = h
--- a/deepdoc/vision/recognizer.py
+++ b/deepdoc/vision/recognizer.py
@ -19,6 +19,7 @@ from huggingface_hub import snapshot_download

 from api.utils.file_utils import get_project_base_directory
 from .operators import *
+from api.utils.log_utils import logger


 class Recognizer(object):
@ -439,7 +440,7 @@ class Recognizer(object):
            end_index = min((i + 1) * batch_size, len(imgs))
            batch_image_list = imgs[start_index:end_index]
            inputs = self.preprocess(batch_image_list)
-            print("preprocess")
+            logger.info("preprocess")
            for ins in inputs:
                bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr)
                res.append(bb)
--- a/deepdoc/vision/seeit.py
+++ b/deepdoc/vision/seeit.py
@ -14,6 +14,7 @@
 import os
 import PIL
 from PIL import ImageDraw
+from api.utils.log_utils import logger


 def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
@ -24,7 +25,7 @@ def save_results(image_list, results, labels, output_dir='output/', threshold=0.

        out_path = os.path.join(output_dir, f"{idx}.jpg")
        im.save(out_path, quality=95)
-        print("save result to: " + out_path)
+        logger.info("save result to: " + out_path)


 def draw_box(im, result, lables, threshold=0.5):
--- a/deepdoc/vision/t_recognizer.py
+++ b/deepdoc/vision/t_recognizer.py
@ -10,7 +10,10 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import os, sys
+import os
+import sys
+from api.utils.log_utils import logger
+
 sys.path.insert(
    0,
    os.path.abspath(
@ -56,7 +59,7 @@ def main(args):
            } for t in lyt]
        img = draw_box(images[i], lyt, labels, float(args.threshold))
        img.save(outputs[i], quality=95)
-        print("save result to: " + outputs[i])
+        logger.info("save result to: " + outputs[i])


 def get_table_html(img, tb_cpns, ocr):