mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Rework logging (#3358)
Unified all log files into one. ### What problem does this PR solve? Unified all log files into one. ### Type of change - [x] Refactoring
This commit is contained in:
@ -19,13 +19,14 @@ from io import BytesIO
|
||||
import re
|
||||
import pdfplumber
|
||||
import logging
|
||||
from PIL import Image, ImageDraw
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from timeit import default_timer as timer
|
||||
from pypdf import PdfReader as pdf2_read
|
||||
|
||||
from api.settings import LIGHTEN
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
||||
from rag.nlp import rag_tokenizer
|
||||
from copy import deepcopy
|
||||
@ -49,15 +50,15 @@ class RAGFlowPdfParser:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
except Exception:
|
||||
logger.exception("RAGFlowPdfParser __init__")
|
||||
try:
|
||||
model_dir = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res/deepdoc")
|
||||
self.updown_cnt_mdl.load_model(os.path.join(
|
||||
model_dir, "updown_concat_xgb.model"))
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
model_dir = snapshot_download(
|
||||
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
||||
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
|
||||
@ -187,7 +188,7 @@ class RAGFlowPdfParser:
|
||||
return True
|
||||
|
||||
def _table_transformer_job(self, ZM):
|
||||
logging.info("Table processing...")
|
||||
logger.info("Table processing...")
|
||||
imgs, pos = [], []
|
||||
tbcnt = [0]
|
||||
MARGIN = 10
|
||||
@ -425,12 +426,12 @@ class RAGFlowPdfParser:
|
||||
detach_feats = [b["x1"] < b_["x0"],
|
||||
b["x0"] > b_["x1"]]
|
||||
if (any(feats) and not any(concatting_feats)) or any(detach_feats):
|
||||
print(
|
||||
logger.info("{} {} {} {}".format(
|
||||
b["text"],
|
||||
b_["text"],
|
||||
any(feats),
|
||||
any(concatting_feats),
|
||||
any(detach_feats))
|
||||
))
|
||||
i += 1
|
||||
continue
|
||||
# merge up and down
|
||||
@ -726,14 +727,14 @@ class RAGFlowPdfParser:
|
||||
# continue
|
||||
if tv < fv and tk:
|
||||
tables[tk].insert(0, c)
|
||||
logging.debug(
|
||||
logger.debug(
|
||||
"TABLE:" +
|
||||
self.boxes[i]["text"] +
|
||||
"; Cap: " +
|
||||
tk)
|
||||
elif fk:
|
||||
figures[fk].insert(0, c)
|
||||
logging.debug(
|
||||
logger.debug(
|
||||
"FIGURE:" +
|
||||
self.boxes[i]["text"] +
|
||||
"; Cap: " +
|
||||
@ -760,7 +761,7 @@ class RAGFlowPdfParser:
|
||||
if ii is not None:
|
||||
b = louts[ii]
|
||||
else:
|
||||
logging.warn(
|
||||
logger.warn(
|
||||
f"Missing layout match: {pn + 1},%s" %
|
||||
(bxs[0].get(
|
||||
"layoutno", "")))
|
||||
@ -918,8 +919,8 @@ class RAGFlowPdfParser:
|
||||
if usefull(boxes[0]):
|
||||
dfs(boxes[0], 0)
|
||||
else:
|
||||
logging.debug("WASTE: " + boxes[0]["text"])
|
||||
except Exception as e:
|
||||
logger.debug("WASTE: " + boxes[0]["text"])
|
||||
except Exception:
|
||||
pass
|
||||
boxes.pop(0)
|
||||
mw = np.mean(widths)
|
||||
@ -927,7 +928,7 @@ class RAGFlowPdfParser:
|
||||
res.append(
|
||||
"\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
|
||||
else:
|
||||
logging.debug("REMOVED: " +
|
||||
logger.debug("REMOVED: " +
|
||||
"<<".join([c["text"] for c in lines]))
|
||||
|
||||
return "\n\n".join(res)
|
||||
@ -938,8 +939,8 @@ class RAGFlowPdfParser:
|
||||
pdf = pdfplumber.open(
|
||||
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
||||
return len(pdf.pages)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
except Exception:
|
||||
logger.exception("total_page_number")
|
||||
|
||||
def __images__(self, fnm, zoomin=3, page_from=0,
|
||||
page_to=299, callback=None):
|
||||
@ -962,8 +963,8 @@ class RAGFlowPdfParser:
|
||||
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
||||
self.pdf.pages[page_from:page_to]]
|
||||
self.total_page = len(self.pdf.pages)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
except Exception:
|
||||
logger.exception("RAGFlowPdfParser __images__")
|
||||
|
||||
self.outlines = []
|
||||
try:
|
||||
@ -979,11 +980,11 @@ class RAGFlowPdfParser:
|
||||
|
||||
dfs(outlines, 0)
|
||||
except Exception as e:
|
||||
logging.warning(f"Outlines exception: {e}")
|
||||
logger.warning(f"Outlines exception: {e}")
|
||||
if not self.outlines:
|
||||
logging.warning(f"Miss outlines")
|
||||
logger.warning("Miss outlines")
|
||||
|
||||
logging.info("Images converted.")
|
||||
logger.info("Images converted.")
|
||||
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
|
||||
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
|
||||
range(len(self.page_chars))]
|
||||
@ -1023,7 +1024,7 @@ class RAGFlowPdfParser:
|
||||
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
|
||||
"".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
|
||||
|
||||
logging.info("Is it English:", self.is_english)
|
||||
logger.info("Is it English:", self.is_english)
|
||||
|
||||
self.page_cum_height = np.cumsum(self.page_cum_height)
|
||||
assert len(self.page_cum_height) == len(self.page_images) + 1
|
||||
@ -1162,10 +1163,10 @@ class PlainParser(object):
|
||||
dfs(a, depth + 1)
|
||||
|
||||
dfs(outlines, 0)
|
||||
except Exception as e:
|
||||
logging.warning(f"Outlines exception: {e}")
|
||||
except Exception:
|
||||
logger.exception("Outlines exception")
|
||||
if not self.outlines:
|
||||
logging.warning(f"Miss outlines")
|
||||
logger.warning("Miss outlines")
|
||||
|
||||
return [(l, "") for l in lines], []
|
||||
|
||||
|
||||
@ -11,10 +11,15 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re,json,os
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
import pandas as pd
|
||||
from rag.nlp import rag_tokenizer
|
||||
from . import regions
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||
GOODS["cid"] = GOODS["cid"].astype(str)
|
||||
@ -27,7 +32,7 @@ def baike(cid, default_v=0):
|
||||
global GOODS
|
||||
try:
|
||||
return GOODS.loc[str(cid), "len"]
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
return default_v
|
||||
|
||||
@ -65,7 +70,8 @@ def rmNoise(n):
|
||||
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
||||
for c,v in CORP_TAG.items():
|
||||
cc = corpNorm(rmNoise(c), False)
|
||||
if not cc: print (c)
|
||||
if not cc:
|
||||
logger.info(c)
|
||||
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
|
||||
|
||||
def is_good(nm):
|
||||
|
||||
@ -11,13 +11,19 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re, copy, time, datetime, demjson3, \
|
||||
traceback, signal
|
||||
import re
|
||||
import copy
|
||||
import time
|
||||
import datetime
|
||||
import demjson3
|
||||
import traceback
|
||||
import signal
|
||||
import numpy as np
|
||||
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||
from rag.nlp import rag_tokenizer, surname
|
||||
from xpinyin import Pinyin
|
||||
from contextlib import contextmanager
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class TimeoutException(Exception): pass
|
||||
@ -79,7 +85,7 @@ def forEdu(cv):
|
||||
y, m, d = getYMD(dt)
|
||||
st_dt.append(str(y))
|
||||
e["start_dt_kwd"] = str(y)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
r = schools.select(n.get("school_name", ""))
|
||||
@ -158,7 +164,7 @@ def forEdu(cv):
|
||||
y, m, d = getYMD(edu_end_dt)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
|
||||
logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
|
||||
if sch:
|
||||
cv["school_name_kwd"] = sch
|
||||
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
||||
@ -233,7 +239,7 @@ def forWork(cv):
|
||||
if type(n) == type(""):
|
||||
try:
|
||||
n = json_loads(n)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
|
||||
@ -269,8 +275,8 @@ def forWork(cv):
|
||||
|
||||
try:
|
||||
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
||||
except Exception as e:
|
||||
print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
|
||||
except Exception:
|
||||
logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
|
||||
|
||||
if n.get("scale"):
|
||||
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
||||
@ -327,7 +333,7 @@ def forWork(cv):
|
||||
y, m, d = getYMD(work_st_tm)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
|
||||
logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
|
||||
|
||||
cv["job_num_int"] = 0
|
||||
if duas:
|
||||
@ -457,8 +463,8 @@ def parse(cv):
|
||||
t = k[:-4]
|
||||
cv[f"{t}_kwd"] = nms
|
||||
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
||||
except Exception as e:
|
||||
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
||||
except Exception:
|
||||
logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
|
||||
cv[k] = []
|
||||
|
||||
# tokenize fields
|
||||
@ -524,7 +530,7 @@ def parse(cv):
|
||||
if not y: y = "2012"
|
||||
if not m: m = "01"
|
||||
if not d: d = "01"
|
||||
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
# long text tokenize
|
||||
|
||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||
@ -556,10 +562,10 @@ def parse(cv):
|
||||
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
||||
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
||||
y, m, d = getYMD(str(cv["work_start_time"]))
|
||||
cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
||||
except Exception as e:
|
||||
print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
|
||||
logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
|
||||
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
||||
|
||||
keys = list(cv.keys())
|
||||
@ -574,7 +580,7 @@ def parse(cv):
|
||||
|
||||
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
||||
cv["id"] = cv["tob_resume_id"]
|
||||
print("CCCCCCCCCCCCCCC")
|
||||
logger.info("CCCCCCCCCCCCCCC")
|
||||
|
||||
return dealWithInt64(cv)
|
||||
|
||||
@ -589,4 +595,3 @@ def dealWithInt64(d):
|
||||
|
||||
if isinstance(d, np.integer): d = int(d)
|
||||
return d
|
||||
|
||||
|
||||
Reference in New Issue
Block a user