Rework logging (#3358)

Unified all log files into one.

### What problem does this PR solve?

Unified all log files into one.

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-11-12 17:35:13 +08:00
committed by GitHub
parent 567a7563e7
commit a2a5631da4
75 changed files with 481 additions and 853 deletions

View File

@ -17,6 +17,7 @@ from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
import numpy as np
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -40,7 +41,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis finished")
print("layouts:", timer() - start)
logger.info(f"layouts cost: {timer() - start}s")
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()
@ -52,8 +53,8 @@ class Pdf(PdfParser):
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:
print("two_column...................", column_width,
self.page_images[0].size[0] / zoomin / 2)
logger.info("two_column................... {} {}".format(column_width,
self.page_images[0].size[0] / zoomin / 2))
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
@ -114,8 +115,8 @@ class Pdf(PdfParser):
from_page, min(
to_page, self.total_page)))
for b in self.boxes:
print(b["text"], b.get("layoutno"))
print(tbls)
logger.info("{} {}".format(b["text"], b.get("layoutno")))
logger.info("{}".format(tbls))
return {
"title": title,
@ -156,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
print("It's English.....", eng)
logger.info("It's English.....{}".format(eng))
res = tokenize_table(paper["tables"], doc, eng)
@ -183,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
print(lvl, sorted_sections[i][0], most_level, sid)
logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
chunks = []
last_sid = -2