mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Rework logging (#3358)
Unified all log files into one. ### What problem does this PR solve? Unified all log files into one. ### Type of change - [x] Refactoring
This commit is contained in:
@ -17,6 +17,7 @@ from api.db import ParserType
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from deepdoc.parser import PdfParser, PlainParser
|
||||
import numpy as np
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -40,7 +41,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.63, "Layout analysis finished")
|
||||
print("layouts:", timer() - start)
|
||||
logger.info(f"layouts cost: {timer() - start}s")
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.68, "Table analysis finished")
|
||||
self._text_merge()
|
||||
@ -52,8 +53,8 @@ class Pdf(PdfParser):
|
||||
|
||||
# clean mess
|
||||
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
||||
print("two_column...................", column_width,
|
||||
self.page_images[0].size[0] / zoomin / 2)
|
||||
logger.info("two_column................... {} {}".format(column_width,
|
||||
self.page_images[0].size[0] / zoomin / 2))
|
||||
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
|
||||
for b in self.boxes:
|
||||
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
||||
@ -114,8 +115,8 @@ class Pdf(PdfParser):
|
||||
from_page, min(
|
||||
to_page, self.total_page)))
|
||||
for b in self.boxes:
|
||||
print(b["text"], b.get("layoutno"))
|
||||
print(tbls)
|
||||
logger.info("{} {}".format(b["text"], b.get("layoutno")))
|
||||
logger.info("{}".format(tbls))
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
@ -156,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
print("It's English.....", eng)
|
||||
logger.info("It's English.....{}".format(eng))
|
||||
|
||||
res = tokenize_table(paper["tables"], doc, eng)
|
||||
|
||||
@ -183,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
print(lvl, sorted_sections[i][0], most_level, sid)
|
||||
logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
|
||||
|
||||
chunks = []
|
||||
last_sid = -2
|
||||
|
||||
Reference in New Issue
Block a user