refine pdf parser, add time zone to userinfo (#112)

This commit is contained in:
KevinHuSh
2024-03-08 11:24:24 +08:00
committed by GitHub
parent 63e498ac79
commit 8f86ab9f7f
7 changed files with 28 additions and 23 deletions

View File

@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin):
avatar = TextField(null=True, help_text="avatar base64 string") avatar = TextField(null=True, help_text="avatar base64 string")
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese") language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark") color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai")
last_login_time = DateTimeField(null=True) last_login_time = DateTimeField(null=True)
is_authenticated = CharField(max_length=1, null=False, default="1") is_authenticated = CharField(max_length=1, null=False, default="1")
is_active = CharField(max_length=1, null=False, default="1") is_active = CharField(max_length=1, null=False, default="1")

View File

@ -313,9 +313,19 @@ class HuParser:
while i < len(bxs) - 1: while i < len(bxs) - 1:
b = bxs[i] b = bxs[i]
b_ = bxs[i + 1] b_ = bxs[i + 1]
if b.get("layoutno", "0") != b_.get("layoutno", "1"): if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
i += 1 i += 1
continue continue
if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
# merge
bxs[i]["x1"] = b_["x1"]
bxs[i]["top"] = (b["top"] + b_["top"]) / 2
bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
bxs[i]["text"] += b_["text"]
bxs.pop(i + 1)
continue
i += 1
continue
dis_thr = 1 dis_thr = 1
dis = b["x1"] - b_["x0"] dis = b["x1"] - b_["x0"]
@ -642,9 +652,9 @@ class HuParser:
tk, tv = nearest(tables) tk, tv = nearest(tables)
fk, fv = nearest(figures) fk, fv = nearest(figures)
if min(tv, fv) > 2000: #if min(tv, fv) > 2000:
i += 1 # i += 1
continue # continue
if tv < fv: if tv < fv:
tables[tk].insert(0, c) tables[tk].insert(0, c)
logging.debug( logging.debug(
@ -711,12 +721,7 @@ class HuParser:
# crop figure out and add caption # crop figure out and add caption
for k, bxs in figures.items(): for k, bxs in figures.items():
txt = "\n".join( txt = "\n".join([b["text"] for b in bxs])
[b["text"] for b in bxs
if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
and len(b["text"].strip()) >= 4
]
)
if not txt: if not txt:
continue continue

View File

@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer):
continue continue
bxs[i]["layoutno"] = f"{ty}-{ii}" bxs[i]["layoutno"] = f"{ty}-{ii}"
bxs[i]["layout_type"] = lts_[ii]["type"] bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
i += 1 i += 1
for lt in ["footer", "header", "reference", "figure caption", for lt in ["footer", "header", "reference", "figure caption",
@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer):
# add box to figure layouts which has not text box # add box to figure layouts which has not text box
for i, lt in enumerate( for i, lt in enumerate(
[lt for lt in lts if lt["type"] == "figure"]): [lt for lt in lts if lt["type"] in ["figure","equation"]]):
if lt.get("visited"): if lt.get("visited"):
continue continue
lt = deepcopy(lt) lt = deepcopy(lt)

View File

@ -21,7 +21,6 @@ from .operators import *
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
from api.utils.file_utils import get_project_base_directory
from .postprocess import build_post_process from .postprocess import build_post_process
from rag.settings import cron_logger from rag.settings import cron_logger

View File

@ -276,18 +276,18 @@ class Recognizer(object):
def find_overlapped_with_threashold(box, boxes, thr=0.3): def find_overlapped_with_threashold(box, boxes, thr=0.3):
if not boxes: if not boxes:
return return
max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0 max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
s, e = 0, len(boxes) s, e = 0, len(boxes)
for i in range(s, e): for i in range(s, e):
ov = Recognizer.overlapped_area(box, boxes[i]) ov = Recognizer.overlapped_area(box, boxes[i])
_ov = Recognizer.overlapped_area(boxes[i], box) _ov = Recognizer.overlapped_area(boxes[i], box)
if (ov, _ov) < (max_overlaped, _max_overlaped): if (ov, _ov) < (max_overlapped, _max_overlapped):
continue continue
max_overlaped_i = i max_overlapped_i = i
max_overlaped = ov max_overlapped = ov
_max_overlaped = _ov _max_overlapped = _ov
return max_overlaped_i return max_overlapped_i
def preprocess(self, image_list): def preprocess(self, image_list):
inputs = [] inputs = []

View File

@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
if pdf_parser: if pdf_parser:
d["image"], poss = pdf_parser.crop(ck, need_position=True) d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss) add_positions(d, poss, from_page)
ck = pdf_parser.remove_tag(ck) ck = pdf_parser.remove_tag(ck)
tokenize(d, ck, eng) tokenize(d, ck, eng)
res.append(d) res.append(d)
@ -112,7 +112,7 @@ if __name__ == "__main__":
import sys import sys
def dummy(a, b): def dummy(prog=None, msg=""):
pass pass

View File

@ -82,8 +82,8 @@ class Dealer:
) )
else: else:
s = s.sort( s = s.sort(
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}}, {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}}, {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}}, {"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
) )