refine pdf parser, add time zone to userinfo (#112)

This commit is contained in:
KevinHuSh
2024-03-08 11:24:24 +08:00
committed by GitHub
parent 63e498ac79
commit 8f86ab9f7f
7 changed files with 28 additions and 23 deletions

View File

@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer):
continue
bxs[i]["layoutno"] = f"{ty}-{ii}"
bxs[i]["layout_type"] = lts_[ii]["type"]
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
i += 1
for lt in ["footer", "header", "reference", "figure caption",
@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer):
# add box to figure layouts which has not text box
for i, lt in enumerate(
[lt for lt in lts if lt["type"] == "figure"]):
[lt for lt in lts if lt["type"] in ["figure","equation"]]):
if lt.get("visited"):
continue
lt = deepcopy(lt)

View File

@ -21,7 +21,6 @@ from .operators import *
import numpy as np
import onnxruntime as ort
from api.utils.file_utils import get_project_base_directory
from .postprocess import build_post_process
from rag.settings import cron_logger

View File

@ -276,18 +276,18 @@ class Recognizer(object):
def find_overlapped_with_threashold(box, boxes, thr=0.3):
if not boxes:
return
max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
s, e = 0, len(boxes)
for i in range(s, e):
ov = Recognizer.overlapped_area(box, boxes[i])
_ov = Recognizer.overlapped_area(boxes[i], box)
if (ov, _ov) < (max_overlaped, _max_overlaped):
if (ov, _ov) < (max_overlapped, _max_overlapped):
continue
max_overlaped_i = i
max_overlaped = ov
_max_overlaped = _ov
max_overlapped_i = i
max_overlapped = ov
_max_overlapped = _ov
return max_overlaped_i
return max_overlapped_i
def preprocess(self, image_list):
inputs = []