mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
refine pdf parser, add time zone to userinfo (#112)
This commit is contained in:
@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin):
|
|||||||
avatar = TextField(null=True, help_text="avatar base64 string")
|
avatar = TextField(null=True, help_text="avatar base64 string")
|
||||||
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
|
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
|
||||||
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
|
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
|
||||||
|
timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai")
|
||||||
last_login_time = DateTimeField(null=True)
|
last_login_time = DateTimeField(null=True)
|
||||||
is_authenticated = CharField(max_length=1, null=False, default="1")
|
is_authenticated = CharField(max_length=1, null=False, default="1")
|
||||||
is_active = CharField(max_length=1, null=False, default="1")
|
is_active = CharField(max_length=1, null=False, default="1")
|
||||||
|
|||||||
@ -313,9 +313,19 @@ class HuParser:
|
|||||||
while i < len(bxs) - 1:
|
while i < len(bxs) - 1:
|
||||||
b = bxs[i]
|
b = bxs[i]
|
||||||
b_ = bxs[i + 1]
|
b_ = bxs[i + 1]
|
||||||
if b.get("layoutno", "0") != b_.get("layoutno", "1"):
|
if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
|
||||||
|
# merge
|
||||||
|
bxs[i]["x1"] = b_["x1"]
|
||||||
|
bxs[i]["top"] = (b["top"] + b_["top"]) / 2
|
||||||
|
bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
|
||||||
|
bxs[i]["text"] += b_["text"]
|
||||||
|
bxs.pop(i + 1)
|
||||||
|
continue
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
dis_thr = 1
|
dis_thr = 1
|
||||||
dis = b["x1"] - b_["x0"]
|
dis = b["x1"] - b_["x0"]
|
||||||
@ -642,9 +652,9 @@ class HuParser:
|
|||||||
|
|
||||||
tk, tv = nearest(tables)
|
tk, tv = nearest(tables)
|
||||||
fk, fv = nearest(figures)
|
fk, fv = nearest(figures)
|
||||||
if min(tv, fv) > 2000:
|
#if min(tv, fv) > 2000:
|
||||||
i += 1
|
# i += 1
|
||||||
continue
|
# continue
|
||||||
if tv < fv:
|
if tv < fv:
|
||||||
tables[tk].insert(0, c)
|
tables[tk].insert(0, c)
|
||||||
logging.debug(
|
logging.debug(
|
||||||
@ -711,12 +721,7 @@ class HuParser:
|
|||||||
|
|
||||||
# crop figure out and add caption
|
# crop figure out and add caption
|
||||||
for k, bxs in figures.items():
|
for k, bxs in figures.items():
|
||||||
txt = "\n".join(
|
txt = "\n".join([b["text"] for b in bxs])
|
||||||
[b["text"] for b in bxs
|
|
||||||
if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
|
|
||||||
and len(b["text"].strip()) >= 4
|
|
||||||
]
|
|
||||||
)
|
|
||||||
if not txt:
|
if not txt:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
bxs[i]["layoutno"] = f"{ty}-{ii}"
|
bxs[i]["layoutno"] = f"{ty}-{ii}"
|
||||||
bxs[i]["layout_type"] = lts_[ii]["type"]
|
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
for lt in ["footer", "header", "reference", "figure caption",
|
for lt in ["footer", "header", "reference", "figure caption",
|
||||||
@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer):
|
|||||||
|
|
||||||
# add box to figure layouts which has not text box
|
# add box to figure layouts which has not text box
|
||||||
for i, lt in enumerate(
|
for i, lt in enumerate(
|
||||||
[lt for lt in lts if lt["type"] == "figure"]):
|
[lt for lt in lts if lt["type"] in ["figure","equation"]]):
|
||||||
if lt.get("visited"):
|
if lt.get("visited"):
|
||||||
continue
|
continue
|
||||||
lt = deepcopy(lt)
|
lt = deepcopy(lt)
|
||||||
|
|||||||
@ -21,7 +21,6 @@ from .operators import *
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
|
||||||
from .postprocess import build_post_process
|
from .postprocess import build_post_process
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
|
|||||||
@ -276,18 +276,18 @@ class Recognizer(object):
|
|||||||
def find_overlapped_with_threashold(box, boxes, thr=0.3):
|
def find_overlapped_with_threashold(box, boxes, thr=0.3):
|
||||||
if not boxes:
|
if not boxes:
|
||||||
return
|
return
|
||||||
max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
|
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
|
||||||
s, e = 0, len(boxes)
|
s, e = 0, len(boxes)
|
||||||
for i in range(s, e):
|
for i in range(s, e):
|
||||||
ov = Recognizer.overlapped_area(box, boxes[i])
|
ov = Recognizer.overlapped_area(box, boxes[i])
|
||||||
_ov = Recognizer.overlapped_area(boxes[i], box)
|
_ov = Recognizer.overlapped_area(boxes[i], box)
|
||||||
if (ov, _ov) < (max_overlaped, _max_overlaped):
|
if (ov, _ov) < (max_overlapped, _max_overlapped):
|
||||||
continue
|
continue
|
||||||
max_overlaped_i = i
|
max_overlapped_i = i
|
||||||
max_overlaped = ov
|
max_overlapped = ov
|
||||||
_max_overlaped = _ov
|
_max_overlapped = _ov
|
||||||
|
|
||||||
return max_overlaped_i
|
return max_overlapped_i
|
||||||
|
|
||||||
def preprocess(self, image_list):
|
def preprocess(self, image_list):
|
||||||
inputs = []
|
inputs = []
|
||||||
|
|||||||
@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
if pdf_parser:
|
if pdf_parser:
|
||||||
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
||||||
add_positions(d, poss)
|
add_positions(d, poss, from_page)
|
||||||
ck = pdf_parser.remove_tag(ck)
|
ck = pdf_parser.remove_tag(ck)
|
||||||
tokenize(d, ck, eng)
|
tokenize(d, ck, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
@ -112,7 +112,7 @@ if __name__ == "__main__":
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def dummy(a, b):
|
def dummy(prog=None, msg=""):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -82,8 +82,8 @@ class Dealer:
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
s = s.sort(
|
s = s.sort(
|
||||||
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
|
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}},
|
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user