refine pdf parser, add time zone to userinfo (#112)

This commit is contained in:
KevinHuSh
2024-03-08 11:24:24 +08:00
committed by GitHub
parent 63e498ac79
commit 8f86ab9f7f
7 changed files with 28 additions and 23 deletions

View File

@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
d = copy.deepcopy(doc)
if pdf_parser:
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
add_positions(d, poss, from_page)
ck = pdf_parser.remove_tag(ck)
tokenize(d, ck, eng)
res.append(d)
@ -112,7 +112,7 @@ if __name__ == "__main__":
import sys
def dummy(a, b):
def dummy(prog=None, msg=""):
pass

View File

@ -82,8 +82,8 @@ class Dealer:
)
else:
s = s.sort(
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}},
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
)