mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
refine pdf parser, add time zone to userinfo (#112)
This commit is contained in:
@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
d = copy.deepcopy(doc)
|
||||
if pdf_parser:
|
||||
d["image"], poss = pdf_parser.crop(ck, need_position=True)
|
||||
add_positions(d, poss)
|
||||
add_positions(d, poss, from_page)
|
||||
ck = pdf_parser.remove_tag(ck)
|
||||
tokenize(d, ck, eng)
|
||||
res.append(d)
|
||||
@ -112,7 +112,7 @@ if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(a, b):
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
@ -82,8 +82,8 @@ class Dealer:
|
||||
)
|
||||
else:
|
||||
s = s.sort(
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}},
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user