add use layout or not option (#145)

* add use layout or not option

* trival
This commit is contained in:
KevinHuSh
2024-03-22 19:21:09 +08:00
committed by GitHub
parent 2f4c71b4b4
commit f6aee7f230
18 changed files with 238 additions and 140 deletions

View File

@ -76,6 +76,25 @@ def tokenize(d, t, eng):
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser):
res = []
# wrap up as es documents
for ck in chunks:
if len(ck.strip()) == 0:continue
print("--", ck)
d = copy.deepcopy(doc)
if pdf_parser:
try:
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
ck = pdf_parser.remove_tag(ck)
except NotImplementedError as e:
pass
tokenize(d, ck, eng)
res.append(d)
return res
def tokenize_table(tbls, doc, eng, batch_size=10):
res = []
# add tables