fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
This commit is contained in:
KevinHuSh
2024-03-04 17:08:35 +08:00
committed by GitHub
parent fae00827e6
commit 7bfaf0df29
11 changed files with 34 additions and 22 deletions

View File

@ -41,7 +41,7 @@ class Pdf(PdfParser):
self._filter_forpages()
self._merge_with_same_bullet()
callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, False, True)
tbls = self._extract_table_figure(True, zoomin, True, True)
callback(0.8, "Text extraction finished")

View File

@ -33,7 +33,7 @@ class Pdf(PdfParser):
self._concat_downward(concat_between_pages=False)
self._filter_forpages()
callback(0.77, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, False, True)
tbls = self._extract_table_figure(True, zoomin, True, True)
# clean mess
for b in self.boxes:

View File

@ -40,7 +40,7 @@ class Pdf(PdfParser):
self._concat_downward(concat_between_pages=False)
self._filter_forpages()
callback(0.77, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, False, True)
tbls = self._extract_table_figure(True, zoomin, True, True)
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
#self._naive_vertical_merge()

View File

@ -48,7 +48,7 @@ class Pdf(PdfParser):
self._concat_downward(concat_between_pages=False)
self._filter_forpages()
callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, False, True)
tbls = self._extract_table_figure(True, zoomin, True, True)
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:

View File

@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
tk_nums[-1] += tnum
for sec, pos in sections:
add_chunk(sec, pos)
continue
s, e = 0, 1
while e < len(sec):
if sec[e] in delimiter:

View File

@ -83,7 +83,7 @@ class Dealer:
else:
s = s.sort(
{"page_num_int": {"order": "asc", "unmapped_type": "float"}},
{"top_int": {"order": "asc", "unmapped_type": "float"}},
{"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
)

View File

@ -83,10 +83,10 @@ def dispatch():
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
for s,e in r["parser_config"].get("pages", [(0,100000)]):
e = min(e, pages)
for p in range(s, e, 10):
for p in range(s, e, 5):
task = new_task()
task["from_page"] = p
task["to_page"] = min(p + 10, e)
task["to_page"] = min(p + 5, e)
tsks.append(task)
else:
tsks.append(new_task())