mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
fix position extraction bug (#93)
* fix position extraction bug * remove delimiter for naive parser
This commit is contained in:
@ -41,7 +41,7 @@ class Pdf(PdfParser):
|
||||
self._filter_forpages()
|
||||
self._merge_with_same_bullet()
|
||||
callback(0.75, "Text merging finished.")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
# clean mess
|
||||
for b in self.boxes:
|
||||
|
||||
@ -40,7 +40,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
#self._naive_vertical_merge()
|
||||
|
||||
@ -48,7 +48,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.75, "Text merging finished.")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
# clean mess
|
||||
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
||||
|
||||
@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
for sec, pos in sections:
|
||||
add_chunk(sec, pos)
|
||||
continue
|
||||
s, e = 0, 1
|
||||
while e < len(sec):
|
||||
if sec[e] in delimiter:
|
||||
|
||||
@ -83,7 +83,7 @@ class Dealer:
|
||||
else:
|
||||
s = s.sort(
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
|
||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||
)
|
||||
|
||||
@ -83,10 +83,10 @@ def dispatch():
|
||||
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||
for s,e in r["parser_config"].get("pages", [(0,100000)]):
|
||||
e = min(e, pages)
|
||||
for p in range(s, e, 10):
|
||||
for p in range(s, e, 5):
|
||||
task = new_task()
|
||||
task["from_page"] = p
|
||||
task["to_page"] = min(p + 10, e)
|
||||
task["to_page"] = min(p + 5, e)
|
||||
tsks.append(task)
|
||||
else:
|
||||
tsks.append(new_task())
|
||||
|
||||
Reference in New Issue
Block a user