mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
fix position extraction bug (#93)
* fix position extraction bug * remove delimiter for naive parser
This commit is contained in:
@ -41,7 +41,7 @@ class Pdf(PdfParser):
|
||||
self._filter_forpages()
|
||||
self._merge_with_same_bullet()
|
||||
callback(0.75, "Text merging finished.")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
# clean mess
|
||||
for b in self.boxes:
|
||||
|
||||
@ -40,7 +40,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
#self._naive_vertical_merge()
|
||||
|
||||
@ -48,7 +48,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.75, "Text merging finished.")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
# clean mess
|
||||
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
||||
|
||||
Reference in New Issue
Block a user