mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add support for multi-column PDF parsing (#10475)
### What problem does this PR solve? Add support for multi-columns PDF parsing. #9878, #9919. Two-column sample: <img width="1885" height="1020" alt="image" src="https://github.com/user-attachments/assets/0270c028-2db8-4ca6-a4b7-cd5830882d28" /> Three-column sample: <img width="1881" height="992" alt="image" src="https://github.com/user-attachments/assets/9ee88844-d5b1-4927-9e4e-3bd810d6e03a" /> Single-column sample: <img width="1883" height="1042" alt="image" src="https://github.com/user-attachments/assets/e93d3d18-43c3-4067-b5fa-e454ed0ab093" /> ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -328,7 +328,7 @@ class Pdf(PdfParser):
|
||||
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
self._text_merge()
|
||||
self._text_merge(zoomin=zoomin)
|
||||
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
||||
|
||||
if separate_tables_figures:
|
||||
@ -340,6 +340,7 @@ class Pdf(PdfParser):
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._naive_vertical_merge()
|
||||
self._concat_downward()
|
||||
self._final_reading_order_merge()
|
||||
# self._filter_forpages()
|
||||
logging.info("layouts cost: {}s".format(timer() - first_start))
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
||||
|
||||
Reference in New Issue
Block a user