refine page ranges (#147)

This commit is contained in:
KevinHuSh
2024-03-25 13:11:57 +08:00
committed by GitHub
parent 1d9a50b090
commit 71fe314955
13 changed files with 169 additions and 72 deletions

View File

@ -1074,15 +1074,15 @@ class HuParser:
class PlainParser(object):
def __call__(self, filename, **kwargs):
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
self.outlines = []
lines = []
try:
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
outlines = self.pdf.outline
for page in self.pdf.pages:
for page in self.pdf.pages[from_page:to_page]:
lines.extend([t for t in page.extract_text().split("\n")])
outlines = self.pdf.outline
def dfs(arr, depth):
for a in arr:
if isinstance(a, dict):