Fix: relative page_number in boxes (#11712)

page_number in boxes is relative page number,must + from_page

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
rommy2017
2025-12-04 11:23:34 +08:00
committed by GitHub
parent cbdacf21f6
commit 257af75ece

View File

@ -86,9 +86,11 @@ class Pdf(PdfParser):
# (A) Add text
for b in self.boxes:
if not (from_page < b["page_number"] <= to_page + from_page):
# b["page_number"] is relative page numbermust + from_page
global_page_num = b["page_number"] + from_page
if not (from_page < global_page_num <= to_page + from_page):
continue
page_items[b["page_number"]].append({
page_items[global_page_num].append({
"top": b["top"],
"x0": b["x0"],
"text": b["text"],
@ -100,7 +102,6 @@ class Pdf(PdfParser):
if not positions:
continue
# Handle content type (list vs str)
if isinstance(content, list):
final_text = "\n".join(content)
elif isinstance(content, str):
@ -109,10 +110,11 @@ class Pdf(PdfParser):
final_text = str(content)
try:
# Parse positions
pn_index = positions[0][0]
if isinstance(pn_index, list):
pn_index = pn_index[0]
# pn_index in tbls is absolute page number
current_page_num = int(pn_index) + 1
except Exception as e:
print(f"Error parsing position: {e}")