mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: relative page_number in boxes (#11712)
page_number in boxes is relative page number,must + from_page ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -86,9 +86,11 @@ class Pdf(PdfParser):
|
|||||||
|
|
||||||
# (A) Add text
|
# (A) Add text
|
||||||
for b in self.boxes:
|
for b in self.boxes:
|
||||||
if not (from_page < b["page_number"] <= to_page + from_page):
|
# b["page_number"] is relative page number,must + from_page
|
||||||
|
global_page_num = b["page_number"] + from_page
|
||||||
|
if not (from_page < global_page_num <= to_page + from_page):
|
||||||
continue
|
continue
|
||||||
page_items[b["page_number"]].append({
|
page_items[global_page_num].append({
|
||||||
"top": b["top"],
|
"top": b["top"],
|
||||||
"x0": b["x0"],
|
"x0": b["x0"],
|
||||||
"text": b["text"],
|
"text": b["text"],
|
||||||
@ -100,7 +102,6 @@ class Pdf(PdfParser):
|
|||||||
if not positions:
|
if not positions:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Handle content type (list vs str)
|
|
||||||
if isinstance(content, list):
|
if isinstance(content, list):
|
||||||
final_text = "\n".join(content)
|
final_text = "\n".join(content)
|
||||||
elif isinstance(content, str):
|
elif isinstance(content, str):
|
||||||
@ -109,10 +110,11 @@ class Pdf(PdfParser):
|
|||||||
final_text = str(content)
|
final_text = str(content)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Parse positions
|
|
||||||
pn_index = positions[0][0]
|
pn_index = positions[0][0]
|
||||||
if isinstance(pn_index, list):
|
if isinstance(pn_index, list):
|
||||||
pn_index = pn_index[0]
|
pn_index = pn_index[0]
|
||||||
|
|
||||||
|
# pn_index in tbls is absolute page number
|
||||||
current_page_num = int(pn_index) + 1
|
current_page_num = int(pn_index) + 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error parsing position: {e}")
|
print(f"Error parsing position: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user