From 257af75ece7d472adf57e9d1ac103c0c83fe645f Mon Sep 17 00:00:00 2001 From: rommy2017 Date: Thu, 4 Dec 2025 11:23:34 +0800 Subject: [PATCH] Fix: relative page_number in boxes (#11712) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit page_number in boxes is relative page number,must + from_page ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/presentation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 1404ac19b..7ce801749 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -86,9 +86,11 @@ class Pdf(PdfParser): # (A) Add text for b in self.boxes: - if not (from_page < b["page_number"] <= to_page + from_page): + # b["page_number"] is relative page number,must + from_page + global_page_num = b["page_number"] + from_page + if not (from_page < global_page_num <= to_page + from_page): continue - page_items[b["page_number"]].append({ + page_items[global_page_num].append({ "top": b["top"], "x0": b["x0"], "text": b["text"], @@ -100,7 +102,6 @@ class Pdf(PdfParser): if not positions: continue - # Handle content type (list vs str) if isinstance(content, list): final_text = "\n".join(content) elif isinstance(content, str): @@ -109,10 +110,11 @@ class Pdf(PdfParser): final_text = str(content) try: - # Parse positions pn_index = positions[0][0] if isinstance(pn_index, list): pn_index = pn_index[0] + + # pn_index in tbls is absolute page number current_page_num = int(pn_index) + 1 except Exception as e: print(f"Error parsing position: {e}")