diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 1404ac19b..7ce801749 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -86,9 +86,11 @@ class Pdf(PdfParser): # (A) Add text for b in self.boxes: - if not (from_page < b["page_number"] <= to_page + from_page): + # b["page_number"] is relative page number,must + from_page + global_page_num = b["page_number"] + from_page + if not (from_page < global_page_num <= to_page + from_page): continue - page_items[b["page_number"]].append({ + page_items[global_page_num].append({ "top": b["top"], "x0": b["x0"], "text": b["text"], @@ -100,7 +102,6 @@ class Pdf(PdfParser): if not positions: continue - # Handle content type (list vs str) if isinstance(content, list): final_text = "\n".join(content) elif isinstance(content, str): @@ -109,10 +110,11 @@ class Pdf(PdfParser): final_text = str(content) try: - # Parse positions pn_index = positions[0][0] if isinstance(pn_index, list): pn_index = pn_index[0] + + # pn_index in tbls is absolute page number current_page_num = int(pn_index) + 1 except Exception as e: print(f"Error parsing position: {e}")