From e59458c36bd8b1492ced5d1509b4843cb92e15dd Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Tue, 28 Oct 2025 09:40:37 +0800 Subject: [PATCH] Fix: parsing excel with chartsheet & Clamp begin to a minimum of 0 to prevent negative indexing (#10819) ### What problem does this PR solve? Fix: parsing excel with chartsheet #10815 Fix: Clamp begin to a minimum of 0 to prevent negative indexing #10804 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/excel_parser.py | 22 ++++++++++++++++++---- rag/app/table.py | 7 ++++++- rag/nlp/search.py | 4 +++- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 4d0496a33..868fc5f41 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -123,7 +123,12 @@ class RAGFlowExcelParser: for sheetname in wb.sheetnames: ws = wb[sheetname] - rows = list(ws.rows) + try: + rows = list(ws.rows) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue + if not rows: continue @@ -170,7 +175,11 @@ class RAGFlowExcelParser: res = [] for sheetname in wb.sheetnames: ws = wb[sheetname] - rows = list(ws.rows) + try: + rows = list(ws.rows) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue if not rows: continue ti = list(rows[0]) @@ -193,9 +202,14 @@ class RAGFlowExcelParser: if fnm.split(".")[-1].lower().find("xls") >= 0: wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary)) total = 0 + for sheetname in wb.sheetnames: - ws = wb[sheetname] - total += len(list(ws.rows)) + try: + ws = wb[sheetname] + total += len(list(ws.rows)) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue return total if fnm.split(".")[-1].lower() in ["csv", "txt"]: diff --git a/rag/app/table.py b/rag/app/table.py index b0c3e5bc2..7a21a738a 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -15,6 +15,7 @@ # import copy +import logging import re from io import BytesIO from xpinyin import Pinyin @@ -44,7 +45,11 @@ class Excel(ExcelParser): rn = 0 for sheetname in wb.sheetnames: ws = wb[sheetname] - rows = list(ws.rows) + try: + rows = list(ws.rows) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue if not rows: continue headers, header_rows = self._parse_headers(ws, rows) diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 4256a638d..ecb22522f 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -395,7 +395,9 @@ class Dealer: tsim = sim vsim = sim # Already paginated in search function - begin = ((page % (RERANK_LIMIT//page_size)) - 1) * page_size + max_pages = RERANK_LIMIT // page_size + page_index = (page % max_pages) - 1 + begin = max(page_index * page_size, 0) sim = sim[begin : begin + page_size] sim_np = np.array(sim) idx = np.argsort(sim_np * -1)