diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 4d0496a33..868fc5f41 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -123,7 +123,12 @@ class RAGFlowExcelParser: for sheetname in wb.sheetnames: ws = wb[sheetname] - rows = list(ws.rows) + try: + rows = list(ws.rows) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue + if not rows: continue @@ -170,7 +175,11 @@ class RAGFlowExcelParser: res = [] for sheetname in wb.sheetnames: ws = wb[sheetname] - rows = list(ws.rows) + try: + rows = list(ws.rows) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue if not rows: continue ti = list(rows[0]) @@ -193,9 +202,14 @@ class RAGFlowExcelParser: if fnm.split(".")[-1].lower().find("xls") >= 0: wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary)) total = 0 + for sheetname in wb.sheetnames: - ws = wb[sheetname] - total += len(list(ws.rows)) + try: + ws = wb[sheetname] + total += len(list(ws.rows)) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue return total if fnm.split(".")[-1].lower() in ["csv", "txt"]: diff --git a/rag/app/table.py b/rag/app/table.py index b0c3e5bc2..7a21a738a 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -15,6 +15,7 @@ # import copy +import logging import re from io import BytesIO from xpinyin import Pinyin @@ -44,7 +45,11 @@ class Excel(ExcelParser): rn = 0 for sheetname in wb.sheetnames: ws = wb[sheetname] - rows = list(ws.rows) + try: + rows = list(ws.rows) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue if not rows: continue headers, header_rows = self._parse_headers(ws, rows) diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 4256a638d..ecb22522f 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -395,7 +395,9 @@ class Dealer: tsim = sim vsim = sim # Already paginated in search function - begin = ((page % (RERANK_LIMIT//page_size)) - 1) * page_size + max_pages = RERANK_LIMIT // page_size + page_index = (page % max_pages) - 1 + begin = max(page_index * page_size, 0) sim = sim[begin : begin + page_size] sim_np = np.array(sim) idx = np.argsort(sim_np * -1)