diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index a8087ef81..2fe342019 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -156,6 +156,55 @@ class RAGFlowExcelParser: continue return raw_items + @staticmethod + def _get_actual_row_count(ws): + max_row = ws.max_row + if not max_row: + return 0 + if max_row <= 10000: + return max_row + + max_col = min(ws.max_column or 1, 50) + + def row_has_data(row_idx): + for col_idx in range(1, max_col + 1): + cell = ws.cell(row=row_idx, column=col_idx) + if cell.value is not None and str(cell.value).strip(): + return True + return False + + if not any(row_has_data(i) for i in range(1, min(101, max_row + 1))): + return 0 + + left, right = 1, max_row + last_data_row = 1 + + while left <= right: + mid = (left + right) // 2 + found = False + for r in range(mid, min(mid + 10, max_row + 1)): + if row_has_data(r): + found = True + last_data_row = max(last_data_row, r) + break + if found: + left = mid + 1 + else: + right = mid - 1 + + for r in range(last_data_row, min(last_data_row + 500, max_row + 1)): + if row_has_data(r): + last_data_row = r + + return last_data_row + + @staticmethod + def _get_rows_limited(ws): + actual_rows = RAGFlowExcelParser._get_actual_row_count(ws) + if actual_rows == 0: + return [] + return list(ws.iter_rows(min_row=1, max_row=actual_rows)) + def html(self, fnm, chunk_rows=256): from html import escape @@ -171,7 +220,7 @@ class RAGFlowExcelParser: for sheetname in wb.sheetnames: ws = wb[sheetname] try: - rows = list(ws.rows) + rows = RAGFlowExcelParser._get_rows_limited(ws) except Exception as e: logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") continue @@ -223,7 +272,7 @@ class RAGFlowExcelParser: for sheetname in wb.sheetnames: ws = wb[sheetname] try: - rows = list(ws.rows) + rows = RAGFlowExcelParser._get_rows_limited(ws) except Exception as e: logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") continue @@ -238,6 +287,8 @@ class RAGFlowExcelParser: t = str(ti[i].value) if i < len(ti) else "" t += (":" if t else "") + str(c.value) fields.append(t) + if not fields: + continue line = "; ".join(fields) if sheetname.lower().find("sheet") < 0: line += " ——" + sheetname @@ -249,14 +300,14 @@ class RAGFlowExcelParser: if fnm.split(".")[-1].lower().find("xls") >= 0: wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary)) total = 0 - + for sheetname in wb.sheetnames: - try: - ws = wb[sheetname] - total += len(list(ws.rows)) - except Exception as e: - logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") - continue + try: + ws = wb[sheetname] + total += RAGFlowExcelParser._get_actual_row_count(ws) + except Exception as e: + logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + continue return total if fnm.split(".")[-1].lower() in ["csv", "txt"]: diff --git a/rag/app/table.py b/rag/app/table.py index fad6b7038..106f06a4c 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -44,7 +44,7 @@ class Excel(ExcelParser): wb = Excel._load_excel_to_workbook(BytesIO(binary)) total = 0 for sheet_name in wb.sheetnames: - total += len(list(wb[sheet_name].rows)) + total += Excel._get_actual_row_count(wb[sheet_name]) res, fails, done = [], [], 0 rn = 0 flow_images = [] @@ -66,7 +66,7 @@ class Excel(ExcelParser): flow_images.append(img) try: - rows = list(ws.rows) + rows = Excel._get_rows_limited(ws) except Exception as e: logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}") continue