fix: optimize Excel row counting for files with abnormal max_row (#13018)

### What problem does this PR solve? Some Excel files have abnormal `max_row` metadata (e.g., `max_row=1,048,534` with only 300 actual data rows). This causes: - `row_number()` returns incorrect count, creating 350+ tasks instead of 1 - `list(ws.rows)` iterates through millions of empty rows, causing system hang This PR uses binary search to find the actual last row with data. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Performance Improvement Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-06 18:45:08 +08:00 · 2026-02-06 14:43:52 +08:00
parent 00c392e633
commit 5333e764fc
2 changed files with 62 additions and 11 deletions
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -156,6 +156,55 @@ class RAGFlowExcelParser:
                continue
        return raw_items

+    @staticmethod
+    def _get_actual_row_count(ws):
+        max_row = ws.max_row
+        if not max_row:
+            return 0
+        if max_row <= 10000:
+            return max_row
+
+        max_col = min(ws.max_column or 1, 50)
+
+        def row_has_data(row_idx):
+            for col_idx in range(1, max_col + 1):
+                cell = ws.cell(row=row_idx, column=col_idx)
+                if cell.value is not None and str(cell.value).strip():
+                    return True
+            return False
+
+        if not any(row_has_data(i) for i in range(1, min(101, max_row + 1))):
+            return 0
+
+        left, right = 1, max_row
+        last_data_row = 1
+
+        while left <= right:
+            mid = (left + right) // 2
+            found = False
+            for r in range(mid, min(mid + 10, max_row + 1)):
+                if row_has_data(r):
+                    found = True
+                    last_data_row = max(last_data_row, r)
+                    break
+            if found:
+                left = mid + 1
+            else:
+                right = mid - 1
+
+        for r in range(last_data_row, min(last_data_row + 500, max_row + 1)):
+            if row_has_data(r):
+                last_data_row = r
+
+        return last_data_row
+
+    @staticmethod
+    def _get_rows_limited(ws):
+        actual_rows = RAGFlowExcelParser._get_actual_row_count(ws)
+        if actual_rows == 0:
+            return []
+        return list(ws.iter_rows(min_row=1, max_row=actual_rows))
+
    def html(self, fnm, chunk_rows=256):
        from html import escape

@ -171,7 +220,7 @@ class RAGFlowExcelParser:
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            try:
-                rows = list(ws.rows)
+                rows = RAGFlowExcelParser._get_rows_limited(ws)
            except Exception as e:
                logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
                continue
@ -223,7 +272,7 @@ class RAGFlowExcelParser:
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            try:
-                rows = list(ws.rows)
+                rows = RAGFlowExcelParser._get_rows_limited(ws)
            except Exception as e:
                logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
                continue
@ -238,6 +287,8 @@ class RAGFlowExcelParser:
                    t = str(ti[i].value) if i < len(ti) else ""
                    t += ("：" if t else "") + str(c.value)
                    fields.append(t)
+                if not fields:
+                    continue
                line = "; ".join(fields)
                if sheetname.lower().find("sheet") < 0:
                    line += " ——" + sheetname
@ -251,12 +302,12 @@ class RAGFlowExcelParser:
            total = 0

            for sheetname in wb.sheetnames:
-               try:
-                   ws = wb[sheetname]
-                   total += len(list(ws.rows))
-               except Exception as e:
-                   logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
-                   continue
+                try:
+                    ws = wb[sheetname]
+                    total += RAGFlowExcelParser._get_actual_row_count(ws)
+                except Exception as e:
+                    logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
+                    continue
            return total

        if fnm.split(".")[-1].lower() in ["csv", "txt"]:
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -44,7 +44,7 @@ class Excel(ExcelParser):
            wb = Excel._load_excel_to_workbook(BytesIO(binary))
        total = 0
        for sheet_name in wb.sheetnames:
-            total += len(list(wb[sheet_name].rows))
+            total += Excel._get_actual_row_count(wb[sheet_name])
        res, fails, done = [], [], 0
        rn = 0
        flow_images = []
@ -66,7 +66,7 @@ class Excel(ExcelParser):
                            flow_images.append(img)

            try:
-                rows = list(ws.rows)
+                rows = Excel._get_rows_limited(ws)
            except Exception as e:
                logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
                continue