fix: optimize Excel row counting for files with abnormal max_row (#13018)

### What problem does this PR solve?

Some Excel files have abnormal `max_row` metadata (e.g.,
`max_row=1,048,534` with only 300 actual data rows). This causes:
- `row_number()` returns incorrect count, creating 350+ tasks instead of
1
- `list(ws.rows)` iterates through millions of empty rows, causing
system hang

This PR uses binary search to find the actual last row with data.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Performance Improvement

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
yH
2026-02-06 14:43:52 +08:00
committed by GitHub
parent 00c392e633
commit 5333e764fc
2 changed files with 62 additions and 11 deletions

View File

@ -156,6 +156,55 @@ class RAGFlowExcelParser:
continue
return raw_items
@staticmethod
def _get_actual_row_count(ws):
max_row = ws.max_row
if not max_row:
return 0
if max_row <= 10000:
return max_row
max_col = min(ws.max_column or 1, 50)
def row_has_data(row_idx):
for col_idx in range(1, max_col + 1):
cell = ws.cell(row=row_idx, column=col_idx)
if cell.value is not None and str(cell.value).strip():
return True
return False
if not any(row_has_data(i) for i in range(1, min(101, max_row + 1))):
return 0
left, right = 1, max_row
last_data_row = 1
while left <= right:
mid = (left + right) // 2
found = False
for r in range(mid, min(mid + 10, max_row + 1)):
if row_has_data(r):
found = True
last_data_row = max(last_data_row, r)
break
if found:
left = mid + 1
else:
right = mid - 1
for r in range(last_data_row, min(last_data_row + 500, max_row + 1)):
if row_has_data(r):
last_data_row = r
return last_data_row
@staticmethod
def _get_rows_limited(ws):
actual_rows = RAGFlowExcelParser._get_actual_row_count(ws)
if actual_rows == 0:
return []
return list(ws.iter_rows(min_row=1, max_row=actual_rows))
def html(self, fnm, chunk_rows=256):
from html import escape
@ -171,7 +220,7 @@ class RAGFlowExcelParser:
for sheetname in wb.sheetnames:
ws = wb[sheetname]
try:
rows = list(ws.rows)
rows = RAGFlowExcelParser._get_rows_limited(ws)
except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
continue
@ -223,7 +272,7 @@ class RAGFlowExcelParser:
for sheetname in wb.sheetnames:
ws = wb[sheetname]
try:
rows = list(ws.rows)
rows = RAGFlowExcelParser._get_rows_limited(ws)
except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
continue
@ -238,6 +287,8 @@ class RAGFlowExcelParser:
t = str(ti[i].value) if i < len(ti) else ""
t += ("" if t else "") + str(c.value)
fields.append(t)
if not fields:
continue
line = "; ".join(fields)
if sheetname.lower().find("sheet") < 0:
line += " ——" + sheetname
@ -251,12 +302,12 @@ class RAGFlowExcelParser:
total = 0
for sheetname in wb.sheetnames:
try:
ws = wb[sheetname]
total += len(list(ws.rows))
except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
continue
try:
ws = wb[sheetname]
total += RAGFlowExcelParser._get_actual_row_count(ws)
except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
continue
return total
if fnm.split(".")[-1].lower() in ["csv", "txt"]:

View File

@ -44,7 +44,7 @@ class Excel(ExcelParser):
wb = Excel._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheet_name in wb.sheetnames:
total += len(list(wb[sheet_name].rows))
total += Excel._get_actual_row_count(wb[sheet_name])
res, fails, done = [], [], 0
rn = 0
flow_images = []
@ -66,7 +66,7 @@ class Excel(ExcelParser):
flow_images.append(img)
try:
rows = list(ws.rows)
rows = Excel._get_rows_limited(ws)
except Exception as e:
logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
continue