mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-06 18:45:08 +08:00
fix: optimize Excel row counting for files with abnormal max_row (#13018)
### What problem does this PR solve? Some Excel files have abnormal `max_row` metadata (e.g., `max_row=1,048,534` with only 300 actual data rows). This causes: - `row_number()` returns incorrect count, creating 350+ tasks instead of 1 - `list(ws.rows)` iterates through millions of empty rows, causing system hang This PR uses binary search to find the actual last row with data. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Performance Improvement Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@ -156,6 +156,55 @@ class RAGFlowExcelParser:
|
||||
continue
|
||||
return raw_items
|
||||
|
||||
@staticmethod
|
||||
def _get_actual_row_count(ws):
|
||||
max_row = ws.max_row
|
||||
if not max_row:
|
||||
return 0
|
||||
if max_row <= 10000:
|
||||
return max_row
|
||||
|
||||
max_col = min(ws.max_column or 1, 50)
|
||||
|
||||
def row_has_data(row_idx):
|
||||
for col_idx in range(1, max_col + 1):
|
||||
cell = ws.cell(row=row_idx, column=col_idx)
|
||||
if cell.value is not None and str(cell.value).strip():
|
||||
return True
|
||||
return False
|
||||
|
||||
if not any(row_has_data(i) for i in range(1, min(101, max_row + 1))):
|
||||
return 0
|
||||
|
||||
left, right = 1, max_row
|
||||
last_data_row = 1
|
||||
|
||||
while left <= right:
|
||||
mid = (left + right) // 2
|
||||
found = False
|
||||
for r in range(mid, min(mid + 10, max_row + 1)):
|
||||
if row_has_data(r):
|
||||
found = True
|
||||
last_data_row = max(last_data_row, r)
|
||||
break
|
||||
if found:
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
|
||||
for r in range(last_data_row, min(last_data_row + 500, max_row + 1)):
|
||||
if row_has_data(r):
|
||||
last_data_row = r
|
||||
|
||||
return last_data_row
|
||||
|
||||
@staticmethod
|
||||
def _get_rows_limited(ws):
|
||||
actual_rows = RAGFlowExcelParser._get_actual_row_count(ws)
|
||||
if actual_rows == 0:
|
||||
return []
|
||||
return list(ws.iter_rows(min_row=1, max_row=actual_rows))
|
||||
|
||||
def html(self, fnm, chunk_rows=256):
|
||||
from html import escape
|
||||
|
||||
@ -171,7 +220,7 @@ class RAGFlowExcelParser:
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
try:
|
||||
rows = list(ws.rows)
|
||||
rows = RAGFlowExcelParser._get_rows_limited(ws)
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||
continue
|
||||
@ -223,7 +272,7 @@ class RAGFlowExcelParser:
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
try:
|
||||
rows = list(ws.rows)
|
||||
rows = RAGFlowExcelParser._get_rows_limited(ws)
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||
continue
|
||||
@ -238,6 +287,8 @@ class RAGFlowExcelParser:
|
||||
t = str(ti[i].value) if i < len(ti) else ""
|
||||
t += (":" if t else "") + str(c.value)
|
||||
fields.append(t)
|
||||
if not fields:
|
||||
continue
|
||||
line = "; ".join(fields)
|
||||
if sheetname.lower().find("sheet") < 0:
|
||||
line += " ——" + sheetname
|
||||
@ -251,12 +302,12 @@ class RAGFlowExcelParser:
|
||||
total = 0
|
||||
|
||||
for sheetname in wb.sheetnames:
|
||||
try:
|
||||
ws = wb[sheetname]
|
||||
total += len(list(ws.rows))
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||
continue
|
||||
try:
|
||||
ws = wb[sheetname]
|
||||
total += RAGFlowExcelParser._get_actual_row_count(ws)
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||
continue
|
||||
return total
|
||||
|
||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||
|
||||
@ -44,7 +44,7 @@ class Excel(ExcelParser):
|
||||
wb = Excel._load_excel_to_workbook(BytesIO(binary))
|
||||
total = 0
|
||||
for sheet_name in wb.sheetnames:
|
||||
total += len(list(wb[sheet_name].rows))
|
||||
total += Excel._get_actual_row_count(wb[sheet_name])
|
||||
res, fails, done = [], [], 0
|
||||
rn = 0
|
||||
flow_images = []
|
||||
@ -66,7 +66,7 @@ class Excel(ExcelParser):
|
||||
flow_images.append(img)
|
||||
|
||||
try:
|
||||
rows = list(ws.rows)
|
||||
rows = Excel._get_rows_limited(ws)
|
||||
except Exception as e:
|
||||
logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user