mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-06 18:45:08 +08:00
fix: optimize Excel row counting for files with abnormal max_row (#13018)
### What problem does this PR solve? Some Excel files have abnormal `max_row` metadata (e.g., `max_row=1,048,534` with only 300 actual data rows). This causes: - `row_number()` returns incorrect count, creating 350+ tasks instead of 1 - `list(ws.rows)` iterates through millions of empty rows, causing system hang This PR uses binary search to find the actual last row with data. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Performance Improvement Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@ -156,6 +156,55 @@ class RAGFlowExcelParser:
|
|||||||
continue
|
continue
|
||||||
return raw_items
|
return raw_items
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_actual_row_count(ws):
|
||||||
|
max_row = ws.max_row
|
||||||
|
if not max_row:
|
||||||
|
return 0
|
||||||
|
if max_row <= 10000:
|
||||||
|
return max_row
|
||||||
|
|
||||||
|
max_col = min(ws.max_column or 1, 50)
|
||||||
|
|
||||||
|
def row_has_data(row_idx):
|
||||||
|
for col_idx in range(1, max_col + 1):
|
||||||
|
cell = ws.cell(row=row_idx, column=col_idx)
|
||||||
|
if cell.value is not None and str(cell.value).strip():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not any(row_has_data(i) for i in range(1, min(101, max_row + 1))):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
left, right = 1, max_row
|
||||||
|
last_data_row = 1
|
||||||
|
|
||||||
|
while left <= right:
|
||||||
|
mid = (left + right) // 2
|
||||||
|
found = False
|
||||||
|
for r in range(mid, min(mid + 10, max_row + 1)):
|
||||||
|
if row_has_data(r):
|
||||||
|
found = True
|
||||||
|
last_data_row = max(last_data_row, r)
|
||||||
|
break
|
||||||
|
if found:
|
||||||
|
left = mid + 1
|
||||||
|
else:
|
||||||
|
right = mid - 1
|
||||||
|
|
||||||
|
for r in range(last_data_row, min(last_data_row + 500, max_row + 1)):
|
||||||
|
if row_has_data(r):
|
||||||
|
last_data_row = r
|
||||||
|
|
||||||
|
return last_data_row
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_rows_limited(ws):
|
||||||
|
actual_rows = RAGFlowExcelParser._get_actual_row_count(ws)
|
||||||
|
if actual_rows == 0:
|
||||||
|
return []
|
||||||
|
return list(ws.iter_rows(min_row=1, max_row=actual_rows))
|
||||||
|
|
||||||
def html(self, fnm, chunk_rows=256):
|
def html(self, fnm, chunk_rows=256):
|
||||||
from html import escape
|
from html import escape
|
||||||
|
|
||||||
@ -171,7 +220,7 @@ class RAGFlowExcelParser:
|
|||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
try:
|
try:
|
||||||
rows = list(ws.rows)
|
rows = RAGFlowExcelParser._get_rows_limited(ws)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||||
continue
|
continue
|
||||||
@ -223,7 +272,7 @@ class RAGFlowExcelParser:
|
|||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
try:
|
try:
|
||||||
rows = list(ws.rows)
|
rows = RAGFlowExcelParser._get_rows_limited(ws)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||||
continue
|
continue
|
||||||
@ -238,6 +287,8 @@ class RAGFlowExcelParser:
|
|||||||
t = str(ti[i].value) if i < len(ti) else ""
|
t = str(ti[i].value) if i < len(ti) else ""
|
||||||
t += (":" if t else "") + str(c.value)
|
t += (":" if t else "") + str(c.value)
|
||||||
fields.append(t)
|
fields.append(t)
|
||||||
|
if not fields:
|
||||||
|
continue
|
||||||
line = "; ".join(fields)
|
line = "; ".join(fields)
|
||||||
if sheetname.lower().find("sheet") < 0:
|
if sheetname.lower().find("sheet") < 0:
|
||||||
line += " ——" + sheetname
|
line += " ——" + sheetname
|
||||||
@ -249,14 +300,14 @@ class RAGFlowExcelParser:
|
|||||||
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
||||||
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
|
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
|
||||||
total = 0
|
total = 0
|
||||||
|
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
try:
|
try:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
total += len(list(ws.rows))
|
total += RAGFlowExcelParser._get_actual_row_count(ws)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
||||||
continue
|
continue
|
||||||
return total
|
return total
|
||||||
|
|
||||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||||
|
|||||||
@ -44,7 +44,7 @@ class Excel(ExcelParser):
|
|||||||
wb = Excel._load_excel_to_workbook(BytesIO(binary))
|
wb = Excel._load_excel_to_workbook(BytesIO(binary))
|
||||||
total = 0
|
total = 0
|
||||||
for sheet_name in wb.sheetnames:
|
for sheet_name in wb.sheetnames:
|
||||||
total += len(list(wb[sheet_name].rows))
|
total += Excel._get_actual_row_count(wb[sheet_name])
|
||||||
res, fails, done = [], [], 0
|
res, fails, done = [], [], 0
|
||||||
rn = 0
|
rn = 0
|
||||||
flow_images = []
|
flow_images = []
|
||||||
@ -66,7 +66,7 @@ class Excel(ExcelParser):
|
|||||||
flow_images.append(img)
|
flow_images.append(img)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rows = list(ws.rows)
|
rows = Excel._get_rows_limited(ws)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
|
logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
|
||||||
continue
|
continue
|
||||||
|
|||||||
Reference in New Issue
Block a user