Add fallback to use 'calamine' parse engine in excel_parser.py (#9374)

### What problem does this PR solve?

add fallback to `calamine` engine when parse error raised using the
default `openpyxl` / `xlrd` engine.
e.g. the following error can be fixed:
```
Traceback (most recent call last):
  File "/ragflow/deepdoc/parser/excel_parser.py", line 53, in _load_excel_to_workbook
    df = pd.read_excel(file_like_object)
  File "/ragflow/.venv/lib/python3.10/site-packages/pandas/io/excel/_base.py", line 495, in read_excel
    io = ExcelFile(
  File "/ragflow/.venv/lib/python3.10/site-packages/pandas/io/excel/_base.py", line 1567, in __init__
    self._reader = self._engines[engine](
  File "/ragflow/.venv/lib/python3.10/site-packages/pandas/io/excel/_xlrd.py", line 46, in __init__
    super().__init__(
  File "/ragflow/.venv/lib/python3.10/site-packages/pandas/io/excel/_base.py", line 573, in __init__
    self.book = self.load_workbook(self.handles.handle, engine_kwargs)
  File "/ragflow/.venv/lib/python3.10/site-packages/pandas/io/excel/_xlrd.py", line 63, in load_workbook
    return open_workbook(file_contents=data, **engine_kwargs)
  File "/ragflow/.venv/lib/python3.10/site-packages/xlrd/__init__.py", line 172, in open_workbook
    bk = open_workbook_xls(
  File "/ragflow/.venv/lib/python3.10/site-packages/xlrd/book.py", line 68, in open_workbook_xls
    bk.biff2_8_load(
  File "/ragflow/.venv/lib/python3.10/site-packages/xlrd/book.py", line 641, in biff2_8_load
    cd.locate_named_stream(UNICODE_LITERAL(qname))
  File "/ragflow/.venv/lib/python3.10/site-packages/xlrd/compdoc.py", line 398, in locate_named_stream
    result = self._locate_stream(
  File "/ragflow/.venv/lib/python3.10/site-packages/xlrd/compdoc.py", line 429, in _locate_stream
    raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s]))
xlrd.compdoc.CompDocError: Workbook corruption: seen[2] == 4
```

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Jay Xu
2025-08-12 12:41:33 +08:00
committed by GitHub
parent 96b1538b3e
commit 569ab011c4
3 changed files with 68 additions and 2 deletions

View File

@ -50,8 +50,14 @@ class RAGFlowExcelParser:
logging.info(f"openpyxl load error: {e}, try pandas instead")
try:
file_like_object.seek(0)
df = pd.read_excel(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)
try:
df = pd.read_excel(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as ex:
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
file_like_object.seek(0)
df = pd.read_excel(file_like_object, engine='calamine')
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as e_pandas:
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")