From 79e2edc8356c23e97886058d04e4433eb1a0d4be Mon Sep 17 00:00:00 2001 From: Jay Xu Date: Tue, 12 Aug 2025 14:58:36 +0800 Subject: [PATCH] Fix "File contains no valid workbook part" (#9360) ### What problem does this PR solve? fix "File contains no valid workbook part" stacktrace: ``` Traceback (most recent call last): File "/ragflow/deepdoc/parser/excel_parser.py", line 54, in _load_excel_to_workbook return RAGFlowExcelParser._dataframe_to_workbook(df) File "/ragflow/deepdoc/parser/excel_parser.py", line 69, in _dataframe_to_workbook ws.cell(row=row_num, column=col_num, value=value) File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/worksheet/worksheet.py", line 246, in cell cell.value = value File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 218, in value self._bind_value(value) File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 197, in _bind_value value = self.check_string(value) File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 165, in check_string raise IllegalCharacterError(f"{value} cannot be used in worksheets.") ``` ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): --- deepdoc/parser/excel_parser.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 5c1e20219..e0d642775 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -12,6 +12,7 @@ # import logging +import re import sys from io import BytesIO @@ -20,6 +21,8 @@ from openpyxl import Workbook, load_workbook from rag.nlp import find_codec +# copied from `/openpyxl/cell/cell.py` +ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') class RAGFlowExcelParser: @@ -61,8 +64,18 @@ class RAGFlowExcelParser: except Exception as e_pandas: raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") + @staticmethod + def _clean_dataframe(df: pd.DataFrame): + def clean_string(s): + if isinstance(s, str): + return ILLEGAL_CHARACTERS_RE.sub(" ", s) + return s + + return df.apply(lambda col: col.map(clean_string)) + @staticmethod def _dataframe_to_workbook(df): + df = RAGFlowExcelParser._clean_dataframe(df) wb = Workbook() ws = wb.active ws.title = "Data"