From 9d123808060be379222e33402bbb21e2884ba43c Mon Sep 17 00:00:00 2001 From: Stephen Hu <812791840@qq.com> Date: Tue, 21 Oct 2025 09:52:59 +0800 Subject: [PATCH] =?UTF-8?q?Fix:=20Excel2HTML=20can't=20support=20XLS?= =?UTF-8?q?=EF=BC=88Excel=2097-2003=EF=BC=89=20(#10660)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/10602 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/excel_parser.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 315df7df3..4d0496a33 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -54,8 +54,8 @@ class RAGFlowExcelParser: try: file_like_object.seek(0) try: - df = pd.read_excel(file_like_object) - return RAGFlowExcelParser._dataframe_to_workbook(df) + dfs = pd.read_excel(file_like_object, sheet_name=None) + return RAGFlowExcelParser._dataframe_to_workbook(dfs) except Exception as ex: logging.info(f"pandas with default engine load error: {ex}, try calamine instead") file_like_object.seek(0) @@ -75,6 +75,10 @@ class RAGFlowExcelParser: @staticmethod def _dataframe_to_workbook(df): + # if contains multiple sheets use _dataframes_to_workbook + if isinstance(df, dict) and len(df) > 1: + return RAGFlowExcelParser._dataframes_to_workbook(df) + df = RAGFlowExcelParser._clean_dataframe(df) wb = Workbook() ws = wb.active @@ -88,6 +92,22 @@ class RAGFlowExcelParser: ws.cell(row=row_num, column=col_num, value=value) return wb + + @staticmethod + def _dataframes_to_workbook(dfs: dict): + wb = Workbook() + default_sheet = wb.active + wb.remove(default_sheet) + + for sheet_name, df in dfs.items(): + df = RAGFlowExcelParser._clean_dataframe(df) + ws = wb.create_sheet(title=sheet_name) + for col_num, column_name in enumerate(df.columns, 1): + ws.cell(row=1, column=col_num, value=column_name) + for row_num, row in enumerate(df.values, 2): + for col_num, value in enumerate(row, 1): + ws.cell(row=row_num, column=col_num, value=value) + return wb def html(self, fnm, chunk_rows=256): from html import escape