Files
ragflow/deepdoc/parser/excel_parser.py
Jay Xu 79e2edc835 Fix "File contains no valid workbook part" (#9360)
### What problem does this PR solve?

fix "File contains no valid workbook part"

stacktrace:
```
Traceback (most recent call last):
  File "/ragflow/deepdoc/parser/excel_parser.py", line 54, in _load_excel_to_workbook
    return RAGFlowExcelParser._dataframe_to_workbook(df)
  File "/ragflow/deepdoc/parser/excel_parser.py", line 69, in _dataframe_to_workbook
    ws.cell(row=row_num, column=col_num, value=value)
  File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/worksheet/worksheet.py", line 246, in cell
    cell.value = value
  File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 218, in value
    self._bind_value(value)
  File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 197, in _bind_value
    value = self.check_string(value)
  File "/ragflow/.venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 165, in check_string
    raise IllegalCharacterError(f"{value} cannot be used in worksheets.")
```

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
2025-08-12 14:58:36 +08:00

170 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import sys
from io import BytesIO
import pandas as pd
from openpyxl import Workbook, load_workbook
from rag.nlp import find_codec
# copied from `/openpyxl/cell/cell.py`
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
class RAGFlowExcelParser:
@staticmethod
def _load_excel_to_workbook(file_like_object):
if isinstance(file_like_object, bytes):
file_like_object = BytesIO(file_like_object)
# Read first 4 bytes to determine file type
file_like_object.seek(0)
file_head = file_like_object.read(4)
file_like_object.seek(0)
if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
logging.info("Not an Excel file, converting CSV to Excel Workbook")
try:
file_like_object.seek(0)
df = pd.read_csv(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as e_csv:
raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
try:
return load_workbook(file_like_object,data_only= True)
except Exception as e:
logging.info(f"openpyxl load error: {e}, try pandas instead")
try:
file_like_object.seek(0)
try:
df = pd.read_excel(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as ex:
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
file_like_object.seek(0)
df = pd.read_excel(file_like_object, engine='calamine')
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as e_pandas:
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
@staticmethod
def _clean_dataframe(df: pd.DataFrame):
def clean_string(s):
if isinstance(s, str):
return ILLEGAL_CHARACTERS_RE.sub(" ", s)
return s
return df.apply(lambda col: col.map(clean_string))
@staticmethod
def _dataframe_to_workbook(df):
df = RAGFlowExcelParser._clean_dataframe(df)
wb = Workbook()
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
def html(self, fnm, chunk_rows=256):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
tb_chunks = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
tb_rows_0 = "<tr>"
for t in list(rows[0]):
tb_rows_0 += f"<th>{t.value}</th>"
tb_rows_0 += "</tr>"
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
tb = ""
tb += f"<table><caption>{sheetname}</caption>"
tb += tb_rows_0
for r in list(
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
):
tb += "<tr>"
for i, c in enumerate(r):
if c.value is None:
tb += "<td></td>"
else:
tb += f"<td>{c.value}</td>"
tb += "</tr>"
tb += "</table>\n"
tb_chunks.append(tb)
return tb_chunks
def __call__(self, fnm):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
res = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
ti = list(rows[0])
for r in list(rows[1:]):
fields = []
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
t += ("" if t else "") + str(c.value)
fields.append(t)
line = "; ".join(fields)
if sheetname.lower().find("sheet") < 0:
line += " ——" + sheetname
res.append(line)
return res
@staticmethod
def row_number(fnm, binary):
if fnm.split(".")[-1].lower().find("xls") >= 0:
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
ws = wb[sheetname]
total += len(list(ws.rows))
return total
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
return len(txt.split("\n"))
if __name__ == "__main__":
psr = RAGFlowExcelParser()
psr(sys.argv[1])