Feat: dataflow supports Spreadsheet and Word processor document (#9996)

### What problem does this PR solve? Dataflow supports Spreadsheet and Word processor document ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 08:05:07 +08:00 · 2025-09-10 13:02:53 +08:00
parent e650f0d368
commit 0d9c1f1c3c
9 changed files with 126 additions and 43 deletions
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -22,10 +22,10 @@ from openpyxl import Workbook, load_workbook
 from rag.nlp import find_codec

 # copied from `/openpyxl/cell/cell.py`
-ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
+ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
+

 class RAGFlowExcelParser:
-
    @staticmethod
    def _load_excel_to_workbook(file_like_object):
        if isinstance(file_like_object, bytes):
@ -36,7 +36,7 @@ class RAGFlowExcelParser:
        file_head = file_like_object.read(4)
        file_like_object.seek(0)

-        if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
+        if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
            logging.info("Not an Excel file, converting CSV to Excel Workbook")

            try:
@ -48,7 +48,7 @@ class RAGFlowExcelParser:
                raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")

        try:
-            return load_workbook(file_like_object,data_only= True)
+            return load_workbook(file_like_object, data_only=True)
        except Exception as e:
            logging.info(f"openpyxl load error: {e}, try pandas instead")
            try:
@ -59,7 +59,7 @@ class RAGFlowExcelParser:
                except Exception as ex:
                    logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
                    file_like_object.seek(0)
-                    df = pd.read_excel(file_like_object, engine='calamine')
+                    df = pd.read_excel(file_like_object, engine="calamine")
                    return RAGFlowExcelParser._dataframe_to_workbook(df)
            except Exception as e_pandas:
                raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
@ -116,9 +116,7 @@ class RAGFlowExcelParser:
                tb = ""
                tb += f"<table><caption>{sheetname}</caption>"
                tb += tb_rows_0
-                for r in list(
-                    rows[1 + chunk_i * chunk_rows: min(1 + (chunk_i + 1) * chunk_rows, len(rows))]
-                ):
+                for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
                    tb += "<tr>"
                    for i, c in enumerate(r):
                        if c.value is None:
@ -133,8 +131,16 @@ class RAGFlowExcelParser:

    def markdown(self, fnm):
        import pandas as pd
+
        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
-        df = pd.read_excel(file_like_object)
+        try:
+            file_like_object.seek(0)
+            df = pd.read_excel(file_like_object)
+        except Exception as e:
+            logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
+            file_like_object.seek(0)
+            df = pd.read_csv(file_like_object)
+        df = df.replace(r"^\s*$", "", regex=True)
        return df.to_markdown(index=False)

    def __call__(self, fnm):