Fix: unexpected truncated Excel files (#9500)

### What problem does this PR solve? Handle unexpected truncated Excel files. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-02 00:25:06 +08:00 · 2025-08-15 17:00:34 +08:00
parent 5a4dfecfbe
commit eef43fa25c
2 changed files with 11 additions and 2 deletions
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -90,9 +90,17 @@ class RAGFlowExcelParser:
        return wb
    def html(self, fnm, chunk_rows=256):
        from html import escape
        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
        wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
        tb_chunks = []
        def _fmt(v):
            if v is None:
                return ""
            return str(v).strip()
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
@ -101,7 +109,7 @@ class RAGFlowExcelParser:
            tb_rows_0 = "<tr>"
            for t in list(rows[0]):
-                tb_rows_0 += f"<th>{t.value}</th>"
+                tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
            tb_rows_0 += "</tr>"
            for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
@ -109,7 +117,7 @@ class RAGFlowExcelParser:
                tb += f"<table><caption>{sheetname}</caption>"
                tb += tb_rows_0
                for r in list(
-                    rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
+                    rows[1 + chunk_i * chunk_rows: min(1 + (chunk_i + 1) * chunk_rows, len(rows))]
                ):
                    tb += "<tr>"
                    for i, c in enumerate(r):
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -490,6 +490,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
        else:
            sections = [(_, "") for _ in excel_parser(binary) if _]
        parser_config["chunk_token_num"] = 12800
    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")