From eef43fa25cbe380983129d169d9fa27cb9410a13 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Fri, 15 Aug 2025 17:00:34 +0800 Subject: [PATCH] Fix: unexpected truncated Excel files (#9500) ### What problem does this PR solve? Handle unexpected truncated Excel files. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/excel_parser.py | 12 ++++++++++-- rag/app/naive.py | 1 + 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index e0d642775..d90224f59 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -90,9 +90,17 @@ class RAGFlowExcelParser: return wb def html(self, fnm, chunk_rows=256): + from html import escape + file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object) tb_chunks = [] + + def _fmt(v): + if v is None: + return "" + return str(v).strip() + for sheetname in wb.sheetnames: ws = wb[sheetname] rows = list(ws.rows) @@ -101,7 +109,7 @@ class RAGFlowExcelParser: tb_rows_0 = "" for t in list(rows[0]): - tb_rows_0 += f"{t.value}" + tb_rows_0 += f"{escape(_fmt(t.value))}" tb_rows_0 += "" for chunk_i in range((len(rows) - 1) // chunk_rows + 1): @@ -109,7 +117,7 @@ class RAGFlowExcelParser: tb += f"" tb += tb_rows_0 for r in list( - rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows] + rows[1 + chunk_i * chunk_rows: min(1 + (chunk_i + 1) * chunk_rows, len(rows))] ): tb += "" for i, c in enumerate(r): diff --git a/rag/app/naive.py b/rag/app/naive.py index 5e946d1c8..173ef0aef 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -490,6 +490,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] else: sections = [(_, "") for _ in excel_parser(binary) if _] + parser_config["chunk_token_num"] = 12800 elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.")
{sheetname}