diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py
index e0d642775..d90224f59 100644
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@@ -90,9 +90,17 @@ class RAGFlowExcelParser:
return wb
def html(self, fnm, chunk_rows=256):
+ from html import escape
+
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
tb_chunks = []
+
+ def _fmt(v):
+ if v is None:
+ return ""
+ return str(v).strip()
+
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
@@ -101,7 +109,7 @@ class RAGFlowExcelParser:
tb_rows_0 = "
"
for t in list(rows[0]):
- tb_rows_0 += f"| {t.value} | "
+ tb_rows_0 += f"{escape(_fmt(t.value))} | "
tb_rows_0 += "
"
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
@@ -109,7 +117,7 @@ class RAGFlowExcelParser:
tb += f"{sheetname}"
tb += tb_rows_0
for r in list(
- rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
+ rows[1 + chunk_i * chunk_rows: min(1 + (chunk_i + 1) * chunk_rows, len(rows))]
):
tb += ""
for i, c in enumerate(r):
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 5e946d1c8..173ef0aef 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -490,6 +490,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
+ parser_config["chunk_token_num"] = 12800
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")