mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Fix: unexpected truncated Excel files (#9500)
### What problem does this PR solve? Handle unexpected truncated Excel files. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -90,9 +90,17 @@ class RAGFlowExcelParser:
|
|||||||
return wb
|
return wb
|
||||||
|
|
||||||
def html(self, fnm, chunk_rows=256):
|
def html(self, fnm, chunk_rows=256):
|
||||||
|
from html import escape
|
||||||
|
|
||||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||||
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
||||||
tb_chunks = []
|
tb_chunks = []
|
||||||
|
|
||||||
|
def _fmt(v):
|
||||||
|
if v is None:
|
||||||
|
return ""
|
||||||
|
return str(v).strip()
|
||||||
|
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
rows = list(ws.rows)
|
rows = list(ws.rows)
|
||||||
@ -101,7 +109,7 @@ class RAGFlowExcelParser:
|
|||||||
|
|
||||||
tb_rows_0 = "<tr>"
|
tb_rows_0 = "<tr>"
|
||||||
for t in list(rows[0]):
|
for t in list(rows[0]):
|
||||||
tb_rows_0 += f"<th>{t.value}</th>"
|
tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
|
||||||
tb_rows_0 += "</tr>"
|
tb_rows_0 += "</tr>"
|
||||||
|
|
||||||
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
||||||
@ -109,7 +117,7 @@ class RAGFlowExcelParser:
|
|||||||
tb += f"<table><caption>{sheetname}</caption>"
|
tb += f"<table><caption>{sheetname}</caption>"
|
||||||
tb += tb_rows_0
|
tb += tb_rows_0
|
||||||
for r in list(
|
for r in list(
|
||||||
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
|
rows[1 + chunk_i * chunk_rows: min(1 + (chunk_i + 1) * chunk_rows, len(rows))]
|
||||||
):
|
):
|
||||||
tb += "<tr>"
|
tb += "<tr>"
|
||||||
for i, c in enumerate(r):
|
for i, c in enumerate(r):
|
||||||
|
|||||||
@ -490,6 +490,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||||
else:
|
else:
|
||||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||||
|
parser_config["chunk_token_num"] = 12800
|
||||||
|
|
||||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
|
|||||||
Reference in New Issue
Block a user