mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Split Excel file into different chunks (#847)
### What problem does this PR solve? Split Excel into different chunk ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -7,30 +7,39 @@ from rag.nlp import find_codec
|
||||
|
||||
|
||||
class RAGFlowExcelParser:
|
||||
def html(self, fnm):
|
||||
def html(self, fnm,chunk_rows=256):
|
||||
if isinstance(fnm, str):
|
||||
wb = load_workbook(fnm)
|
||||
else:
|
||||
wb = load_workbook(BytesIO(fnm))
|
||||
tb = ""
|
||||
|
||||
tb_chunks = []
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
rows = list(ws.rows)
|
||||
if not rows:continue
|
||||
tb += f"<table><caption>{sheetname}</caption><tr>"
|
||||
if not rows: continue
|
||||
|
||||
tb_rows_0 = "<tr>"
|
||||
for t in list(rows[0]):
|
||||
tb += f"<th>{t.value}</th>"
|
||||
tb += "</tr>"
|
||||
for r in list(rows[1:]):
|
||||
tb += "<tr>"
|
||||
for i, c in enumerate(r):
|
||||
if c.value is None:
|
||||
tb += "<td></td>"
|
||||
else:
|
||||
tb += f"<td>{c.value}</td>"
|
||||
tb += "</tr>"
|
||||
tb += "</table>\n"
|
||||
return tb
|
||||
tb_rows_0 += f"<th>{t.value}</th>"
|
||||
tb_rows_0 += "</tr>"
|
||||
|
||||
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
||||
tb = ""
|
||||
tb += f"<table><caption>{sheetname}</caption>"
|
||||
tb += tb_rows_0
|
||||
for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
|
||||
tb += "<tr>"
|
||||
for i, c in enumerate(r):
|
||||
if c.value is None:
|
||||
tb += "<td></td>"
|
||||
else:
|
||||
tb += f"<td>{c.value}</td>"
|
||||
tb += "</tr>"
|
||||
tb += "</table>\n"
|
||||
tb_chunks.append(tb)
|
||||
|
||||
return tb_chunks
|
||||
|
||||
def __call__(self, fnm):
|
||||
if isinstance(fnm, str):
|
||||
|
||||
Reference in New Issue
Block a user