mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-18 19:46:44 +08:00
Fix:csv parse in Table (#11870)
### What problem does this PR solve? change: csv parse in Table ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -15,6 +15,8 @@
|
||||
#
|
||||
|
||||
import copy
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
@ -323,7 +325,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = Excel()
|
||||
dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = get_text(filename, binary)
|
||||
lines = txt.split("\n")
|
||||
@ -344,7 +346,33 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
dfs = [pd.DataFrame(np.array(rows), columns=headers)]
|
||||
elif re.search(r"\.csv$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = get_text(filename, binary)
|
||||
delimiter = kwargs.get("delimiter", ",")
|
||||
|
||||
reader = csv.reader(io.StringIO(txt), delimiter=delimiter)
|
||||
all_rows = list(reader)
|
||||
if not all_rows:
|
||||
raise ValueError("Empty CSV file")
|
||||
|
||||
headers = all_rows[0]
|
||||
fails = []
|
||||
rows = []
|
||||
|
||||
for i, row in enumerate(all_rows[1 + from_page : 1 + to_page]):
|
||||
if len(row) != len(headers):
|
||||
fails.append(str(i + from_page))
|
||||
continue
|
||||
rows.append(row)
|
||||
|
||||
callback(
|
||||
0.3,
|
||||
(f"Extract records: {from_page}~{from_page + len(rows)}" +
|
||||
(f"{len(fails)} failure, line: {','.join(fails[:3])}..." if fails else ""))
|
||||
)
|
||||
|
||||
dfs = [pd.DataFrame(rows, columns=headers)]
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(excel, text, csv supported)")
|
||||
|
||||
|
||||
17
web/package-lock.json
generated
17
web/package-lock.json
generated
@ -45,6 +45,7 @@
|
||||
"@tanstack/react-query": "^5.40.0",
|
||||
"@tanstack/react-query-devtools": "^5.51.5",
|
||||
"@tanstack/react-table": "^8.20.5",
|
||||
"@types/papaparse": "^5.5.1",
|
||||
"@uiw/react-markdown-preview": "^5.1.3",
|
||||
"@xyflow/react": "^12.3.6",
|
||||
"ahooks": "^3.7.10",
|
||||
@ -73,6 +74,7 @@
|
||||
"mammoth": "^1.7.2",
|
||||
"next-themes": "^0.4.6",
|
||||
"openai-speech-stream-player": "^1.0.8",
|
||||
"papaparse": "^5.5.3",
|
||||
"pptx-preview": "^1.0.5",
|
||||
"rc-tween-one": "^3.0.6",
|
||||
"react": "^18.2.0",
|
||||
@ -10632,6 +10634,15 @@
|
||||
"integrity": "sha512-37i+OaWTh9qeK4LSHPsyRC7NahnGotNuZvjLSgcPzblpHB3rrCJxAOgI5gCdKm7coonsaX1Of0ILiTcnZjbfxA==",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/@types/papaparse": {
|
||||
"version": "5.5.1",
|
||||
"resolved": "https://registry.npmmirror.com/@types/papaparse/-/papaparse-5.5.1.tgz",
|
||||
"integrity": "sha512-esEO+VISsLIyE+JZBmb89NzsYYbpwV8lmv2rPo6oX5y9KhBaIP7hhHgjuTut54qjdKVMufTEcrh5fUl9+58huw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/parse-json": {
|
||||
"version": "4.0.2",
|
||||
"resolved": "https://registry.npmmirror.com/@types/parse-json/-/parse-json-4.0.2.tgz",
|
||||
@ -27413,6 +27424,12 @@
|
||||
"resolved": "https://registry.npmmirror.com/pako/-/pako-1.0.11.tgz",
|
||||
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw=="
|
||||
},
|
||||
"node_modules/papaparse": {
|
||||
"version": "5.5.3",
|
||||
"resolved": "https://registry.npmmirror.com/papaparse/-/papaparse-5.5.3.tgz",
|
||||
"integrity": "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/param-case": {
|
||||
"version": "3.0.4",
|
||||
"resolved": "https://registry.npmmirror.com/param-case/-/param-case-3.0.4.tgz",
|
||||
|
||||
@ -58,6 +58,7 @@
|
||||
"@tanstack/react-query": "^5.40.0",
|
||||
"@tanstack/react-query-devtools": "^5.51.5",
|
||||
"@tanstack/react-table": "^8.20.5",
|
||||
"@types/papaparse": "^5.5.1",
|
||||
"@uiw/react-markdown-preview": "^5.1.3",
|
||||
"@xyflow/react": "^12.3.6",
|
||||
"ahooks": "^3.7.10",
|
||||
@ -86,6 +87,7 @@
|
||||
"mammoth": "^1.7.2",
|
||||
"next-themes": "^0.4.6",
|
||||
"openai-speech-stream-player": "^1.0.8",
|
||||
"papaparse": "^5.5.3",
|
||||
"pptx-preview": "^1.0.5",
|
||||
"rc-tween-one": "^3.0.6",
|
||||
"react": "^18.2.0",
|
||||
|
||||
@ -2,6 +2,7 @@ import message from '@/components/ui/message';
|
||||
import { Spin } from '@/components/ui/spin';
|
||||
import request from '@/utils/request';
|
||||
import classNames from 'classnames';
|
||||
import Papa from 'papaparse';
|
||||
import React, { useEffect, useRef, useState } from 'react';
|
||||
|
||||
interface CSVData {
|
||||
@ -20,14 +21,17 @@ const CSVFileViewer: React.FC<FileViewerProps> = ({ url }) => {
|
||||
const containerRef = useRef<HTMLDivElement>(null);
|
||||
// const url = useGetDocumentUrl();
|
||||
const parseCSV = (csvText: string): CSVData => {
|
||||
console.log('Parsing CSV data:', csvText);
|
||||
const lines = csvText.split('\n');
|
||||
const headers = lines[0].split(',').map((header) => header.trim());
|
||||
const rows = lines
|
||||
.slice(1)
|
||||
.map((line) => line.split(',').map((cell) => cell.trim()));
|
||||
const result = Papa.parse<string[]>(csvText, {
|
||||
header: false,
|
||||
skipEmptyLines: false,
|
||||
});
|
||||
|
||||
return { headers, rows };
|
||||
const rows = result.data as string[][];
|
||||
|
||||
const headers = rows[0];
|
||||
const dataRows = rows.slice(1);
|
||||
|
||||
return { headers, rows: dataRows };
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
|
||||
Reference in New Issue
Block a user