From ab4b62031f4e7b5261e7dd4d41af863ea5e46fd9 Mon Sep 17 00:00:00 2001 From: buua436 Date: Wed, 10 Dec 2025 16:44:06 +0800 Subject: [PATCH] Fix:csv parse in Table (#11870) ### What problem does this PR solve? change: csv parse in Table ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/table.py | 30 ++++++++++++++++++- web/package-lock.json | 17 +++++++++++ web/package.json | 2 ++ .../document-preview/csv-preview.tsx | 18 ++++++----- 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/rag/app/table.py b/rag/app/table.py index 7a21a738a..a87a858bf 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -15,6 +15,8 @@ # import copy +import csv +import io import logging import re from io import BytesIO @@ -323,7 +325,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese callback(0.1, "Start to parse.") excel_parser = Excel() dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback) - elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): + elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = get_text(filename, binary) lines = txt.split("\n") @@ -344,7 +346,33 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) dfs = [pd.DataFrame(np.array(rows), columns=headers)] + elif re.search(r"\.csv$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + txt = get_text(filename, binary) + delimiter = kwargs.get("delimiter", ",") + reader = csv.reader(io.StringIO(txt), delimiter=delimiter) + all_rows = list(reader) + if not all_rows: + raise ValueError("Empty CSV file") + + headers = all_rows[0] + fails = [] + rows = [] + + for i, row in enumerate(all_rows[1 + from_page : 1 + to_page]): + if len(row) != len(headers): + fails.append(str(i + from_page)) + continue + rows.append(row) + + callback( + 0.3, + (f"Extract records: {from_page}~{from_page + len(rows)}" + + (f"{len(fails)} failure, line: {','.join(fails[:3])}..." if fails else "")) + ) + + dfs = [pd.DataFrame(rows, columns=headers)] else: raise NotImplementedError("file type not supported yet(excel, text, csv supported)") diff --git a/web/package-lock.json b/web/package-lock.json index 880a1c9b4..ed94049b2 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -45,6 +45,7 @@ "@tanstack/react-query": "^5.40.0", "@tanstack/react-query-devtools": "^5.51.5", "@tanstack/react-table": "^8.20.5", + "@types/papaparse": "^5.5.1", "@uiw/react-markdown-preview": "^5.1.3", "@xyflow/react": "^12.3.6", "ahooks": "^3.7.10", @@ -73,6 +74,7 @@ "mammoth": "^1.7.2", "next-themes": "^0.4.6", "openai-speech-stream-player": "^1.0.8", + "papaparse": "^5.5.3", "pptx-preview": "^1.0.5", "rc-tween-one": "^3.0.6", "react": "^18.2.0", @@ -10632,6 +10634,15 @@ "integrity": "sha512-37i+OaWTh9qeK4LSHPsyRC7NahnGotNuZvjLSgcPzblpHB3rrCJxAOgI5gCdKm7coonsaX1Of0ILiTcnZjbfxA==", "peer": true }, + "node_modules/@types/papaparse": { + "version": "5.5.1", + "resolved": "https://registry.npmmirror.com/@types/papaparse/-/papaparse-5.5.1.tgz", + "integrity": "sha512-esEO+VISsLIyE+JZBmb89NzsYYbpwV8lmv2rPo6oX5y9KhBaIP7hhHgjuTut54qjdKVMufTEcrh5fUl9+58huw==", + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/parse-json": { "version": "4.0.2", "resolved": "https://registry.npmmirror.com/@types/parse-json/-/parse-json-4.0.2.tgz", @@ -27413,6 +27424,12 @@ "resolved": "https://registry.npmmirror.com/pako/-/pako-1.0.11.tgz", "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==" }, + "node_modules/papaparse": { + "version": "5.5.3", + "resolved": "https://registry.npmmirror.com/papaparse/-/papaparse-5.5.3.tgz", + "integrity": "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A==", + "license": "MIT" + }, "node_modules/param-case": { "version": "3.0.4", "resolved": "https://registry.npmmirror.com/param-case/-/param-case-3.0.4.tgz", diff --git a/web/package.json b/web/package.json index f183c8008..051c4b9d7 100644 --- a/web/package.json +++ b/web/package.json @@ -58,6 +58,7 @@ "@tanstack/react-query": "^5.40.0", "@tanstack/react-query-devtools": "^5.51.5", "@tanstack/react-table": "^8.20.5", + "@types/papaparse": "^5.5.1", "@uiw/react-markdown-preview": "^5.1.3", "@xyflow/react": "^12.3.6", "ahooks": "^3.7.10", @@ -86,6 +87,7 @@ "mammoth": "^1.7.2", "next-themes": "^0.4.6", "openai-speech-stream-player": "^1.0.8", + "papaparse": "^5.5.3", "pptx-preview": "^1.0.5", "rc-tween-one": "^3.0.6", "react": "^18.2.0", diff --git a/web/src/components/document-preview/csv-preview.tsx b/web/src/components/document-preview/csv-preview.tsx index 45b05454e..fa1cf1ed8 100644 --- a/web/src/components/document-preview/csv-preview.tsx +++ b/web/src/components/document-preview/csv-preview.tsx @@ -2,6 +2,7 @@ import message from '@/components/ui/message'; import { Spin } from '@/components/ui/spin'; import request from '@/utils/request'; import classNames from 'classnames'; +import Papa from 'papaparse'; import React, { useEffect, useRef, useState } from 'react'; interface CSVData { @@ -20,14 +21,17 @@ const CSVFileViewer: React.FC = ({ url }) => { const containerRef = useRef(null); // const url = useGetDocumentUrl(); const parseCSV = (csvText: string): CSVData => { - console.log('Parsing CSV data:', csvText); - const lines = csvText.split('\n'); - const headers = lines[0].split(',').map((header) => header.trim()); - const rows = lines - .slice(1) - .map((line) => line.split(',').map((cell) => cell.trim())); + const result = Papa.parse(csvText, { + header: false, + skipEmptyLines: false, + }); - return { headers, rows }; + const rows = result.data as string[][]; + + const headers = rows[0]; + const dataRows = rows.slice(1); + + return { headers, rows: dataRows }; }; useEffect(() => {