Fix some bugs in text2sql.(#4279)(#4281) (#4280)

Fix some bugs in text2sql.(#4279)(#4281)

### What problem does this PR solve?
- The incorrect results in parsing CSV files of the QA knowledge base in
the text2sql scenario. Process CSV files using the csv library. Decouple
CSV parsing from TXT parsing
- Most llm return results in markdown format ```sql query ```, Fix
execution error caused by LLM output SQLmarkdown format.### Type of
change
- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
TeslaZY
2024-12-30 10:32:19 +08:00
committed by GitHub
parent 8cdf10148d
commit dd13a5d05c
2 changed files with 42 additions and 14 deletions

View File

@ -12,6 +12,7 @@
#
import logging
import re
import csv
from copy import deepcopy
from io import BytesIO
from timeit import default_timer as timer
@ -25,7 +26,6 @@ from docx import Document
from PIL import Image
from markdown import markdown
class Excel(ExcelParser):
def __call__(self, fnm, binary=None, callback=None):
if not binary:
@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
res.append(beAdoc(deepcopy(doc), q, a, eng))
return res
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
return res
elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
delimiter = "\t" if any("\t" in line for line in lines) else ","
fails = []
question, answer = "", ""
res = []
reader = csv.reader(lines, delimiter=delimiter)
for i, row in enumerate(reader):
if len(row) != 2:
if question:
answer += "\n" + lines[i]
else:
fails.append(str(i + 1))
elif len(row) == 2:
if question and answer:
res.append(beAdoc(deepcopy(doc), question, answer, eng))
question, answer = row
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
if question:
res.append(beAdoc(deepcopy(doc), question, answer, eng))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
pdf_parser = Pdf()