Display only the duplicate column names and corresponding original source. (#8138)

### What problem does this PR solve?
This PR aims to slove #8120 which request a better error display of
duplicate column names.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
HaiyangP
2025-06-10 10:16:38 +08:00
committed by GitHub
parent 8fb6b5d945
commit baf32ee461

View File

@ -20,6 +20,8 @@ from io import BytesIO
from xpinyin import Pinyin from xpinyin import Pinyin
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from collections import Counter
# from openpyxl import load_workbook, Workbook # from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse from dateutil.parser import parse as datetime_parse
@ -30,8 +32,7 @@ from deepdoc.parser import ExcelParser
class Excel(ExcelParser): class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0, def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
to_page=10000000000, callback=None):
if not binary: if not binary:
wb = Excel._load_excel_to_workbook(fnm) wb = Excel._load_excel_to_workbook(fnm)
else: else:
@ -49,10 +50,7 @@ class Excel(ExcelParser):
continue continue
headers = [cell.value for cell in rows[0]] headers = [cell.value for cell in rows[0]]
missed = set([i for i, h in enumerate(headers) if h is None]) missed = set([i for i, h in enumerate(headers) if h is None])
headers = [ headers = [cell.value for i, cell in enumerate(rows[0]) if i not in missed]
cell.value for i,
cell in enumerate(
rows[0]) if i not in missed]
if not headers: if not headers:
continue continue
data = [] data = []
@ -62,9 +60,7 @@ class Excel(ExcelParser):
continue continue
if rn - 1 >= to_page: if rn - 1 >= to_page:
break break
row = [ row = [cell.value for ii, cell in enumerate(r) if ii not in missed]
cell.value for ii,
cell in enumerate(r) if ii not in missed]
if len(row) != len(headers): if len(row) != len(headers):
fails.append(str(i)) fails.append(str(i))
continue continue
@ -74,8 +70,7 @@ class Excel(ExcelParser):
continue continue
res.append(pd.DataFrame(np.array(data), columns=headers)) res.append(pd.DataFrame(np.array(data), columns=headers))
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + ( callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res return res
@ -87,8 +82,7 @@ def trans_datatime(s):
def trans_bool(s): def trans_bool(s):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
str(s).strip(), flags=re.IGNORECASE):
return "yes" return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return "no" return "no"
@ -97,8 +91,7 @@ def trans_bool(s):
def column_data_type(arr): def column_data_type(arr):
arr = list(arr) arr = list(arr)
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0} counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
trans = {t: f for f, t in trans = {t: f for f, t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
[(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
for a in arr: for a in arr:
if a is None: if a is None:
continue continue
@ -127,31 +120,25 @@ def column_data_type(arr):
return arr, ty return arr, ty
def chunk(filename, binary=None, from_page=0, to_page=10000000000, def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
lang="Chinese", callback=None, **kwargs):
""" """
Excel and csv(txt) format files are supported. Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB. For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers. The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding. Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'. enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers: Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL) 1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA 2. 姓名/名字\t电话/手机/微信\t最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA
Every row in table will be treated as a chunk. Every row in table will be treated as a chunk.
""" """
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
excel_parser = Excel() excel_parser = Excel()
dfs = excel_parser( dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
filename,
binary,
from_page=from_page,
to_page=to_page,
callback=callback)
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_text(filename, binary) txt = get_text(filename, binary)
@ -170,40 +157,29 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
continue continue
rows.append(row) rows.append(row)
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + ( callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
dfs = [pd.DataFrame(np.array(rows), columns=headers)] dfs = [pd.DataFrame(np.array(rows), columns=headers)]
else: else:
raise NotImplementedError( raise NotImplementedError("file type not supported yet(excel, text, csv supported)")
"file type not supported yet(excel, text, csv supported)")
res = [] res = []
PY = Pinyin() PY = Pinyin()
fieds_map = { fieds_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
"text": "_tks",
"int": "_long",
"keyword": "_kwd",
"float": "_flt",
"datetime": "_dt",
"bool": "_kwd"}
for df in dfs: for df in dfs:
for n in ["id", "_id", "index", "idx"]: for n in ["id", "_id", "index", "idx"]:
if n in df.columns: if n in df.columns:
del df[n] del df[n]
clmns = df.columns.values clmns = df.columns.values
if len(clmns) != len(set(clmns)): if len(clmns) != len(set(clmns)):
duplicates = [col for col in clmns if list(clmns).count(col) > 1] col_counts = Counter(clmns)
raise ValueError(f"Duplicate column names detected: {set(duplicates)}") duplicates = [col for col, count in col_counts.items() if count > 1]
if duplicates:
raise ValueError(f"Duplicate column names detected: {duplicates}\nFrom: {clmns}")
txts = list(copy.deepcopy(clmns)) txts = list(copy.deepcopy(clmns))
py_clmns = [ py_clmns = [PY.get_pinyins(re.sub(r"(/.*|[^]+?|\([^()]+?\))", "", str(n)), "_")[0] for n in clmns]
PY.get_pinyins(
re.sub(
r"(/.*|[^]+?|\([^()]+?\))",
"",
str(n)),
'_')[0] for n in clmns]
clmn_tys = [] clmn_tys = []
for j in range(len(clmns)): for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]]) cln, ty = column_data_type(df[clmns[j]])
@ -211,15 +187,11 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
df[clmns[j]] = cln df[clmns[j]] = cln
if ty == "text": if ty == "text":
txts.extend([str(c) for c in cln if c]) txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in range(len(clmns))]
for i in range(len(clmns))]
eng = lang.lower() == "english" # is_english(txts) eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows(): for ii, row in df.iterrows():
d = { d = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
row_txt = [] row_txt = []
for j in range(len(clmns)): for j in range(len(clmns)):
if row[clmns[j]] is None: if row[clmns[j]] is None:
@ -229,16 +201,14 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]): if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
continue continue
fld = clmns_map[j][0] fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize( d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]])
row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt: if not row_txt:
continue continue
tokenize(d, "; ".join(row_txt), eng) tokenize(d, "; ".join(row_txt), eng)
res.append(d) res.append(d)
KnowledgebaseService.update_parser_config( KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
callback(0.35, "") callback(0.35, "")
return res return res