diff --git a/rag/app/table.py b/rag/app/table.py index 14facd169..450cd6280 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -20,6 +20,8 @@ from io import BytesIO from xpinyin import Pinyin import numpy as np import pandas as pd +from collections import Counter + # from openpyxl import load_workbook, Workbook from dateutil.parser import parse as datetime_parse @@ -30,8 +32,7 @@ from deepdoc.parser import ExcelParser class Excel(ExcelParser): - def __call__(self, fnm, binary=None, from_page=0, - to_page=10000000000, callback=None): + def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None): if not binary: wb = Excel._load_excel_to_workbook(fnm) else: @@ -49,10 +50,7 @@ class Excel(ExcelParser): continue headers = [cell.value for cell in rows[0]] missed = set([i for i, h in enumerate(headers) if h is None]) - headers = [ - cell.value for i, - cell in enumerate( - rows[0]) if i not in missed] + headers = [cell.value for i, cell in enumerate(rows[0]) if i not in missed] if not headers: continue data = [] @@ -62,9 +60,7 @@ class Excel(ExcelParser): continue if rn - 1 >= to_page: break - row = [ - cell.value for ii, - cell in enumerate(r) if ii not in missed] + row = [cell.value for ii, cell in enumerate(r) if ii not in missed] if len(row) != len(headers): fails.append(str(i)) continue @@ -74,8 +70,7 @@ class Excel(ExcelParser): continue res.append(pd.DataFrame(np.array(data), columns=headers)) - callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + ( - f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) + callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) return res @@ -87,8 +82,7 @@ def trans_datatime(s): def trans_bool(s): - if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", - str(s).strip(), flags=re.IGNORECASE): + if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE): return "yes" if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): return "no" @@ -97,8 +91,7 @@ def trans_bool(s): def column_data_type(arr): arr = list(arr) counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0} - trans = {t: f for f, t in - [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]} + trans = {t: f for f, t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]} for a in arr: if a is None: continue @@ -127,31 +120,25 @@ def column_data_type(arr): return arr, ty -def chunk(filename, binary=None, from_page=0, to_page=10000000000, - lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs): """ - Excel and csv(txt) format files are supported. - For csv or txt file, the delimiter between columns is TAB. - The first line must be column headers. - Column headers must be meaningful terms inorder to make our NLP model understanding. - It's good to enumerate some synonyms using slash '/' to separate, and even better to - enumerate values using brackets like 'gender/sex(male, female)'. - Here are some examples for headers: - 1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL) - 2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA) + Excel and csv(txt) format files are supported. + For csv or txt file, the delimiter between columns is TAB. + The first line must be column headers. + Column headers must be meaningful terms inorder to make our NLP model understanding. + It's good to enumerate some synonyms using slash '/' to separate, and even better to + enumerate values using brackets like 'gender/sex(male, female)'. + Here are some examples for headers: + 1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL) + 2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA) - Every row in table will be treated as a chunk. + Every row in table will be treated as a chunk. """ if re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = Excel() - dfs = excel_parser( - filename, - binary, - from_page=from_page, - to_page=to_page, - callback=callback) + dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback) elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = get_text(filename, binary) @@ -170,40 +157,29 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, continue rows.append(row) - callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + ( - f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) + callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) dfs = [pd.DataFrame(np.array(rows), columns=headers)] else: - raise NotImplementedError( - "file type not supported yet(excel, text, csv supported)") + raise NotImplementedError("file type not supported yet(excel, text, csv supported)") res = [] PY = Pinyin() - fieds_map = { - "text": "_tks", - "int": "_long", - "keyword": "_kwd", - "float": "_flt", - "datetime": "_dt", - "bool": "_kwd"} + fieds_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"} for df in dfs: for n in ["id", "_id", "index", "idx"]: if n in df.columns: del df[n] clmns = df.columns.values if len(clmns) != len(set(clmns)): - duplicates = [col for col in clmns if list(clmns).count(col) > 1] - raise ValueError(f"Duplicate column names detected: {set(duplicates)}") + col_counts = Counter(clmns) + duplicates = [col for col, count in col_counts.items() if count > 1] + if duplicates: + raise ValueError(f"Duplicate column names detected: {duplicates}\nFrom: {clmns}") + txts = list(copy.deepcopy(clmns)) - py_clmns = [ - PY.get_pinyins( - re.sub( - r"(/.*|([^()]+?)|\([^()]+?\))", - "", - str(n)), - '_')[0] for n in clmns] + py_clmns = [PY.get_pinyins(re.sub(r"(/.*|([^()]+?)|\([^()]+?\))", "", str(n)), "_")[0] for n in clmns] clmn_tys = [] for j in range(len(clmns)): cln, ty = column_data_type(df[clmns[j]]) @@ -211,15 +187,11 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, df[clmns[j]] = cln if ty == "text": txts.extend([str(c) for c in cln if c]) - clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) - for i in range(len(clmns))] + clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in range(len(clmns))] eng = lang.lower() == "english" # is_english(txts) for ii, row in df.iterrows(): - d = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) - } + d = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} row_txt = [] for j in range(len(clmns)): if row[clmns[j]] is None: @@ -229,16 +201,14 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]): continue fld = clmns_map[j][0] - d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize( - row[clmns[j]]) + d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]]) row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) if not row_txt: continue tokenize(d, "; ".join(row_txt), eng) res.append(d) - KnowledgebaseService.update_parser_config( - kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}}) + KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}}) callback(0.35, "") return res