mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix errors detected by Ruff (#3918)
### What problem does this PR solve? Fix errors detected by Ruff ### Type of change - [x] Refactoring
This commit is contained in:
@ -18,4 +18,16 @@ from .ppt_parser import RAGFlowPptParser as PptParser
|
||||
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
||||
from .json_parser import RAGFlowJsonParser as JsonParser
|
||||
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
||||
from .txt_parser import RAGFlowTxtParser as TxtParser
|
||||
from .txt_parser import RAGFlowTxtParser as TxtParser
|
||||
|
||||
__all__ = [
|
||||
"PdfParser",
|
||||
"PlainParser",
|
||||
"DocxParser",
|
||||
"ExcelParser",
|
||||
"PptParser",
|
||||
"HtmlParser",
|
||||
"JsonParser",
|
||||
"MarkdownParser",
|
||||
"TxtParser",
|
||||
]
|
||||
@ -29,7 +29,8 @@ class RAGFlowExcelParser:
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
rows = list(ws.rows)
|
||||
if not rows: continue
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
tb_rows_0 = "<tr>"
|
||||
for t in list(rows[0]):
|
||||
@ -40,7 +41,9 @@ class RAGFlowExcelParser:
|
||||
tb = ""
|
||||
tb += f"<table><caption>{sheetname}</caption>"
|
||||
tb += tb_rows_0
|
||||
for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
|
||||
for r in list(
|
||||
rows[1 + chunk_i * chunk_rows : 1 + (chunk_i + 1) * chunk_rows]
|
||||
):
|
||||
tb += "<tr>"
|
||||
for i, c in enumerate(r):
|
||||
if c.value is None:
|
||||
@ -62,20 +65,21 @@ class RAGFlowExcelParser:
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
rows = list(ws.rows)
|
||||
if not rows:continue
|
||||
if not rows:
|
||||
continue
|
||||
ti = list(rows[0])
|
||||
for r in list(rows[1:]):
|
||||
l = []
|
||||
fields = []
|
||||
for i, c in enumerate(r):
|
||||
if not c.value:
|
||||
continue
|
||||
t = str(ti[i].value) if i < len(ti) else ""
|
||||
t += (":" if t else "") + str(c.value)
|
||||
l.append(t)
|
||||
l = "; ".join(l)
|
||||
fields.append(t)
|
||||
line = "; ".join(fields)
|
||||
if sheetname.lower().find("sheet") < 0:
|
||||
l += " ——" + sheetname
|
||||
res.append(l)
|
||||
line += " ——" + sheetname
|
||||
res.append(line)
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -36,7 +36,7 @@ class RAGFlowHtmlParser:
|
||||
|
||||
@classmethod
|
||||
def parser_txt(cls, txt):
|
||||
if type(txt) != str:
|
||||
if not isinstance(txt, str):
|
||||
raise TypeError("txt type should be str!")
|
||||
html_doc = readability.Document(txt)
|
||||
title = html_doc.title()
|
||||
|
||||
@ -22,7 +22,7 @@ class RAGFlowJsonParser:
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
json_data = json.loads(txt)
|
||||
chunks = self.split_json(json_data, True)
|
||||
sections = [json.dumps(l, ensure_ascii=False) for l in chunks if l]
|
||||
sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
|
||||
return sections
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -752,7 +752,7 @@ class RAGFlowPdfParser:
|
||||
"x1": np.max([b["x1"] for b in bxs]),
|
||||
"bottom": np.max([b["bottom"] for b in bxs]) - ht
|
||||
}
|
||||
louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
|
||||
louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
|
||||
ii = Recognizer.find_overlapped(b, louts, naive=True)
|
||||
if ii is not None:
|
||||
b = louts[ii]
|
||||
@ -763,7 +763,8 @@ class RAGFlowPdfParser:
|
||||
"layoutno", "")))
|
||||
|
||||
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
||||
if right < left: right = left + 1
|
||||
if right < left:
|
||||
right = left + 1
|
||||
poss.append((pn + self.page_from, left, right, top, bott))
|
||||
return self.page_images[pn] \
|
||||
.crop((left * ZM, top * ZM,
|
||||
@ -845,7 +846,8 @@ class RAGFlowPdfParser:
|
||||
top = bx["top"] - self.page_cum_height[pn[0] - 1]
|
||||
bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
|
||||
page_images_cnt = len(self.page_images)
|
||||
if pn[-1] - 1 >= page_images_cnt: return ""
|
||||
if pn[-1] - 1 >= page_images_cnt:
|
||||
return ""
|
||||
while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
|
||||
bott -= self.page_images[pn[-1] - 1].size[1] / ZM
|
||||
pn.append(pn[-1] + 1)
|
||||
@ -889,7 +891,6 @@ class RAGFlowPdfParser:
|
||||
nonlocal mh, pw, lines, widths
|
||||
lines.append(line)
|
||||
widths.append(width(line))
|
||||
width_mean = np.mean(widths)
|
||||
mmj = self.proj_match(
|
||||
line["text"]) or line.get(
|
||||
"layout_type",
|
||||
@ -994,7 +995,7 @@ class RAGFlowPdfParser:
|
||||
else:
|
||||
self.is_english = False
|
||||
|
||||
st = timer()
|
||||
# st = timer()
|
||||
for i, img in enumerate(self.page_images_x2):
|
||||
chars = self.page_chars[i] if not self.is_english else []
|
||||
self.mean_height.append(
|
||||
@ -1028,8 +1029,8 @@ class RAGFlowPdfParser:
|
||||
|
||||
self.page_cum_height = np.cumsum(self.page_cum_height)
|
||||
assert len(self.page_cum_height) == len(self.page_images) + 1
|
||||
if len(self.boxes) == 0 and zoomin < 9: self.__images__(fnm, zoomin * 3, page_from,
|
||||
page_to, callback)
|
||||
if len(self.boxes) == 0 and zoomin < 9:
|
||||
self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
|
||||
|
||||
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
||||
self.__images__(fnm, zoomin)
|
||||
@ -1168,7 +1169,7 @@ class PlainParser(object):
|
||||
if not self.outlines:
|
||||
logging.warning("Miss outlines")
|
||||
|
||||
return [(l, "") for l in lines], []
|
||||
return [(line, "") for line in lines], []
|
||||
|
||||
def crop(self, ck, need_position):
|
||||
raise NotImplementedError
|
||||
|
||||
@ -15,21 +15,42 @@ import datetime
|
||||
|
||||
|
||||
def refactor(cv):
|
||||
for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
|
||||
if n in cv and cv[n] is not None: del cv[n]
|
||||
for n in [
|
||||
"raw_txt",
|
||||
"parser_name",
|
||||
"inference",
|
||||
"ori_text",
|
||||
"use_time",
|
||||
"time_stat",
|
||||
]:
|
||||
if n in cv and cv[n] is not None:
|
||||
del cv[n]
|
||||
cv["is_deleted"] = 0
|
||||
if "basic" not in cv: cv["basic"] = {}
|
||||
if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
|
||||
if "basic" not in cv:
|
||||
cv["basic"] = {}
|
||||
if cv["basic"].get("photo2"):
|
||||
del cv["basic"]["photo2"]
|
||||
|
||||
for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
|
||||
if n not in cv or cv[n] is None: continue
|
||||
if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
|
||||
if type(cv[n]) != type([]):
|
||||
for n in [
|
||||
"education",
|
||||
"work",
|
||||
"certificate",
|
||||
"project",
|
||||
"language",
|
||||
"skill",
|
||||
"training",
|
||||
]:
|
||||
if n not in cv or cv[n] is None:
|
||||
continue
|
||||
if isinstance(cv[n], dict):
|
||||
cv[n] = [v for _, v in cv[n].items()]
|
||||
if not isinstance(cv[n], list):
|
||||
del cv[n]
|
||||
continue
|
||||
vv = []
|
||||
for v in cv[n]:
|
||||
if "external" in v and v["external"] is not None: del v["external"]
|
||||
if "external" in v and v["external"] is not None:
|
||||
del v["external"]
|
||||
vv.append(v)
|
||||
cv[n] = {str(i): vv[i] for i in range(len(vv))}
|
||||
|
||||
@ -42,24 +63,44 @@ def refactor(cv):
|
||||
cv["basic"][t] = cv["basic"][n]
|
||||
del cv["basic"][n]
|
||||
|
||||
work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
work = sorted(
|
||||
[v for _, v in cv.get("work", {}).items()],
|
||||
key=lambda x: x.get("start_time", ""),
|
||||
)
|
||||
edu = sorted(
|
||||
[v for _, v in cv.get("education", {}).items()],
|
||||
key=lambda x: x.get("start_time", ""),
|
||||
)
|
||||
|
||||
if work:
|
||||
cv["basic"]["work_start_time"] = work[0].get("start_time", "")
|
||||
cv["basic"]["management_experience"] = 'Y' if any(
|
||||
[w.get("management_experience", '') == 'Y' for w in work]) else 'N'
|
||||
cv["basic"]["management_experience"] = (
|
||||
"Y"
|
||||
if any([w.get("management_experience", "") == "Y" for w in work])
|
||||
else "N"
|
||||
)
|
||||
cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
|
||||
|
||||
for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
|
||||
"corporation_type", "scale", "corporation_name"]:
|
||||
for n in [
|
||||
"annual_salary_from",
|
||||
"annual_salary_to",
|
||||
"industry_name",
|
||||
"position_name",
|
||||
"responsibilities",
|
||||
"corporation_type",
|
||||
"scale",
|
||||
"corporation_name",
|
||||
]:
|
||||
cv["basic"][n] = work[-1].get(n, "")
|
||||
|
||||
if edu:
|
||||
for n in ["school_name", "discipline_name"]:
|
||||
if n in edu[-1]: cv["basic"][n] = edu[-1][n]
|
||||
if n in edu[-1]:
|
||||
cv["basic"][n] = edu[-1][n]
|
||||
|
||||
cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
if "contact" not in cv: cv["contact"] = {}
|
||||
if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
|
||||
return cv
|
||||
if "contact" not in cv:
|
||||
cv["contact"] = {}
|
||||
if not cv["contact"].get("name"):
|
||||
cv["contact"]["name"] = cv["basic"].get("name", "")
|
||||
return cv
|
||||
|
||||
@ -21,13 +21,18 @@ from . import regions
|
||||
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||
GOODS = pd.read_csv(
|
||||
os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
|
||||
).fillna(0)
|
||||
GOODS["cid"] = GOODS["cid"].astype(str)
|
||||
GOODS = GOODS.set_index(["cid"])
|
||||
CORP_TKS = json.load(open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r"))
|
||||
CORP_TKS = json.load(
|
||||
open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r")
|
||||
)
|
||||
GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r"))
|
||||
CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r"))
|
||||
|
||||
|
||||
def baike(cid, default_v=0):
|
||||
global GOODS
|
||||
try:
|
||||
@ -39,27 +44,41 @@ def baike(cid, default_v=0):
|
||||
|
||||
def corpNorm(nm, add_region=True):
|
||||
global CORP_TKS
|
||||
if not nm or type(nm)!=type(""):return ""
|
||||
if not nm or isinstance(nm, str):
|
||||
return ""
|
||||
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
|
||||
nm = re.sub(r"&", "&", nm)
|
||||
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
||||
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
||||
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
||||
nm = re.sub(
|
||||
r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
|
||||
)
|
||||
nm = re.sub(
|
||||
r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
|
||||
"",
|
||||
nm,
|
||||
10000,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
|
||||
return nm
|
||||
|
||||
tks = rag_tokenizer.tokenize(nm).split()
|
||||
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||
reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||
nm = ""
|
||||
for t in tks:
|
||||
if regions.isName(t) or t in CORP_TKS:continue
|
||||
if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):nm += " "
|
||||
if regions.isName(t) or t in CORP_TKS:
|
||||
continue
|
||||
if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
|
||||
nm += " "
|
||||
nm += t
|
||||
|
||||
r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
|
||||
if r:nm = r.group(1)
|
||||
if r:
|
||||
nm = r.group(1)
|
||||
r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
|
||||
if r:nm = r.group(1)
|
||||
return nm.strip() + (("" if not reg else "(%s)"%reg[0]) if add_region else "")
|
||||
if r:
|
||||
nm = r.group(1)
|
||||
return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
|
||||
|
||||
|
||||
def rmNoise(n):
|
||||
@ -67,33 +86,40 @@ def rmNoise(n):
|
||||
n = re.sub(r"[,. &()()]+", "", n)
|
||||
return n
|
||||
|
||||
|
||||
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
||||
for c,v in CORP_TAG.items():
|
||||
for c, v in CORP_TAG.items():
|
||||
cc = corpNorm(rmNoise(c), False)
|
||||
if not cc:
|
||||
logging.debug(c)
|
||||
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
|
||||
CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
|
||||
|
||||
|
||||
def is_good(nm):
|
||||
global GOOD_CORP
|
||||
if nm.find("外派")>=0:return False
|
||||
if nm.find("外派") >= 0:
|
||||
return False
|
||||
nm = rmNoise(nm)
|
||||
nm = corpNorm(nm, False)
|
||||
for n in GOOD_CORP:
|
||||
if re.match(r"[0-9a-zA-Z]+$", n):
|
||||
if n == nm: return True
|
||||
elif nm.find(n)>=0:return True
|
||||
if n == nm:
|
||||
return True
|
||||
elif nm.find(n) >= 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def corp_tag(nm):
|
||||
global CORP_TAG
|
||||
nm = rmNoise(nm)
|
||||
nm = corpNorm(nm, False)
|
||||
for n in CORP_TAG.keys():
|
||||
if re.match(r"[0-9a-zA-Z., ]+$", n):
|
||||
if n == nm: return CORP_TAG[n]
|
||||
elif nm.find(n)>=0:
|
||||
if len(n)<3 and len(nm)/len(n)>=2:continue
|
||||
if n == nm:
|
||||
return CORP_TAG[n]
|
||||
elif nm.find(n) >= 0:
|
||||
if len(n) < 3 and len(nm) / len(n) >= 2:
|
||||
continue
|
||||
return CORP_TAG[n]
|
||||
return []
|
||||
|
||||
|
||||
@ -11,27 +11,31 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
TBL = {"94":"EMBA",
|
||||
"6":"MBA",
|
||||
"95":"MPA",
|
||||
"92":"专升本",
|
||||
"4":"专科",
|
||||
"90":"中专",
|
||||
"91":"中技",
|
||||
"86":"初中",
|
||||
"3":"博士",
|
||||
"10":"博士后",
|
||||
"1":"本科",
|
||||
"2":"硕士",
|
||||
"87":"职高",
|
||||
"89":"高中"
|
||||
TBL = {
|
||||
"94": "EMBA",
|
||||
"6": "MBA",
|
||||
"95": "MPA",
|
||||
"92": "专升本",
|
||||
"4": "专科",
|
||||
"90": "中专",
|
||||
"91": "中技",
|
||||
"86": "初中",
|
||||
"3": "博士",
|
||||
"10": "博士后",
|
||||
"1": "本科",
|
||||
"2": "硕士",
|
||||
"87": "职高",
|
||||
"89": "高中",
|
||||
}
|
||||
|
||||
TBL_ = {v:k for k,v in TBL.items()}
|
||||
TBL_ = {v: k for k, v in TBL.items()}
|
||||
|
||||
|
||||
def get_name(id):
|
||||
return TBL.get(str(id), "")
|
||||
|
||||
|
||||
def get_id(nm):
|
||||
if not nm:return ""
|
||||
if not nm:
|
||||
return ""
|
||||
return TBL_.get(nm.upper().strip(), "")
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -16,8 +16,11 @@ import json
|
||||
import re
|
||||
import copy
|
||||
import pandas as pd
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
|
||||
TBL = pd.read_csv(
|
||||
os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0
|
||||
).fillna("")
|
||||
TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
|
||||
GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r"))
|
||||
GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
|
||||
@ -26,14 +29,15 @@ GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
|
||||
def loadRank(fnm):
|
||||
global TBL
|
||||
TBL["rank"] = 1000000
|
||||
with open(fnm, "r", encoding='utf-8') as f:
|
||||
with open(fnm, "r", encoding="utf-8") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
l = l.strip("\n").split(",")
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n").split(",")
|
||||
try:
|
||||
nm,rk = l[0].strip(),int(l[1])
|
||||
#assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
|
||||
nm, rk = line[0].strip(), int(line[1])
|
||||
# assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
|
||||
TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
|
||||
except Exception:
|
||||
pass
|
||||
@ -44,27 +48,35 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
|
||||
|
||||
def split(txt):
|
||||
tks = []
|
||||
for t in re.sub(r"[ \t]+", " ",txt).split():
|
||||
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
||||
re.match(r"[a-zA-Z]", t) and tks:
|
||||
for t in re.sub(r"[ \t]+", " ", txt).split():
|
||||
if (
|
||||
tks
|
||||
and re.match(r".*[a-zA-Z]$", tks[-1])
|
||||
and re.match(r"[a-zA-Z]", t)
|
||||
and tks
|
||||
):
|
||||
tks[-1] = tks[-1] + " " + t
|
||||
else:tks.append(t)
|
||||
else:
|
||||
tks.append(t)
|
||||
return tks
|
||||
|
||||
|
||||
def select(nm):
|
||||
global TBL
|
||||
if not nm:return
|
||||
if isinstance(nm, list):nm = str(nm[0])
|
||||
if not nm:
|
||||
return
|
||||
if isinstance(nm, list):
|
||||
nm = str(nm[0])
|
||||
nm = split(nm)[0]
|
||||
nm = str(nm).lower().strip()
|
||||
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||
nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm)
|
||||
nm = re.sub(r"大学.*学院", "大学", nm)
|
||||
tbl = copy.deepcopy(TBL)
|
||||
tbl["hit_alias"] = tbl["alias"].map(lambda x:nm in set(x.split("+")))
|
||||
res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | (tbl.hit_alias == True))]
|
||||
if res.empty:return
|
||||
tbl["hit_alias"] = tbl["alias"].map(lambda x: nm in set(x.split("+")))
|
||||
res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | tbl.hit_alias)]
|
||||
if res.empty:
|
||||
return
|
||||
|
||||
return json.loads(res.to_json(orient="records"))[0]
|
||||
|
||||
@ -74,4 +86,3 @@ def is_good(nm):
|
||||
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||
nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm)
|
||||
return nm in GOOD_SCH
|
||||
|
||||
|
||||
@ -25,7 +25,8 @@ from xpinyin import Pinyin
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
class TimeoutException(Exception): pass
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
@ -50,8 +51,10 @@ def rmHtmlTag(line):
|
||||
|
||||
|
||||
def highest_degree(dg):
|
||||
if not dg: return ""
|
||||
if type(dg) == type(""): dg = [dg]
|
||||
if not dg:
|
||||
return ""
|
||||
if isinstance(dg, str):
|
||||
dg = [dg]
|
||||
m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
|
||||
return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
|
||||
|
||||
@ -68,10 +71,12 @@ def forEdu(cv):
|
||||
for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
|
||||
e = {}
|
||||
if n.get("end_time"):
|
||||
if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
|
||||
if n["end_time"] > edu_end_dt:
|
||||
edu_end_dt = n["end_time"]
|
||||
try:
|
||||
dt = n["end_time"]
|
||||
if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
|
||||
if re.match(r"[0-9]{9,}", dt):
|
||||
dt = turnTm2Dt(dt)
|
||||
y, m, d = getYMD(dt)
|
||||
ed_dt.append(str(y))
|
||||
e["end_dt_kwd"] = str(y)
|
||||
@ -80,7 +85,8 @@ def forEdu(cv):
|
||||
if n.get("start_time"):
|
||||
try:
|
||||
dt = n["start_time"]
|
||||
if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
|
||||
if re.match(r"[0-9]{9,}", dt):
|
||||
dt = turnTm2Dt(dt)
|
||||
y, m, d = getYMD(dt)
|
||||
st_dt.append(str(y))
|
||||
e["start_dt_kwd"] = str(y)
|
||||
@ -89,13 +95,20 @@ def forEdu(cv):
|
||||
|
||||
r = schools.select(n.get("school_name", ""))
|
||||
if r:
|
||||
if str(r.get("type", "")) == "1": fea.append("211")
|
||||
if str(r.get("type", "")) == "2": fea.append("211")
|
||||
if str(r.get("is_abroad", "")) == "1": fea.append("留学")
|
||||
if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
|
||||
if str(r.get("is_985", "")) == "1": fea.append("985")
|
||||
if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
|
||||
if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
|
||||
if str(r.get("type", "")) == "1":
|
||||
fea.append("211")
|
||||
if str(r.get("type", "")) == "2":
|
||||
fea.append("211")
|
||||
if str(r.get("is_abroad", "")) == "1":
|
||||
fea.append("留学")
|
||||
if str(r.get("is_double_first", "")) == "1":
|
||||
fea.append("双一流")
|
||||
if str(r.get("is_985", "")) == "1":
|
||||
fea.append("985")
|
||||
if str(r.get("is_world_known", "")) == "1":
|
||||
fea.append("海外知名")
|
||||
if r.get("rank") and cv["school_rank_int"] > r["rank"]:
|
||||
cv["school_rank_int"] = r["rank"]
|
||||
|
||||
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||
@ -106,22 +119,25 @@ def forEdu(cv):
|
||||
maj.append(n["discipline_name"])
|
||||
e["major_kwd"] = n["discipline_name"]
|
||||
|
||||
if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
|
||||
if not n.get("degree") and "985" in fea and not first_fea:
|
||||
n["degree"] = "1"
|
||||
|
||||
if n.get("degree"):
|
||||
d = degrees.get_name(n["degree"])
|
||||
if d: e["degree_kwd"] = d
|
||||
if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
|
||||
n.get(
|
||||
"school_name",
|
||||
""))): d = "专升本"
|
||||
if d: deg.append(d)
|
||||
if d:
|
||||
e["degree_kwd"] = d
|
||||
if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
|
||||
d = "专升本"
|
||||
if d:
|
||||
deg.append(d)
|
||||
|
||||
# for first degree
|
||||
if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
|
||||
fdeg = [d]
|
||||
if n.get("school_name"): fsch = [n["school_name"]]
|
||||
if n.get("discipline_name"): fmaj = [n["discipline_name"]]
|
||||
if n.get("school_name"):
|
||||
fsch = [n["school_name"]]
|
||||
if n.get("discipline_name"):
|
||||
fmaj = [n["discipline_name"]]
|
||||
first_fea = copy.deepcopy(fea)
|
||||
|
||||
edu_nst.append(e)
|
||||
@ -140,16 +156,26 @@ def forEdu(cv):
|
||||
else:
|
||||
cv["sch_rank_kwd"].append("一般学校")
|
||||
|
||||
if edu_nst: cv["edu_nst"] = edu_nst
|
||||
if fea: cv["edu_fea_kwd"] = list(set(fea))
|
||||
if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
|
||||
if maj: cv["major_kwd"] = maj
|
||||
if fsch: cv["first_school_name_kwd"] = fsch
|
||||
if fdeg: cv["first_degree_kwd"] = fdeg
|
||||
if fmaj: cv["first_major_kwd"] = fmaj
|
||||
if st_dt: cv["edu_start_kwd"] = st_dt
|
||||
if ed_dt: cv["edu_end_kwd"] = ed_dt
|
||||
if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
|
||||
if edu_nst:
|
||||
cv["edu_nst"] = edu_nst
|
||||
if fea:
|
||||
cv["edu_fea_kwd"] = list(set(fea))
|
||||
if first_fea:
|
||||
cv["edu_first_fea_kwd"] = list(set(first_fea))
|
||||
if maj:
|
||||
cv["major_kwd"] = maj
|
||||
if fsch:
|
||||
cv["first_school_name_kwd"] = fsch
|
||||
if fdeg:
|
||||
cv["first_degree_kwd"] = fdeg
|
||||
if fmaj:
|
||||
cv["first_major_kwd"] = fmaj
|
||||
if st_dt:
|
||||
cv["edu_start_kwd"] = st_dt
|
||||
if ed_dt:
|
||||
cv["edu_end_kwd"] = ed_dt
|
||||
if ed_dt:
|
||||
cv["edu_end_int"] = max([int(t) for t in ed_dt])
|
||||
if deg:
|
||||
if "本科" in deg and "专科" in deg:
|
||||
deg.append("专升本")
|
||||
@ -158,8 +184,10 @@ def forEdu(cv):
|
||||
cv["highest_degree_kwd"] = highest_degree(deg)
|
||||
if edu_end_dt:
|
||||
try:
|
||||
if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
|
||||
if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
|
||||
if re.match(r"[0-9]{9,}", edu_end_dt):
|
||||
edu_end_dt = turnTm2Dt(edu_end_dt)
|
||||
if edu_end_dt.strip("\n") == "至今":
|
||||
edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
|
||||
y, m, d = getYMD(edu_end_dt)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
@ -171,7 +199,8 @@ def forEdu(cv):
|
||||
or not cv.get("degree_kwd"):
|
||||
for c in sch:
|
||||
if schools.is_good(c):
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好学校")
|
||||
cv["tag_kwd"].append("好学历")
|
||||
break
|
||||
@ -180,28 +209,39 @@ def forEdu(cv):
|
||||
any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
|
||||
or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
|
||||
or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
if "好学历" not in cv["tag_kwd"]:
|
||||
cv["tag_kwd"].append("好学历")
|
||||
|
||||
if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
||||
if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
||||
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
||||
if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
||||
if cv.get("major_kwd"):
|
||||
cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
||||
if cv.get("school_name_kwd"):
|
||||
cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
||||
if cv.get("first_school_name_kwd"):
|
||||
cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
||||
if cv.get("first_major_kwd"):
|
||||
cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
||||
|
||||
return cv
|
||||
|
||||
|
||||
def forProj(cv):
|
||||
if not cv.get("project_obj"): return cv
|
||||
if not cv.get("project_obj"):
|
||||
return cv
|
||||
|
||||
pro_nms, desc = [], []
|
||||
for i, n in enumerate(
|
||||
sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
|
||||
sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
|
||||
reverse=True)):
|
||||
if n.get("name"): pro_nms.append(n["name"])
|
||||
if n.get("describe"): desc.append(str(n["describe"]))
|
||||
if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
|
||||
if n.get("achivement"): desc.append(str(n["achivement"]))
|
||||
if n.get("name"):
|
||||
pro_nms.append(n["name"])
|
||||
if n.get("describe"):
|
||||
desc.append(str(n["describe"]))
|
||||
if n.get("responsibilities"):
|
||||
desc.append(str(n["responsibilities"]))
|
||||
if n.get("achivement"):
|
||||
desc.append(str(n["achivement"]))
|
||||
|
||||
if pro_nms:
|
||||
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
|
||||
@ -233,15 +273,16 @@ def forWork(cv):
|
||||
work_st_tm = ""
|
||||
corp_tags = []
|
||||
for i, n in enumerate(
|
||||
sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
|
||||
sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
|
||||
reverse=True)):
|
||||
if type(n) == type(""):
|
||||
if isinstance(n, str):
|
||||
try:
|
||||
n = json_loads(n)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
|
||||
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
|
||||
work_st_tm = n["start_time"]
|
||||
for c in flds:
|
||||
if not n.get(c) or str(n[c]) == '0':
|
||||
fea[c].append("")
|
||||
@ -262,14 +303,18 @@ def forWork(cv):
|
||||
fea[c].append(rmHtmlTag(str(n[c]).lower()))
|
||||
|
||||
y, m, d = getYMD(n.get("start_time"))
|
||||
if not y or not m: continue
|
||||
if not y or not m:
|
||||
continue
|
||||
st = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
latest_job_tm = st
|
||||
|
||||
y, m, d = getYMD(n.get("end_time"))
|
||||
if (not y or not m) and i > 0: continue
|
||||
if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", "")))
|
||||
if not y or not m: continue
|
||||
if (not y or not m) and i > 0:
|
||||
continue
|
||||
if not y or not m or int(y) > 2022:
|
||||
y, m, d = getYMD(str(n.get("updated_at", "")))
|
||||
if not y or not m:
|
||||
continue
|
||||
ed = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
|
||||
try:
|
||||
@ -279,22 +324,28 @@ def forWork(cv):
|
||||
|
||||
if n.get("scale"):
|
||||
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
||||
if r: scales.append(int(r.group(1)))
|
||||
if r:
|
||||
scales.append(int(r.group(1)))
|
||||
|
||||
if goodcorp:
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好公司")
|
||||
if goodcorp_:
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好公司(曾)")
|
||||
|
||||
if corp_tags:
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].extend(corp_tags)
|
||||
cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
|
||||
|
||||
if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
|
||||
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
||||
if latest_job_tm:
|
||||
cv["latest_job_dt"] = latest_job_tm
|
||||
if fea["corporation_id"]:
|
||||
cv["corporation_id"] = fea["corporation_id"]
|
||||
|
||||
if fea["position_name"]:
|
||||
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
|
||||
@ -317,18 +368,23 @@ def forWork(cv):
|
||||
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
|
||||
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
|
||||
|
||||
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||
if fea["subordinates_count"]:
|
||||
fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||
re.match(r"[^0-9]+$", str(i))]
|
||||
if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
|
||||
if fea["subordinates_count"]:
|
||||
cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
|
||||
|
||||
if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
|
||||
if not cv.get("corporation_id"): cv["corporation_id"] = []
|
||||
if isinstance(cv.get("corporation_id"), int):
|
||||
cv["corporation_id"] = [str(cv["corporation_id"])]
|
||||
if not cv.get("corporation_id"):
|
||||
cv["corporation_id"] = []
|
||||
for i in cv.get("corporation_id", []):
|
||||
cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
|
||||
|
||||
if work_st_tm:
|
||||
try:
|
||||
if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
|
||||
if re.match(r"[0-9]{9,}", work_st_tm):
|
||||
work_st_tm = turnTm2Dt(work_st_tm)
|
||||
y, m, d = getYMD(work_st_tm)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
@ -339,28 +395,37 @@ def forWork(cv):
|
||||
cv["dua_flt"] = np.mean(duas)
|
||||
cv["cur_dua_int"] = duas[0]
|
||||
cv["job_num_int"] = len(duas)
|
||||
if scales: cv["scale_flt"] = np.max(scales)
|
||||
if scales:
|
||||
cv["scale_flt"] = np.max(scales)
|
||||
return cv
|
||||
|
||||
|
||||
def turnTm2Dt(b):
|
||||
if not b: return
|
||||
if not b:
|
||||
return
|
||||
b = str(b).strip()
|
||||
if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
|
||||
if re.match(r"[0-9]{10,}", b):
|
||||
b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
|
||||
return b
|
||||
|
||||
|
||||
def getYMD(b):
|
||||
y, m, d = "", "", "01"
|
||||
if not b: return (y, m, d)
|
||||
if not b:
|
||||
return (y, m, d)
|
||||
b = turnTm2Dt(b)
|
||||
if re.match(r"[0-9]{4}", b): y = int(b[:4])
|
||||
if re.match(r"[0-9]{4}", b):
|
||||
y = int(b[:4])
|
||||
r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
|
||||
if r: m = r.group(1)
|
||||
if r:
|
||||
m = r.group(1)
|
||||
r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
|
||||
if r: d = r.group(1)
|
||||
if not d or int(d) == 0 or int(d) > 31: d = "1"
|
||||
if not m or int(m) > 12 or int(m) < 1: m = "1"
|
||||
if r:
|
||||
d = r.group(1)
|
||||
if not d or int(d) == 0 or int(d) > 31:
|
||||
d = "1"
|
||||
if not m or int(m) > 12 or int(m) < 1:
|
||||
m = "1"
|
||||
return (y, m, d)
|
||||
|
||||
|
||||
@ -369,7 +434,8 @@ def birth(cv):
|
||||
cv["integerity_flt"] *= 0.9
|
||||
return cv
|
||||
y, m, d = getYMD(cv["birth"])
|
||||
if not m or not y: return cv
|
||||
if not m or not y:
|
||||
return cv
|
||||
b = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
cv["birth_dt"] = b
|
||||
cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
|
||||
@ -380,7 +446,8 @@ def birth(cv):
|
||||
|
||||
def parse(cv):
|
||||
for k in cv.keys():
|
||||
if cv[k] == '\\N': cv[k] = ''
|
||||
if cv[k] == '\\N':
|
||||
cv[k] = ''
|
||||
# cv = cv.asDict()
|
||||
tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
|
||||
"expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
|
||||
@ -402,9 +469,12 @@ def parse(cv):
|
||||
|
||||
rmkeys = []
|
||||
for k in cv.keys():
|
||||
if cv[k] is None: rmkeys.append(k)
|
||||
if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
|
||||
for k in rmkeys: del cv[k]
|
||||
if cv[k] is None:
|
||||
rmkeys.append(k)
|
||||
if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
|
||||
rmkeys.append(k)
|
||||
for k in rmkeys:
|
||||
del cv[k]
|
||||
|
||||
integerity = 0.
|
||||
flds_num = 0.
|
||||
@ -414,7 +484,8 @@ def parse(cv):
|
||||
flds_num += len(flds)
|
||||
for f in flds:
|
||||
v = str(cv.get(f, ""))
|
||||
if len(v) > 0 and v != '0' and v != '[]': integerity += 1
|
||||
if len(v) > 0 and v != '0' and v != '[]':
|
||||
integerity += 1
|
||||
|
||||
hasValues(tks_fld)
|
||||
hasValues(small_tks_fld)
|
||||
@ -433,7 +504,8 @@ def parse(cv):
|
||||
(r"[ ()\(\)人/·0-9-]+", ""),
|
||||
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
|
||||
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
|
||||
if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
|
||||
if len(cv["corporation_type"]) < 2:
|
||||
del cv["corporation_type"]
|
||||
|
||||
if cv.get("political_status"):
|
||||
for p, r in [
|
||||
@ -441,9 +513,11 @@ def parse(cv):
|
||||
(r".*(无党派|公民).*", "群众"),
|
||||
(r".*团员.*", "团员")]:
|
||||
cv["political_status"] = re.sub(p, r, cv["political_status"])
|
||||
if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
|
||||
if not re.search(r"[党团群]", cv["political_status"]):
|
||||
del cv["political_status"]
|
||||
|
||||
if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
|
||||
if cv.get("phone"):
|
||||
cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
|
||||
|
||||
keys = list(cv.keys())
|
||||
for k in keys:
|
||||
@ -454,9 +528,11 @@ def parse(cv):
|
||||
cv[k] = [a for _, a in cv[k].items()]
|
||||
nms = []
|
||||
for n in cv[k]:
|
||||
if type(n) != type({}) or "name" not in n or not n.get("name"): continue
|
||||
if not isinstance(n, dict) or "name" not in n or not n.get("name"):
|
||||
continue
|
||||
n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
|
||||
if not n["name"]: continue
|
||||
if not n["name"]:
|
||||
continue
|
||||
nms.append(n["name"])
|
||||
if nms:
|
||||
t = k[:-4]
|
||||
@ -469,15 +545,18 @@ def parse(cv):
|
||||
# tokenize fields
|
||||
if k in tks_fld:
|
||||
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
|
||||
if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
||||
if k in small_tks_fld:
|
||||
cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
||||
|
||||
# keyword fields
|
||||
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
||||
if k in kwd_fld:
|
||||
cv[f"{k}_kwd"] = [n.lower()
|
||||
for n in re.split(r"[\t,,;;. ]",
|
||||
re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
|
||||
) if n]
|
||||
|
||||
if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
|
||||
if k in num_fld and cv.get(k):
|
||||
cv[f"{k}_int"] = cv[k]
|
||||
|
||||
cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
|
||||
# for name field
|
||||
@ -501,10 +580,12 @@ def parse(cv):
|
||||
cv["name_py_pref0_tks"] = ""
|
||||
cv["name_py_pref_tks"] = ""
|
||||
for py in PY.get_pinyins(nm[:20], ''):
|
||||
for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
|
||||
for i in range(2, len(py) + 1):
|
||||
cv["name_py_pref_tks"] += " " + py[:i]
|
||||
for py in PY.get_pinyins(nm[:20], ' '):
|
||||
py = py.split()
|
||||
for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
||||
for i in range(1, len(py) + 1):
|
||||
cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
||||
|
||||
cv["name_kwd"] = name
|
||||
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
||||
@ -526,22 +607,30 @@ def parse(cv):
|
||||
cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
|
||||
else:
|
||||
y, m, d = getYMD(str(cv.get("updated_at", "")))
|
||||
if not y: y = "2012"
|
||||
if not m: m = "01"
|
||||
if not d: d = "01"
|
||||
if not y:
|
||||
y = "2012"
|
||||
if not m:
|
||||
m = "01"
|
||||
if not d:
|
||||
d = "01"
|
||||
cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
# long text tokenize
|
||||
|
||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||
if cv.get("responsibilities"):
|
||||
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||
|
||||
# for yes or no field
|
||||
fea = []
|
||||
for f, y, n in is_fld:
|
||||
if f not in cv: continue
|
||||
if cv[f] == '是': fea.append(y)
|
||||
if cv[f] == '否': fea.append(n)
|
||||
if f not in cv:
|
||||
continue
|
||||
if cv[f] == '是':
|
||||
fea.append(y)
|
||||
if cv[f] == '否':
|
||||
fea.append(n)
|
||||
|
||||
if fea: cv["tag_kwd"] = fea
|
||||
if fea:
|
||||
cv["tag_kwd"] = fea
|
||||
|
||||
cv = forEdu(cv)
|
||||
cv = forProj(cv)
|
||||
@ -550,9 +639,11 @@ def parse(cv):
|
||||
|
||||
cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
|
||||
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||
for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
|
||||
for j in cv.get("sch_rank_kwd", []):
|
||||
cv["corp_proj_sch_deg_kwd"][i] += "+" + j
|
||||
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||
if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
|
||||
if cv.get("highest_degree_kwd"):
|
||||
cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
|
||||
|
||||
try:
|
||||
if not cv.get("work_exp_flt") and cv.get("work_start_time"):
|
||||
@ -565,17 +656,21 @@ def parse(cv):
|
||||
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
||||
except Exception as e:
|
||||
logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
|
||||
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
||||
if "work_exp_flt" not in cv and cv.get("work_experience", 0):
|
||||
cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
||||
|
||||
keys = list(cv.keys())
|
||||
for k in keys:
|
||||
if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
|
||||
if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
|
||||
del cv[k]
|
||||
for k in cv.keys():
|
||||
if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
|
||||
if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
|
||||
continue
|
||||
cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
|
||||
keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
|
||||
for k in keys:
|
||||
if cv[k] <= 0: del cv[k]
|
||||
if cv[k] <= 0:
|
||||
del cv[k]
|
||||
|
||||
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
||||
cv["id"] = cv["tob_resume_id"]
|
||||
@ -592,5 +687,6 @@ def dealWithInt64(d):
|
||||
if isinstance(d, list):
|
||||
d = [dealWithInt64(t) for t in d]
|
||||
|
||||
if isinstance(d, np.integer): d = int(d)
|
||||
if isinstance(d, np.integer):
|
||||
d = int(d)
|
||||
return d
|
||||
|
||||
@ -51,6 +51,7 @@ class RAGFlowTxtParser:
|
||||
dels = [d for d in dels if d]
|
||||
dels = "|".join(dels)
|
||||
secs = re.split(r"(%s)" % dels, txt)
|
||||
for sec in secs: add_chunk(sec)
|
||||
for sec in secs:
|
||||
add_chunk(sec)
|
||||
|
||||
return [[c, ""] for c in cks]
|
||||
|
||||
Reference in New Issue
Block a user