mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Format file format from Windows/dos to Unix (#1949)
### What problem does this PR solve? Related source file is in Windows/DOS format, they are format to Unix format. ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
@ -1,61 +1,61 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class RAGFlowPptParser(object):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __extract(self, shape):
|
||||
if shape.shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
return shape.text_frame.text
|
||||
|
||||
if shape.shape_type == 6:
|
||||
texts = []
|
||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||
t = self.__extract(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
self.total_page = len(ppt.slides)
|
||||
for i, slide in enumerate(ppt.slides):
|
||||
if i < from_page:
|
||||
continue
|
||||
if i >= to_page:
|
||||
break
|
||||
texts = []
|
||||
for shape in sorted(
|
||||
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
|
||||
txt = self.__extract(shape)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
return txts
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class RAGFlowPptParser(object):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __extract(self, shape):
|
||||
if shape.shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
return shape.text_frame.text
|
||||
|
||||
if shape.shape_type == 6:
|
||||
texts = []
|
||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||
t = self.__extract(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
self.total_page = len(ppt.slides)
|
||||
for i, slide in enumerate(ppt.slides):
|
||||
if i < from_page:
|
||||
continue
|
||||
if i >= to_page:
|
||||
break
|
||||
texts = []
|
||||
for shape in sorted(
|
||||
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
|
||||
txt = self.__extract(shape)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
return txts
|
||||
|
||||
@ -1,65 +1,65 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import datetime
|
||||
|
||||
|
||||
def refactor(cv):
|
||||
for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
|
||||
if n in cv and cv[n] is not None: del cv[n]
|
||||
cv["is_deleted"] = 0
|
||||
if "basic" not in cv: cv["basic"] = {}
|
||||
if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
|
||||
|
||||
for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
|
||||
if n not in cv or cv[n] is None: continue
|
||||
if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
|
||||
if type(cv[n]) != type([]):
|
||||
del cv[n]
|
||||
continue
|
||||
vv = []
|
||||
for v in cv[n]:
|
||||
if "external" in v and v["external"] is not None: del v["external"]
|
||||
vv.append(v)
|
||||
cv[n] = {str(i): vv[i] for i in range(len(vv))}
|
||||
|
||||
basics = [
|
||||
("basic_salary_month", "salary_month"),
|
||||
("expect_annual_salary_from", "expect_annual_salary"),
|
||||
]
|
||||
for n, t in basics:
|
||||
if cv["basic"].get(n):
|
||||
cv["basic"][t] = cv["basic"][n]
|
||||
del cv["basic"][n]
|
||||
|
||||
work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
|
||||
if work:
|
||||
cv["basic"]["work_start_time"] = work[0].get("start_time", "")
|
||||
cv["basic"]["management_experience"] = 'Y' if any(
|
||||
[w.get("management_experience", '') == 'Y' for w in work]) else 'N'
|
||||
cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
|
||||
|
||||
for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
|
||||
"corporation_type", "scale", "corporation_name"]:
|
||||
cv["basic"][n] = work[-1].get(n, "")
|
||||
|
||||
if edu:
|
||||
for n in ["school_name", "discipline_name"]:
|
||||
if n in edu[-1]: cv["basic"][n] = edu[-1][n]
|
||||
|
||||
cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
if "contact" not in cv: cv["contact"] = {}
|
||||
if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import datetime
|
||||
|
||||
|
||||
def refactor(cv):
|
||||
for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
|
||||
if n in cv and cv[n] is not None: del cv[n]
|
||||
cv["is_deleted"] = 0
|
||||
if "basic" not in cv: cv["basic"] = {}
|
||||
if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
|
||||
|
||||
for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
|
||||
if n not in cv or cv[n] is None: continue
|
||||
if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
|
||||
if type(cv[n]) != type([]):
|
||||
del cv[n]
|
||||
continue
|
||||
vv = []
|
||||
for v in cv[n]:
|
||||
if "external" in v and v["external"] is not None: del v["external"]
|
||||
vv.append(v)
|
||||
cv[n] = {str(i): vv[i] for i in range(len(vv))}
|
||||
|
||||
basics = [
|
||||
("basic_salary_month", "salary_month"),
|
||||
("expect_annual_salary_from", "expect_annual_salary"),
|
||||
]
|
||||
for n, t in basics:
|
||||
if cv["basic"].get(n):
|
||||
cv["basic"][t] = cv["basic"][n]
|
||||
del cv["basic"][n]
|
||||
|
||||
work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
|
||||
if work:
|
||||
cv["basic"]["work_start_time"] = work[0].get("start_time", "")
|
||||
cv["basic"]["management_experience"] = 'Y' if any(
|
||||
[w.get("management_experience", '') == 'Y' for w in work]) else 'N'
|
||||
cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
|
||||
|
||||
for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
|
||||
"corporation_type", "scale", "corporation_name"]:
|
||||
cv["basic"][n] = work[-1].get(n, "")
|
||||
|
||||
if edu:
|
||||
for n in ["school_name", "discipline_name"]:
|
||||
if n in edu[-1]: cv["basic"][n] = edu[-1][n]
|
||||
|
||||
cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
if "contact" not in cv: cv["contact"] = {}
|
||||
if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
|
||||
return cv
|
||||
@ -1,4 +1,4 @@
|
||||
清华大学,2,985,清华
|
||||
清华大学,2,985,清华
|
||||
清华大学,2,985,Tsinghua University
|
||||
清华大学,2,985,THU
|
||||
北京大学,1,985,北大
|
||||
|
||||
|
@ -1,186 +1,186 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import json
|
||||
from deepdoc.parser.resume.entities import degrees, regions, industries
|
||||
|
||||
FIELDS = [
|
||||
"address STRING",
|
||||
"annual_salary int",
|
||||
"annual_salary_from int",
|
||||
"annual_salary_to int",
|
||||
"birth STRING",
|
||||
"card STRING",
|
||||
"certificate_obj string",
|
||||
"city STRING",
|
||||
"corporation_id int",
|
||||
"corporation_name STRING",
|
||||
"corporation_type STRING",
|
||||
"degree STRING",
|
||||
"discipline_name STRING",
|
||||
"education_obj string",
|
||||
"email STRING",
|
||||
"expect_annual_salary int",
|
||||
"expect_city_names string",
|
||||
"expect_industry_name STRING",
|
||||
"expect_position_name STRING",
|
||||
"expect_salary_from int",
|
||||
"expect_salary_to int",
|
||||
"expect_type STRING",
|
||||
"gender STRING",
|
||||
"industry_name STRING",
|
||||
"industry_names STRING",
|
||||
"is_deleted STRING",
|
||||
"is_fertility STRING",
|
||||
"is_house STRING",
|
||||
"is_management_experience STRING",
|
||||
"is_marital STRING",
|
||||
"is_oversea STRING",
|
||||
"language_obj string",
|
||||
"name STRING",
|
||||
"nation STRING",
|
||||
"phone STRING",
|
||||
"political_status STRING",
|
||||
"position_name STRING",
|
||||
"project_obj string",
|
||||
"responsibilities string",
|
||||
"salary_month int",
|
||||
"scale STRING",
|
||||
"school_name STRING",
|
||||
"self_remark string",
|
||||
"skill_obj string",
|
||||
"title_name STRING",
|
||||
"tob_resume_id STRING",
|
||||
"updated_at Timestamp",
|
||||
"wechat STRING",
|
||||
"work_obj string",
|
||||
"work_experience int",
|
||||
"work_start_time BIGINT"
|
||||
]
|
||||
|
||||
def refactor(df):
|
||||
def deal_obj(obj, k, kk):
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
obj = obj.get(k, {})
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
return obj.get(kk, "")
|
||||
|
||||
def loadjson(line):
|
||||
try:
|
||||
return json.loads(line)
|
||||
except Exception as e:
|
||||
pass
|
||||
return {}
|
||||
|
||||
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
|
||||
df.fillna("", inplace=True)
|
||||
|
||||
clms = ["tob_resume_id", "updated_at"]
|
||||
|
||||
def extract(nms, cc=None):
|
||||
nonlocal clms
|
||||
clms.extend(nms)
|
||||
for c in nms:
|
||||
if cc:
|
||||
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
|
||||
else:
|
||||
df[c] = df["obj"].map(
|
||||
lambda x: json.dumps(
|
||||
x.get(
|
||||
c,
|
||||
{}),
|
||||
ensure_ascii=False) if isinstance(
|
||||
x,
|
||||
type(
|
||||
{})) and (
|
||||
isinstance(
|
||||
x.get(c),
|
||||
type(
|
||||
{})) or not x.get(c)) else str(x).replace(
|
||||
"None",
|
||||
""))
|
||||
|
||||
extract(["education", "work", "certificate", "project", "language",
|
||||
"skill"])
|
||||
extract(["wechat", "phone", "is_deleted",
|
||||
"name", "tel", "email"], "contact")
|
||||
extract(["nation", "expect_industry_name", "salary_month",
|
||||
"industry_ids", "is_house", "birth", "annual_salary_from",
|
||||
"annual_salary_to", "card",
|
||||
"expect_salary_to", "expect_salary_from",
|
||||
"expect_position_name", "gender", "city",
|
||||
"is_fertility", "expect_city_names",
|
||||
"political_status", "title_name", "expect_annual_salary",
|
||||
"industry_name", "address", "position_name", "school_name",
|
||||
"corporation_id",
|
||||
"is_oversea", "responsibilities",
|
||||
"work_start_time", "degree", "management_experience",
|
||||
"expect_type", "corporation_type", "scale", "corporation_name",
|
||||
"self_remark", "annual_salary", "work_experience",
|
||||
"discipline_name", "marital", "updated_at"], "basic")
|
||||
|
||||
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
|
||||
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
|
||||
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
|
||||
str(x).split(",")]))
|
||||
clms.append("industry_names")
|
||||
|
||||
def arr2str(a):
|
||||
if not a:
|
||||
return ""
|
||||
if isinstance(a, list):
|
||||
a = " ".join([str(i) for i in a])
|
||||
return str(a).replace(",", " ")
|
||||
|
||||
df["expect_industry_name"] = df["expect_industry_name"].map(
|
||||
lambda x: arr2str(x))
|
||||
df["gender"] = df["gender"].map(
|
||||
lambda x: "男" if x == 'M' else (
|
||||
"女" if x == 'F' else ""))
|
||||
for c in ["is_fertility", "is_oversea", "is_house",
|
||||
"management_experience", "marital"]:
|
||||
df[c] = df[c].map(
|
||||
lambda x: '是' if x == 'Y' else (
|
||||
'否' if x == 'N' else ""))
|
||||
df["is_management_experience"] = df["management_experience"]
|
||||
df["is_marital"] = df["marital"]
|
||||
clms.extend(["is_management_experience", "is_marital"])
|
||||
|
||||
df.fillna("", inplace=True)
|
||||
for i in range(len(df)):
|
||||
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
|
||||
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
|
||||
|
||||
for n in ["industry_ids", "management_experience", "marital", "tel"]:
|
||||
for i in range(len(clms)):
|
||||
if clms[i] == n:
|
||||
del clms[i]
|
||||
break
|
||||
|
||||
clms = list(set(clms))
|
||||
|
||||
df = df.reindex(sorted(clms), axis=1)
|
||||
#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
|
||||
for c in clms:
|
||||
df[c] = df[c].map(
|
||||
lambda s: str(s).replace(
|
||||
"\t",
|
||||
" ").replace(
|
||||
"\n",
|
||||
"\\n").replace(
|
||||
"\r",
|
||||
"\\n"))
|
||||
# print(df.values.tolist())
|
||||
return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import json
|
||||
from deepdoc.parser.resume.entities import degrees, regions, industries
|
||||
|
||||
FIELDS = [
|
||||
"address STRING",
|
||||
"annual_salary int",
|
||||
"annual_salary_from int",
|
||||
"annual_salary_to int",
|
||||
"birth STRING",
|
||||
"card STRING",
|
||||
"certificate_obj string",
|
||||
"city STRING",
|
||||
"corporation_id int",
|
||||
"corporation_name STRING",
|
||||
"corporation_type STRING",
|
||||
"degree STRING",
|
||||
"discipline_name STRING",
|
||||
"education_obj string",
|
||||
"email STRING",
|
||||
"expect_annual_salary int",
|
||||
"expect_city_names string",
|
||||
"expect_industry_name STRING",
|
||||
"expect_position_name STRING",
|
||||
"expect_salary_from int",
|
||||
"expect_salary_to int",
|
||||
"expect_type STRING",
|
||||
"gender STRING",
|
||||
"industry_name STRING",
|
||||
"industry_names STRING",
|
||||
"is_deleted STRING",
|
||||
"is_fertility STRING",
|
||||
"is_house STRING",
|
||||
"is_management_experience STRING",
|
||||
"is_marital STRING",
|
||||
"is_oversea STRING",
|
||||
"language_obj string",
|
||||
"name STRING",
|
||||
"nation STRING",
|
||||
"phone STRING",
|
||||
"political_status STRING",
|
||||
"position_name STRING",
|
||||
"project_obj string",
|
||||
"responsibilities string",
|
||||
"salary_month int",
|
||||
"scale STRING",
|
||||
"school_name STRING",
|
||||
"self_remark string",
|
||||
"skill_obj string",
|
||||
"title_name STRING",
|
||||
"tob_resume_id STRING",
|
||||
"updated_at Timestamp",
|
||||
"wechat STRING",
|
||||
"work_obj string",
|
||||
"work_experience int",
|
||||
"work_start_time BIGINT"
|
||||
]
|
||||
|
||||
def refactor(df):
|
||||
def deal_obj(obj, k, kk):
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
obj = obj.get(k, {})
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
return obj.get(kk, "")
|
||||
|
||||
def loadjson(line):
|
||||
try:
|
||||
return json.loads(line)
|
||||
except Exception as e:
|
||||
pass
|
||||
return {}
|
||||
|
||||
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
|
||||
df.fillna("", inplace=True)
|
||||
|
||||
clms = ["tob_resume_id", "updated_at"]
|
||||
|
||||
def extract(nms, cc=None):
|
||||
nonlocal clms
|
||||
clms.extend(nms)
|
||||
for c in nms:
|
||||
if cc:
|
||||
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
|
||||
else:
|
||||
df[c] = df["obj"].map(
|
||||
lambda x: json.dumps(
|
||||
x.get(
|
||||
c,
|
||||
{}),
|
||||
ensure_ascii=False) if isinstance(
|
||||
x,
|
||||
type(
|
||||
{})) and (
|
||||
isinstance(
|
||||
x.get(c),
|
||||
type(
|
||||
{})) or not x.get(c)) else str(x).replace(
|
||||
"None",
|
||||
""))
|
||||
|
||||
extract(["education", "work", "certificate", "project", "language",
|
||||
"skill"])
|
||||
extract(["wechat", "phone", "is_deleted",
|
||||
"name", "tel", "email"], "contact")
|
||||
extract(["nation", "expect_industry_name", "salary_month",
|
||||
"industry_ids", "is_house", "birth", "annual_salary_from",
|
||||
"annual_salary_to", "card",
|
||||
"expect_salary_to", "expect_salary_from",
|
||||
"expect_position_name", "gender", "city",
|
||||
"is_fertility", "expect_city_names",
|
||||
"political_status", "title_name", "expect_annual_salary",
|
||||
"industry_name", "address", "position_name", "school_name",
|
||||
"corporation_id",
|
||||
"is_oversea", "responsibilities",
|
||||
"work_start_time", "degree", "management_experience",
|
||||
"expect_type", "corporation_type", "scale", "corporation_name",
|
||||
"self_remark", "annual_salary", "work_experience",
|
||||
"discipline_name", "marital", "updated_at"], "basic")
|
||||
|
||||
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
|
||||
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
|
||||
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
|
||||
str(x).split(",")]))
|
||||
clms.append("industry_names")
|
||||
|
||||
def arr2str(a):
|
||||
if not a:
|
||||
return ""
|
||||
if isinstance(a, list):
|
||||
a = " ".join([str(i) for i in a])
|
||||
return str(a).replace(",", " ")
|
||||
|
||||
df["expect_industry_name"] = df["expect_industry_name"].map(
|
||||
lambda x: arr2str(x))
|
||||
df["gender"] = df["gender"].map(
|
||||
lambda x: "男" if x == 'M' else (
|
||||
"女" if x == 'F' else ""))
|
||||
for c in ["is_fertility", "is_oversea", "is_house",
|
||||
"management_experience", "marital"]:
|
||||
df[c] = df[c].map(
|
||||
lambda x: '是' if x == 'Y' else (
|
||||
'否' if x == 'N' else ""))
|
||||
df["is_management_experience"] = df["management_experience"]
|
||||
df["is_marital"] = df["marital"]
|
||||
clms.extend(["is_management_experience", "is_marital"])
|
||||
|
||||
df.fillna("", inplace=True)
|
||||
for i in range(len(df)):
|
||||
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
|
||||
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
|
||||
|
||||
for n in ["industry_ids", "management_experience", "marital", "tel"]:
|
||||
for i in range(len(clms)):
|
||||
if clms[i] == n:
|
||||
del clms[i]
|
||||
break
|
||||
|
||||
clms = list(set(clms))
|
||||
|
||||
df = df.reindex(sorted(clms), axis=1)
|
||||
#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
|
||||
for c in clms:
|
||||
df[c] = df[c].map(
|
||||
lambda s: str(s).replace(
|
||||
"\t",
|
||||
" ").replace(
|
||||
"\n",
|
||||
"\\n").replace(
|
||||
"\r",
|
||||
"\\n"))
|
||||
# print(df.values.tolist())
|
||||
return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user