Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve? Related source file is in Windows/DOS format, they are format to Unix format. ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2026-01-30 23:26:36 +08:00 · 2024-08-15 09:17:36 +08:00
parent 1328d715db
commit 6b3a40be5c
108 changed files with 36399 additions and 36399 deletions
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@ -1,61 +1,61 @@
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-from io import BytesIO
-from pptx import Presentation
-
-
-class RAGFlowPptParser(object):
-    def __init__(self):
-        super().__init__()
-
-    def __extract(self, shape):
-        if shape.shape_type == 19:
-            tb = shape.table
-            rows = []
-            for i in range(1, len(tb.rows)):
-                rows.append("; ".join([tb.cell(
-                    0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
-            return "\n".join(rows)
-
-        if shape.has_text_frame:
-            return shape.text_frame.text
-
-        if shape.shape_type == 6:
-            texts = []
-            for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
-                t = self.__extract(p)
-                if t:
-                    texts.append(t)
-            return "\n".join(texts)
-
-    def __call__(self, fnm, from_page, to_page, callback=None):
-        ppt = Presentation(fnm) if isinstance(
-            fnm, str) else Presentation(
-            BytesIO(fnm))
-        txts = []
-        self.total_page = len(ppt.slides)
-        for i, slide in enumerate(ppt.slides):
-            if i < from_page:
-                continue
-            if i >= to_page:
-                break
-            texts = []
-            for shape in sorted(
-                    slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
-                txt = self.__extract(shape)
-                if txt:
-                    texts.append(txt)
-            txts.append("\n".join(texts))
-
-        return txts
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from io import BytesIO
+from pptx import Presentation
+
+
+class RAGFlowPptParser(object):
+    def __init__(self):
+        super().__init__()
+
+    def __extract(self, shape):
+        if shape.shape_type == 19:
+            tb = shape.table
+            rows = []
+            for i in range(1, len(tb.rows)):
+                rows.append("; ".join([tb.cell(
+                    0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+            return "\n".join(rows)
+
+        if shape.has_text_frame:
+            return shape.text_frame.text
+
+        if shape.shape_type == 6:
+            texts = []
+            for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
+                t = self.__extract(p)
+                if t:
+                    texts.append(t)
+            return "\n".join(texts)
+
+    def __call__(self, fnm, from_page, to_page, callback=None):
+        ppt = Presentation(fnm) if isinstance(
+            fnm, str) else Presentation(
+            BytesIO(fnm))
+        txts = []
+        self.total_page = len(ppt.slides)
+        for i, slide in enumerate(ppt.slides):
+            if i < from_page:
+                continue
+            if i >= to_page:
+                break
+            texts = []
+            for shape in sorted(
+                    slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
+                txt = self.__extract(shape)
+                if txt:
+                    texts.append(txt)
+            txts.append("\n".join(texts))
+
+        return txts
--- a/deepdoc/parser/resume/init.py
+++ b/deepdoc/parser/resume/init.py
@ -1,65 +1,65 @@
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import datetime
-
-
-def refactor(cv):
-    for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
-        if n in cv and cv[n] is not None: del cv[n]
-    cv["is_deleted"] = 0
-    if "basic" not in cv: cv["basic"] = {}
-    if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
-
-    for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
-        if n not in cv or cv[n] is None: continue
-        if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
-        if type(cv[n]) != type([]):
-            del cv[n]
-            continue
-        vv = []
-        for v in cv[n]:
-            if "external" in v and v["external"] is not None: del v["external"]
-            vv.append(v)
-        cv[n] = {str(i): vv[i] for i in range(len(vv))}
-
-    basics = [
-        ("basic_salary_month", "salary_month"),
-        ("expect_annual_salary_from", "expect_annual_salary"),
-    ]
-    for n, t in basics:
-        if cv["basic"].get(n):
-            cv["basic"][t] = cv["basic"][n]
-            del cv["basic"][n]
-
-    work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
-    edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
-
-    if work:
-        cv["basic"]["work_start_time"] = work[0].get("start_time", "")
-        cv["basic"]["management_experience"] = 'Y' if any(
-            [w.get("management_experience", '') == 'Y' for w in work]) else 'N'
-        cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
-
-        for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
-                  "corporation_type", "scale", "corporation_name"]:
-            cv["basic"][n] = work[-1].get(n, "")
-
-    if edu:
-        for n in ["school_name", "discipline_name"]:
-            if n in edu[-1]: cv["basic"][n] = edu[-1][n]
-
-    cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    if "contact" not in cv: cv["contact"] = {}
-    if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import datetime
+
+
+def refactor(cv):
+    for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
+        if n in cv and cv[n] is not None: del cv[n]
+    cv["is_deleted"] = 0
+    if "basic" not in cv: cv["basic"] = {}
+    if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
+
+    for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
+        if n not in cv or cv[n] is None: continue
+        if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
+        if type(cv[n]) != type([]):
+            del cv[n]
+            continue
+        vv = []
+        for v in cv[n]:
+            if "external" in v and v["external"] is not None: del v["external"]
+            vv.append(v)
+        cv[n] = {str(i): vv[i] for i in range(len(vv))}
+
+    basics = [
+        ("basic_salary_month", "salary_month"),
+        ("expect_annual_salary_from", "expect_annual_salary"),
+    ]
+    for n, t in basics:
+        if cv["basic"].get(n):
+            cv["basic"][t] = cv["basic"][n]
+            del cv["basic"][n]
+
+    work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
+    edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
+
+    if work:
+        cv["basic"]["work_start_time"] = work[0].get("start_time", "")
+        cv["basic"]["management_experience"] = 'Y' if any(
+            [w.get("management_experience", '') == 'Y' for w in work]) else 'N'
+        cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
+
+        for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
+                  "corporation_type", "scale", "corporation_name"]:
+            cv["basic"][n] = work[-1].get(n, "")
+
+    if edu:
+        for n in ["school_name", "discipline_name"]:
+            if n in edu[-1]: cv["basic"][n] = edu[-1][n]
+
+    cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if "contact" not in cv: cv["contact"] = {}
+    if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
    return cv
--- a/deepdoc/parser/resume/entities/res/school.rank.csv
+++ b/deepdoc/parser/resume/entities/res/school.rank.csv
@ -1,4 +1,4 @@
-清华大学,2,985,清华
+清华大学,2,985,清华
 清华大学,2,985,Tsinghua University
 清华大学,2,985,THU
 北京大学,1,985,北大
--- a/deepdoc/parser/resume/step_one.py
+++ b/deepdoc/parser/resume/step_one.py
@ -1,186 +1,186 @@
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import json
-from deepdoc.parser.resume.entities import degrees, regions, industries
-
-FIELDS = [
-"address STRING",
-"annual_salary int",
-"annual_salary_from int",
-"annual_salary_to int",
-"birth STRING",
-"card STRING",
-"certificate_obj string",
-"city STRING",
-"corporation_id int",
-"corporation_name STRING",
-"corporation_type STRING",
-"degree STRING",
-"discipline_name STRING",
-"education_obj string",
-"email STRING",
-"expect_annual_salary int",
-"expect_city_names string",
-"expect_industry_name STRING",
-"expect_position_name STRING",
-"expect_salary_from int",
-"expect_salary_to int",
-"expect_type STRING",
-"gender STRING",
-"industry_name STRING",
-"industry_names STRING",
-"is_deleted STRING",
-"is_fertility STRING",
-"is_house STRING",
-"is_management_experience STRING",
-"is_marital STRING",
-"is_oversea STRING",
-"language_obj string",
-"name STRING",
-"nation STRING",
-"phone STRING",
-"political_status STRING",
-"position_name STRING",
-"project_obj string",
-"responsibilities string",
-"salary_month int",
-"scale STRING",
-"school_name STRING",
-"self_remark string",
-"skill_obj string",
-"title_name STRING",
-"tob_resume_id STRING",
-"updated_at Timestamp",
-"wechat STRING",
-"work_obj string",
-"work_experience int",
-"work_start_time BIGINT"
-]
-
-def refactor(df):
-    def deal_obj(obj, k, kk):
-        if not isinstance(obj, type({})):
-            return ""
-        obj = obj.get(k, {})
-        if not isinstance(obj, type({})):
-            return ""
-        return obj.get(kk, "")
-
-    def loadjson(line):
-        try:
-            return json.loads(line)
-        except Exception as e:
-            pass
-        return {}
-
-    df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
-    df.fillna("", inplace=True)
-
-    clms = ["tob_resume_id", "updated_at"]
-
-    def extract(nms, cc=None):
-        nonlocal clms
-        clms.extend(nms)
-        for c in nms:
-            if cc:
-                df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
-            else:
-                df[c] = df["obj"].map(
-                    lambda x: json.dumps(
-                        x.get(
-                            c,
-                            {}),
-                        ensure_ascii=False) if isinstance(
-                        x,
-                        type(
-                            {})) and (
-                        isinstance(
-                            x.get(c),
-                            type(
-                                {})) or not x.get(c)) else str(x).replace(
-                                    "None",
-                        ""))
-
-    extract(["education", "work", "certificate", "project", "language",
-             "skill"])
-    extract(["wechat", "phone", "is_deleted",
-            "name", "tel", "email"], "contact")
-    extract(["nation", "expect_industry_name", "salary_month",
-             "industry_ids", "is_house", "birth", "annual_salary_from",
-             "annual_salary_to", "card",
-             "expect_salary_to", "expect_salary_from",
-             "expect_position_name", "gender", "city",
-             "is_fertility", "expect_city_names",
-             "political_status", "title_name", "expect_annual_salary",
-             "industry_name", "address", "position_name", "school_name",
-             "corporation_id",
-             "is_oversea", "responsibilities",
-             "work_start_time", "degree", "management_experience",
-             "expect_type", "corporation_type", "scale", "corporation_name",
-             "self_remark", "annual_salary", "work_experience",
-             "discipline_name", "marital", "updated_at"], "basic")
-
-    df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
-    df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
-    df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
-                                                                      str(x).split(",")]))
-    clms.append("industry_names")
-
-    def arr2str(a):
-        if not a:
-            return ""
-        if isinstance(a, list):
-            a = " ".join([str(i) for i in a])
-        return str(a).replace(",", " ")
-
-    df["expect_industry_name"] = df["expect_industry_name"].map(
-        lambda x: arr2str(x))
-    df["gender"] = df["gender"].map(
-        lambda x: "男" if x == 'M' else (
-            "女" if x == 'F' else ""))
-    for c in ["is_fertility", "is_oversea", "is_house",
-              "management_experience", "marital"]:
-        df[c] = df[c].map(
-            lambda x: '是' if x == 'Y' else (
-                '否' if x == 'N' else ""))
-    df["is_management_experience"] = df["management_experience"]
-    df["is_marital"] = df["marital"]
-    clms.extend(["is_management_experience", "is_marital"])
-
-    df.fillna("", inplace=True)
-    for i in range(len(df)):
-        if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
-            df.loc[i, "phone"] = df.loc[i, "tel"].strip()
-
-    for n in ["industry_ids", "management_experience", "marital", "tel"]:
-        for i in range(len(clms)):
-            if clms[i] == n:
-                del clms[i]
-                break
-
-    clms = list(set(clms))
-
-    df = df.reindex(sorted(clms), axis=1)
-    #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
-    for c in clms:
-        df[c] = df[c].map(
-            lambda s: str(s).replace(
-                "\t",
-                " ").replace(
-                "\n",
-                "\\n").replace(
-                "\r",
-                "\\n"))
-    # print(df.values.tolist())
-    return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import json
+from deepdoc.parser.resume.entities import degrees, regions, industries
+
+FIELDS = [
+"address STRING",
+"annual_salary int",
+"annual_salary_from int",
+"annual_salary_to int",
+"birth STRING",
+"card STRING",
+"certificate_obj string",
+"city STRING",
+"corporation_id int",
+"corporation_name STRING",
+"corporation_type STRING",
+"degree STRING",
+"discipline_name STRING",
+"education_obj string",
+"email STRING",
+"expect_annual_salary int",
+"expect_city_names string",
+"expect_industry_name STRING",
+"expect_position_name STRING",
+"expect_salary_from int",
+"expect_salary_to int",
+"expect_type STRING",
+"gender STRING",
+"industry_name STRING",
+"industry_names STRING",
+"is_deleted STRING",
+"is_fertility STRING",
+"is_house STRING",
+"is_management_experience STRING",
+"is_marital STRING",
+"is_oversea STRING",
+"language_obj string",
+"name STRING",
+"nation STRING",
+"phone STRING",
+"political_status STRING",
+"position_name STRING",
+"project_obj string",
+"responsibilities string",
+"salary_month int",
+"scale STRING",
+"school_name STRING",
+"self_remark string",
+"skill_obj string",
+"title_name STRING",
+"tob_resume_id STRING",
+"updated_at Timestamp",
+"wechat STRING",
+"work_obj string",
+"work_experience int",
+"work_start_time BIGINT"
+]
+
+def refactor(df):
+    def deal_obj(obj, k, kk):
+        if not isinstance(obj, type({})):
+            return ""
+        obj = obj.get(k, {})
+        if not isinstance(obj, type({})):
+            return ""
+        return obj.get(kk, "")
+
+    def loadjson(line):
+        try:
+            return json.loads(line)
+        except Exception as e:
+            pass
+        return {}
+
+    df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
+    df.fillna("", inplace=True)
+
+    clms = ["tob_resume_id", "updated_at"]
+
+    def extract(nms, cc=None):
+        nonlocal clms
+        clms.extend(nms)
+        for c in nms:
+            if cc:
+                df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
+            else:
+                df[c] = df["obj"].map(
+                    lambda x: json.dumps(
+                        x.get(
+                            c,
+                            {}),
+                        ensure_ascii=False) if isinstance(
+                        x,
+                        type(
+                            {})) and (
+                        isinstance(
+                            x.get(c),
+                            type(
+                                {})) or not x.get(c)) else str(x).replace(
+                                    "None",
+                        ""))
+
+    extract(["education", "work", "certificate", "project", "language",
+             "skill"])
+    extract(["wechat", "phone", "is_deleted",
+            "name", "tel", "email"], "contact")
+    extract(["nation", "expect_industry_name", "salary_month",
+             "industry_ids", "is_house", "birth", "annual_salary_from",
+             "annual_salary_to", "card",
+             "expect_salary_to", "expect_salary_from",
+             "expect_position_name", "gender", "city",
+             "is_fertility", "expect_city_names",
+             "political_status", "title_name", "expect_annual_salary",
+             "industry_name", "address", "position_name", "school_name",
+             "corporation_id",
+             "is_oversea", "responsibilities",
+             "work_start_time", "degree", "management_experience",
+             "expect_type", "corporation_type", "scale", "corporation_name",
+             "self_remark", "annual_salary", "work_experience",
+             "discipline_name", "marital", "updated_at"], "basic")
+
+    df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
+    df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
+    df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
+                                                                      str(x).split(",")]))
+    clms.append("industry_names")
+
+    def arr2str(a):
+        if not a:
+            return ""
+        if isinstance(a, list):
+            a = " ".join([str(i) for i in a])
+        return str(a).replace(",", " ")
+
+    df["expect_industry_name"] = df["expect_industry_name"].map(
+        lambda x: arr2str(x))
+    df["gender"] = df["gender"].map(
+        lambda x: "男" if x == 'M' else (
+            "女" if x == 'F' else ""))
+    for c in ["is_fertility", "is_oversea", "is_house",
+              "management_experience", "marital"]:
+        df[c] = df[c].map(
+            lambda x: '是' if x == 'Y' else (
+                '否' if x == 'N' else ""))
+    df["is_management_experience"] = df["management_experience"]
+    df["is_marital"] = df["marital"]
+    clms.extend(["is_management_experience", "is_marital"])
+
+    df.fillna("", inplace=True)
+    for i in range(len(df)):
+        if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
+            df.loc[i, "phone"] = df.loc[i, "tel"].strip()
+
+    for n in ["industry_ids", "management_experience", "marital", "tel"]:
+        for i in range(len(clms)):
+            if clms[i] == n:
+                del clms[i]
+                break
+
+    clms = list(set(clms))
+
+    df = df.reindex(sorted(clms), axis=1)
+    #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
+    for c in clms:
+        df[c] = df[c].map(
+            lambda s: str(s).replace(
+                "\t",
+                " ").replace(
+                "\n",
+                "\\n").replace(
+                "\r",
+                "\\n"))
+    # print(df.values.tolist())
+    return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py