Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve? Edit chunk shall update instead of insert it. Close #3679 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-02 08:35:08 +08:00 · 2024-11-28 13:00:38 +08:00
parent 9f57534843
commit bc701d7b4c
19 changed files with 51 additions and 46 deletions
--- a/deepdoc/parser/resume/entities/corporations.py
+++ b/deepdoc/parser/resume/entities/corporations.py
@ -47,7 +47,7 @@ def corpNorm(nm, add_region=True):
    nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
    if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm

-    tks = rag_tokenizer.tokenize(nm).split(" ")
+    tks = rag_tokenizer.tokenize(nm).split()
    reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
    nm = ""
    for t in tks:
--- a/deepdoc/parser/resume/entities/schools.py
+++ b/deepdoc/parser/resume/entities/schools.py
@ -44,7 +44,7 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))

 def split(txt):
    tks = []
-    for t in re.sub(r"[ \t]+", " ",txt).split(" "):
+    for t in re.sub(r"[ \t]+", " ",txt).split():
        if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
           re.match(r"[a-zA-Z]", t) and tks:
            tks[-1] = tks[-1] + " " + t
--- a/deepdoc/parser/resume/step_one.py
+++ b/deepdoc/parser/resume/step_one.py
@ -80,7 +80,7 @@ def refactor(df):
    def loadjson(line):
        try:
            return json.loads(line)
-        except Exception as e:
+        except Exception:
            pass
        return {}

@ -183,4 +183,4 @@ def refactor(df):
                "\r",
                "\\n"))
    # print(df.values.tolist())
-    return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
+    return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py
@ -100,7 +100,7 @@ def forEdu(cv):
        if n.get("school_name") and isinstance(n["school_name"], str):
            sch.append(re.sub(r"(211|985|重点大学|[,&;；-])", "", n["school_name"]))
            e["sch_nm_kwd"] = sch[-1]
-        fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
+        fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])

        if n.get("discipline_name") and isinstance(n["discipline_name"], str):
            maj.append(n["discipline_name"])
@ -485,7 +485,7 @@ def parse(cv):
        nm = re.sub(r"[\n——\-\(（\+].*", "", cv["name"].strip())
        nm = re.sub(r"[ \t　]+", " ", nm)
        if re.match(r"[a-zA-Z ]+$", nm):
-            if len(nm.split(" ")) > 1:
+            if len(nm.split()) > 1:
                cv["name"] = nm
            else:
                nm = ""
@ -503,7 +503,7 @@ def parse(cv):
        for py in PY.get_pinyins(nm[:20], ''):
            for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
        for py in PY.get_pinyins(nm[:20], ' '):
-            py = py.split(" ")
+            py = py.split()
            for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])

        cv["name_kwd"] = name