mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Edit chunk shall update instead of insert it (#3709)
### What problem does this PR solve? Edit chunk shall update instead of insert it. Close #3679 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -47,7 +47,7 @@ def corpNorm(nm, add_region=True):
|
||||
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
||||
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
||||
|
||||
tks = rag_tokenizer.tokenize(nm).split(" ")
|
||||
tks = rag_tokenizer.tokenize(nm).split()
|
||||
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||
nm = ""
|
||||
for t in tks:
|
||||
|
||||
@ -44,7 +44,7 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
|
||||
|
||||
def split(txt):
|
||||
tks = []
|
||||
for t in re.sub(r"[ \t]+", " ",txt).split(" "):
|
||||
for t in re.sub(r"[ \t]+", " ",txt).split():
|
||||
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
||||
re.match(r"[a-zA-Z]", t) and tks:
|
||||
tks[-1] = tks[-1] + " " + t
|
||||
|
||||
@ -80,7 +80,7 @@ def refactor(df):
|
||||
def loadjson(line):
|
||||
try:
|
||||
return json.loads(line)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
@ -183,4 +183,4 @@ def refactor(df):
|
||||
"\r",
|
||||
"\\n"))
|
||||
# print(df.values.tolist())
|
||||
return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
|
||||
return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
|
||||
|
||||
@ -100,7 +100,7 @@ def forEdu(cv):
|
||||
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||
e["sch_nm_kwd"] = sch[-1]
|
||||
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
|
||||
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
|
||||
|
||||
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
||||
maj.append(n["discipline_name"])
|
||||
@ -485,7 +485,7 @@ def parse(cv):
|
||||
nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
|
||||
nm = re.sub(r"[ \t ]+", " ", nm)
|
||||
if re.match(r"[a-zA-Z ]+$", nm):
|
||||
if len(nm.split(" ")) > 1:
|
||||
if len(nm.split()) > 1:
|
||||
cv["name"] = nm
|
||||
else:
|
||||
nm = ""
|
||||
@ -503,7 +503,7 @@ def parse(cv):
|
||||
for py in PY.get_pinyins(nm[:20], ''):
|
||||
for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
|
||||
for py in PY.get_pinyins(nm[:20], ' '):
|
||||
py = py.split(" ")
|
||||
py = py.split()
|
||||
for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
||||
|
||||
cv["name_kwd"] = name
|
||||
|
||||
Reference in New Issue
Block a user