Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Zhichang Yu
2024-11-28 13:00:38 +08:00
committed by GitHub
parent 9f57534843
commit bc701d7b4c
19 changed files with 51 additions and 46 deletions

View File

@ -47,7 +47,7 @@ class RAGFlowDocxParser:
for p, n in patt:
if re.search(p, b):
return n
tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
if len(tks) > 3:
if len(tks) < 12:
return "Tx"

View File

@ -108,13 +108,13 @@ class RAGFlowPdfParser:
h = max(self.__height(up), self.__height(down))
y_dis = self._y_dis(up, down)
LEN = 6
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
tks_all = up["text"][-LEN:].strip() \
+ (" " if re.match(r"[a-zA-Z0-9]+",
up["text"][-1] + down["text"][0]) else "") \
+ down["text"][:LEN].strip()
tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
tks_all = rag_tokenizer.tokenize(tks_all).split()
fea = [
up.get("R", -1) == down.get("R", -1),
y_dis / h,
@ -565,13 +565,13 @@ class RAGFlowPdfParser:
if i >= len(self.boxes):
break
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
self.boxes[i]["text"].strip().split(" ")[:2])
self.boxes[i]["text"].strip().split()[:2])
while not prefix:
self.boxes.pop(i)
if i >= len(self.boxes):
break
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
self.boxes[i]["text"].strip().split(" ")[:2])
self.boxes[i]["text"].strip().split()[:2])
self.boxes.pop(i)
if i >= len(self.boxes) or not prefix:
break

View File

@ -47,7 +47,7 @@ def corpNorm(nm, add_region=True):
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
tks = rag_tokenizer.tokenize(nm).split(" ")
tks = rag_tokenizer.tokenize(nm).split()
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
nm = ""
for t in tks:

View File

@ -44,7 +44,7 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
def split(txt):
tks = []
for t in re.sub(r"[ \t]+", " ",txt).split(" "):
for t in re.sub(r"[ \t]+", " ",txt).split():
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
re.match(r"[a-zA-Z]", t) and tks:
tks[-1] = tks[-1] + " " + t

View File

@ -80,7 +80,7 @@ def refactor(df):
def loadjson(line):
try:
return json.loads(line)
except Exception as e:
except Exception:
pass
return {}
@ -183,4 +183,4 @@ def refactor(df):
"\r",
"\\n"))
# print(df.values.tolist())
return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))

View File

@ -100,7 +100,7 @@ def forEdu(cv):
if n.get("school_name") and isinstance(n["school_name"], str):
sch.append(re.sub(r"(211|985|重点大学|[,&;-])", "", n["school_name"]))
e["sch_nm_kwd"] = sch[-1]
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
maj.append(n["discipline_name"])
@ -485,7 +485,7 @@ def parse(cv):
nm = re.sub(r"[\n——\-\(\+].*", "", cv["name"].strip())
nm = re.sub(r"[ \t ]+", " ", nm)
if re.match(r"[a-zA-Z ]+$", nm):
if len(nm.split(" ")) > 1:
if len(nm.split()) > 1:
cv["name"] = nm
else:
nm = ""
@ -503,7 +503,7 @@ def parse(cv):
for py in PY.get_pinyins(nm[:20], ''):
for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
for py in PY.get_pinyins(nm[:20], ' '):
py = py.split(" ")
py = py.split()
for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
cv["name_kwd"] = name