fix plainPdf bugs (#152)

This commit is contained in:
KevinHuSh
2024-03-26 15:11:07 +08:00
committed by GitHub
parent 75f7c6da2f
commit da21320b88
13 changed files with 36 additions and 33 deletions

View File

@ -53,7 +53,7 @@ class EsQueryer:
if not self.isChinese(txt):
tks = huqie.qie(txt).split(" ")
q = tks
q = copy.deepcopy(tks)
for i in range(1, len(tks)):
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
if not q:
@ -138,7 +138,7 @@ class EsQueryer:
def toDict(tks):
d = {}
if isinstance(tks, type("")):
if isinstance(tks, str):
tks = tks.split(" ")
for t, c in self.tw.weights(tks):
if t not in d:

View File

@ -234,13 +234,13 @@ class Dealer:
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
len(ans_v[0]), len(chunk_v[0]))
chunks_tks = [huqie.qie(ck).split(" ") for ck in chunks]
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
cites = {}
for i, a in enumerate(pieces_):
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
chunk_v,
huqie.qie(
pieces_[i]).split(" "),
self.qryr.rmWWW(pieces_[i])).split(" "),
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99

View File

@ -150,9 +150,10 @@ class Dealer:
return 6
def ner(t):
if re.match(r"[0-9,.]{2,}$", t): return 2
if re.match(r"[a-z]{1,2}$", t): return 0.01
if not self.ne or t not in self.ne:
return 1
if re.match(r"[0-9,.]+$", t): return 2
m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
"firstnm": 1}
return m[self.ne[t]]
@ -170,11 +171,11 @@ class Dealer:
return 1
def freq(t):
if re.match(r"[0-9\. -]+$", t):
return 10000
if re.match(r"[0-9. -]{2,}$", t):
return 3
s = huqie.freq(t)
if not s and re.match(r"[a-z\. -]+$", t):
return 10
if not s and re.match(r"[a-z. -]+$", t):
return 300
if not s:
s = 0
@ -188,12 +189,12 @@ class Dealer:
return max(s, 10)
def df(t):
if re.match(r"[0-9\. -]+$", t):
return 100000
if re.match(r"[0-9. -]{2,}$", t):
return 5
if t in self.df:
return self.df[t] + 3
elif re.match(r"[a-z\. -]+$", t):
return 3
elif re.match(r"[a-z. -]+$", t):
return 300
elif len(t) >= 4:
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
if len(s) > 1: