add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
KevinHuSh
2024-05-29 16:50:02 +08:00
committed by GitHub
parent e1f0644deb
commit 614defec21
17 changed files with 437 additions and 64 deletions

View File

@ -54,7 +54,8 @@ class EsQueryer:
if not self.isChinese(txt):
tks = rag_tokenizer.tokenize(txt).split(" ")
tks_w = self.tw.weights(tks)
q = [re.sub(r"[ \\\"']+", "", tk)+"^{:.4f}".format(w) for tk, w in tks_w]
tks_w = [(re.sub(r"[ \\\"']+", "", tk), w) for tk, w in tks_w]
q = ["{}^{:.4f}".format(tk, w) for tk, w in tks_w if tk]
for i in range(1, len(tks_w)):
q.append("\"%s %s\"^%.4f" % (tks_w[i - 1][0], tks_w[i][0], max(tks_w[i - 1][1], tks_w[i][1])*2))
if not q:
@ -136,7 +137,11 @@ class EsQueryer:
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
import numpy as np
sims = CosineSimilarity([avec], bvecs)
tksim = self.token_similarity(atks, btkss)
return np.array(sims[0]) * vtweight + \
np.array(tksim) * tkweight, tksim, sims[0]
def token_similarity(self, atks, btkss):
def toDict(tks):
d = {}
if isinstance(tks, str):
@ -149,9 +154,7 @@ class EsQueryer:
atks = toDict(atks)
btkss = [toDict(tks) for tks in btkss]
tksim = [self.similarity(atks, btks) for btks in btkss]
return np.array(sims[0]) * vtweight + \
np.array(tksim) * tkweight, tksim, sims[0]
return [self.similarity(atks, btks) for btks in btkss]
def similarity(self, qtwt, dtwt):
if isinstance(dtwt, type("")):