add rerank model (#969)

### What problem does this PR solve? feat: add rerank models to the project #724 #162 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 00:25:06 +08:00 · 2024-05-29 16:50:02 +08:00
parent e1f0644deb
commit 614defec21
17 changed files with 437 additions and 64 deletions
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -54,7 +54,8 @@ class EsQueryer:
        if not self.isChinese(txt):
            tks = rag_tokenizer.tokenize(txt).split(" ")
            tks_w = self.tw.weights(tks)
-            q = [re.sub(r"[ \\\"']+", "", tk)+"^{:.4f}".format(w) for tk, w in tks_w]
+            tks_w = [(re.sub(r"[ \\\"']+", "", tk), w) for tk, w in tks_w]
+            q = ["{}^{:.4f}".format(tk, w) for tk, w in tks_w if tk]
            for i in range(1, len(tks_w)):
                q.append("\"%s %s\"^%.4f" % (tks_w[i - 1][0], tks_w[i][0], max(tks_w[i - 1][1], tks_w[i][1])*2))
            if not q:
@ -136,7 +137,11 @@ class EsQueryer:
        from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
        import numpy as np
        sims = CosineSimilarity([avec], bvecs)
+        tksim = self.token_similarity(atks, btkss)
+        return np.array(sims[0]) * vtweight + \
+            np.array(tksim) * tkweight, tksim, sims[0]

+    def token_similarity(self, atks, btkss):
        def toDict(tks):
            d = {}
            if isinstance(tks, str):
@ -149,9 +154,7 @@ class EsQueryer:

        atks = toDict(atks)
        btkss = [toDict(tks) for tks in btkss]
-        tksim = [self.similarity(atks, btks) for btks in btkss]
-        return np.array(sims[0]) * vtweight + \
-            np.array(tksim) * tkweight, tksim, sims[0]
+        return [self.similarity(atks, btks) for btks in btkss]

    def similarity(self, qtwt, dtwt):
        if isinstance(dtwt, type("")):