diff --git a/rag/nlp/query.py b/rag/nlp/query.py index b58efb859..72784beda 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -16,9 +16,11 @@ import logging import json +import math import re -from rag.utils.doc_store_conn import MatchTextExpr +from collections import defaultdict +from rag.utils.doc_store_conn import MatchTextExpr from rag.nlp import rag_tokenizer, term_weight, synonym @@ -212,12 +214,11 @@ class FulltextQueryer: def token_similarity(self, atks, btkss): def toDict(tks): - d = {} if isinstance(tks, str): tks = tks.split() - for t, c in self.tw.weights(tks, preprocess=False): - if t not in d: - d[t] = 0 + d = defaultdict(int) + wts = self.tw.weights(tks, preprocess=False) + for i, (t, c) in enumerate(wts): d[t] += c return d @@ -233,11 +234,11 @@ class FulltextQueryer: s = 1e-9 for k, v in qtwt.items(): if k in dtwt: - s += v # * dtwt[k] + s += v * dtwt[k] q = 1e-9 for k, v in qtwt.items(): - q += v - return s / q + q += v * v + return math.sqrt(3. * (s / q / math.log10( len(dtwt.keys()) + 512 ))) def paragraph(self, content_tks: str, keywords: list = [], keywords_topn=30): if isinstance(content_tks, str): diff --git a/rag/nlp/search.py b/rag/nlp/search.py index b18bfd52a..e2128a268 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -15,6 +15,7 @@ # import logging import re +from collections import OrderedDict from dataclasses import dataclass from rag.settings import TAG_FLD, PAGERANK_FLD @@ -297,7 +298,7 @@ class Dealer: sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]] ins_tw = [] for i in sres.ids: - content_ltks = sres.field[i][cfield].split() + content_ltks = list(OrderedDict.fromkeys(sres.field[i][cfield].split())) title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t] question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t] important_kwd = sres.field[i].get("important_kwd", [])