Refa: token similarity calculations. (#6614)

### What problem does this PR solve?

#6507

### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu
2025-03-28 09:33:08 +08:00
committed by GitHub
parent fe0396bbb9
commit 0758c04941
2 changed files with 11 additions and 9 deletions

View File

@ -16,9 +16,11 @@
import logging
import json
import math
import re
from rag.utils.doc_store_conn import MatchTextExpr
from collections import defaultdict
from rag.utils.doc_store_conn import MatchTextExpr
from rag.nlp import rag_tokenizer, term_weight, synonym
@ -212,12 +214,11 @@ class FulltextQueryer:
def token_similarity(self, atks, btkss):
def toDict(tks):
d = {}
if isinstance(tks, str):
tks = tks.split()
for t, c in self.tw.weights(tks, preprocess=False):
if t not in d:
d[t] = 0
d = defaultdict(int)
wts = self.tw.weights(tks, preprocess=False)
for i, (t, c) in enumerate(wts):
d[t] += c
return d
@ -233,11 +234,11 @@ class FulltextQueryer:
s = 1e-9
for k, v in qtwt.items():
if k in dtwt:
s += v # * dtwt[k]
s += v * dtwt[k]
q = 1e-9
for k, v in qtwt.items():
q += v
return s / q
q += v * v
return math.sqrt(3. * (s / q / math.log10( len(dtwt.keys()) + 512 )))
def paragraph(self, content_tks: str, keywords: list = [], keywords_topn=30):
if isinstance(content_tks, str):