mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refa: token similarity calculations. (#6614)
### What problem does this PR solve? #6507 ### Type of change - [x] Performance Improvement
This commit is contained in:
@ -16,9 +16,11 @@
|
||||
|
||||
import logging
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
from rag.utils.doc_store_conn import MatchTextExpr
|
||||
from collections import defaultdict
|
||||
|
||||
from rag.utils.doc_store_conn import MatchTextExpr
|
||||
from rag.nlp import rag_tokenizer, term_weight, synonym
|
||||
|
||||
|
||||
@ -212,12 +214,11 @@ class FulltextQueryer:
|
||||
|
||||
def token_similarity(self, atks, btkss):
|
||||
def toDict(tks):
|
||||
d = {}
|
||||
if isinstance(tks, str):
|
||||
tks = tks.split()
|
||||
for t, c in self.tw.weights(tks, preprocess=False):
|
||||
if t not in d:
|
||||
d[t] = 0
|
||||
d = defaultdict(int)
|
||||
wts = self.tw.weights(tks, preprocess=False)
|
||||
for i, (t, c) in enumerate(wts):
|
||||
d[t] += c
|
||||
return d
|
||||
|
||||
@ -233,11 +234,11 @@ class FulltextQueryer:
|
||||
s = 1e-9
|
||||
for k, v in qtwt.items():
|
||||
if k in dtwt:
|
||||
s += v # * dtwt[k]
|
||||
s += v * dtwt[k]
|
||||
q = 1e-9
|
||||
for k, v in qtwt.items():
|
||||
q += v
|
||||
return s / q
|
||||
q += v * v
|
||||
return math.sqrt(3. * (s / q / math.log10( len(dtwt.keys()) + 512 )))
|
||||
|
||||
def paragraph(self, content_tks: str, keywords: list = [], keywords_topn=30):
|
||||
if isinstance(content_tks, str):
|
||||
|
||||
Reference in New Issue
Block a user