mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refine english synonym (#3371)
### What problem does this PR solve? #3361 ### Type of change - [x] Performance Improvement
This commit is contained in:
@ -75,11 +75,20 @@ class FulltextQueryer:
|
||||
if not self.isChinese(txt):
|
||||
txt = FulltextQueryer.rmWWW(txt)
|
||||
tks = rag_tokenizer.tokenize(txt).split(" ")
|
||||
tks_w = self.tw.weights(tks)
|
||||
keywords = [t for t in tks if t]
|
||||
tks_w = self.tw.weights(tks, preprocess=False)
|
||||
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
||||
tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
|
||||
tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
|
||||
q = ["{}^{:.4f}".format(tk, w) for tk, w in tks_w if tk]
|
||||
syns = []
|
||||
for tk, w in tks_w:
|
||||
syn = self.syn.lookup(tk)
|
||||
syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
|
||||
keywords.extend(syn)
|
||||
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
|
||||
syns.append(" ".join(syn))
|
||||
|
||||
q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
|
||||
for i in range(1, len(tks_w)):
|
||||
q.append(
|
||||
'"%s %s"^%.4f'
|
||||
@ -94,7 +103,7 @@ class FulltextQueryer:
|
||||
query = " ".join(q)
|
||||
return MatchTextExpr(
|
||||
self.query_fields, query, 100
|
||||
), tks
|
||||
), keywords
|
||||
|
||||
def need_fine_grained_tokenize(tk):
|
||||
if len(tk) < 3:
|
||||
|
||||
@ -18,7 +18,7 @@ import json
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
|
||||
from nltk.corpus import wordnet
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
@ -67,6 +67,10 @@ class Dealer:
|
||||
logger.error("Fail to load synonym!" + str(e))
|
||||
|
||||
def lookup(self, tk):
|
||||
if re.match(r"[a-z]+$", tk):
|
||||
res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets("love")]) - set([tk]))
|
||||
return [t for t in res if t]
|
||||
|
||||
self.lookup_num += 1
|
||||
self.load()
|
||||
res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])
|
||||
|
||||
Reference in New Issue
Block a user