From 501b7d4d0168e1df0176d2ac112986175dcd0a4d Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Mon, 27 Oct 2025 09:32:55 +0800 Subject: [PATCH] Fix: prio synonym match than wordnet for english (#10762) ### What problem does this PR solve? Fix: prio synonym match than wordnet for english ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/synonym.py | 29 +++++++++++++++++++++++------ rag/res/synonym.json | 3 ++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/rag/nlp/synonym.py b/rag/nlp/synonym.py index a826a3266..b28560ce1 100644 --- a/rag/nlp/synonym.py +++ b/rag/nlp/synonym.py @@ -32,6 +32,7 @@ class Dealer: path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json") try: self.dictionary = json.load(open(path, 'r')) + self.dictionary = { (k.lower() if isinstance(k, str) else k): v for k, v in self.dictionary.items() } except Exception: logging.warning("Missing synonym.json") self.dictionary = {} @@ -66,18 +67,34 @@ class Dealer: except Exception as e: logging.error("Fail to load synonym!" + str(e)) - def lookup(self, tk, topn=8): - if re.match(r"[a-z]+$", tk): - res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets(tk)]) - set([tk])) - return [t for t in res if t] + def lookup(self, tk, topn=8): + if not tk or not isinstance(tk, str): + return [] + + # 1) Check the custom dictionary first (both keys and tk are already lowercase) self.lookup_num += 1 self.load() - res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), []) + key = re.sub(r"[ \t]+", " ", tk.strip()) + res = self.dictionary.get(key, []) if isinstance(res, str): res = [res] - return res[:topn] + if res: # Found in dictionary → return directly + return res[:topn] + # 2) If not found and tk is purely alphabetical → fallback to WordNet + if re.fullmatch(r"[a-z]+", tk): + wn_set = { + re.sub("_", " ", syn.name().split(".")[0]) + for syn in wordnet.synsets(tk) + } + wn_set.discard(tk) # Remove the original token itself + wn_res = [t for t in wn_set if t] + return wn_res[:topn] + + # 3) Nothing found in either source + return [] + if __name__ == '__main__': dl = Dealer() diff --git a/rag/res/synonym.json b/rag/res/synonym.json index ea61b9e1c..047303155 100644 --- a/rag/res/synonym.json +++ b/rag/res/synonym.json @@ -10542,5 +10542,6 @@ "周五": ["礼拜五", "星期五"], "周六": ["礼拜六", "星期六"], "周日": ["礼拜日", "星期日", "星期天", "礼拜天"], -"上班": "办公" +"上班": "办公", +"HELO":"agn" }