mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refa: improve GraphRAG similarity sensitivity to numeric differences (#8479)
### What problem does this PR solve? Improve GraphRAG similarity sensitivity to numeric differences. #8444. ### Type of change - [x] Refactoring
This commit is contained in:
@ -218,7 +218,20 @@ class EntityResolution(Extractor):
|
|||||||
|
|
||||||
return ans_list
|
return ans_list
|
||||||
|
|
||||||
|
def _has_digit_in_2gram_diff(self, a, b):
|
||||||
|
def to_2gram_set(s):
|
||||||
|
return {s[i:i+2] for i in range(len(s) - 1)}
|
||||||
|
|
||||||
|
set_a = to_2gram_set(a)
|
||||||
|
set_b = to_2gram_set(b)
|
||||||
|
diff = set_a ^ set_b
|
||||||
|
|
||||||
|
return any(any(c.isdigit() for c in pair) for pair in diff)
|
||||||
|
|
||||||
def is_similarity(self, a, b):
|
def is_similarity(self, a, b):
|
||||||
|
if self._has_digit_in_2gram_diff(a, b):
|
||||||
|
return False
|
||||||
|
|
||||||
if is_english(a) and is_english(b):
|
if is_english(a) and is_english(b):
|
||||||
if editdistance.eval(a, b) <= min(len(a), len(b)) // 2:
|
if editdistance.eval(a, b) <= min(len(a), len(b)) // 2:
|
||||||
return True
|
return True
|
||||||
|
|||||||
@ -225,15 +225,23 @@ def bullets_category(sections):
|
|||||||
|
|
||||||
|
|
||||||
def is_english(texts):
|
def is_english(texts):
|
||||||
eng = 0
|
|
||||||
if not texts:
|
if not texts:
|
||||||
return False
|
return False
|
||||||
for t in texts:
|
|
||||||
if re.match(r"[ `a-zA-Z.,':;/\"?<>!\(\)-]", t.strip()):
|
pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]")
|
||||||
eng += 1
|
|
||||||
if eng / len(texts) > 0.8:
|
if isinstance(texts, str):
|
||||||
return True
|
texts = list(texts)
|
||||||
return False
|
elif isinstance(texts, list):
|
||||||
|
texts = [t for t in texts if isinstance(t, str) and t.strip()]
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not texts:
|
||||||
|
return False
|
||||||
|
|
||||||
|
eng = sum(1 for t in texts if pattern.fullmatch(t.strip()))
|
||||||
|
return (eng / len(texts)) > 0.8
|
||||||
|
|
||||||
|
|
||||||
def is_chinese(text):
|
def is_chinese(text):
|
||||||
|
|||||||
Reference in New Issue
Block a user