Refa: improve GraphRAG similarity sensitivity to numeric differences (#8479)

### What problem does this PR solve?

Improve GraphRAG similarity sensitivity to numeric differences. #8444.

### Type of change

- [x] Refactoring
This commit is contained in:
Yongteng Lei
2025-06-25 16:20:59 +08:00
committed by GitHub
parent d632046032
commit b705ff08fe
2 changed files with 28 additions and 7 deletions

View File

@ -218,7 +218,20 @@ class EntityResolution(Extractor):
return ans_list
def _has_digit_in_2gram_diff(self, a, b):
def to_2gram_set(s):
return {s[i:i+2] for i in range(len(s) - 1)}
set_a = to_2gram_set(a)
set_b = to_2gram_set(b)
diff = set_a ^ set_b
return any(any(c.isdigit() for c in pair) for pair in diff)
def is_similarity(self, a, b):
if self._has_digit_in_2gram_diff(a, b):
return False
if is_english(a) and is_english(b):
if editdistance.eval(a, b) <= min(len(a), len(b)) // 2:
return True