diff --git a/graphrag/entity_resolution.py b/graphrag/entity_resolution.py index 97b135775..8d26335ca 100644 --- a/graphrag/entity_resolution.py +++ b/graphrag/entity_resolution.py @@ -218,7 +218,20 @@ class EntityResolution(Extractor): return ans_list + def _has_digit_in_2gram_diff(self, a, b): + def to_2gram_set(s): + return {s[i:i+2] for i in range(len(s) - 1)} + + set_a = to_2gram_set(a) + set_b = to_2gram_set(b) + diff = set_a ^ set_b + + return any(any(c.isdigit() for c in pair) for pair in diff) + def is_similarity(self, a, b): + if self._has_digit_in_2gram_diff(a, b): + return False + if is_english(a) and is_english(b): if editdistance.eval(a, b) <= min(len(a), len(b)) // 2: return True diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index f88c059a5..78f73ece7 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -225,15 +225,23 @@ def bullets_category(sections): def is_english(texts): - eng = 0 if not texts: return False - for t in texts: - if re.match(r"[ `a-zA-Z.,':;/\"?<>!\(\)-]", t.strip()): - eng += 1 - if eng / len(texts) > 0.8: - return True - return False + + pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]") + + if isinstance(texts, str): + texts = list(texts) + elif isinstance(texts, list): + texts = [t for t in texts if isinstance(t, str) and t.strip()] + else: + return False + + if not texts: + return False + + eng = sum(1 for t in texts if pattern.fullmatch(t.strip())) + return (eng / len(texts)) > 0.8 def is_chinese(text):