From 7e063283ba0bbf8c727639809b0bd33463309802 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 26 Dec 2024 11:48:16 +0800 Subject: [PATCH] Removing invisible chars before tokenization. (#4233) ### What problem does this PR solve? #4223 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/rag_tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rag/nlp/rag_tokenizer.py b/rag/nlp/rag_tokenizer.py index 4effd026f..0b78c4e09 100644 --- a/rag/nlp/rag_tokenizer.py +++ b/rag/nlp/rag_tokenizer.py @@ -264,6 +264,7 @@ class RagTokenizer: return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks] def tokenize(self, line): + line = re.sub(r"\W+", " ", line) line = self._strQ2B(line).lower() line = self._tradi2simp(line) zh_num = len([1 for c in line if is_chinese(c)])