Removing invisible chars before tokenization. (#4233)

### What problem does this PR solve? #4223 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-02 16:45:08 +08:00 · 2024-12-26 11:48:16 +08:00
parent 28eeb29b88
commit 7e063283ba
1 changed files with 1 additions and 0 deletions
--- a/rag/nlp/rag_tokenizer.py
+++ b/rag/nlp/rag_tokenizer.py
@ -264,6 +264,7 @@ class RagTokenizer:
        return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]

    def tokenize(self, line):
+        line = re.sub(r"\W+", " ", line)
        line = self._strQ2B(line).lower()
        line = self._tradi2simp(line)
        zh_num = len([1 for c in line if is_chinese(c)])