mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-21 21:36:42 +08:00
Removing invisible chars before tokenization. (#4233)
### What problem does this PR solve? #4223 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -264,6 +264,7 @@ class RagTokenizer:
|
|||||||
return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
|
return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
|
||||||
|
|
||||||
def tokenize(self, line):
|
def tokenize(self, line):
|
||||||
|
line = re.sub(r"\W+", " ", line)
|
||||||
line = self._strQ2B(line).lower()
|
line = self._strQ2B(line).lower()
|
||||||
line = self._tradi2simp(line)
|
line = self._tradi2simp(line)
|
||||||
zh_num = len([1 for c in line if is_chinese(c)])
|
zh_num = len([1 for c in line if is_chinese(c)])
|
||||||
|
|||||||
Reference in New Issue
Block a user