Fit a lot of encodings for text file. (#458)

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement
This commit is contained in:
KevinHuSh
2024-04-19 18:02:53 +08:00
committed by GitHub
parent cda7b607cb
commit ed6081845a
19 changed files with 118 additions and 55 deletions

View File

@ -3,6 +3,8 @@ from openpyxl import load_workbook
import sys
from io import BytesIO
from rag.nlp import find_codec
class HuExcelParser:
def html(self, fnm):
@ -66,7 +68,8 @@ class HuExcelParser:
return total
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
txt = binary.decode("utf-8")
encoding = find_codec(binary)
txt = binary.decode(encoding)
return len(txt.split("\n"))