Fit a lot of encodings for text file. (#458)

### What problem does this PR solve? #384 ### Type of change - [x] Performance Improvement
2026-01-30 23:26:36 +08:00 · 2024-04-19 18:02:53 +08:00
parent cda7b607cb
commit ed6081845a
19 changed files with 118 additions and 55 deletions
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -3,6 +3,8 @@ from openpyxl import load_workbook
 import sys
 from io import BytesIO

+from rag.nlp import find_codec
+

 class HuExcelParser:
    def html(self, fnm):
@ -66,7 +68,8 @@ class HuExcelParser:
                return total

        if fnm.split(".")[-1].lower() in ["csv", "txt"]:
-            txt = binary.decode("utf-8")
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding)
            return len(txt.split("\n"))