Fix: decode error in email parser app (#10920)

### What problem does this PR solve? Fix: UnicodeDecodeError: 'gb2312' codec can't decode byte 0xab in position 560: illegal multibyte sequence. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-05 01:55:05 +08:00 · 2025-11-03 12:31:06 +08:00
parent a52bdf0b7e
commit 4117f41758
1 changed files with 19 additions and 6 deletions
--- a/rag/app/email.py
+++ b/rag/app/email.py
@ -62,14 +62,27 @@ def chunk(

    #  get the email main info
    def _add_content(msg, content_type):
+        def _decode_payload(payload, charset, target_list):
+            try:
+                target_list.append(payload.decode(charset))
+            except (UnicodeDecodeError, LookupError):
+                for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
+                    try:
+                        target_list.append(payload.decode(enc))
+                        break
+                    except UnicodeDecodeError:
+                        continue
+                else:
+                    target_list.append(payload.decode("utf-8", errors="ignore"))
+
        if content_type == "text/plain":
-            text_txt.append(
-                msg.get_payload(decode=True).decode(msg.get_content_charset())
-            )
+            payload = msg.get_payload(decode=True)
+            charset = msg.get_content_charset() or "utf-8"
+            _decode_payload(payload, charset, text_txt)
        elif content_type == "text/html":
-            html_txt.append(
-                msg.get_payload(decode=True).decode(msg.get_content_charset())
-            )
+            payload = msg.get_payload(decode=True)
+            charset = msg.get_content_charset() or "utf-8"
+            _decode_payload(payload, charset, html_txt)
        elif "multipart" in content_type:
            if msg.is_multipart():
                for part in msg.iter_parts():