Fix: decode error in email parser app (#10920)

### What problem does this PR solve? Fix: UnicodeDecodeError: 'gb2312' codec can't decode byte 0xab in position 560: illegal multibyte sequence. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-03 17:15:08 +08:00 · 2025-11-03 12:31:06 +08:00
parent a52bdf0b7e
commit 4117f41758
1 changed files with 19 additions and 6 deletions
--- a/rag/app/email.py
+++ b/rag/app/email.py
@ -62,14 +62,27 @@ def chunk(
    #  get the email main info
    def _add_content(msg, content_type):
        def _decode_payload(payload, charset, target_list):
            try:
                target_list.append(payload.decode(charset))
            except (UnicodeDecodeError, LookupError):
                for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
                    try:
                        target_list.append(payload.decode(enc))
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    target_list.append(payload.decode("utf-8", errors="ignore"))
        if content_type == "text/plain":
-            text_txt.append(
+            payload = msg.get_payload(decode=True)
-                msg.get_payload(decode=True).decode(msg.get_content_charset())
+            charset = msg.get_content_charset() or "utf-8"
-            )
+            _decode_payload(payload, charset, text_txt)
        elif content_type == "text/html":
-            html_txt.append(
+            payload = msg.get_payload(decode=True)
-                msg.get_payload(decode=True).decode(msg.get_content_charset())
+            charset = msg.get_content_charset() or "utf-8"
-            )
+            _decode_payload(payload, charset, html_txt)
        elif "multipart" in content_type:
            if msg.is_multipart():
                for part in msg.iter_parts():