From 4117f4175898e683156f70c59bb802b66569bdd3 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Mon, 3 Nov 2025 12:31:06 +0800 Subject: [PATCH] Fix: decode error in email parser app (#10920) ### What problem does this PR solve? Fix: UnicodeDecodeError: 'gb2312' codec can't decode byte 0xab in position 560: illegal multibyte sequence. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/email.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/rag/app/email.py b/rag/app/email.py index 1affe4f25..2e95eceec 100644 --- a/rag/app/email.py +++ b/rag/app/email.py @@ -62,14 +62,27 @@ def chunk( # get the email main info def _add_content(msg, content_type): + def _decode_payload(payload, charset, target_list): + try: + target_list.append(payload.decode(charset)) + except (UnicodeDecodeError, LookupError): + for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]: + try: + target_list.append(payload.decode(enc)) + break + except UnicodeDecodeError: + continue + else: + target_list.append(payload.decode("utf-8", errors="ignore")) + if content_type == "text/plain": - text_txt.append( - msg.get_payload(decode=True).decode(msg.get_content_charset()) - ) + payload = msg.get_payload(decode=True) + charset = msg.get_content_charset() or "utf-8" + _decode_payload(payload, charset, text_txt) elif content_type == "text/html": - html_txt.append( - msg.get_payload(decode=True).decode(msg.get_content_charset()) - ) + payload = msg.get_payload(decode=True) + charset = msg.get_content_charset() or "utf-8" + _decode_payload(payload, charset, html_txt) elif "multipart" in content_type: if msg.is_multipart(): for part in msg.iter_parts():