Fix: decode error in email parser app (#10920)

### What problem does this PR solve?

Fix: UnicodeDecodeError: 'gb2312' codec can't decode byte 0xab in
position 560: illegal multibyte sequence.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-11-03 12:31:06 +08:00
committed by GitHub
parent a52bdf0b7e
commit 4117f41758

View File

@ -62,14 +62,27 @@ def chunk(
# get the email main info # get the email main info
def _add_content(msg, content_type): def _add_content(msg, content_type):
def _decode_payload(payload, charset, target_list):
try:
target_list.append(payload.decode(charset))
except (UnicodeDecodeError, LookupError):
for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
try:
target_list.append(payload.decode(enc))
break
except UnicodeDecodeError:
continue
else:
target_list.append(payload.decode("utf-8", errors="ignore"))
if content_type == "text/plain": if content_type == "text/plain":
text_txt.append( payload = msg.get_payload(decode=True)
msg.get_payload(decode=True).decode(msg.get_content_charset()) charset = msg.get_content_charset() or "utf-8"
) _decode_payload(payload, charset, text_txt)
elif content_type == "text/html": elif content_type == "text/html":
html_txt.append( payload = msg.get_payload(decode=True)
msg.get_payload(decode=True).decode(msg.get_content_charset()) charset = msg.get_content_charset() or "utf-8"
) _decode_payload(payload, charset, html_txt)
elif "multipart" in content_type: elif "multipart" in content_type:
if msg.is_multipart(): if msg.is_multipart():
for part in msg.iter_parts(): for part in msg.iter_parts():