mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Fix: decode error in email parser app (#10920)
### What problem does this PR solve? Fix: UnicodeDecodeError: 'gb2312' codec can't decode byte 0xab in position 560: illegal multibyte sequence. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -62,14 +62,27 @@ def chunk(
|
||||
|
||||
# get the email main info
|
||||
def _add_content(msg, content_type):
|
||||
def _decode_payload(payload, charset, target_list):
|
||||
try:
|
||||
target_list.append(payload.decode(charset))
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
|
||||
try:
|
||||
target_list.append(payload.decode(enc))
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
else:
|
||||
target_list.append(payload.decode("utf-8", errors="ignore"))
|
||||
|
||||
if content_type == "text/plain":
|
||||
text_txt.append(
|
||||
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
||||
)
|
||||
payload = msg.get_payload(decode=True)
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
_decode_payload(payload, charset, text_txt)
|
||||
elif content_type == "text/html":
|
||||
html_txt.append(
|
||||
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
||||
)
|
||||
payload = msg.get_payload(decode=True)
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
_decode_payload(payload, charset, html_txt)
|
||||
elif "multipart" in content_type:
|
||||
if msg.is_multipart():
|
||||
for part in msg.iter_parts():
|
||||
|
||||
Reference in New Issue
Block a user