mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: decode error in email parser app (#10920)
### What problem does this PR solve? Fix: UnicodeDecodeError: 'gb2312' codec can't decode byte 0xab in position 560: illegal multibyte sequence. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -62,14 +62,27 @@ def chunk(
|
|||||||
|
|
||||||
# get the email main info
|
# get the email main info
|
||||||
def _add_content(msg, content_type):
|
def _add_content(msg, content_type):
|
||||||
|
def _decode_payload(payload, charset, target_list):
|
||||||
|
try:
|
||||||
|
target_list.append(payload.decode(charset))
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
|
||||||
|
try:
|
||||||
|
target_list.append(payload.decode(enc))
|
||||||
|
break
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
target_list.append(payload.decode("utf-8", errors="ignore"))
|
||||||
|
|
||||||
if content_type == "text/plain":
|
if content_type == "text/plain":
|
||||||
text_txt.append(
|
payload = msg.get_payload(decode=True)
|
||||||
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
charset = msg.get_content_charset() or "utf-8"
|
||||||
)
|
_decode_payload(payload, charset, text_txt)
|
||||||
elif content_type == "text/html":
|
elif content_type == "text/html":
|
||||||
html_txt.append(
|
payload = msg.get_payload(decode=True)
|
||||||
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
charset = msg.get_content_charset() or "utf-8"
|
||||||
)
|
_decode_payload(payload, charset, html_txt)
|
||||||
elif "multipart" in content_type:
|
elif "multipart" in content_type:
|
||||||
if msg.is_multipart():
|
if msg.is_multipart():
|
||||||
for part in msg.iter_parts():
|
for part in msg.iter_parts():
|
||||||
|
|||||||
Reference in New Issue
Block a user