mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: location rule for http (#10901)
### What problem does this PR solve? Location rule for http. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -464,14 +464,27 @@ class Parser(ProcessBase):
|
||||
if "body" in target_fields:
|
||||
body_text, body_html = [], []
|
||||
def _add_content(m, content_type):
|
||||
def _decode_payload(payload, charset, target_list):
|
||||
try:
|
||||
target_list.append(payload.decode(charset))
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
|
||||
try:
|
||||
target_list.append(payload.decode(enc))
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
else:
|
||||
target_list.append(payload.decode("utf-8", errors="ignore"))
|
||||
|
||||
if content_type == "text/plain":
|
||||
body_text.append(
|
||||
m.get_payload(decode=True).decode(m.get_content_charset())
|
||||
)
|
||||
payload = msg.get_payload(decode=True)
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
_decode_payload(payload, charset, body_text)
|
||||
elif content_type == "text/html":
|
||||
body_html.append(
|
||||
m.get_payload(decode=True).decode(m.get_content_charset())
|
||||
)
|
||||
payload = msg.get_payload(decode=True)
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
_decode_payload(payload, charset, body_html)
|
||||
elif "multipart" in content_type:
|
||||
if m.is_multipart():
|
||||
for part in m.iter_parts():
|
||||
|
||||
Reference in New Issue
Block a user