mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
118 lines
3.7 KiB
Python
118 lines
3.7 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import logging
|
|
from email import policy
|
|
from email.parser import BytesParser
|
|
from rag.app.naive import chunk as naive_chunk
|
|
import re
|
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
|
from deepdoc.parser import HtmlParser, TxtParser
|
|
from timeit import default_timer as timer
|
|
import io
|
|
|
|
|
|
def chunk(
|
|
filename,
|
|
binary=None,
|
|
from_page=0,
|
|
to_page=100000,
|
|
lang="Chinese",
|
|
callback=None,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
Only eml is supported
|
|
"""
|
|
eng = lang.lower() == "english" # is_english(cks)
|
|
parser_config = kwargs.get(
|
|
"parser_config",
|
|
{"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
|
|
)
|
|
doc = {
|
|
"docnm_kwd": filename,
|
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
|
}
|
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
|
main_res = []
|
|
attachment_res = []
|
|
|
|
if binary:
|
|
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
|
|
else:
|
|
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
|
|
|
|
text_txt, html_txt = [], []
|
|
# get the email header info
|
|
for header, value in msg.items():
|
|
text_txt.append(f"{header}: {value}")
|
|
|
|
# get the email main info
|
|
def _add_content(msg, content_type):
|
|
if content_type == "text/plain":
|
|
text_txt.append(
|
|
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
|
)
|
|
elif content_type == "text/html":
|
|
html_txt.append(
|
|
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
|
)
|
|
elif "multipart" in content_type:
|
|
if msg.is_multipart():
|
|
for part in msg.iter_parts():
|
|
_add_content(part, part.get_content_type())
|
|
|
|
_add_content(msg, msg.get_content_type())
|
|
|
|
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
|
|
(line, "") for line in HtmlParser.parser_txt("\n".join(html_txt), chunk_token_num=parser_config["chunk_token_num"]) if line
|
|
]
|
|
|
|
st = timer()
|
|
chunks = naive_merge(
|
|
sections,
|
|
int(parser_config.get("chunk_token_num", 128)),
|
|
parser_config.get("delimiter", "\n!?。;!?"),
|
|
)
|
|
|
|
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
|
logging.debug("naive_merge({}): {}".format(filename, timer() - st))
|
|
# get the attachment info
|
|
for part in msg.iter_attachments():
|
|
content_disposition = part.get("Content-Disposition")
|
|
if content_disposition:
|
|
dispositions = content_disposition.strip().split(";")
|
|
if dispositions[0].lower() == "attachment":
|
|
filename = part.get_filename()
|
|
payload = part.get_payload(decode=True)
|
|
try:
|
|
attachment_res.extend(
|
|
naive_chunk(filename, payload, callback=callback, **kwargs)
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return main_res + attachment_res
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
def dummy(prog=None, msg=""):
|
|
pass
|
|
|
|
chunk(sys.argv[1], callback=dummy)
|