add support for eml file parser (#1768)

### What problem does this PR solve? add support for eml file parser #1363 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-23 03:26:53 +08:00 · 2024-08-06 16:42:14 +08:00
parent b67484e77d
commit ede733e130
12 changed files with 178 additions and 28 deletions
--- a/deepdoc/parser/init.py
+++ b/deepdoc/parser/init.py
@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
 from .json_parser import RAGFlowJsonParser as JsonParser
-from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
+from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
+from .txt_parser import RAGFlowTxtParser as TxtParser
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
        else:
            with open(fnm, "r",encoding=get_encoding(fnm)) as f:
                txt = f.read()
+        return self.parser_txt(txt)

+    @classmethod
+    def parser_txt(cls, txt):
+        if type(txt) != str:
+            raise TypeError("txt type should be str!")
        html_doc = readability.Document(txt)
        title = html_doc.title()
        content = html_text.extract_text(html_doc.summary(html_partial=True))
-        txt = f'{title}\n{content}'
+        txt = f"{title}\n{content}"
        sections = txt.split("\n")
        return sections
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
@ -0,0 +1,42 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from rag.nlp import find_codec,num_tokens_from_string
+
+class RAGFlowTxtParser:
+    def __call__(self, fnm, binary=None, chunk_token_num=128):
+        txt = ""
+        if binary:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+        else:
+            with open(fnm, "r") as f:
+                while True:
+                    l = f.readline()
+                    if not l:
+                        break
+                    txt += l
+        return self.parser_txt(txt, chunk_token_num)
+
+    @classmethod
+    def parser_txt(cls, txt, chunk_token_num=128):
+        if type(txt) != str:
+            raise TypeError("txt type should be str!")
+        sections = []
+        for sec in txt.split("\n"):
+            if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
+                sections.append((sec[: int(len(sec) / 2)], ""))
+                sections.append((sec[int(len(sec) / 2) :], ""))
+            else:
+                sections.append((sec, ""))
+        return sections