add support for eml file parser (#1768)

### What problem does this PR solve? add support for eml file parser #1363 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-29 22:56:36 +08:00 · 2024-08-06 16:42:14 +08:00
parent b67484e77d
commit ede733e130
12 changed files with 178 additions and 28 deletions
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@ -39,7 +39,7 @@ from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
-from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
+from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
            table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case "audio":
            audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
+        case "email":
+            email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case _:
            return False

--- a/api/db/init.py
+++ b/api/db/init.py
@ -85,6 +85,7 @@ class ParserType(StrEnum):
    PICTURE = "picture"
    ONE = "one"
    AUDIO = "audio"
+    EMAIL = "email"
    KG = "knowledge_graph"


--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@ -122,7 +122,7 @@ def init_llm_factory():
    LLMService.filter_delete([LLMService.model.fid == "QAnything"])
    TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
    TenantService.filter_update([1 == 1], {
-        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
+        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
    ## insert openai two embedding models to the current openai user.
    print("Start to insert 2 OpenAI embedding models...")
    tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
--- a/api/settings.py
+++ b/api/settings.py
@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get(
    "parsers",
-    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
+    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")

 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@ -156,7 +156,7 @@ def filename_type(filename):
        return FileType.PDF.value

    if re.match(
-             r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
+             r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
        return FileType.DOC.value

    if re.match(