From 72c19b44c3fd773935356ec8fcd5668d755bdf52 Mon Sep 17 00:00:00 2001
From: Yongteng Lei <yongtengrey@outlook.com>
Date: Fri, 11 Jul 2025 18:47:19 +0800
Subject: [PATCH] Refa: better MIME content type (#8801)

### What problem does this PR solve?

Better uniform MIME content type.

### Type of change

- [x] Refactoring
---
 api/apps/document_app.py | 10 +++++----
 api/apps/file_app.py     | 14 ++++++-------
 api/utils/web_utils.py   | 45 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/api/apps/document_app.py b/api/apps/document_app.py
index bc1e5851b..c0d1a6be2 100644
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -42,7 +42,7 @@ from api.utils.api_utils import (
     validate_request,
 )
 from api.utils.file_utils import filename_type, get_project_base_directory, thumbnail
-from api.utils.web_utils import html2pdf, is_valid_url
+from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url
 from deepdoc.parser.html_parser import RAGFlowHtmlParser
 from rag.nlp import search
 from rag.utils.storage_factory import STORAGE_IMPL
@@ -505,12 +505,14 @@ def get(doc_id):
         b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
         response = flask.make_response(STORAGE_IMPL.get(b, n))
 
-        ext = re.search(r"\.([^.]+)$", doc.name)
+        ext = re.search(r"\.([^.]+)$", doc.name.lower())
+        ext = ext.group(1) if ext else None
         if ext:
             if doc.type == FileType.VISUAL.value:
-                response.headers.set("Content-Type", "image/%s" % ext.group(1))
+                content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}")
             else:
-                response.headers.set("Content-Type", "application/%s" % ext.group(1))
+                content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
+            response.headers.set("Content-Type", content_type)
         return response
     except Exception as e:
         return server_error_response(e)
diff --git a/api/apps/file_app.py b/api/apps/file_app.py
index eabe7bd9b..eeb66f6ec 100644
--- a/api/apps/file_app.py
+++ b/api/apps/file_app.py
@@ -31,6 +31,7 @@ from api.db.services.file_service import FileService
 from api import settings
 from api.utils.api_utils import get_json_result
 from api.utils.file_utils import filename_type
+from api.utils.web_utils import CONTENT_TYPE_MAP
 from rag.utils.storage_factory import STORAGE_IMPL
 
 
@@ -334,15 +335,14 @@ def get(file_id):
             blob = STORAGE_IMPL.get(b, n)
 
         response = flask.make_response(blob)
-        ext = re.search(r"\.([^.]+)$", file.name)
+        ext = re.search(r"\.([^.]+)$", file.name.lower())
+        ext = ext.group(1) if ext else None
         if ext:
             if file.type == FileType.VISUAL.value:
-                response.headers.set('Content-Type', 'image/%s' % ext.group(1))
+                content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}")
             else:
-                response.headers.set(
-                    'Content-Type',
-                    'application/%s' %
-                    ext.group(1))
+                content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
+            response.headers.set("Content-Type", content_type)
         return response
     except Exception as e:
         return server_error_response(e)
@@ -373,4 +373,4 @@ def move():
         FileService.move_file(file_ids, parent_id)
         return get_json_result(data=True)
     except Exception as e:
-        return server_error_response(e)
\ No newline at end of file
+        return server_error_response(e)
diff --git a/api/utils/web_utils.py b/api/utils/web_utils.py
index 5b89248d7..7bb25728e 100644
--- a/api/utils/web_utils.py
+++ b/api/utils/web_utils.py
@@ -31,6 +31,51 @@ from selenium.webdriver.support.ui import WebDriverWait
 from webdriver_manager.chrome import ChromeDriverManager
 
 
+CONTENT_TYPE_MAP = {
+    # Office
+    "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "doc": "application/msword",
+    "pdf": "application/pdf",
+    "csv": "text/csv",
+    "xls": "application/vnd.ms-excel",
+    "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    # Text/code
+    "txt": "text/plain",
+    "py": "text/plain",
+    "js": "text/plain",
+    "java": "text/plain",
+    "c": "text/plain",
+    "cpp": "text/plain",
+    "h": "text/plain",
+    "php": "text/plain",
+    "go": "text/plain",
+    "ts": "text/plain",
+    "sh": "text/plain",
+    "cs": "text/plain",
+    "kt": "text/plain",
+    "sql": "text/plain",
+    # Web
+    "md": "text/markdown",
+    "markdown": "text/markdown",
+    "htm": "text/html",
+    "html": "text/html",
+    "json": "application/json",
+    # Image formats
+    "png": "image/png",
+    "jpg": "image/jpeg",
+    "jpeg": "image/jpeg",
+    "gif": "image/gif",
+    "bmp": "image/bmp",
+    "tiff": "image/tiff",
+    "tif": "image/tiff",
+    "webp": "image/webp",
+    "svg": "image/svg+xml",
+    "ico": "image/x-icon",
+    "avif": "image/avif",
+    "heic": "image/heic",
+}
+
+
 def html2pdf(
     source: str,
     timeout: int = 2,