From 72c19b44c3fd773935356ec8fcd5668d755bdf52 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Fri, 11 Jul 2025 18:47:19 +0800 Subject: [PATCH] Refa: better MIME content type (#8801) ### What problem does this PR solve? Better uniform MIME content type. ### Type of change - [x] Refactoring --- api/apps/document_app.py | 10 +++++---- api/apps/file_app.py | 14 ++++++------- api/utils/web_utils.py | 45 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 11 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index bc1e5851b..c0d1a6be2 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -42,7 +42,7 @@ from api.utils.api_utils import ( validate_request, ) from api.utils.file_utils import filename_type, get_project_base_directory, thumbnail -from api.utils.web_utils import html2pdf, is_valid_url +from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url from deepdoc.parser.html_parser import RAGFlowHtmlParser from rag.nlp import search from rag.utils.storage_factory import STORAGE_IMPL @@ -505,12 +505,14 @@ def get(doc_id): b, n = File2DocumentService.get_storage_address(doc_id=doc_id) response = flask.make_response(STORAGE_IMPL.get(b, n)) - ext = re.search(r"\.([^.]+)$", doc.name) + ext = re.search(r"\.([^.]+)$", doc.name.lower()) + ext = ext.group(1) if ext else None if ext: if doc.type == FileType.VISUAL.value: - response.headers.set("Content-Type", "image/%s" % ext.group(1)) + content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}") else: - response.headers.set("Content-Type", "application/%s" % ext.group(1)) + content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") + response.headers.set("Content-Type", content_type) return response except Exception as e: return server_error_response(e) diff --git a/api/apps/file_app.py b/api/apps/file_app.py index eabe7bd9b..eeb66f6ec 100644 --- a/api/apps/file_app.py +++ b/api/apps/file_app.py @@ -31,6 +31,7 @@ from api.db.services.file_service import FileService from api import settings from api.utils.api_utils import get_json_result from api.utils.file_utils import filename_type +from api.utils.web_utils import CONTENT_TYPE_MAP from rag.utils.storage_factory import STORAGE_IMPL @@ -334,15 +335,14 @@ def get(file_id): blob = STORAGE_IMPL.get(b, n) response = flask.make_response(blob) - ext = re.search(r"\.([^.]+)$", file.name) + ext = re.search(r"\.([^.]+)$", file.name.lower()) + ext = ext.group(1) if ext else None if ext: if file.type == FileType.VISUAL.value: - response.headers.set('Content-Type', 'image/%s' % ext.group(1)) + content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}") else: - response.headers.set( - 'Content-Type', - 'application/%s' % - ext.group(1)) + content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") + response.headers.set("Content-Type", content_type) return response except Exception as e: return server_error_response(e) @@ -373,4 +373,4 @@ def move(): FileService.move_file(file_ids, parent_id) return get_json_result(data=True) except Exception as e: - return server_error_response(e) \ No newline at end of file + return server_error_response(e) diff --git a/api/utils/web_utils.py b/api/utils/web_utils.py index 5b89248d7..7bb25728e 100644 --- a/api/utils/web_utils.py +++ b/api/utils/web_utils.py @@ -31,6 +31,51 @@ from selenium.webdriver.support.ui import WebDriverWait from webdriver_manager.chrome import ChromeDriverManager +CONTENT_TYPE_MAP = { + # Office + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "doc": "application/msword", + "pdf": "application/pdf", + "csv": "text/csv", + "xls": "application/vnd.ms-excel", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + # Text/code + "txt": "text/plain", + "py": "text/plain", + "js": "text/plain", + "java": "text/plain", + "c": "text/plain", + "cpp": "text/plain", + "h": "text/plain", + "php": "text/plain", + "go": "text/plain", + "ts": "text/plain", + "sh": "text/plain", + "cs": "text/plain", + "kt": "text/plain", + "sql": "text/plain", + # Web + "md": "text/markdown", + "markdown": "text/markdown", + "htm": "text/html", + "html": "text/html", + "json": "application/json", + # Image formats + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "tif": "image/tiff", + "webp": "image/webp", + "svg": "image/svg+xml", + "ico": "image/x-icon", + "avif": "image/avif", + "heic": "image/heic", +} + + def html2pdf( source: str, timeout: int = 2,