diff --git a/Dockerfile b/Dockerfile index b16a0d7d5..239330183 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,7 +51,9 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \ apt install -y libjemalloc-dev && \ apt install -y python3-pip pipx nginx unzip curl wget git vim less && \ - apt install -y ghostscript + apt install -y ghostscript && \ + apt install -y pandoc && \ + apt install -y texlive RUN if [ "$NEED_MIRROR" == "1" ]; then \ pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ diff --git a/agent/canvas.py b/agent/canvas.py index bc7a45e3e..f262cd597 100644 --- a/agent/canvas.py +++ b/agent/canvas.py @@ -408,6 +408,10 @@ class Canvas(Graph): else: yield decorate("message", {"content": cpn_obj.output("content")}) cite = re.search(r"\[ID:[ 0-9]+\]", cpn_obj.output("content")) + + if isinstance(cpn_obj.output("attachment"), tuple): + yield decorate("message", {"attachment": cpn_obj.output("attachment")}) + yield decorate("message_end", {"reference": self.get_reference() if cite else None}) while partials: diff --git a/agent/component/message.py b/agent/component/message.py index 641198083..555534610 100644 --- a/agent/component/message.py +++ b/agent/component/message.py @@ -17,6 +17,9 @@ import json import os import random import re +import pypandoc +import logging +import tempfile from functools import partial from typing import Any @@ -24,7 +27,8 @@ from agent.component.base import ComponentBase, ComponentParamBase from jinja2 import Template as Jinja2Template from common.connection_utils import timeout - +from common.misc_utils import get_uuid +from common import settings class MessageParam(ComponentParamBase): """ @@ -34,6 +38,7 @@ class MessageParam(ComponentParamBase): super().__init__() self.content = [] self.stream = True + self.output_format = None # default output format self.outputs = { "content": { "type": "str" @@ -133,6 +138,7 @@ class Message(ComponentBase): yield rand_cnt[s: ] self.set_output("content", all_content) + self._convert_content(all_content) def _is_jinjia2(self, content:str) -> bool: patt = [ @@ -164,6 +170,68 @@ class Message(ComponentBase): content = re.sub(n, v, content) self.set_output("content", content) + self._convert_content(content) def thoughts(self) -> str: return "" + + def _convert_content(self, content): + doc_id = get_uuid() + + if self._param.output_format.lower() not in {"markdown", "html", "pdf", "docx"}: + self._param.output_format = "markdown" + + try: + if self._param.output_format in {"markdown", "html"}: + if isinstance(content, str): + converted = pypandoc.convert_text( + content, + to=self._param.output_format, + format="markdown", + ) + else: + converted = pypandoc.convert_file( + content, + to=self._param.output_format, + format="markdown", + ) + + binary_content = converted.encode("utf-8") + + else: # pdf, docx + with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp: + tmp_name = tmp.name + + try: + if isinstance(content, str): + pypandoc.convert_text( + content, + to=self._param.output_format, + format="markdown", + outputfile=tmp_name, + ) + else: + pypandoc.convert_file( + content, + to=self._param.output_format, + format="markdown", + outputfile=tmp_name, + ) + + with open(tmp_name, "rb") as f: + binary_content = f.read() + + finally: + if os.path.exists(tmp_name): + os.remove(tmp_name) + + settings.STORAGE_IMPL.put(self._canvas._tenant_id, doc_id, binary_content) + self.set_output("attachment", { + "doc_id":doc_id, + "format":self._param.output_format, + "file_name":f"{doc_id[:8]}.{self._param.output_format}"}) + + logging.info(f"Converted content uploaded as {doc_id} (format={self._param.output_format})") + + except Exception as e: + logging.error(f"Error converting content to {self._param.output_format}: {e}") \ No newline at end of file diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 12c19f978..8cea336de 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -508,6 +508,7 @@ def get(doc_id): ext = ext.group(1) if ext else None if ext: if doc.type == FileType.VISUAL.value: + content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}") else: content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") @@ -517,6 +518,22 @@ def get(doc_id): return server_error_response(e) +@manager.route("/download/", methods=["GET"]) # noqa: F821 +@login_required +def download_attachment(attachment_id): + try: + ext = request.args.get("ext", "markdown") + data = settings.STORAGE_IMPL.get(current_user.id, attachment_id) + # data = settings.STORAGE_IMPL.get("eb500d50bb0411f0907561d2782adda5", attachment_id) + response = flask.make_response(data) + response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}")) + + return response + + except Exception as e: + return server_error_response(e) + + @manager.route("/change_parser", methods=["POST"]) # noqa: F821 @login_required @validate_request("doc_id") diff --git a/pyproject.toml b/pyproject.toml index 2ec792b90..c1210dfb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,6 +145,7 @@ dependencies = [ "markdownify>=1.2.0", "captcha>=0.7.1", "pip>=25.2", + "pypandoc>=1.16", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index 166b34ce4..474ca510b 100644 --- a/uv.lock +++ b/uv.lock @@ -4892,6 +4892,14 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" }, ] +[[package]] +name = "pypandoc" +version = "1.16" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/77/af1fc54740a0712988f9518e629d38edc7b8ffccd7549203f19c3d8a2db6/pypandoc-1.16-py3-none-any.whl", hash = "sha256:868f390d48388743e7a5885915cbbaa005dea36a825ecdfd571f8c523416c822", size = 19425, upload-time = "2025-11-08T15:44:38.429Z" }, +] + [[package]] name = "pyparsing" version = "3.2.3" @@ -5292,6 +5300,7 @@ dependencies = [ { name = "pyicu" }, { name = "pymysql" }, { name = "pyodbc" }, + { name = "pypandoc" }, { name = "pypdf" }, { name = "pypdf2" }, { name = "python-calamine" }, @@ -5447,6 +5456,7 @@ requires-dist = [ { name = "pyicu", specifier = ">=2.15.3,<3.0.0" }, { name = "pymysql", specifier = ">=1.1.1,<2.0.0" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, + { name = "pypandoc", specifier = ">=1.16" }, { name = "pypdf", specifier = "==6.0.0" }, { name = "pypdf2", specifier = ">=3.0.1,<4.0.0" }, { name = "python-calamine", specifier = ">=0.4.0" },