Feat: extract message output to file (#11251)

### What problem does this PR solve? Feat: extract message output to file ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-04 17:45:07 +08:00 · 2025-11-14 19:52:11 +08:00
parent cd55f6c1b8
commit 68e3b33ae4
6 changed files with 104 additions and 2 deletions
--- a/4
+++ b/4
@ -51,7 +51,9 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
    apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
    apt install -y libjemalloc-dev && \
    apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
-    apt install -y ghostscript
+    apt install -y ghostscript && \
    apt install -y pandoc && \
    apt install -y texlive
 RUN if [ "$NEED_MIRROR" == "1" ]; then \
        pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
--- a/agent/canvas.py
+++ b/agent/canvas.py
@ -408,6 +408,10 @@ class Canvas(Graph):
                    else:
                        yield decorate("message", {"content": cpn_obj.output("content")})
                        cite = re.search(r"\[ID:[ 0-9]+\]",  cpn_obj.output("content"))
                    if isinstance(cpn_obj.output("attachment"), tuple):
                        yield decorate("message", {"attachment": cpn_obj.output("attachment")})
                    yield decorate("message_end", {"reference": self.get_reference() if cite else None})
                    while partials:
--- a/agent/component/message.py
+++ b/agent/component/message.py
@ -17,6 +17,9 @@ import json
 import os
 import random
 import re
 import pypandoc
 import logging
 import tempfile
 from functools import partial
 from typing import Any
@ -24,7 +27,8 @@ from agent.component.base import ComponentBase, ComponentParamBase
 from jinja2 import Template as Jinja2Template
 from common.connection_utils import timeout
-
+from common.misc_utils import get_uuid
 from common import settings
 class MessageParam(ComponentParamBase):
    """
@ -34,6 +38,7 @@ class MessageParam(ComponentParamBase):
        super().__init__()
        self.content = []
        self.stream = True
        self.output_format = None  # default output format
        self.outputs = {
            "content": {
                "type": "str"
@ -133,6 +138,7 @@ class Message(ComponentBase):
            yield rand_cnt[s: ]
        self.set_output("content", all_content)
        self._convert_content(all_content)
    def _is_jinjia2(self, content:str) -> bool:
        patt = [
@ -164,6 +170,68 @@ class Message(ComponentBase):
            content = re.sub(n, v, content)
        self.set_output("content", content)
        self._convert_content(content)
    def thoughts(self) -> str:
        return ""
    def _convert_content(self, content):
        doc_id = get_uuid()
        if self._param.output_format.lower() not in {"markdown", "html", "pdf", "docx"}:
            self._param.output_format = "markdown"
        try:
            if self._param.output_format in {"markdown", "html"}:
                if isinstance(content, str):
                    converted = pypandoc.convert_text(
                        content,
                        to=self._param.output_format,
                        format="markdown",
                    )
                else:
                    converted = pypandoc.convert_file(
                        content,
                        to=self._param.output_format,
                        format="markdown",
                    )
                binary_content = converted.encode("utf-8")
            else:  # pdf, docx
                with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp:
                    tmp_name = tmp.name
                try:
                    if isinstance(content, str):
                        pypandoc.convert_text(
                            content,
                            to=self._param.output_format,
                            format="markdown",
                            outputfile=tmp_name,
                        )
                    else:
                        pypandoc.convert_file(
                            content,
                            to=self._param.output_format,
                            format="markdown",
                            outputfile=tmp_name,
                        )
                    with open(tmp_name, "rb") as f:
                        binary_content = f.read()
                finally:
                    if os.path.exists(tmp_name):
                        os.remove(tmp_name)
            settings.STORAGE_IMPL.put(self._canvas._tenant_id, doc_id, binary_content)
            self.set_output("attachment", {
                "doc_id":doc_id, 
                "format":self._param.output_format, 
                "file_name":f"{doc_id[:8]}.{self._param.output_format}"})
            logging.info(f"Converted content uploaded as {doc_id} (format={self._param.output_format})")
        except Exception as e:
            logging.error(f"Error converting content to {self._param.output_format}: {e}")
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -508,6 +508,7 @@ def get(doc_id):
        ext = ext.group(1) if ext else None
        if ext:
            if doc.type == FileType.VISUAL.value:
                content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}")
            else:
                content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
@ -517,6 +518,22 @@ def get(doc_id):
        return server_error_response(e)
@manager.route("/download/<attachment_id>", methods=["GET"])  # noqa: F821
@login_required
 def download_attachment(attachment_id):
    try:
        ext = request.args.get("ext", "markdown")
        data = settings.STORAGE_IMPL.get(current_user.id, attachment_id)
        # data = settings.STORAGE_IMPL.get("eb500d50bb0411f0907561d2782adda5", attachment_id)
        response = flask.make_response(data)
        response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}"))
        return response
    except Exception as e:
        return server_error_response(e)
@manager.route("/change_parser", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -145,6 +145,7 @@ dependencies = [
    "markdownify>=1.2.0",
    "captcha>=0.7.1",
    "pip>=25.2",
    "pypandoc>=1.16",
 ]
 [dependency-groups]
--- a/uv.lock
+++ b/uv.lock
@ -4892,6 +4892,14 @@ wheels = [
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
 ]
 [[package]]
 name = "pypandoc"
 version = "1.16"
 source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 wheels = [
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/77/af1fc54740a0712988f9518e629d38edc7b8ffccd7549203f19c3d8a2db6/pypandoc-1.16-py3-none-any.whl", hash = "sha256:868f390d48388743e7a5885915cbbaa005dea36a825ecdfd571f8c523416c822", size = 19425, upload-time = "2025-11-08T15:44:38.429Z" },
 ]
 [[package]]
 name = "pyparsing"
 version = "3.2.3"
@ -5292,6 +5300,7 @@ dependencies = [
    { name = "pyicu" },
    { name = "pymysql" },
    { name = "pyodbc" },
    { name = "pypandoc" },
    { name = "pypdf" },
    { name = "pypdf2" },
    { name = "python-calamine" },
@ -5447,6 +5456,7 @@ requires-dist = [
    { name = "pyicu", specifier = ">=2.15.3,<3.0.0" },
    { name = "pymysql", specifier = ">=1.1.1,<2.0.0" },
    { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
    { name = "pypandoc", specifier = ">=1.16" },
    { name = "pypdf", specifier = "==6.0.0" },
    { name = "pypdf2", specifier = ">=3.0.1,<4.0.0" },
    { name = "python-calamine", specifier = ">=0.4.0" },