Feat: extract message output to file (#11251)

### What problem does this PR solve?

Feat: extract message output to file

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-14 19:52:11 +08:00
committed by GitHub
parent cd55f6c1b8
commit 68e3b33ae4
6 changed files with 104 additions and 2 deletions

View File

@ -51,7 +51,9 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
apt install -y libjemalloc-dev && \
apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
apt install -y ghostscript
apt install -y ghostscript && \
apt install -y pandoc && \
apt install -y texlive
RUN if [ "$NEED_MIRROR" == "1" ]; then \
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \

View File

@ -408,6 +408,10 @@ class Canvas(Graph):
else:
yield decorate("message", {"content": cpn_obj.output("content")})
cite = re.search(r"\[ID:[ 0-9]+\]", cpn_obj.output("content"))
if isinstance(cpn_obj.output("attachment"), tuple):
yield decorate("message", {"attachment": cpn_obj.output("attachment")})
yield decorate("message_end", {"reference": self.get_reference() if cite else None})
while partials:

View File

@ -17,6 +17,9 @@ import json
import os
import random
import re
import pypandoc
import logging
import tempfile
from functools import partial
from typing import Any
@ -24,7 +27,8 @@ from agent.component.base import ComponentBase, ComponentParamBase
from jinja2 import Template as Jinja2Template
from common.connection_utils import timeout
from common.misc_utils import get_uuid
from common import settings
class MessageParam(ComponentParamBase):
"""
@ -34,6 +38,7 @@ class MessageParam(ComponentParamBase):
super().__init__()
self.content = []
self.stream = True
self.output_format = None # default output format
self.outputs = {
"content": {
"type": "str"
@ -133,6 +138,7 @@ class Message(ComponentBase):
yield rand_cnt[s: ]
self.set_output("content", all_content)
self._convert_content(all_content)
def _is_jinjia2(self, content:str) -> bool:
patt = [
@ -164,6 +170,68 @@ class Message(ComponentBase):
content = re.sub(n, v, content)
self.set_output("content", content)
self._convert_content(content)
def thoughts(self) -> str:
return ""
def _convert_content(self, content):
doc_id = get_uuid()
if self._param.output_format.lower() not in {"markdown", "html", "pdf", "docx"}:
self._param.output_format = "markdown"
try:
if self._param.output_format in {"markdown", "html"}:
if isinstance(content, str):
converted = pypandoc.convert_text(
content,
to=self._param.output_format,
format="markdown",
)
else:
converted = pypandoc.convert_file(
content,
to=self._param.output_format,
format="markdown",
)
binary_content = converted.encode("utf-8")
else: # pdf, docx
with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp:
tmp_name = tmp.name
try:
if isinstance(content, str):
pypandoc.convert_text(
content,
to=self._param.output_format,
format="markdown",
outputfile=tmp_name,
)
else:
pypandoc.convert_file(
content,
to=self._param.output_format,
format="markdown",
outputfile=tmp_name,
)
with open(tmp_name, "rb") as f:
binary_content = f.read()
finally:
if os.path.exists(tmp_name):
os.remove(tmp_name)
settings.STORAGE_IMPL.put(self._canvas._tenant_id, doc_id, binary_content)
self.set_output("attachment", {
"doc_id":doc_id,
"format":self._param.output_format,
"file_name":f"{doc_id[:8]}.{self._param.output_format}"})
logging.info(f"Converted content uploaded as {doc_id} (format={self._param.output_format})")
except Exception as e:
logging.error(f"Error converting content to {self._param.output_format}: {e}")

View File

@ -508,6 +508,7 @@ def get(doc_id):
ext = ext.group(1) if ext else None
if ext:
if doc.type == FileType.VISUAL.value:
content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}")
else:
content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
@ -517,6 +518,22 @@ def get(doc_id):
return server_error_response(e)
@manager.route("/download/<attachment_id>", methods=["GET"]) # noqa: F821
@login_required
def download_attachment(attachment_id):
try:
ext = request.args.get("ext", "markdown")
data = settings.STORAGE_IMPL.get(current_user.id, attachment_id)
# data = settings.STORAGE_IMPL.get("eb500d50bb0411f0907561d2782adda5", attachment_id)
response = flask.make_response(data)
response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}"))
return response
except Exception as e:
return server_error_response(e)
@manager.route("/change_parser", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id")

View File

@ -145,6 +145,7 @@ dependencies = [
"markdownify>=1.2.0",
"captcha>=0.7.1",
"pip>=25.2",
"pypandoc>=1.16",
]
[dependency-groups]

10
uv.lock generated
View File

@ -4892,6 +4892,14 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
]
[[package]]
name = "pypandoc"
version = "1.16"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/77/af1fc54740a0712988f9518e629d38edc7b8ffccd7549203f19c3d8a2db6/pypandoc-1.16-py3-none-any.whl", hash = "sha256:868f390d48388743e7a5885915cbbaa005dea36a825ecdfd571f8c523416c822", size = 19425, upload-time = "2025-11-08T15:44:38.429Z" },
]
[[package]]
name = "pyparsing"
version = "3.2.3"
@ -5292,6 +5300,7 @@ dependencies = [
{ name = "pyicu" },
{ name = "pymysql" },
{ name = "pyodbc" },
{ name = "pypandoc" },
{ name = "pypdf" },
{ name = "pypdf2" },
{ name = "python-calamine" },
@ -5447,6 +5456,7 @@ requires-dist = [
{ name = "pyicu", specifier = ">=2.15.3,<3.0.0" },
{ name = "pymysql", specifier = ">=1.1.1,<2.0.0" },
{ name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
{ name = "pypandoc", specifier = ">=1.16" },
{ name = "pypdf", specifier = "==6.0.0" },
{ name = "pypdf2", specifier = ">=3.0.1,<4.0.0" },
{ name = "python-calamine", specifier = ">=0.4.0" },