Feat: extract message output to file (#11251)

### What problem does this PR solve?

Feat: extract message output to file

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-14 19:52:11 +08:00
committed by GitHub
parent cd55f6c1b8
commit 68e3b33ae4
6 changed files with 104 additions and 2 deletions

View File

@ -51,7 +51,9 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \ apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
apt install -y libjemalloc-dev && \ apt install -y libjemalloc-dev && \
apt install -y python3-pip pipx nginx unzip curl wget git vim less && \ apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
apt install -y ghostscript apt install -y ghostscript && \
apt install -y pandoc && \
apt install -y texlive
RUN if [ "$NEED_MIRROR" == "1" ]; then \ RUN if [ "$NEED_MIRROR" == "1" ]; then \
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \

View File

@ -408,6 +408,10 @@ class Canvas(Graph):
else: else:
yield decorate("message", {"content": cpn_obj.output("content")}) yield decorate("message", {"content": cpn_obj.output("content")})
cite = re.search(r"\[ID:[ 0-9]+\]", cpn_obj.output("content")) cite = re.search(r"\[ID:[ 0-9]+\]", cpn_obj.output("content"))
if isinstance(cpn_obj.output("attachment"), tuple):
yield decorate("message", {"attachment": cpn_obj.output("attachment")})
yield decorate("message_end", {"reference": self.get_reference() if cite else None}) yield decorate("message_end", {"reference": self.get_reference() if cite else None})
while partials: while partials:

View File

@ -17,6 +17,9 @@ import json
import os import os
import random import random
import re import re
import pypandoc
import logging
import tempfile
from functools import partial from functools import partial
from typing import Any from typing import Any
@ -24,7 +27,8 @@ from agent.component.base import ComponentBase, ComponentParamBase
from jinja2 import Template as Jinja2Template from jinja2 import Template as Jinja2Template
from common.connection_utils import timeout from common.connection_utils import timeout
from common.misc_utils import get_uuid
from common import settings
class MessageParam(ComponentParamBase): class MessageParam(ComponentParamBase):
""" """
@ -34,6 +38,7 @@ class MessageParam(ComponentParamBase):
super().__init__() super().__init__()
self.content = [] self.content = []
self.stream = True self.stream = True
self.output_format = None # default output format
self.outputs = { self.outputs = {
"content": { "content": {
"type": "str" "type": "str"
@ -133,6 +138,7 @@ class Message(ComponentBase):
yield rand_cnt[s: ] yield rand_cnt[s: ]
self.set_output("content", all_content) self.set_output("content", all_content)
self._convert_content(all_content)
def _is_jinjia2(self, content:str) -> bool: def _is_jinjia2(self, content:str) -> bool:
patt = [ patt = [
@ -164,6 +170,68 @@ class Message(ComponentBase):
content = re.sub(n, v, content) content = re.sub(n, v, content)
self.set_output("content", content) self.set_output("content", content)
self._convert_content(content)
def thoughts(self) -> str: def thoughts(self) -> str:
return "" return ""
def _convert_content(self, content):
doc_id = get_uuid()
if self._param.output_format.lower() not in {"markdown", "html", "pdf", "docx"}:
self._param.output_format = "markdown"
try:
if self._param.output_format in {"markdown", "html"}:
if isinstance(content, str):
converted = pypandoc.convert_text(
content,
to=self._param.output_format,
format="markdown",
)
else:
converted = pypandoc.convert_file(
content,
to=self._param.output_format,
format="markdown",
)
binary_content = converted.encode("utf-8")
else: # pdf, docx
with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp:
tmp_name = tmp.name
try:
if isinstance(content, str):
pypandoc.convert_text(
content,
to=self._param.output_format,
format="markdown",
outputfile=tmp_name,
)
else:
pypandoc.convert_file(
content,
to=self._param.output_format,
format="markdown",
outputfile=tmp_name,
)
with open(tmp_name, "rb") as f:
binary_content = f.read()
finally:
if os.path.exists(tmp_name):
os.remove(tmp_name)
settings.STORAGE_IMPL.put(self._canvas._tenant_id, doc_id, binary_content)
self.set_output("attachment", {
"doc_id":doc_id,
"format":self._param.output_format,
"file_name":f"{doc_id[:8]}.{self._param.output_format}"})
logging.info(f"Converted content uploaded as {doc_id} (format={self._param.output_format})")
except Exception as e:
logging.error(f"Error converting content to {self._param.output_format}: {e}")

View File

@ -508,6 +508,7 @@ def get(doc_id):
ext = ext.group(1) if ext else None ext = ext.group(1) if ext else None
if ext: if ext:
if doc.type == FileType.VISUAL.value: if doc.type == FileType.VISUAL.value:
content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}") content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}")
else: else:
content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
@ -517,6 +518,22 @@ def get(doc_id):
return server_error_response(e) return server_error_response(e)
@manager.route("/download/<attachment_id>", methods=["GET"]) # noqa: F821
@login_required
def download_attachment(attachment_id):
try:
ext = request.args.get("ext", "markdown")
data = settings.STORAGE_IMPL.get(current_user.id, attachment_id)
# data = settings.STORAGE_IMPL.get("eb500d50bb0411f0907561d2782adda5", attachment_id)
response = flask.make_response(data)
response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}"))
return response
except Exception as e:
return server_error_response(e)
@manager.route("/change_parser", methods=["POST"]) # noqa: F821 @manager.route("/change_parser", methods=["POST"]) # noqa: F821
@login_required @login_required
@validate_request("doc_id") @validate_request("doc_id")

View File

@ -145,6 +145,7 @@ dependencies = [
"markdownify>=1.2.0", "markdownify>=1.2.0",
"captcha>=0.7.1", "captcha>=0.7.1",
"pip>=25.2", "pip>=25.2",
"pypandoc>=1.16",
] ]
[dependency-groups] [dependency-groups]

10
uv.lock generated
View File

@ -4892,6 +4892,14 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" }, { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
] ]
[[package]]
name = "pypandoc"
version = "1.16"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/77/af1fc54740a0712988f9518e629d38edc7b8ffccd7549203f19c3d8a2db6/pypandoc-1.16-py3-none-any.whl", hash = "sha256:868f390d48388743e7a5885915cbbaa005dea36a825ecdfd571f8c523416c822", size = 19425, upload-time = "2025-11-08T15:44:38.429Z" },
]
[[package]] [[package]]
name = "pyparsing" name = "pyparsing"
version = "3.2.3" version = "3.2.3"
@ -5292,6 +5300,7 @@ dependencies = [
{ name = "pyicu" }, { name = "pyicu" },
{ name = "pymysql" }, { name = "pymysql" },
{ name = "pyodbc" }, { name = "pyodbc" },
{ name = "pypandoc" },
{ name = "pypdf" }, { name = "pypdf" },
{ name = "pypdf2" }, { name = "pypdf2" },
{ name = "python-calamine" }, { name = "python-calamine" },
@ -5447,6 +5456,7 @@ requires-dist = [
{ name = "pyicu", specifier = ">=2.15.3,<3.0.0" }, { name = "pyicu", specifier = ">=2.15.3,<3.0.0" },
{ name = "pymysql", specifier = ">=1.1.1,<2.0.0" }, { name = "pymysql", specifier = ">=1.1.1,<2.0.0" },
{ name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
{ name = "pypandoc", specifier = ">=1.16" },
{ name = "pypdf", specifier = "==6.0.0" }, { name = "pypdf", specifier = "==6.0.0" },
{ name = "pypdf2", specifier = ">=3.0.1,<4.0.0" }, { name = "pypdf2", specifier = ">=3.0.1,<4.0.0" },
{ name = "python-calamine", specifier = ">=0.4.0" }, { name = "python-calamine", specifier = ">=0.4.0" },