Feature (canvas): Add Api for download "message" component output's file (#11772 )

### What problem does this PR solve? -Add Api for download "message" component output's file -Change the attachment output type check from tuple to dictionary,because 'attachement' is not instance of tuple -Update the message type to message_end to avoid the problem that content does not send an error message when the message type is ans ["data"] ["content"] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)
Feat: Fillup component return value not object (#11780 )
2026-02-06 10:35:06 +08:00 · 2025-12-05 19:42:35 +08:00 · 2025-12-05 19:27:36 +08:00 · 2025-12-05 19:25:45 +08:00 · 2025-12-05 19:13:17 +08:00
5 changed files with 77 additions and 52 deletions
--- a/agent/canvas.py
+++ b/agent/canvas.py
@ -534,10 +534,12 @@ class Canvas(Graph):
                        yield decorate("message", {"content": cpn_obj.output("content")})
                        cite = re.search(r"\[ID:[ 0-9]+\]",  cpn_obj.output("content"))

-                    if isinstance(cpn_obj.output("attachment"), tuple):
-                        yield decorate("message", {"attachment": cpn_obj.output("attachment")})
-
-                    yield decorate("message_end", {"reference": self.get_reference() if cite else None})
+                    message_end = {}
+                    if isinstance(cpn_obj.output("attachment"), dict):
+                        message_end["attachment"] = cpn_obj.output("attachment")
+                    if cite:
+                        message_end["reference"] = self.get_reference()
+                    yield decorate("message_end", message_end)

                    while partials:
                        _cpn_obj = self.get_component_obj(partials[0])
--- a/agent/component/fillup.py
+++ b/agent/component/fillup.py
@ -18,6 +18,7 @@ import re
 from functools import partial

 from agent.component.base import ComponentParamBase, ComponentBase
+from api.db.services.file_service import FileService


 class UserFillUpParam(ComponentParamBase):
@ -63,6 +64,13 @@ class UserFillUp(ComponentBase):
        for k, v in kwargs.get("inputs", {}).items():
            if self.check_if_canceled("UserFillUp processing"):
                return
+            if isinstance(v, dict) and v.get("type", "").lower().find("file") >=0:
+                if v.get("optional") and v.get("value", None) is None:
+                    v = None
+                else:
+                    v = FileService.get_files([v["value"]])
+            else:
+                v = v.get("value")
            self.set_output(k, v)

    def thoughts(self) -> str:
--- a/api/apps/sdk/files.py
+++ b/api/apps/sdk/files.py
@ -14,7 +14,7 @@
 #  limitations under the License.
 #

-
+import asyncio
 import pathlib
 import re
 from quart import request, make_response
@ -29,6 +29,7 @@ from api.db import FileType
 from api.db.services import duplicate_name
 from api.db.services.file_service import FileService
 from api.utils.file_utils import filename_type
+from api.utils.web_utils import CONTENT_TYPE_MAP
 from common import settings
 from common.constants import RetCode

@ -629,6 +630,19 @@ async def get(tenant_id, file_id):
    except Exception as e:
        return server_error_response(e)

+@manager.route("/file/download/<attachment_id>", methods=["GET"])  # noqa: F821
+@token_required
+async def download_attachment(tenant_id,attachment_id):
+    try:
+        ext = request.args.get("ext", "markdown")
+        data = await asyncio.to_thread(settings.STORAGE_IMPL.get, tenant_id, attachment_id)
+        response = await make_response(data)
+        response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}"))
+
+        return response
+
+    except Exception as e:
+        return server_error_response(e)

@manager.route('/file/mv', methods=['POST'])  # noqa: F821
@token_required
--- a/mcp/server/server.py
+++ b/mcp/server/server.py
@ -57,7 +57,6 @@ JSON_RESPONSE = True

 class RAGFlowConnector:
    _MAX_DATASET_CACHE = 32
-    _MAX_DOCUMENT_CACHE = 128
    _CACHE_TTL = 300

    _dataset_metadata_cache: OrderedDict[str, tuple[dict, float | int]] = OrderedDict()  # "dataset_id" -> (metadata, expiry_ts)
@ -116,8 +115,6 @@ class RAGFlowConnector:
    def _set_cached_document_metadata_by_dataset(self, dataset_id, doc_id_meta_list):
        self._document_metadata_cache[dataset_id] = (doc_id_meta_list, self._get_expiry_timestamp())
        self._document_metadata_cache.move_to_end(dataset_id)
-        if len(self._document_metadata_cache) > self._MAX_DOCUMENT_CACHE:
-            self._document_metadata_cache.popitem(last=False)

    def list_datasets(self, page: int = 1, page_size: int = 1000, orderby: str = "create_time", desc: bool = True, id: str | None = None, name: str | None = None):
        res = self._get("/datasets", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name})
@ -240,46 +237,46 @@ class RAGFlowConnector:

                docs = None if force_refresh else self._get_cached_document_metadata_by_dataset(dataset_id)
                if docs is None:
-                    docs_res = self._get(f"/datasets/{dataset_id}/documents")
-                    docs_data = docs_res.json()
-                    if docs_data.get("code") == 0 and docs_data.get("data", {}).get("docs"):
-                        doc_id_meta_list = []
-                        docs = {}
-                        for doc in docs_data["data"]["docs"]:
-                            doc_id = doc.get("id")
-                            if not doc_id:
-                                continue
-                            doc_meta = {
-                                "document_id": doc_id,
-                                "name": doc.get("name", ""),
-                                "location": doc.get("location", ""),
-                                "type": doc.get("type", ""),
-                                "size": doc.get("size"),
-                                "chunk_count": doc.get("chunk_count"),
-                                # "chunk_method": doc.get("chunk_method", ""),
-                                "create_date": doc.get("create_date", ""),
-                                "update_date": doc.get("update_date", ""),
-                                # "process_begin_at": doc.get("process_begin_at", ""),
-                                # "process_duration": doc.get("process_duration"),
-                                # "progress": doc.get("progress"),
-                                # "progress_msg": doc.get("progress_msg", ""),
-                                # "status": doc.get("status", ""),
-                                # "run": doc.get("run", ""),
-                                "token_count": doc.get("token_count"),
-                                # "source_type": doc.get("source_type", ""),
-                                "thumbnail": doc.get("thumbnail", ""),
-                                "dataset_id": doc.get("dataset_id", dataset_id),
-                                "meta_fields": doc.get("meta_fields", {}),
-                                # "parser_config": doc.get("parser_config", {})
-                            }
-                            doc_id_meta_list.append((doc_id, doc_meta))
-                            docs[doc_id] = doc_meta
+                    page = 1
+                    page_size = 30
+                    doc_id_meta_list = []
+                    docs = {}
+                    while page:
+                        docs_res = self._get(f"/datasets/{dataset_id}/documents?page={page}")
+                        docs_data = docs_res.json()
+                        if docs_data.get("code") == 0 and docs_data.get("data", {}).get("docs"):
+                            for doc in docs_data["data"]["docs"]:
+                                doc_id = doc.get("id")
+                                if not doc_id:
+                                    continue
+                                doc_meta = {
+                                    "document_id": doc_id,
+                                    "name": doc.get("name", ""),
+                                    "location": doc.get("location", ""),
+                                    "type": doc.get("type", ""),
+                                    "size": doc.get("size"),
+                                    "chunk_count": doc.get("chunk_count"),
+                                    "create_date": doc.get("create_date", ""),
+                                    "update_date": doc.get("update_date", ""),
+                                    "token_count": doc.get("token_count"),
+                                    "thumbnail": doc.get("thumbnail", ""),
+                                    "dataset_id": doc.get("dataset_id", dataset_id),
+                                    "meta_fields": doc.get("meta_fields", {}),
+                                }
+                                doc_id_meta_list.append((doc_id, doc_meta))
+                                docs[doc_id] = doc_meta
+
+                            page += 1
+                            if docs_data.get("data", {}).get("total", 0) - page * page_size <= 0:
+                                page = None
+
                        self._set_cached_document_metadata_by_dataset(dataset_id, doc_id_meta_list)
                if docs:
                    document_cache.update(docs)

-        except Exception:
+        except Exception as e:
            # Gracefully handle metadata cache failures
+            logging.error(f"Problem building the document metadata cache: {str(e)}")
            pass

        return document_cache, dataset_cache
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -219,23 +219,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        )

        def _normalize_section(section):
-            # pad section to length 3: (txt, sec_id, poss)
-            if len(section) == 1:
+            # Pad/normalize to (txt, layout, positions)
+            if not isinstance(section, (list, tuple)):
+                section = (section, "", [])
+            elif len(section) == 1:
                section = (section[0], "", [])
            elif len(section) == 2:
                section = (section[0], "", section[1])
-            elif len(section) != 3:
-                raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
+            else:
+                section = (section[0], section[1], section[2])

            txt, layoutno, poss = section
            if isinstance(poss, str):
                poss = pdf_parser.extract_positions(poss)
-                first = poss[0]          # tuple: ([pn], x1, x2, y1, y2)
-                pn = first[0]
-
-                if isinstance(pn, list):
-                    pn = pn[0]           # [pn] -> pn
+                if poss:
+                    first = poss[0]  # tuple: ([pn], x1, x2, y1, y2)
+                    pn = first[0]
+                    if isinstance(pn, list) and pn:
+                        pn = pn[0]  # [pn] -> pn
                    poss[0] = (pn, *first[1:])
+            if not poss:
+                poss = []

            return (txt, layoutno, poss)
Author	SHA1	Message	Date
天海蒼灆	8de6b97806	Feature (canvas): Add Api for download "message" component output's file (#11772 ) ### What problem does this PR solve? -Add Api for download "message" component output's file -Change the attachment output type check from tuple to dictionary,because 'attachement' is not instance of tuple -Update the message type to message_end to avoid the problem that content does not send an error message when the message type is ans ["data"] ["content"] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)	2025-12-05 19:42:35 +08:00
TeslaZY	e4e0a88053	Feat: Fillup component return value not object (#11780 ) ### What problem does this PR solve? Fillup component return value not object ### Type of change - [x] New Feature (non-breaking change which adds functionality)	2025-12-05 19:27:36 +08:00
少卿	7719fd6350	Fix MinerU API sanitized-output lookup and manual chunk tuple handling (#11702 ) ### What problem does this PR solve? This PR addresses two independent issues encountered when using the MinerU engine in Ragflow: 1. MinerU API output path mismatch for non-ASCII filenames MinerU sanitizes the root directory name inside the returned ZIP when the original filename contains non-ASCII characters (e.g., Chinese). Ragflow's client-side unzip logic assumed the original filename stem and therefore failed to locate `_content_list.json`. This PR adds: * root-directory detection * fallback lookup using sanitized names * a broadened `_read_output` search with a glob fallback ensuring output files are consistently located regardless of filename encoding. 2. Chunker crash due to tuple-structure mismatch in manual mode Some parsers (e.g., MinerU / Docling) return 2-tuple sections, but Ragflow’s chunker expects 3-tuple sections, leading to: `ValueError: not enough values to unpack (expected 3, got 2)` This PR normalizes all sections to a uniform structure `(text, layout, positions)`: * parse position tags when present * default to empty positions when missing preserving backward compatibility and preventing crashes. ### Type of change * [x] Bug Fix (non-breaking change which fixes an issue) [#11136](https://github.com/infiniflow/ragflow/issues/11136) [#11700](https://github.com/infiniflow/ragflow/issues/11700) [#11620](https://github.com/infiniflow/ragflow/issues/11620) [#11701](https://github.com/infiniflow/ragflow/pull/11701) we need your help [yongtenglei](https://github.com/yongtenglei) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>	2025-12-05 19:25:45 +08:00
Giles Lloyd	15ef6dd72f	fix(mcp-server): Ensure all document meta-data is cached (#11767 ) ### What problem does this PR solve? The document metadata cache is built using the list documents endpoint with default pagination parameters of page=1, page_size=3. This means when using the MCP server to search a dataset, only chunks which come from the first 30 documents in the dataset will have metadata returned. Issue described in more detail here https://github.com/infiniflow/ragflow/issues/11533 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: Giles Lloyd <giles.af.lloyd@gmail.com>	2025-12-05 19:13:17 +08:00