Compare commits

...

4 Commits

Author SHA1 Message Date
8de6b97806 Feature (canvas): Add Api for download "message" component output's file (#11772)
### What problem does this PR solve?

-Add Api for download "message" component output's file 
-Change the attachment output type check from tuple to
dictionary,because 'attachement' is not instance of tuple
-Update the message type to message_end to avoid the problem that
content does not send an error message when the message type is ans
["data"] ["content"]

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
2025-12-05 19:42:35 +08:00
e4e0a88053 Feat: Fillup component return value not object (#11780)
### What problem does this PR solve?

 Fillup component return value not object

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-12-05 19:27:36 +08:00
7719fd6350 Fix MinerU API sanitized-output lookup and manual chunk tuple handling (#11702)
### What problem does this PR solve?

This PR addresses **two independent issues** encountered when using the
MinerU engine in Ragflow:

1. **MinerU API output path mismatch for non-ASCII filenames**
MinerU sanitizes the root directory name inside the returned ZIP when
the original filename contains non-ASCII characters (e.g., Chinese).
Ragflow's client-side unzip logic assumed the original filename stem and
therefore failed to locate `_content_list.json`.
   This PR adds:

   * root-directory detection
   * fallback lookup using sanitized names
   * a broadened `_read_output` search with a glob fallback
ensuring output files are consistently located regardless of filename
encoding.

2. **Chunker crash due to tuple-structure mismatch in manual mode**
Some parsers (e.g., MinerU / Docling) return **2-tuple sections**, but
Ragflow’s chunker expects **3-tuple sections**, leading to:
   `ValueError: not enough values to unpack (expected 3, got 2)`
This PR normalizes all sections to a uniform structure `(text, layout,
positions)`:

   * parse position tags when present
   * default to empty positions when missing
     preserving backward compatibility and preventing crashes.

### Type of change

* [x] Bug Fix (non-breaking change which fixes an issue)


[#11136](https://github.com/infiniflow/ragflow/issues/11136)
[#11700](https://github.com/infiniflow/ragflow/issues/11700)
[#11620](https://github.com/infiniflow/ragflow/issues/11620)
[#11701](https://github.com/infiniflow/ragflow/pull/11701)

we need your help [yongtenglei](https://github.com/yongtenglei)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2025-12-05 19:25:45 +08:00
15ef6dd72f fix(mcp-server): Ensure all document meta-data is cached (#11767)
### What problem does this PR solve?

The document metadata cache is built using the list documents endpoint
with default pagination parameters of page=1, page_size=3. This means
when using the MCP server to search a dataset, only chunks which come
from the first 30 documents in the dataset will have metadata returned.

Issue described in more detail here
https://github.com/infiniflow/ragflow/issues/11533

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: Giles Lloyd <giles.af.lloyd@gmail.com>
2025-12-05 19:13:17 +08:00
5 changed files with 77 additions and 52 deletions

View File

@ -534,10 +534,12 @@ class Canvas(Graph):
yield decorate("message", {"content": cpn_obj.output("content")})
cite = re.search(r"\[ID:[ 0-9]+\]", cpn_obj.output("content"))
if isinstance(cpn_obj.output("attachment"), tuple):
yield decorate("message", {"attachment": cpn_obj.output("attachment")})
yield decorate("message_end", {"reference": self.get_reference() if cite else None})
message_end = {}
if isinstance(cpn_obj.output("attachment"), dict):
message_end["attachment"] = cpn_obj.output("attachment")
if cite:
message_end["reference"] = self.get_reference()
yield decorate("message_end", message_end)
while partials:
_cpn_obj = self.get_component_obj(partials[0])

View File

@ -18,6 +18,7 @@ import re
from functools import partial
from agent.component.base import ComponentParamBase, ComponentBase
from api.db.services.file_service import FileService
class UserFillUpParam(ComponentParamBase):
@ -63,6 +64,13 @@ class UserFillUp(ComponentBase):
for k, v in kwargs.get("inputs", {}).items():
if self.check_if_canceled("UserFillUp processing"):
return
if isinstance(v, dict) and v.get("type", "").lower().find("file") >=0:
if v.get("optional") and v.get("value", None) is None:
v = None
else:
v = FileService.get_files([v["value"]])
else:
v = v.get("value")
self.set_output(k, v)
def thoughts(self) -> str:

View File

@ -14,7 +14,7 @@
# limitations under the License.
#
import asyncio
import pathlib
import re
from quart import request, make_response
@ -29,6 +29,7 @@ from api.db import FileType
from api.db.services import duplicate_name
from api.db.services.file_service import FileService
from api.utils.file_utils import filename_type
from api.utils.web_utils import CONTENT_TYPE_MAP
from common import settings
from common.constants import RetCode
@ -629,6 +630,19 @@ async def get(tenant_id, file_id):
except Exception as e:
return server_error_response(e)
@manager.route("/file/download/<attachment_id>", methods=["GET"]) # noqa: F821
@token_required
async def download_attachment(tenant_id,attachment_id):
try:
ext = request.args.get("ext", "markdown")
data = await asyncio.to_thread(settings.STORAGE_IMPL.get, tenant_id, attachment_id)
response = await make_response(data)
response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}"))
return response
except Exception as e:
return server_error_response(e)
@manager.route('/file/mv', methods=['POST']) # noqa: F821
@token_required

View File

@ -57,7 +57,6 @@ JSON_RESPONSE = True
class RAGFlowConnector:
_MAX_DATASET_CACHE = 32
_MAX_DOCUMENT_CACHE = 128
_CACHE_TTL = 300
_dataset_metadata_cache: OrderedDict[str, tuple[dict, float | int]] = OrderedDict() # "dataset_id" -> (metadata, expiry_ts)
@ -116,8 +115,6 @@ class RAGFlowConnector:
def _set_cached_document_metadata_by_dataset(self, dataset_id, doc_id_meta_list):
self._document_metadata_cache[dataset_id] = (doc_id_meta_list, self._get_expiry_timestamp())
self._document_metadata_cache.move_to_end(dataset_id)
if len(self._document_metadata_cache) > self._MAX_DOCUMENT_CACHE:
self._document_metadata_cache.popitem(last=False)
def list_datasets(self, page: int = 1, page_size: int = 1000, orderby: str = "create_time", desc: bool = True, id: str | None = None, name: str | None = None):
res = self._get("/datasets", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name})
@ -240,46 +237,46 @@ class RAGFlowConnector:
docs = None if force_refresh else self._get_cached_document_metadata_by_dataset(dataset_id)
if docs is None:
docs_res = self._get(f"/datasets/{dataset_id}/documents")
docs_data = docs_res.json()
if docs_data.get("code") == 0 and docs_data.get("data", {}).get("docs"):
doc_id_meta_list = []
docs = {}
for doc in docs_data["data"]["docs"]:
doc_id = doc.get("id")
if not doc_id:
continue
doc_meta = {
"document_id": doc_id,
"name": doc.get("name", ""),
"location": doc.get("location", ""),
"type": doc.get("type", ""),
"size": doc.get("size"),
"chunk_count": doc.get("chunk_count"),
# "chunk_method": doc.get("chunk_method", ""),
"create_date": doc.get("create_date", ""),
"update_date": doc.get("update_date", ""),
# "process_begin_at": doc.get("process_begin_at", ""),
# "process_duration": doc.get("process_duration"),
# "progress": doc.get("progress"),
# "progress_msg": doc.get("progress_msg", ""),
# "status": doc.get("status", ""),
# "run": doc.get("run", ""),
"token_count": doc.get("token_count"),
# "source_type": doc.get("source_type", ""),
"thumbnail": doc.get("thumbnail", ""),
"dataset_id": doc.get("dataset_id", dataset_id),
"meta_fields": doc.get("meta_fields", {}),
# "parser_config": doc.get("parser_config", {})
}
doc_id_meta_list.append((doc_id, doc_meta))
docs[doc_id] = doc_meta
page = 1
page_size = 30
doc_id_meta_list = []
docs = {}
while page:
docs_res = self._get(f"/datasets/{dataset_id}/documents?page={page}")
docs_data = docs_res.json()
if docs_data.get("code") == 0 and docs_data.get("data", {}).get("docs"):
for doc in docs_data["data"]["docs"]:
doc_id = doc.get("id")
if not doc_id:
continue
doc_meta = {
"document_id": doc_id,
"name": doc.get("name", ""),
"location": doc.get("location", ""),
"type": doc.get("type", ""),
"size": doc.get("size"),
"chunk_count": doc.get("chunk_count"),
"create_date": doc.get("create_date", ""),
"update_date": doc.get("update_date", ""),
"token_count": doc.get("token_count"),
"thumbnail": doc.get("thumbnail", ""),
"dataset_id": doc.get("dataset_id", dataset_id),
"meta_fields": doc.get("meta_fields", {}),
}
doc_id_meta_list.append((doc_id, doc_meta))
docs[doc_id] = doc_meta
page += 1
if docs_data.get("data", {}).get("total", 0) - page * page_size <= 0:
page = None
self._set_cached_document_metadata_by_dataset(dataset_id, doc_id_meta_list)
if docs:
document_cache.update(docs)
except Exception:
except Exception as e:
# Gracefully handle metadata cache failures
logging.error(f"Problem building the document metadata cache: {str(e)}")
pass
return document_cache, dataset_cache

View File

@ -219,23 +219,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
)
def _normalize_section(section):
# pad section to length 3: (txt, sec_id, poss)
if len(section) == 1:
# Pad/normalize to (txt, layout, positions)
if not isinstance(section, (list, tuple)):
section = (section, "", [])
elif len(section) == 1:
section = (section[0], "", [])
elif len(section) == 2:
section = (section[0], "", section[1])
elif len(section) != 3:
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
else:
section = (section[0], section[1], section[2])
txt, layoutno, poss = section
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
if isinstance(pn, list):
pn = pn[0] # [pn] -> pn
if poss:
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
if isinstance(pn, list) and pn:
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
if not poss:
poss = []
return (txt, layoutno, poss)