From bd5dda6b10750aac284d257802d43c586a281c91 Mon Sep 17 00:00:00 2001 From: YngvarHuang <625452882@qq.com> Date: Thu, 13 Nov 2025 09:59:39 +0800 Subject: [PATCH] Feature/doc upload api add parent path 20251112 (#11231) ### What problem does this PR solve? Add the specified parent_path to the document upload api interface (#11230) ### Type of change - [x] New Feature (non-breaking change which adds functionality) Co-authored-by: virgilwong --- api/apps/sdk/doc.py | 6 +++++- api/db/services/file_service.py | 10 ++++++---- api/utils/file_utils.py | 20 ++++++++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 4caf2cc8d..b54597f89 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -93,6 +93,10 @@ def upload(dataset_id, tenant_id): type: file required: true description: Document files to upload. + - in: formData + name: parent_path + type: string + description: Optional nested path under the parent folder. Uses '/' separators. responses: 200: description: Successfully uploaded documents. @@ -151,7 +155,7 @@ def upload(dataset_id, tenant_id): e, kb = KnowledgebaseService.get_by_id(dataset_id) if not e: raise LookupError(f"Can't find the dataset with ID {dataset_id}!") - err, files = FileService.upload_document(kb, file_objs, tenant_id) + err, files = FileService.upload_document(kb, file_objs, tenant_id, parent_path=request.form.get("parent_path")) if err: return get_result(message="\n".join(err), code=RetCode.SERVER_ERROR) # rename key's name diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index 5a3632e97..2cf4931d0 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -31,7 +31,7 @@ from common.misc_utils import get_uuid from common.constants import TaskStatus, FileSource, ParserType from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.task_service import TaskService -from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img +from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img, sanitize_path from rag.llm.cv_model import GptV4 from common import settings @@ -329,7 +329,7 @@ class FileService(CommonService): current_id = start_id while current_id: e, file = cls.get_by_id(current_id) - if file.parent_id != file.id and e: + if e and file.parent_id != file.id: parent_folders.append(file) current_id = file.parent_id else: @@ -423,13 +423,15 @@ class FileService(CommonService): @classmethod @DB.connection_context() - def upload_document(self, kb, file_objs, user_id, src="local"): + def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None): root_folder = self.get_root_folder(user_id) pf_id = root_folder["id"] self.init_knowledgebase_docs(pf_id, user_id) kb_root_folder = self.get_kb_folder(user_id) kb_folder = self.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) + safe_parent_path = sanitize_path(parent_path) + err, files = [], [] for file in file_objs: try: @@ -439,7 +441,7 @@ class FileService(CommonService): if filetype == FileType.OTHER.value: raise RuntimeError("This type of file has not been supported yet!") - location = filename + location = filename if not safe_parent_path else f"{safe_parent_path}/{filename}" while settings.STORAGE_IMPL.obj_exist(kb.id, location): location += "_" diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 5f0fa70f4..e67ddd82d 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -164,3 +164,23 @@ def read_potential_broken_pdf(blob): return repaired return blob + + +def sanitize_path(raw_path: str | None) -> str: + """Normalize and sanitize a user-provided path segment. + + - Converts backslashes to forward slashes + - Strips leading/trailing slashes + - Removes '.' and '..' segments + - Restricts characters to A-Za-z0-9, underscore, dash, and '/' + """ + if not raw_path: + return "" + backslash_re = re.compile(r"[\\]+") + unsafe_re = re.compile(r"[^A-Za-z0-9_\-/]") + normalized = backslash_re.sub("/", raw_path) + normalized = normalized.strip("/") + parts = [seg for seg in normalized.split("/") if seg and seg not in (".", "..")] + sanitized = "/".join(parts) + sanitized = unsafe_re.sub("", sanitized) + return sanitized