From bd5dda6b10750aac284d257802d43c586a281c91 Mon Sep 17 00:00:00 2001
From: YngvarHuang <625452882@qq.com>
Date: Thu, 13 Nov 2025 09:59:39 +0800
Subject: [PATCH] Feature/doc upload api add parent path 20251112 (#11231)

### What problem does this PR solve?

Add the specified parent_path to the document upload api interface
(#11230)

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Co-authored-by: virgilwong <hyhvirgil@gmail.com>
---
 api/apps/sdk/doc.py             |  6 +++++-
 api/db/services/file_service.py | 10 ++++++----
 api/utils/file_utils.py         | 20 ++++++++++++++++++++
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py
index 4caf2cc8d..b54597f89 100644
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -93,6 +93,10 @@ def upload(dataset_id, tenant_id):
         type: file
         required: true
         description: Document files to upload.
+      - in: formData
+        name: parent_path
+        type: string
+        description: Optional nested path under the parent folder. Uses '/' separators.
     responses:
       200:
         description: Successfully uploaded documents.
@@ -151,7 +155,7 @@ def upload(dataset_id, tenant_id):
     e, kb = KnowledgebaseService.get_by_id(dataset_id)
     if not e:
         raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
-    err, files = FileService.upload_document(kb, file_objs, tenant_id)
+    err, files = FileService.upload_document(kb, file_objs, tenant_id, parent_path=request.form.get("parent_path"))
     if err:
         return get_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
     # rename key's name
diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py
index 5a3632e97..2cf4931d0 100644
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
@@ -31,7 +31,7 @@ from common.misc_utils import get_uuid
 from common.constants import TaskStatus, FileSource, ParserType
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.task_service import TaskService
-from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img
+from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img, sanitize_path
 from rag.llm.cv_model import GptV4
 from common import settings
 
@@ -329,7 +329,7 @@ class FileService(CommonService):
         current_id = start_id
         while current_id:
             e, file = cls.get_by_id(current_id)
-            if file.parent_id != file.id and e:
+            if e and file.parent_id != file.id:
                 parent_folders.append(file)
                 current_id = file.parent_id
             else:
@@ -423,13 +423,15 @@ class FileService(CommonService):
 
     @classmethod
     @DB.connection_context()
-    def upload_document(self, kb, file_objs, user_id, src="local"):
+    def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None):
         root_folder = self.get_root_folder(user_id)
         pf_id = root_folder["id"]
         self.init_knowledgebase_docs(pf_id, user_id)
         kb_root_folder = self.get_kb_folder(user_id)
         kb_folder = self.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
 
+        safe_parent_path = sanitize_path(parent_path)
+
         err, files = [], []
         for file in file_objs:
             try:
@@ -439,7 +441,7 @@ class FileService(CommonService):
                 if filetype == FileType.OTHER.value:
                     raise RuntimeError("This type of file has not been supported yet!")
 
-                location = filename
+                location = filename if not safe_parent_path else f"{safe_parent_path}/{filename}"
                 while settings.STORAGE_IMPL.obj_exist(kb.id, location):
                     location += "_"
 
diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py
index 5f0fa70f4..e67ddd82d 100644
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@@ -164,3 +164,23 @@ def read_potential_broken_pdf(blob):
         return repaired
 
     return blob
+
+
+def sanitize_path(raw_path: str | None) -> str:
+    """Normalize and sanitize a user-provided path segment.
+
+    - Converts backslashes to forward slashes
+    - Strips leading/trailing slashes
+    - Removes '.' and '..' segments
+    - Restricts characters to A-Za-z0-9, underscore, dash, and '/'
+    """
+    if not raw_path:
+        return ""
+    backslash_re = re.compile(r"[\\]+")
+    unsafe_re = re.compile(r"[^A-Za-z0-9_\-/]")
+    normalized = backslash_re.sub("/", raw_path)
+    normalized = normalized.strip("/")
+    parts = [seg for seg in normalized.split("/") if seg and seg not in (".", "..")]
+    sanitized = "/".join(parts)
+    sanitized = unsafe_re.sub("", sanitized)
+    return sanitized