From fa9b7b259c46c01bc22fea7a302bc41d14a9b13a Mon Sep 17 00:00:00 2001
From: Billy Bao <newyorkupperbay@gmail.com>
Date: Fri, 28 Nov 2025 19:55:24 +0800
Subject: [PATCH] Feat: create datasets from http api supports ingestion
 pipeline (#11597)

### What problem does this PR solve?

Feat: create datasets from http api supports ingestion pipeline

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 api/utils/validation_utils.py         | 96 +++++++++++++++++++++++++--
 docs/references/http_api_reference.md | 40 ++++++++++-
 2 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py
index 630b64feb..6c426f6f8 100644
--- a/api/utils/validation_utils.py
+++ b/api/utils/validation_utils.py
@@ -14,6 +14,7 @@
 #  limitations under the License.
 #
 from collections import Counter
+import string
 from typing import Annotated, Any, Literal
 from uuid import UUID
 
@@ -25,6 +26,7 @@ from pydantic import (
     StringConstraints,
     ValidationError,
     field_validator,
+    model_validator,
 )
 from pydantic_core import PydanticCustomError
 from werkzeug.exceptions import BadRequest, UnsupportedMediaType
@@ -361,10 +363,9 @@ class CreateDatasetReq(Base):
     description: Annotated[str | None, Field(default=None, max_length=65535)]
     embedding_model: Annotated[str | None, Field(default=None, max_length=255, serialization_alias="embd_id")]
     permission: Annotated[Literal["me", "team"], Field(default="me", min_length=1, max_length=16)]
-    chunk_method: Annotated[
-        Literal["naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"],
-        Field(default="naive", min_length=1, max_length=32, serialization_alias="parser_id"),
-    ]
+    chunk_method: Annotated[str | None, Field(default=None, serialization_alias="parser_id")]
+    parse_type: Annotated[int | None, Field(default=None, ge=0, le=64)]
+    pipeline_id: Annotated[str | None, Field(default=None, min_length=32, max_length=32, serialization_alias="pipeline_id")]
     parser_config: Annotated[ParserConfig | None, Field(default=None)]
 
     @field_validator("avatar", mode="after")
@@ -525,6 +526,93 @@ class CreateDatasetReq(Base):
             raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
         return v
 
+    @field_validator("pipeline_id", mode="after")
+    @classmethod
+    def validate_pipeline_id(cls, v: str | None) -> str | None:
+        """Validate pipeline_id as 32-char lowercase hex string if provided.
+
+        Rules:
+        - None or empty string: treat as None (not set)
+        - Must be exactly length 32
+        - Must contain only hex digits (0-9a-fA-F); normalized to lowercase
+        """
+        if v is None:
+            return None
+        if v == "":
+            return None
+        if len(v) != 32:
+            raise PydanticCustomError("format_invalid", "pipeline_id must be 32 hex characters")
+        if any(ch not in string.hexdigits for ch in v):
+            raise PydanticCustomError("format_invalid", "pipeline_id must be hexadecimal")
+        return v.lower()
+
+    @model_validator(mode="after")
+    def validate_parser_dependency(self) -> "CreateDatasetReq":
+        """
+        Mixed conditional validation:
+        - If parser_id is omitted (field not set):
+            * If both parse_type and pipeline_id are omitted → default chunk_method = "naive"
+            * If both parse_type and pipeline_id are provided → allow ingestion pipeline mode
+        - If parser_id is provided (valid enum) → parse_type and pipeline_id must be None (disallow mixed usage)
+
+        Raises:
+            PydanticCustomError with code 'dependency_error' on violation.
+        """
+        # Omitted chunk_method (not in fields) logic
+        if self.chunk_method is None and "chunk_method" not in self.model_fields_set:
+            # All three absent → default naive
+            if self.parse_type is None and self.pipeline_id is None:
+                object.__setattr__(self, "chunk_method", "naive")
+                return self
+            # parser_id omitted: require BOTH parse_type & pipeline_id present (no partial allowed)
+            if self.parse_type is None or self.pipeline_id is None:
+                missing = []
+                if self.parse_type is None:
+                    missing.append("parse_type")
+                if self.pipeline_id is None:
+                    missing.append("pipeline_id")
+                raise PydanticCustomError(
+                    "dependency_error",
+                    "parser_id omitted → required fields missing: {fields}",
+                    {"fields": ", ".join(missing)},
+                )
+            # Both provided → allow pipeline mode
+            return self
+
+        # parser_id provided (valid): MUST NOT have parse_type or pipeline_id
+        if isinstance(self.chunk_method, str):
+            if self.parse_type is not None or self.pipeline_id is not None:
+                invalid = []
+                if self.parse_type is not None:
+                    invalid.append("parse_type")
+                if self.pipeline_id is not None:
+                    invalid.append("pipeline_id")
+                raise PydanticCustomError(
+                    "dependency_error",
+                    "parser_id provided → disallowed fields present: {fields}",
+                    {"fields": ", ".join(invalid)},
+                )
+        return self
+
+    @field_validator("chunk_method", mode="wrap")
+    @classmethod
+    def validate_chunk_method(cls, v: Any, handler) -> Any:
+        """Wrap validation to unify error messages, including type errors (e.g. list)."""
+        allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"}
+        error_msg = "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table' or 'tag'"
+        # Omitted field: handler won't be invoked (wrap still gets value); None treated as explicit invalid
+        if v is None:
+            raise PydanticCustomError("literal_error", error_msg)
+        try:
+            # Run inner validation (type checking)
+            result = handler(v)
+        except Exception:
+            raise PydanticCustomError("literal_error", error_msg)
+        # After handler, enforce enumeration
+        if not isinstance(result, str) or result == "" or result not in allowed:
+            raise PydanticCustomError("literal_error", error_msg)
+        return result
+
 
 class UpdateDatasetReq(CreateDatasetReq):
     dataset_id: Annotated[str, Field(...)]
diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md
index 3c73cf58c..7f006ec3d 100644
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -419,7 +419,15 @@ Creates a dataset.
   - `"embedding_model"`: `string`
   - `"permission"`: `string`
   - `"chunk_method"`: `string`
-  - `"parser_config"`: `object`
+  - "parser_config": `object`
+  - "parse_type": `int`
+  - "pipeline_id": `string`
+
+Note: Choose exactly one ingestion mode when creating a dataset.
+- Chunking method: provide `"chunk_method"` (optionally with `"parser_config"`).
+- Ingestion pipeline: provide both `"parse_type"` and `"pipeline_id"` and do not provide `"chunk_method"`.
+
+These options are mutually exclusive. If all three of `chunk_method`, `parse_type`, and `pipeline_id` are omitted, the system defaults to `chunk_method = "naive"`.
 
 ##### Request example
 
@@ -433,6 +441,26 @@ curl --request POST \
       }'
 ```
 
+##### Request example (ingestion pipeline)
+
+Use this form when specifying an ingestion pipeline (do not include `chunk_method`).
+
+```bash
+curl --request POST \
+  --url http://{address}/api/v1/datasets \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: Bearer <YOUR_API_KEY>' \
+  --data '{
+   "name": "test-sdk",
+   "parse_type": <NUMBER_OF_FORMATS_IN_PARSE>,
+   "pipeline_id": "<PIPELINE_ID_32_HEX>"
+  }'
+```
+
+Notes:
+- `parse_type` is an integer. Replace `<NUMBER_OF_FORMATS_IN_PARSE>` with your pipeline's parse-type value.
+- `pipeline_id` must be a 32-character lowercase hexadecimal string.
+
 ##### Request parameters
 
 - `"name"`: (*Body parameter*), `string`, *Required*  
@@ -473,6 +501,7 @@ curl --request POST \
   - `"qa"`: Q&A
   - `"table"`: Table
   - `"tag"`: Tag
+  - Mutually exclusive with `parse_type` and `pipeline_id`. If you set `chunk_method`, do not include `parse_type` or `pipeline_id`.
 
 - `"parser_config"`: (*Body parameter*), `object`  
   The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`:  
@@ -509,6 +538,15 @@ curl --request POST \
       - Defaults to: `{"use_raptor": false}`.
   - If `"chunk_method"` is `"table"`, `"picture"`, `"one"`, or `"email"`, `"parser_config"` is an empty JSON object.
 
+- "parse_type": (*Body parameter*), `int`  
+  The ingestion pipeline parse type identifier. Required if and only if you are using an ingestion pipeline (together with `"pipeline_id"`). Must not be provided when `"chunk_method"` is set.
+
+- "pipeline_id": (*Body parameter*), `string`  
+  The ingestion pipeline ID. Required if and only if you are using an ingestion pipeline (together with `"parse_type"`).  
+  - Must not be provided when `"chunk_method"` is set.
+
+Note: If none of `chunk_method`, `parse_type`, and `pipeline_id` are provided, the system will default to `chunk_method = "naive"`.
+
 #### Response
 
 Success: