From fa9b7b259c46c01bc22fea7a302bc41d14a9b13a Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Fri, 28 Nov 2025 19:55:24 +0800 Subject: [PATCH] Feat: create datasets from http api supports ingestion pipeline (#11597) ### What problem does this PR solve? Feat: create datasets from http api supports ingestion pipeline ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/utils/validation_utils.py | 96 +++++++++++++++++++++++++-- docs/references/http_api_reference.md | 40 ++++++++++- 2 files changed, 131 insertions(+), 5 deletions(-) diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index 630b64feb..6c426f6f8 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -14,6 +14,7 @@ # limitations under the License. # from collections import Counter +import string from typing import Annotated, Any, Literal from uuid import UUID @@ -25,6 +26,7 @@ from pydantic import ( StringConstraints, ValidationError, field_validator, + model_validator, ) from pydantic_core import PydanticCustomError from werkzeug.exceptions import BadRequest, UnsupportedMediaType @@ -361,10 +363,9 @@ class CreateDatasetReq(Base): description: Annotated[str | None, Field(default=None, max_length=65535)] embedding_model: Annotated[str | None, Field(default=None, max_length=255, serialization_alias="embd_id")] permission: Annotated[Literal["me", "team"], Field(default="me", min_length=1, max_length=16)] - chunk_method: Annotated[ - Literal["naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"], - Field(default="naive", min_length=1, max_length=32, serialization_alias="parser_id"), - ] + chunk_method: Annotated[str | None, Field(default=None, serialization_alias="parser_id")] + parse_type: Annotated[int | None, Field(default=None, ge=0, le=64)] + pipeline_id: Annotated[str | None, Field(default=None, min_length=32, max_length=32, serialization_alias="pipeline_id")] parser_config: Annotated[ParserConfig | None, Field(default=None)] @field_validator("avatar", mode="after") @@ -525,6 +526,93 @@ class CreateDatasetReq(Base): raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)}) return v + @field_validator("pipeline_id", mode="after") + @classmethod + def validate_pipeline_id(cls, v: str | None) -> str | None: + """Validate pipeline_id as 32-char lowercase hex string if provided. + + Rules: + - None or empty string: treat as None (not set) + - Must be exactly length 32 + - Must contain only hex digits (0-9a-fA-F); normalized to lowercase + """ + if v is None: + return None + if v == "": + return None + if len(v) != 32: + raise PydanticCustomError("format_invalid", "pipeline_id must be 32 hex characters") + if any(ch not in string.hexdigits for ch in v): + raise PydanticCustomError("format_invalid", "pipeline_id must be hexadecimal") + return v.lower() + + @model_validator(mode="after") + def validate_parser_dependency(self) -> "CreateDatasetReq": + """ + Mixed conditional validation: + - If parser_id is omitted (field not set): + * If both parse_type and pipeline_id are omitted → default chunk_method = "naive" + * If both parse_type and pipeline_id are provided → allow ingestion pipeline mode + - If parser_id is provided (valid enum) → parse_type and pipeline_id must be None (disallow mixed usage) + + Raises: + PydanticCustomError with code 'dependency_error' on violation. + """ + # Omitted chunk_method (not in fields) logic + if self.chunk_method is None and "chunk_method" not in self.model_fields_set: + # All three absent → default naive + if self.parse_type is None and self.pipeline_id is None: + object.__setattr__(self, "chunk_method", "naive") + return self + # parser_id omitted: require BOTH parse_type & pipeline_id present (no partial allowed) + if self.parse_type is None or self.pipeline_id is None: + missing = [] + if self.parse_type is None: + missing.append("parse_type") + if self.pipeline_id is None: + missing.append("pipeline_id") + raise PydanticCustomError( + "dependency_error", + "parser_id omitted → required fields missing: {fields}", + {"fields": ", ".join(missing)}, + ) + # Both provided → allow pipeline mode + return self + + # parser_id provided (valid): MUST NOT have parse_type or pipeline_id + if isinstance(self.chunk_method, str): + if self.parse_type is not None or self.pipeline_id is not None: + invalid = [] + if self.parse_type is not None: + invalid.append("parse_type") + if self.pipeline_id is not None: + invalid.append("pipeline_id") + raise PydanticCustomError( + "dependency_error", + "parser_id provided → disallowed fields present: {fields}", + {"fields": ", ".join(invalid)}, + ) + return self + + @field_validator("chunk_method", mode="wrap") + @classmethod + def validate_chunk_method(cls, v: Any, handler) -> Any: + """Wrap validation to unify error messages, including type errors (e.g. list).""" + allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"} + error_msg = "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table' or 'tag'" + # Omitted field: handler won't be invoked (wrap still gets value); None treated as explicit invalid + if v is None: + raise PydanticCustomError("literal_error", error_msg) + try: + # Run inner validation (type checking) + result = handler(v) + except Exception: + raise PydanticCustomError("literal_error", error_msg) + # After handler, enforce enumeration + if not isinstance(result, str) or result == "" or result not in allowed: + raise PydanticCustomError("literal_error", error_msg) + return result + class UpdateDatasetReq(CreateDatasetReq): dataset_id: Annotated[str, Field(...)] diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 3c73cf58c..7f006ec3d 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -419,7 +419,15 @@ Creates a dataset. - `"embedding_model"`: `string` - `"permission"`: `string` - `"chunk_method"`: `string` - - `"parser_config"`: `object` + - "parser_config": `object` + - "parse_type": `int` + - "pipeline_id": `string` + +Note: Choose exactly one ingestion mode when creating a dataset. +- Chunking method: provide `"chunk_method"` (optionally with `"parser_config"`). +- Ingestion pipeline: provide both `"parse_type"` and `"pipeline_id"` and do not provide `"chunk_method"`. + +These options are mutually exclusive. If all three of `chunk_method`, `parse_type`, and `pipeline_id` are omitted, the system defaults to `chunk_method = "naive"`. ##### Request example @@ -433,6 +441,26 @@ curl --request POST \ }' ``` +##### Request example (ingestion pipeline) + +Use this form when specifying an ingestion pipeline (do not include `chunk_method`). + +```bash +curl --request POST \ + --url http://{address}/api/v1/datasets \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer ' \ + --data '{ + "name": "test-sdk", + "parse_type": , + "pipeline_id": "" + }' +``` + +Notes: +- `parse_type` is an integer. Replace `` with your pipeline's parse-type value. +- `pipeline_id` must be a 32-character lowercase hexadecimal string. + ##### Request parameters - `"name"`: (*Body parameter*), `string`, *Required* @@ -473,6 +501,7 @@ curl --request POST \ - `"qa"`: Q&A - `"table"`: Table - `"tag"`: Tag + - Mutually exclusive with `parse_type` and `pipeline_id`. If you set `chunk_method`, do not include `parse_type` or `pipeline_id`. - `"parser_config"`: (*Body parameter*), `object` The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: @@ -509,6 +538,15 @@ curl --request POST \ - Defaults to: `{"use_raptor": false}`. - If `"chunk_method"` is `"table"`, `"picture"`, `"one"`, or `"email"`, `"parser_config"` is an empty JSON object. +- "parse_type": (*Body parameter*), `int` + The ingestion pipeline parse type identifier. Required if and only if you are using an ingestion pipeline (together with `"pipeline_id"`). Must not be provided when `"chunk_method"` is set. + +- "pipeline_id": (*Body parameter*), `string` + The ingestion pipeline ID. Required if and only if you are using an ingestion pipeline (together with `"parse_type"`). + - Must not be provided when `"chunk_method"` is set. + +Note: If none of `chunk_method`, `parse_type`, and `pipeline_id` are provided, the system will default to `chunk_method = "naive"`. + #### Response Success: