Refa: HTTP API delete dataset / test cases / docs (#7657)

### What problem does this PR solve?

This PR introduces Pydantic-based validation for the delete dataset HTTP
API, improving code clarity and robustness. Key changes include:

1. Pydantic Validation
2. Error Handling
3. Test Updates
4. Documentation Updates

### Type of change

- [x] Documentation Update
- [x] Refactoring
This commit is contained in:
liu an
2025-05-16 10:16:43 +08:00
committed by GitHub
parent 0e9ff8c1f7
commit ae8b628f0a
8 changed files with 341 additions and 173 deletions

View File

@ -14,11 +14,13 @@
# limitations under the License.
#
import uuid
from collections import Counter
from enum import auto
from typing import Annotated, Any
from flask import Request
from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
from pydantic_core import PydanticCustomError
from strenum import StrEnum
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
@ -238,7 +240,7 @@ class CreateDatasetReq(Base):
str: Validated Base64 string
Raises:
ValueError: For structural errors in these cases:
PydanticCustomError: For structural errors in these cases:
- Missing MIME prefix header
- Invalid MIME prefix format
- Unsupported image MIME type
@ -259,16 +261,16 @@ class CreateDatasetReq(Base):
if "," in v:
prefix, _ = v.split(",", 1)
if not prefix.startswith("data:"):
raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
raise PydanticCustomError("format_invalid", "Invalid MIME prefix format. Must start with 'data:'")
mime_type = prefix[5:].split(";")[0]
supported_mime_types = ["image/jpeg", "image/png"]
if mime_type not in supported_mime_types:
raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
raise PydanticCustomError("format_invalid", "Unsupported MIME type. Allowed: {supported_mime_types}", {"supported_mime_types": supported_mime_types})
return v
else:
raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
raise PydanticCustomError("format_invalid", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
@field_validator("embedding_model", mode="after")
@classmethod
@ -288,7 +290,7 @@ class CreateDatasetReq(Base):
str: Validated <model_name>@<provider> format
Raises:
ValueError: For these violations:
PydanticCustomError: For these violations:
- Missing @ separator
- Empty model_name/provider
- Invalid component structure
@ -300,15 +302,15 @@ class CreateDatasetReq(Base):
Invalid: "text-embedding-3-large@" (empty provider)
"""
if "@" not in v:
raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
raise PydanticCustomError("format_invalid", "Embedding model identifier must follow <model_name>@<provider> format")
components = v.split("@", 1)
if len(components) != 2 or not all(components):
raise ValueError("Both model_name and provider must be non-empty strings")
raise PydanticCustomError("format_invalid", "Both model_name and provider must be non-empty strings")
model_name, provider = components
if not model_name.strip() or not provider.strip():
raise ValueError("Model name and provider cannot be whitespace-only strings")
raise PydanticCustomError("format_invalid", "Model name and provider cannot be whitespace-only strings")
return v
@field_validator("permission", mode="before")
@ -374,13 +376,13 @@ class CreateDatasetReq(Base):
ParserConfig | None: Validated configuration object
Raises:
ValueError: When serialized JSON exceeds 65,535 characters
PydanticCustomError: When serialized JSON exceeds 65,535 characters
"""
if v is None:
return None
if (json_str := v.model_dump_json()) and len(json_str) > 65535:
raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
return v
@ -390,4 +392,88 @@ class UpdateDatasetReq(CreateDatasetReq):
@field_serializer("dataset_id")
def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
"""
Serializes a UUID version 1 object to its hexadecimal string representation.
This field serializer specifically handles UUID version 1 objects, converting them
to their canonical 32-character hexadecimal format without hyphens. The conversion
is designed for consistent serialization in API responses and database storage.
Args:
v (uuid.UUID1): The UUID version 1 object to serialize. Must be a valid
UUID1 instance generated by Python's uuid module.
Returns:
str: 32-character lowercase hexadecimal string representation
Example: "550e8400e29b41d4a716446655440000"
Raises:
AttributeError: If input is not a proper UUID object (missing hex attribute)
TypeError: If input is not a UUID1 instance (when type checking is enabled)
Notes:
- Version 1 UUIDs contain timestamp and MAC address information
- The .hex property automatically converts to lowercase hexadecimal
- For cross-version compatibility, consider typing as uuid.UUID instead
"""
return v.hex
class DeleteReq(Base):
ids: list[UUID1] | None = Field(...)
@field_validator("ids", mode="after")
def check_duplicate_ids(cls, v: list[UUID1] | None) -> list[str] | None:
"""
Validates and converts a list of UUID1 objects to hexadecimal strings while checking for duplicates.
This validator implements a three-stage processing pipeline:
1. Null Handling - returns None for empty/null input
2. UUID Conversion - transforms UUID objects to hex strings
3. Duplicate Validation - ensures all IDs are unique
Behavior Specifications:
- Input: None → Returns None (indicates no operation)
- Input: [] → Returns [] (empty list for explicit no-op)
- Input: [UUID1,...] → Returns validated hex strings
- Duplicates: Raises formatted PydanticCustomError
Args:
v (list[UUID1] | None):
- None: Indicates no datasets should be processed
- Empty list: Explicit empty operation
- Populated list: Dataset UUIDs to validate/convert
Returns:
list[str] | None:
- None when input is None
- List of 32-character hex strings (lowercase, no hyphens)
Example: ["550e8400e29b41d4a716446655440000"]
Raises:
PydanticCustomError: When duplicates detected, containing:
- Error type: "duplicate_uuids"
- Template message: "Duplicate ids: '{duplicate_ids}'"
- Context: {"duplicate_ids": "id1, id2, ..."}
Example:
>>> validate([UUID("..."), UUID("...")])
["2cdf0456e9a711ee8000000000000000", ...]
>>> validate([UUID("..."), UUID("...")]) # Duplicates
PydanticCustomError: Duplicate ids: '2cdf0456e9a711ee8000000000000000'
"""
if not v:
return v
uuid_hex_list = [ids.hex for ids in v]
duplicates = [item for item, count in Counter(uuid_hex_list).items() if count > 1]
if duplicates:
duplicates_str = ", ".join(duplicates)
raise PydanticCustomError("duplicate_uuids", "Duplicate ids: '{duplicate_ids}'", {"duplicate_ids": duplicates_str})
return uuid_hex_list
class DeleteDatasetReq(DeleteReq): ...