Refa: HTTP API delete dataset / test cases / docs (#7657)

### What problem does this PR solve? This PR introduces Pydantic-based validation for the delete dataset HTTP API, improving code clarity and robustness. Key changes include: 1. Pydantic Validation 2. Error Handling 3. Test Updates 4. Documentation Updates ### Type of change - [x] Documentation Update - [x] Refactoring
2025-12-08 20:42:30 +08:00 · 2025-05-16 10:16:43 +08:00
parent 0e9ff8c1f7
commit ae8b628f0a
8 changed files with 341 additions and 173 deletions
--- a/api/utils/validation_utils.py
+++ b/api/utils/validation_utils.py
@ -14,11 +14,13 @@
 #  limitations under the License.
 #
 import uuid
+from collections import Counter
 from enum import auto
 from typing import Annotated, Any

 from flask import Request
 from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
+from pydantic_core import PydanticCustomError
 from strenum import StrEnum
 from werkzeug.exceptions import BadRequest, UnsupportedMediaType

@ -238,7 +240,7 @@ class CreateDatasetReq(Base):
            str: Validated Base64 string

        Raises:
-            ValueError: For structural errors in these cases:
+            PydanticCustomError: For structural errors in these cases:
                - Missing MIME prefix header
                - Invalid MIME prefix format
                - Unsupported image MIME type
@ -259,16 +261,16 @@ class CreateDatasetReq(Base):
        if "," in v:
            prefix, _ = v.split(",", 1)
            if not prefix.startswith("data:"):
-                raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
+                raise PydanticCustomError("format_invalid", "Invalid MIME prefix format. Must start with 'data:'")

            mime_type = prefix[5:].split(";")[0]
            supported_mime_types = ["image/jpeg", "image/png"]
            if mime_type not in supported_mime_types:
-                raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
+                raise PydanticCustomError("format_invalid", "Unsupported MIME type. Allowed: {supported_mime_types}", {"supported_mime_types": supported_mime_types})

            return v
        else:
-            raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
+            raise PydanticCustomError("format_invalid", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>")

    @field_validator("embedding_model", mode="after")
    @classmethod
@ -288,7 +290,7 @@ class CreateDatasetReq(Base):
            str: Validated <model_name>@<provider> format

        Raises:
-            ValueError: For these violations:
+            PydanticCustomError: For these violations:
                - Missing @ separator
                - Empty model_name/provider
                - Invalid component structure
@ -300,15 +302,15 @@ class CreateDatasetReq(Base):
            Invalid: "text-embedding-3-large@" (empty provider)
        """
        if "@" not in v:
-            raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
+            raise PydanticCustomError("format_invalid", "Embedding model identifier must follow <model_name>@<provider> format")

        components = v.split("@", 1)
        if len(components) != 2 or not all(components):
-            raise ValueError("Both model_name and provider must be non-empty strings")
+            raise PydanticCustomError("format_invalid", "Both model_name and provider must be non-empty strings")

        model_name, provider = components
        if not model_name.strip() or not provider.strip():
-            raise ValueError("Model name and provider cannot be whitespace-only strings")
+            raise PydanticCustomError("format_invalid", "Model name and provider cannot be whitespace-only strings")
        return v

    @field_validator("permission", mode="before")
@ -374,13 +376,13 @@ class CreateDatasetReq(Base):
            ParserConfig | None: Validated configuration object

        Raises:
-            ValueError: When serialized JSON exceeds 65,535 characters
+            PydanticCustomError: When serialized JSON exceeds 65,535 characters
        """
        if v is None:
            return None

        if (json_str := v.model_dump_json()) and len(json_str) > 65535:
-            raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
+            raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
        return v


@ -390,4 +392,88 @@ class UpdateDatasetReq(CreateDatasetReq):

    @field_serializer("dataset_id")
    def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
+        """
+        Serializes a UUID version 1 object to its hexadecimal string representation.
+
+        This field serializer specifically handles UUID version 1 objects, converting them
+        to their canonical 32-character hexadecimal format without hyphens. The conversion
+        is designed for consistent serialization in API responses and database storage.
+
+        Args:
+            v (uuid.UUID1): The UUID version 1 object to serialize. Must be a valid
+                        UUID1 instance generated by Python's uuid module.
+
+        Returns:
+            str: 32-character lowercase hexadecimal string representation
+                Example: "550e8400e29b41d4a716446655440000"
+
+        Raises:
+            AttributeError: If input is not a proper UUID object (missing hex attribute)
+            TypeError: If input is not a UUID1 instance (when type checking is enabled)
+
+        Notes:
+            - Version 1 UUIDs contain timestamp and MAC address information
+            - The .hex property automatically converts to lowercase hexadecimal
+            - For cross-version compatibility, consider typing as uuid.UUID instead
+        """
        return v.hex
+
+
+class DeleteReq(Base):
+    ids: list[UUID1] | None = Field(...)
+
+    @field_validator("ids", mode="after")
+    def check_duplicate_ids(cls, v: list[UUID1] | None) -> list[str] | None:
+        """
+        Validates and converts a list of UUID1 objects to hexadecimal strings while checking for duplicates.
+
+        This validator implements a three-stage processing pipeline:
+        1. Null Handling - returns None for empty/null input
+        2. UUID Conversion - transforms UUID objects to hex strings
+        3. Duplicate Validation - ensures all IDs are unique
+
+        Behavior Specifications:
+        - Input: None → Returns None (indicates no operation)
+        - Input: [] → Returns [] (empty list for explicit no-op)
+        - Input: [UUID1,...] → Returns validated hex strings
+        - Duplicates: Raises formatted PydanticCustomError
+
+        Args:
+            v (list[UUID1] | None):
+                - None: Indicates no datasets should be processed
+                - Empty list: Explicit empty operation
+                - Populated list: Dataset UUIDs to validate/convert
+
+        Returns:
+            list[str] | None:
+                - None when input is None
+                - List of 32-character hex strings (lowercase, no hyphens)
+                Example: ["550e8400e29b41d4a716446655440000"]
+
+        Raises:
+            PydanticCustomError: When duplicates detected, containing:
+                - Error type: "duplicate_uuids"
+                - Template message: "Duplicate ids: '{duplicate_ids}'"
+                - Context: {"duplicate_ids": "id1, id2, ..."}
+
+        Example:
+            >>> validate([UUID("..."), UUID("...")])
+            ["2cdf0456e9a711ee8000000000000000", ...]
+
+            >>> validate([UUID("..."), UUID("...")])  # Duplicates
+            PydanticCustomError: Duplicate ids: '2cdf0456e9a711ee8000000000000000'
+        """
+        if not v:
+            return v
+
+        uuid_hex_list = [ids.hex for ids in v]
+        duplicates = [item for item, count in Counter(uuid_hex_list).items() if count > 1]
+
+        if duplicates:
+            duplicates_str = ", ".join(duplicates)
+            raise PydanticCustomError("duplicate_uuids", "Duplicate ids: '{duplicate_ids}'", {"duplicate_ids": duplicates_str})
+
+        return uuid_hex_list
+
+
+class DeleteDatasetReq(DeleteReq): ...