mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refa: HTTP API delete dataset / test cases / docs (#7657)
### What problem does this PR solve? This PR introduces Pydantic-based validation for the delete dataset HTTP API, improving code clarity and robustness. Key changes include: 1. Pydantic Validation 2. Error Handling 3. Test Updates 4. Documentation Updates ### Type of change - [x] Documentation Update - [x] Refactoring
This commit is contained in:
@ -14,11 +14,13 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import uuid
|
||||
from collections import Counter
|
||||
from enum import auto
|
||||
from typing import Annotated, Any
|
||||
|
||||
from flask import Request
|
||||
from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
|
||||
from pydantic_core import PydanticCustomError
|
||||
from strenum import StrEnum
|
||||
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
|
||||
|
||||
@ -238,7 +240,7 @@ class CreateDatasetReq(Base):
|
||||
str: Validated Base64 string
|
||||
|
||||
Raises:
|
||||
ValueError: For structural errors in these cases:
|
||||
PydanticCustomError: For structural errors in these cases:
|
||||
- Missing MIME prefix header
|
||||
- Invalid MIME prefix format
|
||||
- Unsupported image MIME type
|
||||
@ -259,16 +261,16 @@ class CreateDatasetReq(Base):
|
||||
if "," in v:
|
||||
prefix, _ = v.split(",", 1)
|
||||
if not prefix.startswith("data:"):
|
||||
raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
|
||||
raise PydanticCustomError("format_invalid", "Invalid MIME prefix format. Must start with 'data:'")
|
||||
|
||||
mime_type = prefix[5:].split(";")[0]
|
||||
supported_mime_types = ["image/jpeg", "image/png"]
|
||||
if mime_type not in supported_mime_types:
|
||||
raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
|
||||
raise PydanticCustomError("format_invalid", "Unsupported MIME type. Allowed: {supported_mime_types}", {"supported_mime_types": supported_mime_types})
|
||||
|
||||
return v
|
||||
else:
|
||||
raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
|
||||
raise PydanticCustomError("format_invalid", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
|
||||
|
||||
@field_validator("embedding_model", mode="after")
|
||||
@classmethod
|
||||
@ -288,7 +290,7 @@ class CreateDatasetReq(Base):
|
||||
str: Validated <model_name>@<provider> format
|
||||
|
||||
Raises:
|
||||
ValueError: For these violations:
|
||||
PydanticCustomError: For these violations:
|
||||
- Missing @ separator
|
||||
- Empty model_name/provider
|
||||
- Invalid component structure
|
||||
@ -300,15 +302,15 @@ class CreateDatasetReq(Base):
|
||||
Invalid: "text-embedding-3-large@" (empty provider)
|
||||
"""
|
||||
if "@" not in v:
|
||||
raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
|
||||
raise PydanticCustomError("format_invalid", "Embedding model identifier must follow <model_name>@<provider> format")
|
||||
|
||||
components = v.split("@", 1)
|
||||
if len(components) != 2 or not all(components):
|
||||
raise ValueError("Both model_name and provider must be non-empty strings")
|
||||
raise PydanticCustomError("format_invalid", "Both model_name and provider must be non-empty strings")
|
||||
|
||||
model_name, provider = components
|
||||
if not model_name.strip() or not provider.strip():
|
||||
raise ValueError("Model name and provider cannot be whitespace-only strings")
|
||||
raise PydanticCustomError("format_invalid", "Model name and provider cannot be whitespace-only strings")
|
||||
return v
|
||||
|
||||
@field_validator("permission", mode="before")
|
||||
@ -374,13 +376,13 @@ class CreateDatasetReq(Base):
|
||||
ParserConfig | None: Validated configuration object
|
||||
|
||||
Raises:
|
||||
ValueError: When serialized JSON exceeds 65,535 characters
|
||||
PydanticCustomError: When serialized JSON exceeds 65,535 characters
|
||||
"""
|
||||
if v is None:
|
||||
return None
|
||||
|
||||
if (json_str := v.model_dump_json()) and len(json_str) > 65535:
|
||||
raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
|
||||
raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
|
||||
return v
|
||||
|
||||
|
||||
@ -390,4 +392,88 @@ class UpdateDatasetReq(CreateDatasetReq):
|
||||
|
||||
@field_serializer("dataset_id")
|
||||
def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
|
||||
"""
|
||||
Serializes a UUID version 1 object to its hexadecimal string representation.
|
||||
|
||||
This field serializer specifically handles UUID version 1 objects, converting them
|
||||
to their canonical 32-character hexadecimal format without hyphens. The conversion
|
||||
is designed for consistent serialization in API responses and database storage.
|
||||
|
||||
Args:
|
||||
v (uuid.UUID1): The UUID version 1 object to serialize. Must be a valid
|
||||
UUID1 instance generated by Python's uuid module.
|
||||
|
||||
Returns:
|
||||
str: 32-character lowercase hexadecimal string representation
|
||||
Example: "550e8400e29b41d4a716446655440000"
|
||||
|
||||
Raises:
|
||||
AttributeError: If input is not a proper UUID object (missing hex attribute)
|
||||
TypeError: If input is not a UUID1 instance (when type checking is enabled)
|
||||
|
||||
Notes:
|
||||
- Version 1 UUIDs contain timestamp and MAC address information
|
||||
- The .hex property automatically converts to lowercase hexadecimal
|
||||
- For cross-version compatibility, consider typing as uuid.UUID instead
|
||||
"""
|
||||
return v.hex
|
||||
|
||||
|
||||
class DeleteReq(Base):
|
||||
ids: list[UUID1] | None = Field(...)
|
||||
|
||||
@field_validator("ids", mode="after")
|
||||
def check_duplicate_ids(cls, v: list[UUID1] | None) -> list[str] | None:
|
||||
"""
|
||||
Validates and converts a list of UUID1 objects to hexadecimal strings while checking for duplicates.
|
||||
|
||||
This validator implements a three-stage processing pipeline:
|
||||
1. Null Handling - returns None for empty/null input
|
||||
2. UUID Conversion - transforms UUID objects to hex strings
|
||||
3. Duplicate Validation - ensures all IDs are unique
|
||||
|
||||
Behavior Specifications:
|
||||
- Input: None → Returns None (indicates no operation)
|
||||
- Input: [] → Returns [] (empty list for explicit no-op)
|
||||
- Input: [UUID1,...] → Returns validated hex strings
|
||||
- Duplicates: Raises formatted PydanticCustomError
|
||||
|
||||
Args:
|
||||
v (list[UUID1] | None):
|
||||
- None: Indicates no datasets should be processed
|
||||
- Empty list: Explicit empty operation
|
||||
- Populated list: Dataset UUIDs to validate/convert
|
||||
|
||||
Returns:
|
||||
list[str] | None:
|
||||
- None when input is None
|
||||
- List of 32-character hex strings (lowercase, no hyphens)
|
||||
Example: ["550e8400e29b41d4a716446655440000"]
|
||||
|
||||
Raises:
|
||||
PydanticCustomError: When duplicates detected, containing:
|
||||
- Error type: "duplicate_uuids"
|
||||
- Template message: "Duplicate ids: '{duplicate_ids}'"
|
||||
- Context: {"duplicate_ids": "id1, id2, ..."}
|
||||
|
||||
Example:
|
||||
>>> validate([UUID("..."), UUID("...")])
|
||||
["2cdf0456e9a711ee8000000000000000", ...]
|
||||
|
||||
>>> validate([UUID("..."), UUID("...")]) # Duplicates
|
||||
PydanticCustomError: Duplicate ids: '2cdf0456e9a711ee8000000000000000'
|
||||
"""
|
||||
if not v:
|
||||
return v
|
||||
|
||||
uuid_hex_list = [ids.hex for ids in v]
|
||||
duplicates = [item for item, count in Counter(uuid_hex_list).items() if count > 1]
|
||||
|
||||
if duplicates:
|
||||
duplicates_str = ", ".join(duplicates)
|
||||
raise PydanticCustomError("duplicate_uuids", "Duplicate ids: '{duplicate_ids}'", {"duplicate_ids": duplicates_str})
|
||||
|
||||
return uuid_hex_list
|
||||
|
||||
|
||||
class DeleteDatasetReq(DeleteReq): ...
|
||||
|
||||
Reference in New Issue
Block a user