Refa: HTTP API delete dataset / test cases / docs (#7657)

### What problem does this PR solve?

This PR introduces Pydantic-based validation for the delete dataset HTTP
API, improving code clarity and robustness. Key changes include:

1. Pydantic Validation
2. Error Handling
3. Test Updates
4. Documentation Updates

### Type of change

- [x] Documentation Update
- [x] Refactoring
This commit is contained in:
liu an
2025-05-16 10:16:43 +08:00
committed by GitHub
parent 0e9ff8c1f7
commit ae8b628f0a
8 changed files with 341 additions and 173 deletions

View File

@ -20,7 +20,6 @@ import logging
from flask import request
from peewee import OperationalError
from api import settings
from api.db import FileSource, StatusEnum
from api.db.db_models import File
from api.db.services.document_service import DocumentService
@ -30,7 +29,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.user_service import TenantService
from api.utils import get_uuid
from api.utils.api_utils import (
check_duplicate_ids,
deep_merge,
get_error_argument_result,
get_error_data_result,
@ -39,7 +37,7 @@ from api.utils.api_utils import (
token_required,
verify_embedding_availability,
)
from api.utils.validation_utils import CreateDatasetReq, UpdateDatasetReq, validate_and_parse_json_request
from api.utils.validation_utils import CreateDatasetReq, DeleteDatasetReq, UpdateDatasetReq, validate_and_parse_json_request
@manager.route("/datasets", methods=["POST"]) # noqa: F821
@ -190,72 +188,85 @@ def delete(tenant_id):
required: true
schema:
type: object
required:
- ids
properties:
ids:
type: array
type: array or null
items:
type: string
description: List of dataset IDs to delete.
description: |
Specifies the datasets to delete:
- If `null`, all datasets will be deleted.
- If an array of IDs, only the specified datasets will be deleted.
- If an empty array, no datasets will be deleted.
responses:
200:
description: Successful operation.
schema:
type: object
"""
req, err = validate_and_parse_json_request(request, DeleteDatasetReq)
if err is not None:
return get_error_argument_result(err)
kb_id_instance_pairs = []
if req["ids"] is None:
try:
kbs = KnowledgebaseService.query(tenant_id=tenant_id)
for kb in kbs:
kb_id_instance_pairs.append((kb.id, kb))
except OperationalError as e:
logging.exception(e)
return get_error_data_result(message="Database operation failed")
else:
error_kb_ids = []
for kb_id in req["ids"]:
try:
kb = KnowledgebaseService.get_or_none(id=kb_id, tenant_id=tenant_id)
if kb is None:
error_kb_ids.append(kb_id)
continue
kb_id_instance_pairs.append((kb_id, kb))
except OperationalError as e:
logging.exception(e)
return get_error_data_result(message="Database operation failed")
if len(error_kb_ids) > 0:
return get_error_data_result(message=f"""User '{tenant_id}' lacks permission for datasets: '{", ".join(error_kb_ids)}'""")
errors = []
success_count = 0
req = request.json
if not req:
ids = None
else:
ids = req.get("ids")
if not ids:
id_list = []
kbs = KnowledgebaseService.query(tenant_id=tenant_id)
for kb in kbs:
id_list.append(kb.id)
else:
id_list = ids
unique_id_list, duplicate_messages = check_duplicate_ids(id_list, "dataset")
id_list = unique_id_list
for id in id_list:
kbs = KnowledgebaseService.query(id=id, tenant_id=tenant_id)
if not kbs:
errors.append(f"You don't own the dataset {id}")
continue
for doc in DocumentService.query(kb_id=id):
if not DocumentService.remove_document(doc, tenant_id):
errors.append(f"Remove document error for dataset {id}")
for kb_id, kb in kb_id_instance_pairs:
try:
for doc in DocumentService.query(kb_id=kb_id):
if not DocumentService.remove_document(doc, tenant_id):
errors.append(f"Remove document '{doc.id}' error for dataset '{kb_id}'")
continue
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete(
[
File.source_type == FileSource.KNOWLEDGEBASE,
File.id == f2d[0].file_id,
]
)
File2DocumentService.delete_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kb.name])
if not KnowledgebaseService.delete_by_id(kb_id):
errors.append(f"Delete dataset error for {kb_id}")
continue
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete(
[
File.source_type == FileSource.KNOWLEDGEBASE,
File.id == f2d[0].file_id,
]
)
File2DocumentService.delete_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
if not KnowledgebaseService.delete_by_id(id):
errors.append(f"Delete dataset error for {id}")
continue
success_count += 1
if errors:
if success_count > 0:
return get_result(data={"success_count": success_count, "errors": errors}, message=f"Partially deleted {success_count} datasets with {len(errors)} errors")
else:
return get_error_data_result(message="; ".join(errors))
if duplicate_messages:
if success_count > 0:
return get_result(
message=f"Partially deleted {success_count} datasets with {len(duplicate_messages)} errors",
data={"success_count": success_count, "errors": duplicate_messages},
)
else:
return get_error_data_result(message=";".join(duplicate_messages))
return get_result(code=settings.RetCode.SUCCESS)
success_count += 1
except OperationalError as e:
logging.exception(e)
return get_error_data_result(message="Database operation failed")
if not errors:
return get_result()
error_message = f"Successfully deleted {success_count} datasets, {len(errors)} failed. Details: {'; '.join(errors)[:128]}..."
if success_count == 0:
return get_error_data_result(message=error_message)
return get_result(data={"success_count": success_count, "errors": errors[:5]}, message=error_message)
@manager.route("/datasets/<dataset_id>", methods=["PUT"]) # noqa: F821
@ -373,7 +384,7 @@ def update(tenant_id, dataset_id):
logging.exception(e)
return get_error_data_result(message="Database operation failed")
return get_result(code=settings.RetCode.SUCCESS)
return get_result()
@manager.route("/datasets", methods=["GET"]) # noqa: F821

View File

@ -14,11 +14,13 @@
# limitations under the License.
#
import uuid
from collections import Counter
from enum import auto
from typing import Annotated, Any
from flask import Request
from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
from pydantic_core import PydanticCustomError
from strenum import StrEnum
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
@ -238,7 +240,7 @@ class CreateDatasetReq(Base):
str: Validated Base64 string
Raises:
ValueError: For structural errors in these cases:
PydanticCustomError: For structural errors in these cases:
- Missing MIME prefix header
- Invalid MIME prefix format
- Unsupported image MIME type
@ -259,16 +261,16 @@ class CreateDatasetReq(Base):
if "," in v:
prefix, _ = v.split(",", 1)
if not prefix.startswith("data:"):
raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
raise PydanticCustomError("format_invalid", "Invalid MIME prefix format. Must start with 'data:'")
mime_type = prefix[5:].split(";")[0]
supported_mime_types = ["image/jpeg", "image/png"]
if mime_type not in supported_mime_types:
raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
raise PydanticCustomError("format_invalid", "Unsupported MIME type. Allowed: {supported_mime_types}", {"supported_mime_types": supported_mime_types})
return v
else:
raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
raise PydanticCustomError("format_invalid", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
@field_validator("embedding_model", mode="after")
@classmethod
@ -288,7 +290,7 @@ class CreateDatasetReq(Base):
str: Validated <model_name>@<provider> format
Raises:
ValueError: For these violations:
PydanticCustomError: For these violations:
- Missing @ separator
- Empty model_name/provider
- Invalid component structure
@ -300,15 +302,15 @@ class CreateDatasetReq(Base):
Invalid: "text-embedding-3-large@" (empty provider)
"""
if "@" not in v:
raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
raise PydanticCustomError("format_invalid", "Embedding model identifier must follow <model_name>@<provider> format")
components = v.split("@", 1)
if len(components) != 2 or not all(components):
raise ValueError("Both model_name and provider must be non-empty strings")
raise PydanticCustomError("format_invalid", "Both model_name and provider must be non-empty strings")
model_name, provider = components
if not model_name.strip() or not provider.strip():
raise ValueError("Model name and provider cannot be whitespace-only strings")
raise PydanticCustomError("format_invalid", "Model name and provider cannot be whitespace-only strings")
return v
@field_validator("permission", mode="before")
@ -374,13 +376,13 @@ class CreateDatasetReq(Base):
ParserConfig | None: Validated configuration object
Raises:
ValueError: When serialized JSON exceeds 65,535 characters
PydanticCustomError: When serialized JSON exceeds 65,535 characters
"""
if v is None:
return None
if (json_str := v.model_dump_json()) and len(json_str) > 65535:
raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
return v
@ -390,4 +392,88 @@ class UpdateDatasetReq(CreateDatasetReq):
@field_serializer("dataset_id")
def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
"""
Serializes a UUID version 1 object to its hexadecimal string representation.
This field serializer specifically handles UUID version 1 objects, converting them
to their canonical 32-character hexadecimal format without hyphens. The conversion
is designed for consistent serialization in API responses and database storage.
Args:
v (uuid.UUID1): The UUID version 1 object to serialize. Must be a valid
UUID1 instance generated by Python's uuid module.
Returns:
str: 32-character lowercase hexadecimal string representation
Example: "550e8400e29b41d4a716446655440000"
Raises:
AttributeError: If input is not a proper UUID object (missing hex attribute)
TypeError: If input is not a UUID1 instance (when type checking is enabled)
Notes:
- Version 1 UUIDs contain timestamp and MAC address information
- The .hex property automatically converts to lowercase hexadecimal
- For cross-version compatibility, consider typing as uuid.UUID instead
"""
return v.hex
class DeleteReq(Base):
ids: list[UUID1] | None = Field(...)
@field_validator("ids", mode="after")
def check_duplicate_ids(cls, v: list[UUID1] | None) -> list[str] | None:
"""
Validates and converts a list of UUID1 objects to hexadecimal strings while checking for duplicates.
This validator implements a three-stage processing pipeline:
1. Null Handling - returns None for empty/null input
2. UUID Conversion - transforms UUID objects to hex strings
3. Duplicate Validation - ensures all IDs are unique
Behavior Specifications:
- Input: None → Returns None (indicates no operation)
- Input: [] → Returns [] (empty list for explicit no-op)
- Input: [UUID1,...] → Returns validated hex strings
- Duplicates: Raises formatted PydanticCustomError
Args:
v (list[UUID1] | None):
- None: Indicates no datasets should be processed
- Empty list: Explicit empty operation
- Populated list: Dataset UUIDs to validate/convert
Returns:
list[str] | None:
- None when input is None
- List of 32-character hex strings (lowercase, no hyphens)
Example: ["550e8400e29b41d4a716446655440000"]
Raises:
PydanticCustomError: When duplicates detected, containing:
- Error type: "duplicate_uuids"
- Template message: "Duplicate ids: '{duplicate_ids}'"
- Context: {"duplicate_ids": "id1, id2, ..."}
Example:
>>> validate([UUID("..."), UUID("...")])
["2cdf0456e9a711ee8000000000000000", ...]
>>> validate([UUID("..."), UUID("...")]) # Duplicates
PydanticCustomError: Duplicate ids: '2cdf0456e9a711ee8000000000000000'
"""
if not v:
return v
uuid_hex_list = [ids.hex for ids in v]
duplicates = [item for item, count in Counter(uuid_hex_list).items() if count > 1]
if duplicates:
duplicates_str = ", ".join(duplicates)
raise PydanticCustomError("duplicate_uuids", "Duplicate ids: '{duplicate_ids}'", {"duplicate_ids": duplicates_str})
return uuid_hex_list
class DeleteDatasetReq(DeleteReq): ...