feat(cell validation): add comprehensive data validation capabilities (#37)

- Add cell_validation.py module for Excel data validation metadata extraction
- Implement get_data_validation_for_cell() and get_all_validation_ranges()
- Include validation metadata in read_data_from_excel responses automatically
- Add get_data_validation_info MCP tool for validation rule summaries
- Resolve range references in list validations to actual cell values
- Support all validation types: list, whole, decimal, date, time, textLength
- Include operators (between, notBetween, equal, greaterThan, etc.) in metadata

This allows LLMs to understand Excel validation constraints including
dropdown options, numeric ranges, date constraints, and text length limits.
This commit is contained in:
Nate
2025-06-06 21:42:00 -07:00
committed by GitHub
parent b2c9ce8e6a
commit bb537b35be
5 changed files with 393 additions and 19 deletions

View File

@ -337,3 +337,23 @@ validate_excel_range(
- `start_cell`: Starting cell of range
- `end_cell`: Optional ending cell of range
- Returns: Validation result message
### get_data_validation_info
Get data validation rules and metadata for a worksheet.
```python
get_data_validation_info(filepath: str, sheet_name: str) -> str
```
- `filepath`: Path to Excel file
- `sheet_name`: Target worksheet name
- Returns: JSON string containing all data validation rules with metadata including:
- Validation type (list, whole, decimal, date, time, textLength)
- Operator (between, notBetween, equal, greaterThan, lessThan, etc.)
- Allowed values for list validations (resolved from ranges)
- Formula constraints for numeric/date validations
- Cell ranges where validation applies
- Prompt and error messages
**Note**: The `read_data_from_excel` tool automatically includes validation metadata for individual cells when available.

View File

@ -0,0 +1,179 @@
import logging
from typing import Any, Dict, List, Optional
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.utils.cell import coordinate_from_string, column_index_from_string
logger = logging.getLogger(__name__)
def get_data_validation_for_cell(worksheet: Worksheet, cell_address: str) -> Optional[Dict[str, Any]]:
"""Get data validation metadata for a specific cell.
Args:
worksheet: The openpyxl worksheet object
cell_address: Cell address like 'A1', 'B2', etc.
Returns:
Dictionary with validation metadata or None if no validation exists
"""
try:
# Convert cell address to row/col coordinates
col_letter, row = coordinate_from_string(cell_address)
col_idx = column_index_from_string(col_letter)
# Check each data validation rule in the worksheet
for dv in worksheet.data_validations.dataValidation:
# Check if this cell is covered by the validation rule
if _cell_in_validation_range(row, col_idx, dv):
return _extract_validation_metadata(dv, cell_address, worksheet)
return None
except Exception as e:
logger.warning(f"Failed to get validation for cell {cell_address}: {e}")
return None
def _cell_in_validation_range(row: int, col: int, data_validation) -> bool:
"""Check if a cell is within a data validation range."""
try:
# data_validation.sqref contains the cell ranges this validation applies to
for cell_range in data_validation.sqref.ranges:
if (cell_range.min_row <= row <= cell_range.max_row and
cell_range.min_col <= col <= cell_range.max_col):
return True
return False
except Exception as e:
logger.warning(f"Error checking if cell ({row}, {col}) is in validation range for DV sqref '{getattr(data_validation, 'sqref', 'N/A')}': {e}")
return False
def _extract_validation_metadata(data_validation, cell_address: str, worksheet: Optional[Worksheet] = None) -> Dict[str, Any]:
"""Extract metadata from a DataValidation object."""
try:
validation_info = {
"cell": cell_address,
"has_validation": True,
"validation_type": data_validation.type,
"allow_blank": data_validation.allowBlank,
}
# Add operator for validation types that use it
if data_validation.operator:
validation_info["operator"] = data_validation.operator
# Add optional fields if they exist
if data_validation.prompt:
validation_info["prompt"] = data_validation.prompt
if data_validation.promptTitle:
validation_info["prompt_title"] = data_validation.promptTitle
if data_validation.error:
validation_info["error_message"] = data_validation.error
if data_validation.errorTitle:
validation_info["error_title"] = data_validation.errorTitle
# For list type validations (dropdown lists), extract allowed values
if data_validation.type == "list" and data_validation.formula1:
allowed_values = _extract_list_values(data_validation.formula1, worksheet)
validation_info["allowed_values"] = allowed_values
# For other validation types, include the formulas
elif data_validation.formula1:
validation_info["formula1"] = data_validation.formula1
if data_validation.formula2:
validation_info["formula2"] = data_validation.formula2
return validation_info
except Exception as e:
logger.warning(f"Failed to extract validation metadata: {e}")
return {
"cell": cell_address,
"has_validation": True,
"validation_type": "unknown",
"error": f"Failed to parse validation: {e}"
}
def _extract_list_values(formula: str, worksheet: Optional[Worksheet] = None) -> List[str]:
"""Extract allowed values from a list validation formula."""
try:
# Remove quotes if present
formula = formula.strip('"')
# Handle comma-separated list
if ',' in formula:
# Split by comma and clean up each value
values = [val.strip().strip('"') for val in formula.split(',')]
return [val for val in values if val] # Remove empty values
# Handle range reference (e.g., "$A$1:$A$5" or "Sheet1!$A$1:$A$5")
elif (':' in formula or formula.startswith('$')) and worksheet:
try:
# Remove potential leading '=' if it's a formula like '=Sheet1!$A$1:$A$5'
range_ref = formula
if formula.startswith('='):
range_ref = formula[1:]
actual_values = []
# worksheet[range_ref] can resolve ranges like "A1:A5" or "SheetName!A1:A5"
# It returns a tuple of tuples of cells for ranges, or a single cell
range_cells = worksheet[range_ref]
# Handle single cell or range
if hasattr(range_cells, 'value'): # Single cell
if range_cells.value is not None:
actual_values.append(str(range_cells.value))
else: # Range of cells
for row_of_cells in range_cells:
# Handle case where row_of_cells might be a single cell
if hasattr(row_of_cells, 'value'):
if row_of_cells.value is not None:
actual_values.append(str(row_of_cells.value))
else:
for cell in row_of_cells:
if cell.value is not None:
actual_values.append(str(cell.value))
if actual_values:
return actual_values
return [f"Range: {formula} (empty or unresolvable)"]
except Exception as e:
logger.warning(f"Could not resolve range '{formula}' for list validation: {e}")
return [f"Range: {formula} (resolution error)"]
# Handle range reference when worksheet not available
elif ':' in formula or formula.startswith('$'):
return [f"Range: {formula}"]
# Single value
else:
return [formula.strip('"')]
except Exception as e:
logger.warning(f"Failed to parse list formula '{formula}': {e}")
return [formula] # Return original formula if parsing fails
def get_all_validation_ranges(worksheet: Worksheet) -> List[Dict[str, Any]]:
"""Get all data validation ranges in a worksheet.
Returns:
List of dictionaries containing validation range information
"""
validations = []
try:
for dv in worksheet.data_validations.dataValidation:
validation_info = {
"ranges": str(dv.sqref),
"validation_type": dv.type,
"allow_blank": dv.allowBlank,
}
if dv.type == "list" and dv.formula1:
validation_info["allowed_values"] = _extract_list_values(dv.formula1, worksheet)
validations.append(validation_info)
except Exception as e:
logger.warning(f"Failed to get validation ranges: {e}")
return validations

View File

@ -1,14 +1,14 @@
from pathlib import Path
from typing import Any
from typing import Any, Dict
import logging
from openpyxl import load_workbook
from openpyxl.styles import Font
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.utils import get_column_letter
from .exceptions import DataError
from .cell_utils import parse_cell_range
from .cell_validation import get_data_validation_for_cell
logger = logging.getLogger(__name__)
@ -244,3 +244,108 @@ def _write_data_to_worksheet(
except Exception as e:
logger.error(f"Failed to write worksheet data: {e}")
raise DataError(str(e))
def read_excel_range_with_metadata(
filepath: Path | str,
sheet_name: str,
start_cell: str = "A1",
end_cell: str | None = None,
include_validation: bool = True
) -> Dict[str, Any]:
"""Read data from Excel range with cell metadata including validation rules.
Args:
filepath: Path to Excel file
sheet_name: Name of worksheet
start_cell: Starting cell address
end_cell: Ending cell address (optional)
include_validation: Whether to include validation metadata
Returns:
Dictionary containing structured cell data with metadata
"""
try:
wb = load_workbook(filepath, read_only=False)
if sheet_name not in wb.sheetnames:
raise DataError(f"Sheet '{sheet_name}' not found")
ws = wb[sheet_name]
# Parse start cell
if ':' in start_cell:
start_cell, end_cell = start_cell.split(':')
# Get start coordinates
try:
start_coords = parse_cell_range(f"{start_cell}:{start_cell}")
if not start_coords or not all(coord is not None for coord in start_coords[:2]):
raise DataError(f"Invalid start cell reference: {start_cell}")
start_row, start_col = start_coords[0], start_coords[1]
except ValueError as e:
raise DataError(f"Invalid start cell format: {str(e)}")
# Determine end coordinates
if end_cell:
try:
end_coords = parse_cell_range(f"{end_cell}:{end_cell}")
if not end_coords or not all(coord is not None for coord in end_coords[:2]):
raise DataError(f"Invalid end cell reference: {end_cell}")
end_row, end_col = end_coords[0], end_coords[1]
except ValueError as e:
raise DataError(f"Invalid end cell format: {str(e)}")
else:
# Dynamically expand range until all values are empty
end_row, end_col = start_row, start_col
while end_row <= ws.max_row and any(ws.cell(row=end_row, column=c).value is not None for c in range(start_col, ws.max_column + 1)):
end_row += 1
while end_col <= ws.max_column and any(ws.cell(row=r, column=end_col).value is not None for r in range(start_row, ws.max_row + 1)):
end_col += 1
end_row -= 1 # Adjust back to last non-empty row
end_col -= 1 # Adjust back to last non-empty column
# Validate range bounds
if start_row > ws.max_row or start_col > ws.max_column:
raise DataError(
f"Start cell out of bounds. Sheet dimensions are "
f"A1:{get_column_letter(ws.max_column)}{ws.max_row}"
)
# Build structured cell data
range_data = {
"range": f"{start_cell}:{get_column_letter(end_col)}{end_row}" if end_cell else start_cell,
"sheet_name": sheet_name,
"cells": []
}
for row in range(start_row, end_row + 1):
for col in range(start_col, end_col + 1):
cell = ws.cell(row=row, column=col)
cell_address = f"{get_column_letter(col)}{row}"
cell_data = {
"address": cell_address,
"value": cell.value,
"row": row,
"column": col
}
# Add validation metadata if requested
if include_validation:
validation_info = get_data_validation_for_cell(ws, cell_address)
if validation_info:
cell_data["validation"] = validation_info
else:
cell_data["validation"] = {"has_validation": False}
range_data["cells"].append(cell_data)
wb.close()
return range_data
except DataError as e:
logger.error(str(e))
raise
except Exception as e:
logger.error(f"Failed to read Excel range with metadata: {e}")
raise DataError(str(e))

View File

@ -165,7 +165,7 @@ def format_range(
full_path = get_excel_path(filepath)
from excel_mcp.formatting import format_range as format_range_func
result = format_range_func(
format_range_func(
filepath=full_path,
sheet_name=sheet_name,
start_cell=start_cell,
@ -201,20 +201,35 @@ def read_data_from_excel(
preview_only: bool = False
) -> str:
"""
Read data from Excel worksheet.
Read data from Excel worksheet with cell metadata including validation rules.
Args:
filepath: Path to Excel file
sheet_name: Name of worksheet
start_cell: Starting cell (default A1)
end_cell: Ending cell (optional, auto-expands if not provided)
preview_only: Whether to return preview only
Returns:
Data from Excel worksheet as json string. list of lists or empty list if no data found. sublists are assumed to be rows.
JSON string containing structured cell data with validation metadata.
Each cell includes: address, value, row, column, and validation info (if any).
"""
try:
full_path = get_excel_path(filepath)
from excel_mcp.data import read_excel_range
result = read_excel_range(full_path, sheet_name, start_cell, end_cell, preview_only)
if not result:
from excel_mcp.data import read_excel_range_with_metadata
result = read_excel_range_with_metadata(
full_path,
sheet_name,
start_cell,
end_cell
)
if not result or not result.get("cells"):
return "No data found in specified range"
# Convert the list of dicts to a formatted string
data_str = "\n".join([str(row) for row in result])
return data_str
# Return as formatted JSON string
import json
return json.dumps(result, indent=2, default=str)
except Exception as e:
logger.error(f"Error reading data: {e}")
raise
@ -253,7 +268,7 @@ def create_workbook(filepath: str) -> str:
try:
full_path = get_excel_path(filepath)
from excel_mcp.workbook import create_workbook as create_workbook_impl
result = create_workbook_impl(full_path)
create_workbook_impl(full_path)
return f"Created workbook at {full_path}"
except WorkbookError as e:
return f"Error: {str(e)}"
@ -500,6 +515,50 @@ def validate_excel_range(
logger.error(f"Error validating range: {e}")
raise
@mcp.tool()
def get_data_validation_info(
filepath: str,
sheet_name: str
) -> str:
"""
Get all data validation rules in a worksheet.
This tool helps identify which cell ranges have validation rules
and what types of validation are applied.
Args:
filepath: Path to Excel file
sheet_name: Name of worksheet
Returns:
JSON string containing all validation rules in the worksheet
"""
try:
full_path = get_excel_path(filepath)
from openpyxl import load_workbook
from excel_mcp.cell_validation import get_all_validation_ranges
wb = load_workbook(full_path, read_only=False)
if sheet_name not in wb.sheetnames:
return f"Error: Sheet '{sheet_name}' not found"
ws = wb[sheet_name]
validations = get_all_validation_ranges(ws)
wb.close()
if not validations:
return "No data validation rules found in this worksheet"
import json
return json.dumps({
"sheet_name": sheet_name,
"validation_rules": validations
}, indent=2, default=str)
except Exception as e:
logger.error(f"Error getting validation info: {e}")
raise
async def run_sse():
"""Run Excel MCP server in SSE mode."""
# Assign value to EXCEL_FILES_PATH in SSE mode

25
uv.lock generated
View File

@ -1,5 +1,4 @@
version = 1
revision = 1
requires-python = ">=3.10"
[[package]]
@ -67,17 +66,19 @@ wheels = [
[[package]]
name = "excel-mcp-server"
version = "0.1.1"
version = "0.1.3"
source = { editable = "." }
dependencies = [
{ name = "mcp", extra = ["cli"] },
{ name = "openpyxl" },
{ name = "typer" },
]
[package.metadata]
requires-dist = [
{ name = "mcp", extras = ["cli"], specifier = ">=1.2.0" },
{ name = "mcp", extras = ["cli"], specifier = ">=1.6.0" },
{ name = "openpyxl", specifier = ">=3.1.2" },
{ name = "typer", specifier = ">=0.15.1" },
]
[[package]]
@ -158,7 +159,7 @@ wheels = [
[[package]]
name = "mcp"
version = "1.2.1"
version = "1.9.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
@ -166,13 +167,14 @@ dependencies = [
{ name = "httpx-sse" },
{ name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "python-multipart" },
{ name = "sse-starlette" },
{ name = "starlette" },
{ name = "uvicorn" },
{ name = "uvicorn", marker = "sys_platform != 'emscripten'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/fc/30/51e4555826126e3954fa2ab1e934bf74163c5fe05e98f38ca4d0f8abbf63/mcp-1.2.1.tar.gz", hash = "sha256:c9d43dbfe943aa1530e2be8f54b73af3ebfb071243827b4483d421684806cb45", size = 103968 }
sdist = { url = "https://files.pythonhosted.org/packages/e7/bc/54aec2c334698cc575ca3b3481eed627125fb66544152fa1af927b1a495c/mcp-1.9.1.tar.gz", hash = "sha256:19879cd6dde3d763297617242888c2f695a95dfa854386a6a68676a646ce75e4", size = 316247 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4c/0d/6770742a84c8aa1d36c0d628896a380584c5759612e66af7446af07d8775/mcp-1.2.1-py3-none-any.whl", hash = "sha256:579bf9c9157850ebb1344f3ca6f7a3021b0123c44c9f089ef577a7062522f0fd", size = 66453 },
{ url = "https://files.pythonhosted.org/packages/a6/c0/4ac795585a22a0a2d09cd2b1187b0252d2afcdebd01e10a68bbac4d34890/mcp-1.9.1-py3-none-any.whl", hash = "sha256:2900ded8ffafc3c8a7bfcfe8bc5204037e988e753ec398f371663e6a06ecd9a9", size = 130261 },
]
[package.optional-dependencies]
@ -322,6 +324,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 },
]
[[package]]
name = "python-multipart"
version = "0.0.20"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546 },
]
[[package]]
name = "rich"
version = "13.9.4"