feat(cell validation): add comprehensive data validation capabilities (#37)

- Add cell_validation.py module for Excel data validation metadata extraction - Implement get_data_validation_for_cell() and get_all_validation_ranges() - Include validation metadata in read_data_from_excel responses automatically - Add get_data_validation_info MCP tool for validation rule summaries - Resolve range references in list validations to actual cell values - Support all validation types: list, whole, decimal, date, time, textLength - Include operators (between, notBetween, equal, greaterThan, etc.) in metadata This allows LLMs to understand Excel validation constraints including dropdown options, numeric ranges, date constraints, and text length limits.
2026-01-27 05:36:50 +08:00 · 2025-06-06 21:42:00 -07:00
parent b2c9ce8e6a
commit bb537b35be
5 changed files with 393 additions and 19 deletions
--- a/TOOLS.md
+++ b/TOOLS.md
@ -337,3 +337,23 @@ validate_excel_range(
 - `start_cell`: Starting cell of range
 - `end_cell`: Optional ending cell of range
 - Returns: Validation result message
+
+### get_data_validation_info
+
+Get data validation rules and metadata for a worksheet.
+
+```python
+get_data_validation_info(filepath: str, sheet_name: str) -> str
+```
+
+- `filepath`: Path to Excel file
+- `sheet_name`: Target worksheet name
+- Returns: JSON string containing all data validation rules with metadata including:
+  - Validation type (list, whole, decimal, date, time, textLength)
+  - Operator (between, notBetween, equal, greaterThan, lessThan, etc.)
+  - Allowed values for list validations (resolved from ranges)
+  - Formula constraints for numeric/date validations
+  - Cell ranges where validation applies
+  - Prompt and error messages
+
+**Note**: The `read_data_from_excel` tool automatically includes validation metadata for individual cells when available.
--- a/src/excel_mcp/cell_validation.py
+++ b/src/excel_mcp/cell_validation.py
@ -0,0 +1,179 @@
+import logging
+from typing import Any, Dict, List, Optional
+
+from openpyxl.worksheet.worksheet import Worksheet
+from openpyxl.utils.cell import coordinate_from_string, column_index_from_string
+
+logger = logging.getLogger(__name__)
+
+def get_data_validation_for_cell(worksheet: Worksheet, cell_address: str) -> Optional[Dict[str, Any]]:
+    """Get data validation metadata for a specific cell.
+    
+    Args:
+        worksheet: The openpyxl worksheet object
+        cell_address: Cell address like 'A1', 'B2', etc.
+        
+    Returns:
+        Dictionary with validation metadata or None if no validation exists
+    """
+    try:
+        # Convert cell address to row/col coordinates
+        col_letter, row = coordinate_from_string(cell_address)
+        col_idx = column_index_from_string(col_letter)
+        
+        # Check each data validation rule in the worksheet
+        for dv in worksheet.data_validations.dataValidation:
+            # Check if this cell is covered by the validation rule
+            if _cell_in_validation_range(row, col_idx, dv):
+                return _extract_validation_metadata(dv, cell_address, worksheet)
+                
+        return None
+        
+    except Exception as e:
+        logger.warning(f"Failed to get validation for cell {cell_address}: {e}")
+        return None
+
+def _cell_in_validation_range(row: int, col: int, data_validation) -> bool:
+    """Check if a cell is within a data validation range."""
+    try:
+        # data_validation.sqref contains the cell ranges this validation applies to
+        for cell_range in data_validation.sqref.ranges:
+            if (cell_range.min_row <= row <= cell_range.max_row and 
+                cell_range.min_col <= col <= cell_range.max_col):
+                return True
+        return False
+    except Exception as e:
+        logger.warning(f"Error checking if cell ({row}, {col}) is in validation range for DV sqref '{getattr(data_validation, 'sqref', 'N/A')}': {e}")
+        return False
+
+def _extract_validation_metadata(data_validation, cell_address: str, worksheet: Optional[Worksheet] = None) -> Dict[str, Any]:
+    """Extract metadata from a DataValidation object."""
+    try:
+        validation_info = {
+            "cell": cell_address,
+            "has_validation": True,
+            "validation_type": data_validation.type,
+            "allow_blank": data_validation.allowBlank,
+        }
+        
+        # Add operator for validation types that use it
+        if data_validation.operator:
+            validation_info["operator"] = data_validation.operator
+        
+        # Add optional fields if they exist
+        if data_validation.prompt:
+            validation_info["prompt"] = data_validation.prompt
+        if data_validation.promptTitle:
+            validation_info["prompt_title"] = data_validation.promptTitle
+        if data_validation.error:
+            validation_info["error_message"] = data_validation.error
+        if data_validation.errorTitle:
+            validation_info["error_title"] = data_validation.errorTitle
+            
+        # For list type validations (dropdown lists), extract allowed values
+        if data_validation.type == "list" and data_validation.formula1:
+            allowed_values = _extract_list_values(data_validation.formula1, worksheet)
+            validation_info["allowed_values"] = allowed_values
+            
+        # For other validation types, include the formulas
+        elif data_validation.formula1:
+            validation_info["formula1"] = data_validation.formula1
+            if data_validation.formula2:
+                validation_info["formula2"] = data_validation.formula2
+                
+        return validation_info
+        
+    except Exception as e:
+        logger.warning(f"Failed to extract validation metadata: {e}")
+        return {
+            "cell": cell_address,
+            "has_validation": True,
+            "validation_type": "unknown",
+            "error": f"Failed to parse validation: {e}"
+        }
+
+def _extract_list_values(formula: str, worksheet: Optional[Worksheet] = None) -> List[str]:
+    """Extract allowed values from a list validation formula."""
+    try:
+        # Remove quotes if present
+        formula = formula.strip('"')
+        
+        # Handle comma-separated list
+        if ',' in formula:
+            # Split by comma and clean up each value
+            values = [val.strip().strip('"') for val in formula.split(',')]
+            return [val for val in values if val]  # Remove empty values
+            
+        # Handle range reference (e.g., "$A$1:$A$5" or "Sheet1!$A$1:$A$5")
+        elif (':' in formula or formula.startswith('$')) and worksheet:
+            try:
+                # Remove potential leading '=' if it's a formula like '=Sheet1!$A$1:$A$5'
+                range_ref = formula
+                if formula.startswith('='):
+                    range_ref = formula[1:]
+                
+                actual_values = []
+                # worksheet[range_ref] can resolve ranges like "A1:A5" or "SheetName!A1:A5"
+                # It returns a tuple of tuples of cells for ranges, or a single cell
+                range_cells = worksheet[range_ref]
+                
+                # Handle single cell or range
+                if hasattr(range_cells, 'value'):  # Single cell
+                    if range_cells.value is not None:
+                        actual_values.append(str(range_cells.value))
+                else:  # Range of cells
+                    for row_of_cells in range_cells:
+                        # Handle case where row_of_cells might be a single cell
+                        if hasattr(row_of_cells, 'value'):
+                            if row_of_cells.value is not None:
+                                actual_values.append(str(row_of_cells.value))
+                        else:
+                            for cell in row_of_cells:
+                                if cell.value is not None:
+                                    actual_values.append(str(cell.value))
+                
+                if actual_values:
+                    return actual_values
+                return [f"Range: {formula} (empty or unresolvable)"]
+                
+            except Exception as e:
+                logger.warning(f"Could not resolve range '{formula}' for list validation: {e}")
+                return [f"Range: {formula} (resolution error)"]
+                
+        # Handle range reference when worksheet not available
+        elif ':' in formula or formula.startswith('$'):
+            return [f"Range: {formula}"]
+            
+        # Single value
+        else:
+            return [formula.strip('"')]
+            
+    except Exception as e:
+        logger.warning(f"Failed to parse list formula '{formula}': {e}")
+        return [formula]  # Return original formula if parsing fails
+
+def get_all_validation_ranges(worksheet: Worksheet) -> List[Dict[str, Any]]:
+    """Get all data validation ranges in a worksheet.
+    
+    Returns:
+        List of dictionaries containing validation range information
+    """
+    validations = []
+    
+    try:
+        for dv in worksheet.data_validations.dataValidation:
+            validation_info = {
+                "ranges": str(dv.sqref),
+                "validation_type": dv.type,
+                "allow_blank": dv.allowBlank,
+            }
+            
+            if dv.type == "list" and dv.formula1:
+                validation_info["allowed_values"] = _extract_list_values(dv.formula1, worksheet)
+                
+            validations.append(validation_info)
+            
+    except Exception as e:
+        logger.warning(f"Failed to get validation ranges: {e}")
+        
+    return validations 
--- a/src/excel_mcp/data.py
+++ b/src/excel_mcp/data.py
@ -1,14 +1,14 @@
 from pathlib import Path
-from typing import Any
+from typing import Any, Dict
 import logging

 from openpyxl import load_workbook
-from openpyxl.styles import Font
 from openpyxl.worksheet.worksheet import Worksheet
 from openpyxl.utils import get_column_letter

 from .exceptions import DataError
 from .cell_utils import parse_cell_range
+from .cell_validation import get_data_validation_for_cell

 logger = logging.getLogger(__name__)

@ -244,3 +244,108 @@ def _write_data_to_worksheet(
    except Exception as e:
        logger.error(f"Failed to write worksheet data: {e}")
        raise DataError(str(e))
+
+def read_excel_range_with_metadata(
+    filepath: Path | str,
+    sheet_name: str,
+    start_cell: str = "A1",
+    end_cell: str | None = None,
+    include_validation: bool = True
+) -> Dict[str, Any]:
+    """Read data from Excel range with cell metadata including validation rules.
+    
+    Args:
+        filepath: Path to Excel file
+        sheet_name: Name of worksheet
+        start_cell: Starting cell address
+        end_cell: Ending cell address (optional)
+        include_validation: Whether to include validation metadata
+        
+    Returns:
+        Dictionary containing structured cell data with metadata
+    """
+    try:
+        wb = load_workbook(filepath, read_only=False)
+        
+        if sheet_name not in wb.sheetnames:
+            raise DataError(f"Sheet '{sheet_name}' not found")
+            
+        ws = wb[sheet_name]
+
+        # Parse start cell
+        if ':' in start_cell:
+            start_cell, end_cell = start_cell.split(':')
+            
+        # Get start coordinates
+        try:
+            start_coords = parse_cell_range(f"{start_cell}:{start_cell}")
+            if not start_coords or not all(coord is not None for coord in start_coords[:2]):
+                raise DataError(f"Invalid start cell reference: {start_cell}")
+            start_row, start_col = start_coords[0], start_coords[1]
+        except ValueError as e:
+            raise DataError(f"Invalid start cell format: {str(e)}")
+
+        # Determine end coordinates
+        if end_cell:
+            try:
+                end_coords = parse_cell_range(f"{end_cell}:{end_cell}")
+                if not end_coords or not all(coord is not None for coord in end_coords[:2]):
+                    raise DataError(f"Invalid end cell reference: {end_cell}")
+                end_row, end_col = end_coords[0], end_coords[1]
+            except ValueError as e:
+                raise DataError(f"Invalid end cell format: {str(e)}")
+        else:
+            # Dynamically expand range until all values are empty
+            end_row, end_col = start_row, start_col
+            while end_row <= ws.max_row and any(ws.cell(row=end_row, column=c).value is not None for c in range(start_col, ws.max_column + 1)):
+                end_row += 1
+            while end_col <= ws.max_column and any(ws.cell(row=r, column=end_col).value is not None for r in range(start_row, ws.max_row + 1)):
+                end_col += 1
+            end_row -= 1  # Adjust back to last non-empty row
+            end_col -= 1  # Adjust back to last non-empty column
+
+        # Validate range bounds
+        if start_row > ws.max_row or start_col > ws.max_column:
+            raise DataError(
+                f"Start cell out of bounds. Sheet dimensions are "
+                f"A1:{get_column_letter(ws.max_column)}{ws.max_row}"
+            )
+
+        # Build structured cell data
+        range_data = {
+            "range": f"{start_cell}:{get_column_letter(end_col)}{end_row}" if end_cell else start_cell,
+            "sheet_name": sheet_name,
+            "cells": []
+        }
+        
+        for row in range(start_row, end_row + 1):
+            for col in range(start_col, end_col + 1):
+                cell = ws.cell(row=row, column=col)
+                cell_address = f"{get_column_letter(col)}{row}"
+                
+                cell_data = {
+                    "address": cell_address,
+                    "value": cell.value,
+                    "row": row,
+                    "column": col
+                }
+                
+                # Add validation metadata if requested
+                if include_validation:
+                    validation_info = get_data_validation_for_cell(ws, cell_address)
+                    if validation_info:
+                        cell_data["validation"] = validation_info
+                    else:
+                        cell_data["validation"] = {"has_validation": False}
+                
+                range_data["cells"].append(cell_data)
+
+        wb.close()
+        return range_data
+        
+    except DataError as e:
+        logger.error(str(e))
+        raise
+    except Exception as e:
+        logger.error(f"Failed to read Excel range with metadata: {e}")
+        raise DataError(str(e))
--- a/src/excel_mcp/server.py
+++ b/src/excel_mcp/server.py
@ -165,7 +165,7 @@ def format_range(
        full_path = get_excel_path(filepath)
        from excel_mcp.formatting import format_range as format_range_func
        
-        result = format_range_func(
+        format_range_func(
            filepath=full_path,
            sheet_name=sheet_name,
            start_cell=start_cell,
@ -201,20 +201,35 @@ def read_data_from_excel(
    preview_only: bool = False
 ) -> str:
    """
-    Read data from Excel worksheet.
+    Read data from Excel worksheet with cell metadata including validation rules.
+    
+    Args:
+        filepath: Path to Excel file
+        sheet_name: Name of worksheet
+        start_cell: Starting cell (default A1)
+        end_cell: Ending cell (optional, auto-expands if not provided)
+        preview_only: Whether to return preview only
    
    Returns:  
-    Data from Excel worksheet as json string. list of lists or empty list if no data found. sublists are assumed to be rows.
+    JSON string containing structured cell data with validation metadata.
+    Each cell includes: address, value, row, column, and validation info (if any).
    """
    try:
        full_path = get_excel_path(filepath)
-        from excel_mcp.data import read_excel_range
-        result = read_excel_range(full_path, sheet_name, start_cell, end_cell, preview_only)
-        if not result:
+        from excel_mcp.data import read_excel_range_with_metadata
+        result = read_excel_range_with_metadata(
+            full_path, 
+            sheet_name, 
+            start_cell, 
+            end_cell
+        )
+        if not result or not result.get("cells"):
            return "No data found in specified range"
-        # Convert the list of dicts to a formatted string
-        data_str = "\n".join([str(row) for row in result])
-        return data_str
+            
+        # Return as formatted JSON string
+        import json
+        return json.dumps(result, indent=2, default=str)
+        
    except Exception as e:
        logger.error(f"Error reading data: {e}")
        raise
@ -253,7 +268,7 @@ def create_workbook(filepath: str) -> str:
    try:
        full_path = get_excel_path(filepath)
        from excel_mcp.workbook import create_workbook as create_workbook_impl
-        result = create_workbook_impl(full_path)
+        create_workbook_impl(full_path)
        return f"Created workbook at {full_path}"
    except WorkbookError as e:
        return f"Error: {str(e)}"
@ -500,6 +515,50 @@ def validate_excel_range(
        logger.error(f"Error validating range: {e}")
        raise

+@mcp.tool()
+def get_data_validation_info(
+    filepath: str,
+    sheet_name: str
+) -> str:
+    """
+    Get all data validation rules in a worksheet.
+    
+    This tool helps identify which cell ranges have validation rules
+    and what types of validation are applied.
+    
+    Args:
+        filepath: Path to Excel file
+        sheet_name: Name of worksheet
+        
+    Returns:
+        JSON string containing all validation rules in the worksheet
+    """
+    try:
+        full_path = get_excel_path(filepath)
+        from openpyxl import load_workbook
+        from excel_mcp.cell_validation import get_all_validation_ranges
+        
+        wb = load_workbook(full_path, read_only=False)
+        if sheet_name not in wb.sheetnames:
+            return f"Error: Sheet '{sheet_name}' not found"
+            
+        ws = wb[sheet_name]
+        validations = get_all_validation_ranges(ws)
+        wb.close()
+        
+        if not validations:
+            return "No data validation rules found in this worksheet"
+            
+        import json
+        return json.dumps({
+            "sheet_name": sheet_name,
+            "validation_rules": validations
+        }, indent=2, default=str)
+        
+    except Exception as e:
+        logger.error(f"Error getting validation info: {e}")
+        raise
+
 async def run_sse():
    """Run Excel MCP server in SSE mode."""
    # Assign value to EXCEL_FILES_PATH in SSE mode
--- a/uv.lock
+++ b/uv.lock
@ -1,5 +1,4 @@
 version = 1
-revision = 1
 requires-python = ">=3.10"

 [[package]]
@ -67,17 +66,19 @@ wheels = [

 [[package]]
 name = "excel-mcp-server"
-version = "0.1.1"
+version = "0.1.3"
 source = { editable = "." }
 dependencies = [
    { name = "mcp", extra = ["cli"] },
    { name = "openpyxl" },
+    { name = "typer" },
 ]

 [package.metadata]
 requires-dist = [
-    { name = "mcp", extras = ["cli"], specifier = ">=1.2.0" },
+    { name = "mcp", extras = ["cli"], specifier = ">=1.6.0" },
    { name = "openpyxl", specifier = ">=3.1.2" },
+    { name = "typer", specifier = ">=0.15.1" },
 ]

 [[package]]
@ -158,7 +159,7 @@ wheels = [

 [[package]]
 name = "mcp"
-version = "1.2.1"
+version = "1.9.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "anyio" },
@ -166,13 +167,14 @@ dependencies = [
    { name = "httpx-sse" },
    { name = "pydantic" },
    { name = "pydantic-settings" },
+    { name = "python-multipart" },
    { name = "sse-starlette" },
    { name = "starlette" },
-    { name = "uvicorn" },
+    { name = "uvicorn", marker = "sys_platform != 'emscripten'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fc/30/51e4555826126e3954fa2ab1e934bf74163c5fe05e98f38ca4d0f8abbf63/mcp-1.2.1.tar.gz", hash = "sha256:c9d43dbfe943aa1530e2be8f54b73af3ebfb071243827b4483d421684806cb45", size = 103968 }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/bc/54aec2c334698cc575ca3b3481eed627125fb66544152fa1af927b1a495c/mcp-1.9.1.tar.gz", hash = "sha256:19879cd6dde3d763297617242888c2f695a95dfa854386a6a68676a646ce75e4", size = 316247 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4c/0d/6770742a84c8aa1d36c0d628896a380584c5759612e66af7446af07d8775/mcp-1.2.1-py3-none-any.whl", hash = "sha256:579bf9c9157850ebb1344f3ca6f7a3021b0123c44c9f089ef577a7062522f0fd", size = 66453 },
+    { url = "https://files.pythonhosted.org/packages/a6/c0/4ac795585a22a0a2d09cd2b1187b0252d2afcdebd01e10a68bbac4d34890/mcp-1.9.1-py3-none-any.whl", hash = "sha256:2900ded8ffafc3c8a7bfcfe8bc5204037e988e753ec398f371663e6a06ecd9a9", size = 130261 },
 ]

 [package.optional-dependencies]
@ -322,6 +324,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 },
 ]

+[[package]]
+name = "python-multipart"
+version = "0.0.20"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546 },
+]
+
 [[package]]
 name = "rich"
 version = "13.9.4"