APM

>Agent Skill

@datadrivenconstruction/schema-validation

skilldevelopment

JSON/data schema validation for construction data exchange: API payloads, file imports, BIM exports. Ensure data structure compliance before processing.

apm::install
$apm install @datadrivenconstruction/schema-validation
apm::skill.md
---
name: "schema-validation"
description: "JSON/data schema validation for construction data exchange: API payloads, file imports, BIM exports. Ensure data structure compliance before processing."
homepage: "https://datadrivenconstruction.io"
metadata: {"openclaw": {"emoji": "✔️", "os": ["darwin", "linux", "win32"], "homepage": "https://datadrivenconstruction.io", "requires": {"bins": ["python3"]}}}
---
# Schema Validation for Construction Data

## Overview

Validate data structures against defined schemas for construction data exchange. Ensure API payloads, file imports, and BIM exports conform to expected formats before processing.

## Schema Validation Framework

### Core Schema Validator

```python
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from enum import Enum
import json
import re
from datetime import datetime

class SchemaType(Enum):
    STRING = "string"
    NUMBER = "number"
    INTEGER = "integer"
    BOOLEAN = "boolean"
    ARRAY = "array"
    OBJECT = "object"
    DATE = "date"
    DATETIME = "datetime"
    CSI_CODE = "csi_code"
    CURRENCY = "currency"
    GUID = "guid"

@dataclass
class SchemaField:
    name: str
    type: SchemaType
    required: bool = True
    nullable: bool = False
    min_value: Optional[float] = None
    max_value: Optional[float] = None
    min_length: Optional[int] = None
    max_length: Optional[int] = None
    pattern: Optional[str] = None
    enum_values: Optional[List[Any]] = None
    items_schema: Optional['Schema'] = None  # For arrays
    properties: Optional[Dict[str, 'SchemaField']] = None  # For objects
    description: str = ""

@dataclass
class Schema:
    name: str
    version: str
    fields: Dict[str, SchemaField]
    description: str = ""

@dataclass
class SchemaValidationError:
    path: str
    message: str
    expected: str
    actual: Any

@dataclass
class SchemaValidationResult:
    is_valid: bool
    errors: List[SchemaValidationError] = field(default_factory=list)
    schema_name: str = ""
    schema_version: str = ""

    def add_error(self, path: str, message: str, expected: str, actual: Any):
        self.errors.append(SchemaValidationError(path, message, expected, actual))
        self.is_valid = False

    def to_report(self) -> str:
        lines = [
            f"Schema Validation: {self.schema_name} v{self.schema_version}",
            "=" * 50,
            f"Status: {'✓ VALID' if self.is_valid else '✗ INVALID'}",
            f"Errors: {len(self.errors)}",
            ""
        ]

        for error in self.errors:
            lines.append(f"❌ {error.path}")
            lines.append(f"   {error.message}")
            lines.append(f"   Expected: {error.expected}")
            lines.append(f"   Actual: {error.actual}")
            lines.append("")

        return "\n".join(lines)


class SchemaValidator:
    """Validate data against schemas."""

    # Custom type patterns
    PATTERNS = {
        SchemaType.CSI_CODE: r'^\d{2}\s?\d{2}\s?\d{2}$',
        SchemaType.GUID: r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$',
        SchemaType.CURRENCY: r'^-?\d+(\.\d{2})?$',
        SchemaType.DATE: r'^\d{4}-\d{2}-\d{2}$',
        SchemaType.DATETIME: r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}',
    }

    def validate(self, data: Any, schema: Schema) -> SchemaValidationResult:
        result = SchemaValidationResult(
            is_valid=True,
            schema_name=schema.name,
            schema_version=schema.version
        )

        self._validate_object(data, schema.fields, "", result)
        return result

    def _validate_object(self, data: Any, fields: Dict[str, SchemaField], path: str, result: SchemaValidationResult):
        if not isinstance(data, dict):
            result.add_error(path or "root", "Expected object", "object", type(data).__name__)
            return

        # Check required fields
        for field_name, field_schema in fields.items():
            field_path = f"{path}.{field_name}" if path else field_name

            if field_name not in data:
                if field_schema.required:
                    result.add_error(field_path, "Required field missing", "present", "missing")
                continue

            value = data[field_name]

            # Check nullable
            if value is None:
                if not field_schema.nullable:
                    result.add_error(field_path, "Field cannot be null", "non-null", "null")
                continue

            # Validate type
            self._validate_field(value, field_schema, field_path, result)

        # Check for extra fields (warning only)
        for key in data.keys():
            if key not in fields:
                # Could add warning here if needed
                pass

    def _validate_field(self, value: Any, schema: SchemaField, path: str, result: SchemaValidationResult):
        # Type validation
        if not self._check_type(value, schema.type):
            result.add_error(path, f"Invalid type", schema.type.value, type(value).__name__)
            return

        # String validations
        if schema.type == SchemaType.STRING:
            if schema.min_length and len(value) < schema.min_length:
                result.add_error(path, f"String too short", f"min {schema.min_length}", len(value))
            if schema.max_length and len(value) > schema.max_length:
                result.add_error(path, f"String too long", f"max {schema.max_length}", len(value))
            if schema.pattern and not re.match(schema.pattern, value):
                result.add_error(path, "Pattern mismatch", schema.pattern, value)

        # Numeric validations
        if schema.type in (SchemaType.NUMBER, SchemaType.INTEGER):
            if schema.min_value is not None and value < schema.min_value:
                result.add_error(path, "Value below minimum", f">= {schema.min_value}", value)
            if schema.max_value is not None and value > schema.max_value:
                result.add_error(path, "Value above maximum", f"<= {schema.max_value}", value)

        # Enum validation
        if schema.enum_values and value not in schema.enum_values:
            result.add_error(path, "Invalid enum value", str(schema.enum_values), value)

        # Array validation
        if schema.type == SchemaType.ARRAY and schema.items_schema:
            for i, item in enumerate(value):
                item_path = f"{path}[{i}]"
                if schema.items_schema.fields:
                    self._validate_object(item, schema.items_schema.fields, item_path, result)

        # Nested object validation
        if schema.type == SchemaType.OBJECT and schema.properties:
            self._validate_object(value, schema.properties, path, result)

        # Custom type validation
        if schema.type in self.PATTERNS:
            pattern = self.PATTERNS[schema.type]
            if not re.match(pattern, str(value)):
                result.add_error(path, f"Invalid {schema.type.value} format", pattern, value)

    def _check_type(self, value: Any, expected: SchemaType) -> bool:
        type_checks = {
            SchemaType.STRING: lambda v: isinstance(v, str),
            SchemaType.NUMBER: lambda v: isinstance(v, (int, float)),
            SchemaType.INTEGER: lambda v: isinstance(v, int) and not isinstance(v, bool),
            SchemaType.BOOLEAN: lambda v: isinstance(v, bool),
            SchemaType.ARRAY: lambda v: isinstance(v, list),
            SchemaType.OBJECT: lambda v: isinstance(v, dict),
            SchemaType.DATE: lambda v: isinstance(v, str),
            SchemaType.DATETIME: lambda v: isinstance(v, str),
            SchemaType.CSI_CODE: lambda v: isinstance(v, str),
            SchemaType.CURRENCY: lambda v: isinstance(v, (int, float, str)),
            SchemaType.GUID: lambda v: isinstance(v, str),
        }
        return type_checks.get(expected, lambda v: True)(value)
```

## Construction Data Schemas

### Cost Estimate Schema

```python
# Define schema for cost estimate data
COST_ESTIMATE_SCHEMA = Schema(
    name="CostEstimate",
    version="1.0",
    description="Schema for construction cost estimates",
    fields={
        "project_id": SchemaField(
            name="project_id",
            type=SchemaType.STRING,
            required=True,
            description="Unique project identifier"
        ),
        "project_name": SchemaField(
            name="project_name",
            type=SchemaType.STRING,
            required=True,
            max_length=200
        ),
        "estimate_type": SchemaField(
            name="estimate_type",
            type=SchemaType.STRING,
            required=True,
            enum_values=["conceptual", "schematic", "design_development", "construction_documents", "bid"]
        ),
        "estimate_date": SchemaField(
            name="estimate_date",
            type=SchemaType.DATE,
            required=True
        ),
        "currency": SchemaField(
            name="currency",
            type=SchemaType.STRING,
            required=False,
            enum_values=["USD", "EUR", "GBP", "CAD"],
            nullable=True
        ),
        "gross_area": SchemaField(
            name="gross_area",
            type=SchemaType.NUMBER,
            required=True,
            min_value=0,
            description="Gross floor area in SF or SM"
        ),
        "line_items": SchemaField(
            name="line_items",
            type=SchemaType.ARRAY,
            required=True,
            items_schema=Schema(
                name="LineItem",
                version="1.0",
                fields={
                    "id": SchemaField(name="id", type=SchemaType.STRING, required=True),
                    "csi_code": SchemaField(name="csi_code", type=SchemaType.CSI_CODE, required=False, nullable=True),
                    "description": SchemaField(name="description", type=SchemaType.STRING, required=True, max_length=500),
                    "quantity": SchemaField(name="quantity", type=SchemaType.NUMBER, required=True, min_value=0),
                    "unit": SchemaField(name="unit", type=SchemaType.STRING, required=True),
                    "unit_cost": SchemaField(name="unit_cost", type=SchemaType.NUMBER, required=True, min_value=0),
                    "amount": SchemaField(name="amount", type=SchemaType.NUMBER, required=True, min_value=0),
                }
            )
        ),
        "subtotal": SchemaField(
            name="subtotal",
            type=SchemaType.NUMBER,
            required=True,
            min_value=0
        ),
        "contingency_percent": SchemaField(
            name="contingency_percent",
            type=SchemaType.NUMBER,
            required=False,
            min_value=0,
            max_value=50
        ),
        "total": SchemaField(
            name="total",
            type=SchemaType.NUMBER,
            required=True,
            min_value=0
        )
    }
)
```

### Schedule Data Schema

```python
SCHEDULE_SCHEMA = Schema(
    name="ProjectSchedule",
    version="1.0",
    description="Schema for project schedule data",
    fields={
        "project_id": SchemaField(name="project_id", type=SchemaType.STRING, required=True),
        "schedule_name": SchemaField(name="schedule_name", type=SchemaType.STRING, required=True),
        "data_date": SchemaField(name="data_date", type=SchemaType.DATE, required=True),
        "start_date": SchemaField(name="start_date", type=SchemaType.DATE, required=True),
        "finish_date": SchemaField(name="finish_date", type=SchemaType.DATE, required=True),
        "calendar": SchemaField(
            name="calendar",
            type=SchemaType.STRING,
            required=False,
            enum_values=["5-day", "6-day", "7-day"],
            nullable=True
        ),
        "tasks": SchemaField(
            name="tasks",
            type=SchemaType.ARRAY,
            required=True,
            items_schema=Schema(
                name="Task",
                version="1.0",
                fields={
                    "id": SchemaField(name="id", type=SchemaType.STRING, required=True),
                    "wbs": SchemaField(name="wbs", type=SchemaType.STRING, required=False, nullable=True),
                    "name": SchemaField(name="name", type=SchemaType.STRING, required=True, max_length=300),
                    "start_date": SchemaField(name="start_date", type=SchemaType.DATE, required=True),
                    "finish_date": SchemaField(name="finish_date", type=SchemaType.DATE, required=True),
                    "duration": SchemaField(name="duration", type=SchemaType.INTEGER, required=True, min_value=0),
                    "percent_complete": SchemaField(name="percent_complete", type=SchemaType.NUMBER, required=False, min_value=0, max_value=100),
                    "predecessors": SchemaField(name="predecessors", type=SchemaType.ARRAY, required=False, nullable=True),
                    "resources": SchemaField(name="resources", type=SchemaType.ARRAY, required=False, nullable=True),
                }
            )
        )
    }
)
```

### BIM Element Schema

```python
BIM_ELEMENT_SCHEMA = Schema(
    name="BIMElement",
    version="1.0",
    description="Schema for BIM element data",
    fields={
        "guid": SchemaField(name="guid", type=SchemaType.GUID, required=True),
        "ifc_class": SchemaField(
            name="ifc_class",
            type=SchemaType.STRING,
            required=True,
            pattern=r'^Ifc[A-Z][a-zA-Z]+$'
        ),
        "name": SchemaField(name="name", type=SchemaType.STRING, required=False, nullable=True),
        "description": SchemaField(name="description", type=SchemaType.STRING, required=False, nullable=True),
        "level": SchemaField(name="level", type=SchemaType.STRING, required=False, nullable=True),
        "classification": SchemaField(
            name="classification",
            type=SchemaType.OBJECT,
            required=False,
            nullable=True,
            properties={
                "system": SchemaField(name="system", type=SchemaType.STRING, required=True),
                "code": SchemaField(name="code", type=SchemaType.STRING, required=True),
                "name": SchemaField(name="name", type=SchemaType.STRING, required=False, nullable=True),
            }
        ),
        "quantities": SchemaField(
            name="quantities",
            type=SchemaType.OBJECT,
            required=False,
            nullable=True,
            properties={
                "area": SchemaField(name="area", type=SchemaType.NUMBER, required=False, min_value=0, nullable=True),
                "volume": SchemaField(name="volume", type=SchemaType.NUMBER, required=False, min_value=0, nullable=True),
                "length": SchemaField(name="length", type=SchemaType.NUMBER, required=False, min_value=0, nullable=True),
                "count": SchemaField(name="count", type=SchemaType.INTEGER, required=False, min_value=0, nullable=True),
            }
        ),
        "properties": SchemaField(name="properties", type=SchemaType.OBJECT, required=False, nullable=True)
    }
)
```

### RFI Schema

```python
RFI_SCHEMA = Schema(
    name="RFI",
    version="1.0",
    description="Schema for Request for Information",
    fields={
        "rfi_number": SchemaField(name="rfi_number", type=SchemaType.STRING, required=True, pattern=r'^RFI-\d+$'),
        "project_id": SchemaField(name="project_id", type=SchemaType.STRING, required=True),
        "subject": SchemaField(name="subject", type=SchemaType.STRING, required=True, max_length=500),
        "status": SchemaField(
            name="status",
            type=SchemaType.STRING,
            required=True,
            enum_values=["draft", "submitted", "in_review", "answered", "closed"]
        ),
        "priority": SchemaField(
            name="priority",
            type=SchemaType.STRING,
            required=False,
            enum_values=["low", "medium", "high", "critical"],
            nullable=True
        ),
        "date_submitted": SchemaField(name="date_submitted", type=SchemaType.DATE, required=True),
        "date_required": SchemaField(name="date_required", type=SchemaType.DATE, required=True),
        "from_company": SchemaField(name="from_company", type=SchemaType.STRING, required=True),
        "to_company": SchemaField(name="to_company", type=SchemaType.STRING, required=True),
        "spec_section": SchemaField(name="spec_section", type=SchemaType.CSI_CODE, required=False, nullable=True),
        "drawing_reference": SchemaField(name="drawing_reference", type=SchemaType.STRING, required=False, nullable=True),
        "question": SchemaField(name="question", type=SchemaType.STRING, required=True),
        "response": SchemaField(name="response", type=SchemaType.STRING, required=False, nullable=True),
        "date_responded": SchemaField(name="date_responded", type=SchemaType.DATE, required=False, nullable=True),
        "attachments": SchemaField(name="attachments", type=SchemaType.ARRAY, required=False, nullable=True)
    }
)
```

## Schema Registry

```python
class ConstructionSchemaRegistry:
    """Registry of construction data schemas."""

    def __init__(self):
        self.schemas: Dict[str, Schema] = {}
        self._register_defaults()

    def _register_defaults(self):
        self.register(COST_ESTIMATE_SCHEMA)
        self.register(SCHEDULE_SCHEMA)
        self.register(BIM_ELEMENT_SCHEMA)
        self.register(RFI_SCHEMA)

    def register(self, schema: Schema):
        key = f"{schema.name}:{schema.version}"
        self.schemas[key] = schema
        # Also register without version for latest
        self.schemas[schema.name] = schema

    def get(self, name: str, version: str = None) -> Optional[Schema]:
        if version:
            return self.schemas.get(f"{name}:{version}")
        return self.schemas.get(name)

    def validate(self, data: Any, schema_name: str, version: str = None) -> SchemaValidationResult:
        schema = self.get(schema_name, version)
        if not schema:
            result = SchemaValidationResult(is_valid=False)
            result.add_error("schema", f"Schema '{schema_name}' not found", "valid schema", "not found")
            return result

        validator = SchemaValidator()
        return validator.validate(data, schema)

    def list_schemas(self) -> List[str]:
        return [k for k in self.schemas.keys() if ':' in k]
```

## Usage Examples

```python
# Initialize registry
registry = ConstructionSchemaRegistry()

# Validate cost estimate
estimate_data = {
    "project_id": "PROJ-001",
    "project_name": "Downtown Office Tower",
    "estimate_type": "schematic",
    "estimate_date": "2026-01-15",
    "gross_area": 50000,
    "line_items": [
        {
            "id": "1",
            "csi_code": "03 30 00",
            "description": "Cast-in-place concrete",
            "quantity": 5000,
            "unit": "CY",
            "unit_cost": 150.00,
            "amount": 750000.00
        }
    ],
    "subtotal": 750000.00,
    "contingency_percent": 10,
    "total": 825000.00
}

result = registry.validate(estimate_data, "CostEstimate")
print(result.to_report())

# Validate RFI
rfi_data = {
    "rfi_number": "RFI-042",
    "project_id": "PROJ-001",
    "subject": "Concrete mix design clarification",
    "status": "submitted",
    "priority": "high",
    "date_submitted": "2026-01-20",
    "date_required": "2026-01-27",
    "from_company": "ABC Concrete",
    "to_company": "XYZ Architects",
    "spec_section": "03 30 00",
    "question": "Please clarify the required PSI for the foundation pour."
}

result = registry.validate(rfi_data, "RFI")
if result.is_valid:
    print("RFI data is valid")
else:
    print(result.to_report())
```

## JSON Schema Export

```python
def export_to_json_schema(schema: Schema) -> dict:
    """Export DDC schema to JSON Schema format."""

    def field_to_json_schema(field: SchemaField) -> dict:
        type_map = {
            SchemaType.STRING: "string",
            SchemaType.NUMBER: "number",
            SchemaType.INTEGER: "integer",
            SchemaType.BOOLEAN: "boolean",
            SchemaType.ARRAY: "array",
            SchemaType.OBJECT: "object",
            SchemaType.DATE: "string",
            SchemaType.DATETIME: "string",
            SchemaType.CSI_CODE: "string",
            SchemaType.CURRENCY: "number",
            SchemaType.GUID: "string",
        }

        js = {"type": type_map.get(field.type, "string")}

        if field.description:
            js["description"] = field.description
        if field.min_value is not None:
            js["minimum"] = field.min_value
        if field.max_value is not None:
            js["maximum"] = field.max_value
        if field.min_length is not None:
            js["minLength"] = field.min_length
        if field.max_length is not None:
            js["maxLength"] = field.max_length
        if field.pattern:
            js["pattern"] = field.pattern
        if field.enum_values:
            js["enum"] = field.enum_values

        if field.type == SchemaType.DATE:
            js["format"] = "date"
        if field.type == SchemaType.DATETIME:
            js["format"] = "date-time"
        if field.type == SchemaType.GUID:
            js["format"] = "uuid"

        return js

    properties = {}
    required = []

    for name, field in schema.fields.items():
        properties[name] = field_to_json_schema(field)
        if field.required:
            required.append(name)

    return {
        "$schema": "http://json-schema.org/draft-07/schema#",
        "title": schema.name,
        "description": schema.description,
        "type": "object",
        "properties": properties,
        "required": required
    }

# Export to JSON Schema
json_schema = export_to_json_schema(COST_ESTIMATE_SCHEMA)
print(json.dumps(json_schema, indent=2))
```

## Integration with DDC Pipeline

```python
# Validate API request before processing
def validate_api_request(endpoint: str, payload: dict) -> SchemaValidationResult:
    schema_map = {
        '/api/estimates': 'CostEstimate',
        '/api/schedules': 'ProjectSchedule',
        '/api/rfis': 'RFI',
        '/api/bim/elements': 'BIMElement'
    }

    schema_name = schema_map.get(endpoint)
    if not schema_name:
        result = SchemaValidationResult(is_valid=True)
        return result

    registry = ConstructionSchemaRegistry()
    return registry.validate(payload, schema_name)

# Use in API handler
@app.post('/api/estimates')
def create_estimate(payload: dict):
    validation = validate_api_request('/api/estimates', payload)
    if not validation.is_valid:
        return {'error': 'Validation failed', 'details': [e.__dict__ for e in validation.errors]}, 400

    # Process valid data
    return process_estimate(payload)
```

## Resources

- **JSON Schema**: https://json-schema.org/
- **CSI MasterFormat**: Standard classification codes
- **IFC Schema**: https://standards.buildingsmart.org/IFC/