---
name: test-fixture-generator
description: Generate synthetic test data with edge cases for ETL pipeline testing.
---

# Test Fixture Generator

Generate test fixtures matching schema specifications with automatic edge case injection.

## Core Generator

```python
def generate_fixtures(
    schema: dict,
    count: int = 100,
    edge_cases: bool = True
) -> pd.DataFrame:
    """Generate test data matching schema."""
    data = {}

    for col, spec in schema.items():
        if spec['type'] == 'integer':
            data[col] = generate_integers(count, spec)
        elif spec['type'] == 'string':
            data[col] = generate_strings(count, spec)
        elif spec['type'] == 'date':
            data[col] = generate_dates(count, spec)
        elif spec['type'] == 'float':
            data[col] = generate_floats(count, spec)
        elif spec['type'] == 'boolean':
            data[col] = generate_booleans(count)
        elif spec['type'] == 'enum':
            data[col] = generate_enums(count, spec['values'])

    df = pd.DataFrame(data)

    if edge_cases:
        df = add_edge_cases(df, schema)

    return df
```

## Edge Case Injection

```python
def add_edge_cases(df: pd.DataFrame, schema: dict) -> pd.DataFrame:
    """Add rows with boundary and edge case values."""
    edge_rows = []

    # Null row (where nullable)
    null_row = {
        col: None if spec.get('nullable', True) else df[col].iloc[0]
        for col, spec in schema.items()
    }
    edge_rows.append(null_row)

    # Boundary values per column
    for col, spec in schema.items():
        base_row = df.iloc[0].to_dict()

        if spec['type'] == 'integer':
            edge_rows.append({**base_row, col: spec.get('min', 0)})
            edge_rows.append({**base_row, col: spec.get('max', 2147483647)})

        elif spec['type'] == 'string':
            edge_rows.append({**base_row, col: ''})  # Empty string
            edge_rows.append({**base_row, col: 'a' * spec.get('max_length', 255)})  # Max length

        elif spec['type'] == 'float':
            edge_rows.append({**base_row, col: 0.0})
            edge_rows.append({**base_row, col: spec.get('min', -1e9)})
            edge_rows.append({**base_row, col: spec.get('max', 1e9)})

        elif spec['type'] == 'date':
            edge_rows.append({**base_row, col: datetime(1970, 1, 1)})
            edge_rows.append({**base_row, col: datetime.now()})

    return pd.concat([df, pd.DataFrame(edge_rows)], ignore_index=True)
```

## Type Generators

```python
import random
import string
from datetime import datetime, timedelta

def generate_integers(count: int, spec: dict) -> list:
    min_val = spec.get('min', 0)
    max_val = spec.get('max', 1000000)
    return [random.randint(min_val, max_val) for _ in range(count)]

def generate_floats(count: int, spec: dict) -> list:
    min_val = spec.get('min', 0.0)
    max_val = spec.get('max', 1000000.0)
    precision = spec.get('precision', 2)
    return [round(random.uniform(min_val, max_val), precision) for _ in range(count)]

def generate_strings(count: int, spec: dict) -> list:
    min_len = spec.get('min_length', 1)
    max_len = spec.get('max_length', 50)
    pattern = spec.get('pattern', None)

    if pattern == 'email':
        return [f"user{i}@example.com" for i in range(count)]
    elif pattern == 'phone':
        return [f"+1{random.randint(1000000000, 9999999999)}" for i in range(count)]
    else:
        return [
            ''.join(random.choices(string.ascii_letters, k=random.randint(min_len, max_len)))
            for _ in range(count)
        ]

def generate_dates(count: int, spec: dict) -> list:
    start = spec.get('min', datetime(2020, 1, 1))
    end = spec.get('max', datetime.now())
    delta = (end - start).days
    return [start + timedelta(days=random.randint(0, delta)) for _ in range(count)]

def generate_booleans(count: int) -> list:
    return [random.choice([True, False]) for _ in range(count)]

def generate_enums(count: int, values: list) -> list:
    return [random.choice(values) for _ in range(count)]
```

## Schema Definition Format

```yaml
# fixtures/orders_schema.yml
columns:
  order_id:
    type: integer
    min: 1
    nullable: false

  customer_email:
    type: string
    pattern: email
    nullable: false

  total_amount:
    type: float
    min: 0.01
    max: 100000.00
    precision: 2

  status:
    type: enum
    values: [pending, confirmed, shipped, delivered, cancelled]

  created_at:
    type: date
    min: 2023-01-01
    nullable: false
```

## Usage

```python
import yaml

# Load schema
with open('fixtures/orders_schema.yml') as f:
    schema = yaml.safe_load(f)['columns']

# Generate fixtures
df = generate_fixtures(schema, count=100, edge_cases=True)

# Save for test use
df.to_csv('tests/fixtures/orders_fixture.csv', index=False)
```