"""
census_api.py
=============
Helpers for pulling American Community Survey (ACS) data via the Census Bureau
Data API (https://api.census.gov).

Functions
---------
fetch_acs_batch(year, variables, state_fips, geography, api_key)
    Fetch a single batch of ACS variables for a given geography and return a
    raw DataFrame with Census API column headers.

fetch_acs_tracts(year, variable_batches, state_fips, api_key, variable_labels)
    Fetch one or more batches, merge on geo columns, build a standard GEOID,
    rename variables, and mask Census sentinel values (-666666666).

build_geoid(df, state_col="state", county_col="county", tract_col="tract")
    Construct an 11-digit Census tract GEOID string from component columns.

mask_sentinel(df, columns, sentinel=-666666666)
    Replace Census sentinel values with pd.NA and coerce columns to numeric.

Dependencies
------------
    requests  (pip install requests)
    pandas    (pip install pandas)

Usage example
-------------
    from census_api import fetch_acs_tracts

    BATCHES = [
        ["B25038_001E",   # total owner-occupied units
         "B25038_002E",   # owner occ: moved in 2015 or later
         "B25038_003E",   # owner occ: moved in 2010–2014
         "B25038_006E",   # owner occ: moved in 2000–2009
         "B25038_010E"],  # owner occ: moved in 1999 or earlier
    ]
    LABELS = {
        "B25038_001E": "owner_occ_total",
        "B25038_002E": "moved_in_2015plus",
        "B25038_003E": "moved_in_2010_2014",
        "B25038_006E": "moved_in_2000_2009",
        "B25038_010E": "moved_in_pre2000",
    }

    df_2020 = fetch_acs_tracts(
        year=2020,
        variable_batches=BATCHES,
        state_fips="06",
        api_key="YOUR_KEY",          # or set CENSUS_API_KEY env var
        variable_labels=LABELS,
    )

Census API notes
----------------
- Free API key signup: https://api.census.gov/data/key_signup.html
- Without a key, unauthenticated requests are rate-limited to ~500/day.
- Set the key as an environment variable: export CENSUS_API_KEY=your_key
- The API returns sentinel value -666666666 for missing/suppressed cells.
- The API limits each request to 50 variables; use variable_batches to split
  large variable lists across multiple requests.
- ACS 5-year vintages: year refers to the end year (e.g., 2020 = 2016–2020).
"""

import os
import time
from pathlib import Path
from typing import Dict, List, Optional

import pandas as pd

try:
    import requests as _requests
    _HAS_REQUESTS = True
except ImportError:
    _HAS_REQUESTS = False

CENSUS_BASE = "https://api.census.gov/data/{year}/acs/acs5"
SENTINEL = -666666666


def _get_api_key(api_key: Optional[str]) -> str:
    """Return api_key if provided, else check CENSUS_API_KEY env var."""
    if api_key:
        return api_key
    env_key = os.environ.get("CENSUS_API_KEY", "")
    if not env_key:
        print(
            "WARNING: CENSUS_API_KEY not set. Unauthenticated requests are rate-limited.\n"
            "Get a free key at https://api.census.gov/data/key_signup.html\n"
            "Then: export CENSUS_API_KEY=your_key_here"
        )
    return env_key


def fetch_acs_batch(
    year: int,
    variables: List[str],
    state_fips: str = "06",
    geography: str = "tract",
    api_key: Optional[str] = None,
) -> pd.DataFrame:
    """
    Fetch one batch of ACS 5-year variables for all geographies of type
    *geography* within *state_fips*.

    Parameters
    ----------
    year : int
        ACS 5-year end-year (e.g., 2020 pulls the 2016–2020 estimates).
    variables : list of str
        ACS variable codes (e.g., ["B25038_001E", "B19013_001E"]).
        Maximum 50 per request (Census API limit).
    state_fips : str
        Two-digit state FIPS code. Default "06" (California).
    geography : str
        Census geography type. Default "tract".
        Other options: "county", "block group", "zip code tabulation area".
    api_key : str, optional
        Census API key. Falls back to CENSUS_API_KEY environment variable.

    Returns
    -------
    pd.DataFrame
        Raw DataFrame with Census API column headers (variable codes + geo cols).

    Raises
    ------
    requests.HTTPError
        If the API returns a non-2xx status.
    """
    if not _HAS_REQUESTS:
        raise ImportError("requests is required: pip install requests")
    import requests

    key = _get_api_key(api_key)
    url = CENSUS_BASE.format(year=year)
    params: dict = {
        "get": "NAME," + ",".join(variables),
        "for": f"{geography}:*",
        "in": f"state:{state_fips}",
    }
    if key:
        params["key"] = key

    print(f"    Fetching {len(variables)} variables for {geography} level ({year} ACS 5-yr)...")
    resp = requests.get(url, params=params, timeout=60)
    if resp.status_code == 400 and "key" in resp.text.lower():
        print(
            "    NOTE: Census API rate limit hit without key.\n"
            "    Get a free key at https://api.census.gov/data/key_signup.html"
        )
    resp.raise_for_status()

    data = resp.json()
    return pd.DataFrame(data[1:], columns=data[0])


def build_geoid(
    df: pd.DataFrame,
    state_col: str = "state",
    county_col: str = "county",
    tract_col: str = "tract",
) -> pd.Series:
    """
    Build a standard 11-digit Census tract GEOID from component columns.

    Concatenates state (2 digits) + county (3 digits) + tract (6 digits).

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the geo component columns as strings.
    state_col, county_col, tract_col : str
        Column names for state, county, and tract FIPS codes.

    Returns
    -------
    pd.Series
        Series of 11-character GEOID strings.
    """
    return df[state_col].astype(str) + df[county_col].astype(str) + df[tract_col].astype(str)


def mask_sentinel(
    df: pd.DataFrame,
    columns: List[str],
    sentinel: int = SENTINEL,
) -> pd.DataFrame:
    """
    Replace Census sentinel values with pd.NA and coerce columns to numeric.

    The Census API uses -666666666 to indicate suppressed or unavailable cells.
    This function coerces columns to numeric and replaces the sentinel with pd.NA.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame (modified in-place).
    columns : list of str
        Column names to process. Missing columns are silently skipped.
    sentinel : int
        Sentinel value to replace. Default -666666666.

    Returns
    -------
    pd.DataFrame
        The modified DataFrame (same object, returned for chaining).
    """
    for col in columns:
        if col not in df.columns:
            continue
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df[col] = df[col].where(df[col] != sentinel, other=pd.NA)
    return df


def fetch_acs_tracts(
    year: int,
    variable_batches: List[List[str]],
    state_fips: str = "06",
    api_key: Optional[str] = None,
    variable_labels: Optional[Dict[str, str]] = None,
    sleep_between_batches: float = 1.0,
) -> pd.DataFrame:
    """
    Fetch multiple batches of ACS variables for all Census tracts in a state,
    merge them into a single DataFrame, build GEOIDs, rename variables, and
    mask sentinel values.

    Parameters
    ----------
    year : int
        ACS 5-year end-year (e.g., 2020 for 2016–2020 estimates).
    variable_batches : list of list of str
        Variable codes split into batches of ≤50 (Census API limit).
        Example: [["B25038_001E", "B25038_002E"], ["B19013_001E"]]
    state_fips : str
        Two-digit state FIPS code. Default "06" (California).
    api_key : str, optional
        Census API key. Falls back to CENSUS_API_KEY env var.
    variable_labels : dict, optional
        Mapping from ACS variable code to human-readable column name.
        Codes not in the dict are left as-is.
    sleep_between_batches : float
        Seconds to sleep between API requests to avoid rate limiting.
        Default 1.0.

    Returns
    -------
    pd.DataFrame
        Columns: geoid (11-digit str), NAME, acs_year,
                 + all requested variables (renamed if variable_labels provided).
    """
    print(f"  Fetching ACS {year} 5-year estimates — {sum(len(b) for b in variable_batches)} variables...")

    geo_cols = ["state", "county", "tract"]
    all_batches = []

    for i, batch in enumerate(variable_batches):
        df_batch = fetch_acs_batch(year, batch, state_fips=state_fips, api_key=api_key)
        all_batches.append(df_batch)
        if i < len(variable_batches) - 1:
            time.sleep(sleep_between_batches)

    # Merge batches on geo columns
    df = all_batches[0]
    for extra in all_batches[1:]:
        drop_cols = [c for c in extra.columns if c in ("NAME",)]
        df = df.merge(extra.drop(columns=drop_cols, errors="ignore"), on=geo_cols)

    # Build GEOID
    df["geoid"] = build_geoid(df)
    df["acs_year"] = year

    # Rename variables
    if variable_labels:
        df = df.rename(columns=variable_labels)

    # Mask sentinel values on all variable columns
    all_var_codes = [v for batch in variable_batches for v in batch]
    renamed = [variable_labels.get(v, v) for v in all_var_codes] if variable_labels else all_var_codes
    mask_sentinel(df, renamed)

    keep = ["geoid", "NAME", "acs_year"] + [c for c in renamed if c in df.columns]
    return df[keep].reset_index(drop=True)