""" geo_crosswalk.py ================ Build area-proportional geographic crosswalk tables for California research projects that work at Census tract level. All crosswalks follow the same convention: - Source unit column (e.g., zcta5, county_fips, pctkey) - Target unit column (tract_geoid_20 — 2020 Census tract GEOID) - weight column (fraction of source unit area that falls in each target tract) - Weights per source unit sum to 1.0 (re-normalised after area computation) Functions --------- check_weights(df, unit_col, weight_col, label) Print weight-sum diagnostics. Useful to verify crosswalk integrity. download_tiger_tracts(dest_dir, state_fips="06") Download 2020 TIGER/Line Census tract shapefile for a state. Returns path to the extracted directory. build_zip_tract(output_path, shapes_dir) ZCTA → 2020 Census tract crosswalk using Census relationship file (area-proportional weights, re-normalised per ZCTA). build_county_tract(acs_tracts_csv, output_path) County FIPS → 2020 Census tract lookup derived from an ACS tracts CSV. (No spatial join needed — county FIPS is the first 5 digits of the tract GEOID.) build_prec_tract(vintage, prec_url, tiger_dir, output_path) Precinct → 2020 Census tract crosswalk via GeoPandas spatial overlay. Both layers projected to EPSG:3310 (CA Albers) before overlay. Dependencies ------------ geopandas (pip install geopandas) pandas (pip install pandas) requests (pip install requests) download_utils (from this utilities package) Census data sources ------------------- - ZCTA-to-tract relationship file (2020): https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_tract20_natl.txt - TIGER 2020 tract shapefiles: https://www2.census.gov/geo/tiger/TIGER2020/TRACT/tl_2020_{state_fips}_tract.zip Usage example ------------- from pathlib import Path from geo_crosswalk import build_zip_tract, build_county_tract SHAPES = Path("data/raw/shapefiles") PROCESSED = Path("data/processed") build_zip_tract(output_path=PROCESSED / "crosswalk_zip_tract.csv", shapes_dir=SHAPES) build_county_tract( acs_tracts_csv=Path("data/raw/acs/acs_tracts_ca_2020.csv"), output_path=PROCESSED / "crosswalk_county_tract.csv", ) """ from pathlib import Path from typing import Optional import pandas as pd # CA Albers Equal Area — accurate area computation for California CRS_CA = "EPSG:3310" ZCTA_REL_URL = ( "https://www2.census.gov/geo/docs/maps-data/data/rel2020/" "zcta520/tab20_zcta520_tract20_natl.txt" ) TIGER_TRACT_URL_TEMPLATE = ( "https://www2.census.gov/geo/tiger/TIGER2020/TRACT/tl_2020_{state_fips}_tract.zip" ) # --------------------------------------------------------------------------- # Diagnostics # --------------------------------------------------------------------------- def check_weights( df: pd.DataFrame, unit_col: str, weight_col: str, label: str = "", ) -> None: """ Print weight-sum diagnostics per source unit. Weights should sum to approximately 1.0 for each source unit after re-normalisation. Use this to verify crosswalk integrity. Parameters ---------- df : pd.DataFrame Crosswalk DataFrame. unit_col : str Column identifying the source geographic unit (e.g., "zcta5"). weight_col : str Column containing area-proportional weights. label : str Optional label printed in the diagnostic output. """ sums = df.groupby(unit_col)[weight_col].sum() tag = f" [{label}]" if label else "" print(f" Weight-sum diagnostics{tag} — should be ≈1.0 per {unit_col}:") print( f" n_units={len(sums):,} " f"mean={sums.mean():.4f} " f"min={sums.min():.4f} " f"max={sums.max():.4f} " f"pct_within_1pct={(abs(sums - 1.0) < 0.01).mean() * 100:.1f}%" ) # --------------------------------------------------------------------------- # TIGER tract download # --------------------------------------------------------------------------- def download_tiger_tracts(dest_dir: Path, state_fips: str = "06") -> Path: """ Download the 2020 TIGER/Line Census tract shapefile for *state_fips*. Parameters ---------- dest_dir : Path Parent directory for shapefile extraction. state_fips : str Two-digit state FIPS code. Default "06" (California). Returns ------- Path Path to the extraction directory containing the .shp file. """ from download_utils import download_zip url = TIGER_TRACT_URL_TEMPLATE.format(state_fips=state_fips) name = f"tl_2020_{state_fips}_tract" return download_zip(url=url, dest_dir=dest_dir, name=name) # --------------------------------------------------------------------------- # Crosswalk 1: ZCTA → Census tract # --------------------------------------------------------------------------- def build_zip_tract( output_path: Path, shapes_dir: Path, state_fips: str = "06", ) -> pd.DataFrame: """ Build a ZCTA (ZIP Code Tabulation Area) → 2020 Census tract crosswalk using the Census Bureau's official relationship file. Weights are area-proportional (AREALAND_PART / AREALAND_ZCTA), then re-normalised per ZCTA to sum exactly to 1.0. Parameters ---------- output_path : Path Where to save the output CSV. shapes_dir : Path Directory to cache the downloaded relationship file. state_fips : str Two-digit state FIPS — used to filter the national relationship file. Returns ------- pd.DataFrame Columns: zcta5, tract_geoid_20, weight. Output CSV columns ------------------ zcta5 ZIP Code Tabulation Area (5-digit string) tract_geoid_20 11-digit Census tract GEOID (2020 definitions) weight Fraction of ZCTA land area that falls in this tract """ from download_utils import download_file output_path = Path(output_path) shapes_dir = Path(shapes_dir) print("\n=== Crosswalk: ZCTA → Census tract ===") # Download relationship file txt_file = shapes_dir / "tab20_zcta520_tract20_natl.txt" download_file(url=ZCTA_REL_URL, dest_path=txt_file) df = pd.read_csv(txt_file, sep="|", dtype=str, low_memory=False) df.columns = [c.strip().upper() for c in df.columns] print(f" Raw rows: {len(df):,} columns: {list(df.columns)}") # Map columns (handle minor naming variants across Census vintages) col_map: dict = {} for col in df.columns: if "GEOID" in col and "ZCTA5" in col and "20" in col: col_map.setdefault("zcta", col) if "TRACT" in col and "20" in col and "GEOID" in col: col_map.setdefault("tract", col) if col == "AREALAND_PART": col_map["area_part"] = col if "AREALAND" in col and "ZCTA" in col and "PART" not in col: col_map.setdefault("area_zcta", col) required = ["zcta", "tract", "area_part", "area_zcta"] missing = [k for k in required if k not in col_map] if missing: raise KeyError(f"Could not identify columns for: {missing}. Available: {list(df.columns)}") df = df.rename(columns={ col_map["zcta"]: "zcta5", col_map["tract"]: "tract_geoid_20", col_map["area_part"]: "area_part", col_map["area_zcta"]: "area_zcta", }) # Filter to target state df = df[df["tract_geoid_20"].str.startswith(state_fips)].copy() print(f" Rows after state filter ({state_fips}): {len(df):,}") # Compute weights df["area_part"] = pd.to_numeric(df["area_part"], errors="coerce").fillna(0) df["area_zcta"] = pd.to_numeric(df["area_zcta"], errors="coerce") df["weight"] = df["area_part"] / df["area_zcta"] # Re-normalise per ZCTA weight_sum = df.groupby("zcta5")["weight"].transform("sum") df.loc[weight_sum > 0, "weight"] = df.loc[weight_sum > 0, "weight"] / weight_sum[weight_sum > 0] df = df.drop(columns=["area_part", "area_zcta"]) df = df[df["weight"] > 0].reset_index(drop=True) out = df[["zcta5", "tract_geoid_20", "weight"]].copy() check_weights(out, "zcta5", "weight", "ZIP→tract") print(f" Output: {len(out):,} rows, {out['zcta5'].nunique():,} unique ZCTAs") output_path.parent.mkdir(parents=True, exist_ok=True) out.to_csv(output_path, index=False) print(f" Saved → {output_path}") return out # --------------------------------------------------------------------------- # Crosswalk 2: County FIPS → Census tract # --------------------------------------------------------------------------- def build_county_tract( acs_tracts_csv: Path, output_path: Path, state_fips: str = "06", ) -> pd.DataFrame: """ Build a county FIPS → 2020 Census tract lookup from an ACS tracts CSV. No spatial join required: county FIPS is simply the first 5 digits of each tract's 11-digit GEOID. Use this crosswalk to broadcast county-level data (e.g., insurance panels) to all tracts within each county. Parameters ---------- acs_tracts_csv : Path Path to a CSV that contains one row per Census tract with a GEOID column (11-digit string, starting with the state FIPS code). Typically produced by census_api.fetch_acs_tracts(). output_path : Path Where to save the output CSV. state_fips : str Two-digit state FIPS to filter the tract list. Default "06" (CA). Returns ------- pd.DataFrame Columns: county_fips (5-digit str), tract_geoid_20 (11-digit str). Output CSV columns ------------------ county_fips 5-digit county FIPS (state 2 + county 3) tract_geoid_20 11-digit Census tract GEOID """ print("\n=== Crosswalk: County FIPS → Census tract ===") df = pd.read_csv(acs_tracts_csv, dtype=str, low_memory=False) print(f" ACS file rows: {len(df):,}") # Find GEOID column geoid_col = None for c in df.columns: if c.upper() in ("GEOID", "GEO_ID", "TRACT_GEOID", "FIPS"): geoid_col = c break if geoid_col is None: for c in df.columns: sample = df[c].dropna().iloc[0] if len(df) > 0 else "" if str(sample).startswith(state_fips) and len(str(sample)) >= 11: geoid_col = c print(f" Inferred GEOID column: '{c}' sample={sample!r}") break if geoid_col is None: raise KeyError(f"Could not identify GEOID column. Columns: {list(df.columns)}") geoids = df[geoid_col].dropna().astype(str).str.strip() # Strip "1400000US" prefix if present (Census API GeoID format) geoids = geoids.str.replace(r"^1400000US", "", regex=True) ca_geoids = geoids[geoids.str.startswith(state_fips)] out = pd.DataFrame({ "county_fips": ca_geoids.str[:5].values, "tract_geoid_20": ca_geoids.values, }).drop_duplicates().reset_index(drop=True) print( f" Unique county FIPS: {out['county_fips'].nunique()} " f"(expect 58 for CA) | tracts: {len(out):,}" ) output_path.parent.mkdir(parents=True, exist_ok=True) out.to_csv(output_path, index=False) print(f" Saved → {output_path}") return out # --------------------------------------------------------------------------- # Crosswalk 3: Precinct → Census tract (spatial overlay) # --------------------------------------------------------------------------- def build_prec_tract( vintage: str, prec_url: str, tiger_dir: Path, output_path: Path, state_fips: str = "06", ) -> pd.DataFrame: """ Build a precinct → 2020 Census tract crosswalk via GeoPandas spatial overlay. Both shapefiles are projected to EPSG:3310 (CA Albers Equal Area) before overlay for accurate area computation. Weight = intersection area / precinct total area, re-normalised per precinct. Parameters ---------- vintage : str Short label for the precinct vintage (e.g., "g22", "p18"). Used in output filenames and log messages. prec_url : str URL to download the precinct shapefile ZIP. Example (UC Berkeley Statewide Database): g22: https://statewidedatabase.org/pub/data/G22/state/mprec_state_g22_v01_shp.zip p18: https://statewidedatabase.org/pub/data/P18/state/mprec_p18_v01_shp.zip tiger_dir : Path Directory containing (or where to download) the TIGER tract shapefile. The function calls download_tiger_tracts() automatically if the shapefile is not already present. output_path : Path Where to save the output CSV. state_fips : str Two-digit state FIPS code. Default "06" (California). Returns ------- pd.DataFrame Columns: pctkey, tract_geoid_20, weight. Output CSV columns ------------------ pctkey Precinct identifier (string, from shapefile key column) tract_geoid_20 11-digit Census tract GEOID (2020 definitions) weight Fraction of precinct area that falls in this tract """ import geopandas as gpd from download_utils import download_zip print(f"\n=== Crosswalk: Precinct → Census tract ({vintage}) ===") prec_dir = download_zip(url=prec_url, dest_dir=tiger_dir, name=f"prec_{vintage}") tract_dir = download_tiger_tracts(dest_dir=tiger_dir, state_fips=state_fips) # Load tracts shp_files = list(tract_dir.rglob("*.shp")) if not shp_files: raise FileNotFoundError(f"No .shp in {tract_dir}") tracts = gpd.read_file(shp_files[0]) geoid_col = next((c for c in tracts.columns if c.upper() == "GEOID"), None) if geoid_col is None: raise KeyError(f"No GEOID column in tracts shapefile. Columns: {list(tracts.columns)}") tracts = tracts[[geoid_col, "geometry"]].rename(columns={geoid_col: "tract_geoid_20"}) print(f" Tracts: {len(tracts):,} CRS: {tracts.crs}") # Load precincts prec_shp = list(prec_dir.rglob("*.shp")) if not prec_shp: raise FileNotFoundError(f"No .shp in {prec_dir}") precincts = gpd.read_file(prec_shp[0]) print(f" Precincts: {len(precincts):,} CRS: {precincts.crs}") # Identify precinct key column key_candidates = ["PCTKEY", "PCTFIPS", "PCT_KEY", "PREC_KEY", "MPREC_KEY", "PRECINCT_ID", "PRECKEY"] pct_key_col = None for c in precincts.columns: if c.upper() in [k.upper() for k in key_candidates]: pct_key_col = c break if pct_key_col is None: for c in precincts.columns: if c != "geometry" and precincts[c].dtype == object: pct_key_col = c print(f" WARNING: No standard key column found; using '{c}' as pctkey") break print(f" Precinct key column: '{pct_key_col}'") precincts = precincts[[pct_key_col, "geometry"]].rename(columns={pct_key_col: "pctkey"}) # Project to CA Albers tracts = tracts.to_crs(CRS_CA) precincts = precincts.to_crs(CRS_CA) prec_areas = ( precincts[["pctkey"]] .copy() .assign(prec_area_total=precincts.geometry.area) .drop_duplicates("pctkey") ) print(f" Running spatial overlay ({len(precincts):,} precincts × {len(tracts):,} tracts)...") overlay = gpd.overlay( precincts[["pctkey", "geometry"]], tracts, how="intersection", keep_geom_type=False, ) overlay["area_part"] = overlay.geometry.area overlay = overlay.merge(prec_areas, on="pctkey", how="left") overlay["weight"] = overlay["area_part"] / overlay["prec_area_total"] weight_sum = overlay.groupby("pctkey")["weight"].transform("sum") overlay.loc[weight_sum > 0, "weight"] = ( overlay.loc[weight_sum > 0, "weight"] / weight_sum[weight_sum > 0] ) overlay = overlay[overlay["weight"] > 1e-5].copy() out = overlay[["pctkey", "tract_geoid_20", "weight"]].reset_index(drop=True) check_weights(out, "pctkey", "weight", f"precinct({vintage})→tract") print(f" Output: {len(out):,} rows, {out['pctkey'].nunique():,} unique precincts") output_path.parent.mkdir(parents=True, exist_ok=True) out.to_csv(output_path, index=False) print(f" Saved → {output_path}") return out