{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "# Ingest FHIBE Dataset\n",
    "\n",
    "This tutorial is still a work in progress. Check back shortly for a more complete, well documented example!"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {},
   "source": [
    "## Install dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install -q 3lc"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import json\n",
    "from collections import defaultdict\n",
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tlc\n",
    "from tlc.core.helpers._value_map_builder import _ValueMapBuilder"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5",
   "metadata": {},
   "source": [
    "## Project setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {},
   "outputs": [],
   "source": [
    "PROJECT_NAME = \"3LC Tutorials - FHIBE\"\n",
    "DATASET_NAME = \"FHIBE\"\n",
    "TABLE_NAME = \"full-improved\"\n",
    "MAX_SAMPLES = 1_000_000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "FHIBE_ROOT = Path(\n",
    "    \"D:/Data/fhibe.20250716.u.gT5_rFTA_downsampled_public_raw_only/fhibe.20250716.u.gT5_rFTA_downsampled_public_raw_only\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_ROOT = FHIBE_ROOT / \"data/raw/fhibe_downsampled\"\n",
    "METADATA_ROOT = FHIBE_ROOT / \"data/processed/\"\n",
    "CSV_FILE = METADATA_ROOT / \"fhibe_downsampled\" / \"fhibe_downsampled.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_file = pd.read_csv(CSV_FILE)\n",
    "csv_file = csv_file.loc[:, ~csv_file.columns.str.contains(\"annotator_id\")]\n",
    "csv_file.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_natural(x):\n",
    "    # NaN stays NaN\n",
    "    if pd.isna(x):\n",
    "        return np.nan\n",
    "\n",
    "    # Already a python type you want to keep\n",
    "    if isinstance(x, (list, dict, int, float, bool)):\n",
    "        return x\n",
    "\n",
    "    # Strings: maybe JSON, maybe not\n",
    "    if isinstance(x, str):\n",
    "        s = x.strip().replace(\"'\", '\"')\n",
    "        try:\n",
    "            return json.loads(s)\n",
    "        except Exception:\n",
    "            return s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {},
   "outputs": [],
   "source": [
    "IGNORE_KEYS = set([\"filepath\", \"image_height\", \"image_width\", \"keypoints\", \"face_bbox\", \"person_bbox\", \"segments\"])\n",
    "\n",
    "for col_name in set(csv_file.columns) - IGNORE_KEYS:\n",
    "    csv_file[col_name] = csv_file[col_name].apply(to_natural)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Handle nan-issues\n",
    "for col_name in set(csv_file.columns) - IGNORE_KEYS:\n",
    "    if \"annotator_id\" in col_name:\n",
    "        continue\n",
    "    col = csv_file[col_name]\n",
    "\n",
    "    if col.isna().sum() == 0:\n",
    "        continue\n",
    "\n",
    "    non_nan = col.dropna()\n",
    "    types = non_nan.map(type).value_counts()\n",
    "    dominant = types.index[0] if len(types) else None\n",
    "    print(f\"{col_name} has {col.isna().sum()} nan-values. Dominant type: {dominant}\")\n",
    "    empty_for = {\n",
    "        str: \"\",\n",
    "        float: np.nan,\n",
    "    }\n",
    "    # Use a default argument in the lambda to bind the current value of empty_value\n",
    "    empty_value = empty_for.get(dominant)\n",
    "\n",
    "    col = col.apply(lambda x, empty_value=empty_value: empty_value if pd.isna(x) else x)\n",
    "    csv_file[col_name] = col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {},
   "outputs": [],
   "source": [
    "override_schemas = {\n",
    "    \"image_id\": tlc.StringSchema(default_visible=False, writable=False),\n",
    "    \"subject_id\": tlc.StringSchema(default_visible=False, writable=False),\n",
    "    \"json_path\": tlc.StringSchema(default_visible=False, writable=False),\n",
    "    \"user_date_captured\": tlc.StringSchema(default_visible=False, writable=False),\n",
    "    \"model\": tlc.StringSchema(default_visible=False, writable=False),\n",
    "    \"location_region\": tlc.StringSchema(default_visible=False, writable=False),\n",
    "}\n",
    "\n",
    "\n",
    "def normalize_string(s):\n",
    "    return s.replace(\".\", \"\").replace(\":\", \";\").strip(\", \")\n",
    "\n",
    "\n",
    "def schema_for_column(col_name, col_value):\n",
    "    if col_name in override_schemas:\n",
    "        return override_schemas[col_name], col_value.apply(str)\n",
    "\n",
    "    is_list = False\n",
    "    if isinstance(col_value[0], float):\n",
    "        return tlc.Float32Schema(default_visible=False), None\n",
    "    elif isinstance(col_value[0], (bool, np.bool_)):\n",
    "        return tlc.BoolSchema(default_visible=False), None\n",
    "    elif isinstance(col_value[0], (int, np.int32, np.int64)):\n",
    "        return tlc.Int32Schema(default_visible=False), None\n",
    "    elif isinstance(col_value[0], (list, np.ndarray)):\n",
    "        is_list = True\n",
    "\n",
    "    # Default behaviour for strings and lists of strings is to convert to categoricals.\n",
    "    element = col_value[0] if not is_list else col_value[0][0]\n",
    "    if not isinstance(element, str):\n",
    "        return None, None\n",
    "\n",
    "    # 1. Build vocab\n",
    "    vals = col_value.apply(\n",
    "        lambda x: x\n",
    "        if isinstance(x, (list, tuple, np.ndarray))\n",
    "        else ([] if (x is None or (isinstance(x, float) and np.isnan(x))) else [x])\n",
    "    )\n",
    "\n",
    "    vocab = sorted({normalize_string(str(s)) for s in itertools.chain.from_iterable(vals)})\n",
    "    str_to_id = {s: i for i, s in enumerate(vocab)}\n",
    "    id_to_str = {i: s for s, i in str_to_id.items()}\n",
    "\n",
    "    # These columns have a hex color associated with them, add to mapping\n",
    "    if col_name in [\"apparent_skin_color\", \"natural_skin_color\"]:\n",
    "        for k, v in id_to_str.items():\n",
    "            hex_color = tlc.rgb_tuple_to_hex(json.loads(v[v.index(\"[\") : v.index(\"]\") + 1]))\n",
    "            id_to_str[k] = tlc.MapElement(v, display_color=hex_color)\n",
    "\n",
    "    # 2. Transform column\n",
    "    def encode(x):\n",
    "        if isinstance(x, (list, tuple, np.ndarray)):\n",
    "            return [str_to_id[normalize_string(str(s))] for s in x]\n",
    "        return str_to_id[normalize_string(str(x))]\n",
    "\n",
    "    transformed = col_value.apply(encode)\n",
    "\n",
    "    # 3. Build schema\n",
    "    schema_type = tlc.CategoricalLabelListSchema if is_list else tlc.CategoricalLabelSchema\n",
    "    return schema_type(classes=id_to_str, default_visible=False, writable=False), transformed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {},
   "outputs": [],
   "source": [
    "def infer_schemas_and_transform_categoricals(df) -> dict[str, tlc.Schema]:\n",
    "    column_schemas = {}\n",
    "    for col in set(df.columns) - IGNORE_KEYS:\n",
    "        schema, transformed = schema_for_column(col, df[col])\n",
    "        column_schemas[col] = schema\n",
    "        if transformed is not None:\n",
    "            df[col] = transformed\n",
    "    return column_schemas\n",
    "\n",
    "\n",
    "column_schemas = infer_schemas_and_transform_categoricals(csv_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15",
   "metadata": {},
   "source": [
    "## Define annotation transforms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16",
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_KEYPOINTS = 33\n",
    "\n",
    "KPTS = [\n",
    "    \"0. Nose\",\n",
    "    \"1. Right eye inner\",\n",
    "    \"2. Right eye\",\n",
    "    \"3. Right eye outer\",\n",
    "    \"4. Left eye inner\",\n",
    "    \"5. Left eye\",\n",
    "    \"6. Left eye outer\",\n",
    "    \"7. Right ear\",\n",
    "    \"8. Left ear\",\n",
    "    \"9. Mouth right\",\n",
    "    \"10. Mouth left\",\n",
    "    \"11. Right shoulder\",\n",
    "    \"12. Left shoulder\",\n",
    "    \"13. Right elbow\",\n",
    "    \"14. Left elbow\",\n",
    "    \"15. Right wrist\",\n",
    "    \"16. Left wrist\",\n",
    "    \"17. Right pinky knuckle\",\n",
    "    \"18. Left pinky knuckle\",\n",
    "    \"19. Right index knuckle\",\n",
    "    \"20. Left index knuckle\",\n",
    "    \"21. Right thumb knuckle\",\n",
    "    \"22. Left thumb knuckle\",\n",
    "    \"23. Right hip\",\n",
    "    \"24. Left hip\",\n",
    "    \"25. Right knee\",\n",
    "    \"26. Left knee\",\n",
    "    \"27. Right ankle\",\n",
    "    \"28. Left ankle\",\n",
    "    \"29. Right heel\",\n",
    "    \"30. Left heel\",\n",
    "    \"31. Right foot index\",\n",
    "    \"32. Left foot index\",\n",
    "]\n",
    "\n",
    "SKELETON = [\n",
    "    11,\n",
    "    12,\n",
    "    11,\n",
    "    13,\n",
    "    13,\n",
    "    15,\n",
    "    12,\n",
    "    14,\n",
    "    14,\n",
    "    16,\n",
    "    12,\n",
    "    24,\n",
    "    11,\n",
    "    23,\n",
    "    23,\n",
    "    24,\n",
    "    24,\n",
    "    26,\n",
    "    26,\n",
    "    28,\n",
    "    23,\n",
    "    25,\n",
    "    25,\n",
    "    27,\n",
    "    27,\n",
    "    29,\n",
    "    29,\n",
    "    31,\n",
    "    28,\n",
    "    30,\n",
    "    30,\n",
    "    32,\n",
    "]\n",
    "\n",
    "\n",
    "def transform_keypoints(keypoints, image_width, image_height) -> tlc.Keypoints2DInstances:\n",
    "    kpts = json.loads(keypoints.replace(\"'\", '\"'))\n",
    "    kpts_arr = np.zeros((NUM_KEYPOINTS, 3), dtype=np.float32)\n",
    "    for i, kpt_name in enumerate(KPTS):\n",
    "        if kpt_name not in kpts:\n",
    "            continue\n",
    "        kpts_arr[i, :] = kpts[kpt_name]\n",
    "        kpts_arr[i, 2] = 2\n",
    "\n",
    "    instances = tlc.Keypoints2DInstances.create_empty(\n",
    "        image_width=image_width,\n",
    "        image_height=image_height,\n",
    "        include_keypoint_visibilities=True,\n",
    "        include_instance_bbs=False,\n",
    "    )\n",
    "\n",
    "    instances.add_instance(\n",
    "        keypoints=kpts_arr,\n",
    "        label=0,\n",
    "    )\n",
    "\n",
    "    return instances\n",
    "\n",
    "\n",
    "builder = _ValueMapBuilder[str]()\n",
    "\n",
    "\n",
    "def transform_segments(segments, image_width, image_height):\n",
    "    segments = json.loads(segments.replace(\"'\", '\"'))\n",
    "\n",
    "    polygons = []\n",
    "    labels = []\n",
    "\n",
    "    for segment in segments:\n",
    "        class_name = segment[\"class_name\"]\n",
    "        polygon = segment[\"polygon\"]\n",
    "        poly_2_tuples = [[p[\"x\"], p[\"y\"]] for p in polygon]\n",
    "        flattened_poly = [item for sublist in poly_2_tuples for item in sublist]\n",
    "        polygons.append(flattened_poly)\n",
    "        labels.append(builder(class_name))\n",
    "\n",
    "    segs = tlc.SegmentationPolygonsDict(\n",
    "        image_width=image_width,\n",
    "        image_height=image_height,\n",
    "        polygons=polygons,\n",
    "        instance_properties={\"label\": labels},\n",
    "    )\n",
    "    return segs\n",
    "\n",
    "\n",
    "def transform_bboxes(face_bbox, person_bbox, image_width, image_height):\n",
    "    face_bbox = json.loads(face_bbox)\n",
    "    person_bbox = json.loads(person_bbox)\n",
    "\n",
    "    bboxes = {\n",
    "        tlc.IMAGE_WIDTH: image_width,\n",
    "        tlc.IMAGE_HEIGHT: image_height,\n",
    "        tlc.BOUNDING_BOX_LIST: [\n",
    "            {\n",
    "                tlc.X0: face_bbox[0],\n",
    "                tlc.Y0: face_bbox[1],\n",
    "                tlc.X1: face_bbox[2],\n",
    "                tlc.Y1: face_bbox[3],\n",
    "                tlc.LABEL: 0,\n",
    "            },\n",
    "            {\n",
    "                tlc.X0: person_bbox[0],\n",
    "                tlc.Y0: person_bbox[1],\n",
    "                tlc.X1: person_bbox[2],\n",
    "                tlc.Y1: person_bbox[3],\n",
    "                tlc.LABEL: 1,\n",
    "            },\n",
    "        ],\n",
    "    }\n",
    "\n",
    "    return bboxes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = defaultdict(list)\n",
    "\n",
    "for index, row in csv_file.iterrows():\n",
    "    input_row = row.to_dict()\n",
    "    image_path = FHIBE_ROOT / input_row[\"filepath\"]\n",
    "\n",
    "    ## Extract and convert annotations to 3lc format\n",
    "\n",
    "    image_height = input_row[\"image_height\"]\n",
    "    image_width = input_row[\"image_width\"]\n",
    "    keypoints = transform_keypoints(input_row[\"keypoints\"], image_width, image_height)\n",
    "    segments = transform_segments(input_row[\"segments\"], image_width, image_height)\n",
    "    bboxes = transform_bboxes(input_row[\"face_bbox\"], input_row[\"person_bbox\"], image_width, image_height)\n",
    "    data[\"image\"].append(image_path.as_posix())\n",
    "    data[\"keypoints\"].append(keypoints.to_row())\n",
    "    data[\"bbs\"].append(bboxes)\n",
    "    data[\"segments\"].append(segments)\n",
    "\n",
    "    ## Extract metadata\n",
    "    for key in set(input_row.keys()) - IGNORE_KEYS:\n",
    "        data[key].append(input_row[key])\n",
    "\n",
    "    if index > MAX_SAMPLES:\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19",
   "metadata": {},
   "source": [
    "## Write 3LC Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_writer = tlc.TableWriter(\n",
    "    table_name=TABLE_NAME,\n",
    "    dataset_name=DATASET_NAME,\n",
    "    project_name=PROJECT_NAME,\n",
    "    column_schemas={\n",
    "        \"image\": tlc.ImageUrlSchema(),\n",
    "        \"keypoints\": tlc.Keypoints2DSchema(\n",
    "            classes=[\"person\"],\n",
    "            num_keypoints=NUM_KEYPOINTS,\n",
    "            lines=SKELETON,\n",
    "            point_attributes=list(map(lambda x: x.split(\". \")[1], KPTS)),\n",
    "            include_per_point_visibility=True,\n",
    "        ),\n",
    "        \"bbs\": tlc.BoundingBoxListSchema(\n",
    "            label_value_map={0: tlc.MapElement(\"face\"), 1: tlc.MapElement(\"person\")},\n",
    "            include_segmentation=False,\n",
    "            x1_number_role=tlc.NUMBER_ROLE_BB_SIZE_X,\n",
    "            y1_number_role=tlc.NUMBER_ROLE_BB_SIZE_Y,\n",
    "        ),\n",
    "        \"segments\": tlc.SegmentationSchema(\n",
    "            label_value_map={i: tlc.MapElement(v.split(\". \")[1]) for i, v in enumerate(builder._values)},\n",
    "        ),\n",
    "        **column_schemas,\n",
    "    },\n",
    ")\n",
    "table_writer.add_batch(data)\n",
    "table = table_writer.finalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21",
   "metadata": {},
   "outputs": [],
   "source": [
    "table.table_rows[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}