# Ingest FHIBE Dataset

This tutorial is still a work in progress. Check back shortly for a more complete, well documented example!

## Install dependencies

In [None]:
%pip install -q 3lc

## Imports

In [None]:
import itertools
import json
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import tlc
from tlc.core.helpers._value_map_builder import _ValueMapBuilder

## Project setup

In [None]:
PROJECT_NAME = "3LC Tutorials - FHIBE"
DATASET_NAME = "FHIBE"
TABLE_NAME = "full-improved"
MAX_SAMPLES = 1_000_000

In [None]:
FHIBE_ROOT = Path(
 "D:/Data/fhibe.20250716.u.gT5_rFTA_downsampled_public_raw_only/fhibe.20250716.u.gT5_rFTA_downsampled_public_raw_only"
)

In [None]:
DATA_ROOT = FHIBE_ROOT / "data/raw/fhibe_downsampled"
METADATA_ROOT = FHIBE_ROOT / "data/processed/"
CSV_FILE = METADATA_ROOT / "fhibe_downsampled" / "fhibe_downsampled.csv"

In [None]:
csv_file = pd.read_csv(CSV_FILE)
csv_file = csv_file.loc[:, ~csv_file.columns.str.contains("annotator_id")]
csv_file.head()

In [None]:
def to_natural(x):
 # NaN stays NaN
 if pd.isna(x):
 return np.nan

 # Already a python type you want to keep
 if isinstance(x, (list, dict, int, float, bool)):
 return x

 # Strings: maybe JSON, maybe not
 if isinstance(x, str):
 s = x.strip().replace("'", '"')
 try:
 return json.loads(s)
 except Exception:
 return s

In [None]:
IGNORE_KEYS = set(["filepath", "image_height", "image_width", "keypoints", "face_bbox", "person_bbox", "segments"])

for col_name in set(csv_file.columns) - IGNORE_KEYS:
 csv_file[col_name] = csv_file[col_name].apply(to_natural)

In [None]:
# Handle nan-issues
for col_name in set(csv_file.columns) - IGNORE_KEYS:
 if "annotator_id" in col_name:
 continue
 col = csv_file[col_name]

 if col.isna().sum() == 0:
 continue

 non_nan = col.dropna()
 types = non_nan.map(type).value_counts()
 dominant = types.index[0] if len(types) else None
 print(f"{col_name} has {col.isna().sum()} nan-values. Dominant type: {dominant}")
 empty_for = {
 str: "",
 float: np.nan,
 }
 # Use a default argument in the lambda to bind the current value of empty_value
 empty_value = empty_for.get(dominant)

 col = col.apply(lambda x, empty_value=empty_value: empty_value if pd.isna(x) else x)
 csv_file[col_name] = col

In [None]:
override_schemas = {
 "image_id": tlc.StringSchema(default_visible=False, writable=False),
 "subject_id": tlc.StringSchema(default_visible=False, writable=False),
 "json_path": tlc.StringSchema(default_visible=False, writable=False),
 "user_date_captured": tlc.StringSchema(default_visible=False, writable=False),
 "model": tlc.StringSchema(default_visible=False, writable=False),
 "location_region": tlc.StringSchema(default_visible=False, writable=False),
}


def normalize_string(s):
 return s.replace(".", "").replace(":", ";").strip(", ")


def schema_for_column(col_name, col_value):
 if col_name in override_schemas:
 return override_schemas[col_name], col_value.apply(str)

 is_list = False
 if isinstance(col_value[0], float):
 return tlc.Float32Schema(default_visible=False), None
 elif isinstance(col_value[0], (bool, np.bool_)):
 return tlc.BoolSchema(default_visible=False), None
 elif isinstance(col_value[0], (int, np.int32, np.int64)):
 return tlc.Int32Schema(default_visible=False), None
 elif isinstance(col_value[0], (list, np.ndarray)):
 is_list = True

 # Default behaviour for strings and lists of strings is to convert to categoricals.
 element = col_value[0] if not is_list else col_value[0][0]
 if not isinstance(element, str):
 return None, None

 # 1. Build vocab
 vals = col_value.apply(
 lambda x: x
 if isinstance(x, (list, tuple, np.ndarray))
 else ([] if (x is None or (isinstance(x, float) and np.isnan(x))) else [x])
 )

 vocab = sorted({normalize_string(str(s)) for s in itertools.chain.from_iterable(vals)})
 str_to_id = {s: i for i, s in enumerate(vocab)}
 id_to_str = {i: s for s, i in str_to_id.items()}

 # These columns have a hex color associated with them, add to mapping
 if col_name in ["apparent_skin_color", "natural_skin_color"]:
 for k, v in id_to_str.items():
 hex_color = tlc.rgb_tuple_to_hex(json.loads(v[v.index("[") : v.index("]") + 1]))
 id_to_str[k] = tlc.MapElement(v, display_color=hex_color)

 # 2. Transform column
 def encode(x):
 if isinstance(x, (list, tuple, np.ndarray)):
 return [str_to_id[normalize_string(str(s))] for s in x]
 return str_to_id[normalize_string(str(x))]

 transformed = col_value.apply(encode)

 # 3. Build schema
 schema_type = tlc.CategoricalLabelListSchema if is_list else tlc.CategoricalLabelSchema
 return schema_type(classes=id_to_str, default_visible=False, writable=False), transformed

In [None]:
def infer_schemas_and_transform_categoricals(df) -> dict[str, tlc.Schema]:
 column_schemas = {}
 for col in set(df.columns) - IGNORE_KEYS:
 schema, transformed = schema_for_column(col, df[col])
 column_schemas[col] = schema
 if transformed is not None:
 df[col] = transformed
 return column_schemas


column_schemas = infer_schemas_and_transform_categoricals(csv_file)

## Define annotation transforms

In [None]:
NUM_KEYPOINTS = 33

KPTS = [
 "0. Nose",
 "1. Right eye inner",
 "2. Right eye",
 "3. Right eye outer",
 "4. Left eye inner",
 "5. Left eye",
 "6. Left eye outer",
 "7. Right ear",
 "8. Left ear",
 "9. Mouth right",
 "10. Mouth left",
 "11. Right shoulder",
 "12. Left shoulder",
 "13. Right elbow",
 "14. Left elbow",
 "15. Right wrist",
 "16. Left wrist",
 "17. Right pinky knuckle",
 "18. Left pinky knuckle",
 "19. Right index knuckle",
 "20. Left index knuckle",
 "21. Right thumb knuckle",
 "22. Left thumb knuckle",
 "23. Right hip",
 "24. Left hip",
 "25. Right knee",
 "26. Left knee",
 "27. Right ankle",
 "28. Left ankle",
 "29. Right heel",
 "30. Left heel",
 "31. Right foot index",
 "32. Left foot index",
]

SKELETON = [
 11,
 12,
 11,
 13,
 13,
 15,
 12,
 14,
 14,
 16,
 12,
 24,
 11,
 23,
 23,
 24,
 24,
 26,
 26,
 28,
 23,
 25,
 25,
 27,
 27,
 29,
 29,
 31,
 28,
 30,
 30,
 32,
]


def transform_keypoints(keypoints, image_width, image_height) -> tlc.Keypoints2DInstances:
 kpts = json.loads(keypoints.replace("'", '"'))
 kpts_arr = np.zeros((NUM_KEYPOINTS, 3), dtype=np.float32)
 for i, kpt_name in enumerate(KPTS):
 if kpt_name not in kpts:
 continue
 kpts_arr[i, :] = kpts[kpt_name]
 kpts_arr[i, 2] = 2

 instances = tlc.Keypoints2DInstances.create_empty(
 image_width=image_width,
 image_height=image_height,
 include_keypoint_visibilities=True,
 include_instance_bbs=False,
 )

 instances.add_instance(
 keypoints=kpts_arr,
 label=0,
 )

 return instances


builder = _ValueMapBuilder[str]()


def transform_segments(segments, image_width, image_height):
 segments = json.loads(segments.replace("'", '"'))

 polygons = []
 labels = []

 for segment in segments:
 class_name = segment["class_name"]
 polygon = segment["polygon"]
 poly_2_tuples = [[p["x"], p["y"]] for p in polygon]
 flattened_poly = [item for sublist in poly_2_tuples for item in sublist]
 polygons.append(flattened_poly)
 labels.append(builder(class_name))

 segs = tlc.SegmentationPolygonsDict(
 image_width=image_width,
 image_height=image_height,
 polygons=polygons,
 instance_properties={"label": labels},
 )
 return segs


def transform_bboxes(face_bbox, person_bbox, image_width, image_height):
 face_bbox = json.loads(face_bbox)
 person_bbox = json.loads(person_bbox)

 bboxes = {
 tlc.IMAGE_WIDTH: image_width,
 tlc.IMAGE_HEIGHT: image_height,
 tlc.BOUNDING_BOX_LIST: [
 {
 tlc.X0: face_bbox[0],
 tlc.Y0: face_bbox[1],
 tlc.X1: face_bbox[2],
 tlc.Y1: face_bbox[3],
 tlc.LABEL: 0,
 },
 {
 tlc.X0: person_bbox[0],
 tlc.Y0: person_bbox[1],
 tlc.X1: person_bbox[2],
 tlc.Y1: person_bbox[3],
 tlc.LABEL: 1,
 },
 ],
 }

 return bboxes

## Load data

In [None]:
data = defaultdict(list)

for index, row in csv_file.iterrows():
 input_row = row.to_dict()
 image_path = FHIBE_ROOT / input_row["filepath"]

 ## Extract and convert annotations to 3lc format

 image_height = input_row["image_height"]
 image_width = input_row["image_width"]
 keypoints = transform_keypoints(input_row["keypoints"], image_width, image_height)
 segments = transform_segments(input_row["segments"], image_width, image_height)
 bboxes = transform_bboxes(input_row["face_bbox"], input_row["person_bbox"], image_width, image_height)
 data["image"].append(image_path.as_posix())
 data["keypoints"].append(keypoints.to_row())
 data["bbs"].append(bboxes)
 data["segments"].append(segments)

 ## Extract metadata
 for key in set(input_row.keys()) - IGNORE_KEYS:
 data[key].append(input_row[key])

 if index > MAX_SAMPLES:
 break

## Write 3LC Table

In [None]:
table_writer = tlc.TableWriter(
 table_name=TABLE_NAME,
 dataset_name=DATASET_NAME,
 project_name=PROJECT_NAME,
 column_schemas={
 "image": tlc.ImageUrlSchema(),
 "keypoints": tlc.Keypoints2DSchema(
 classes=["person"],
 num_keypoints=NUM_KEYPOINTS,
 lines=SKELETON,
 point_attributes=list(map(lambda x: x.split(". ")[1], KPTS)),
 include_per_point_visibility=True,
 ),
 "bbs": tlc.BoundingBoxListSchema(
 label_value_map={0: tlc.MapElement("face"), 1: tlc.MapElement("person")},
 include_segmentation=False,
 x1_number_role=tlc.NUMBER_ROLE_BB_SIZE_X,
 y1_number_role=tlc.NUMBER_ROLE_BB_SIZE_Y,
 ),
 "segments": tlc.SegmentationSchema(
 label_value_map={i: tlc.MapElement(v.split(". ")[1]) for i, v in enumerate(builder._values)},
 ),
 **column_schemas,
 },
)
table_writer.add_batch(data)
table = table_writer.finalize()

In [None]:
table.table_rows[0]