{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", " Try in Google Colab\n", " \n", " \n", " \n", " \n", " Share via nbviewer\n", " \n", " \n", " \n", " \n", " View on GitHub\n", " \n", " \n", " \n", " \n", " Download notebook\n", " \n", "
\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# **Video Labels in FiftyOne**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **A guided example with ASL videos**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First download and unzip the dataset.\n", "\n", "We will be using the [WLASL Dataset](https://www.kaggle.com/datasets/risangbaskoro/wlasl-processed), a dataset comprised of actors performing sign language." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install kaggle" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/dan/.kaggle/kaggle.json'\n", "Downloading wlasl-processed.zip to /home/dan/Documents/tnt\n", "100%|█████████████████████████████████████▉| 4.82G/4.82G [02:14<00:00, 54.0MB/s]\n", "100%|██████████████████████████████████████| 4.82G/4.82G [02:14<00:00, 38.4MB/s]\n" ] } ], "source": [ "!kaggle datasets download -d risangbaskoro/wlasl-processed" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "!mkdir wlasl-processed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!unzip wlasl-processed.zip -d wlasl-processed" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "import json\n", "import os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load in the labels for the dataset" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [], "source": [ "main_path = './wlasl-processed/'\n", "wlasl_df = pd.read_json(main_path + 'WLASL_v0.3.json')" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
glossinstances
0book[{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra...
1drink[{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f...
2computer[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...
3before[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...
4chair[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...
\n", "
" ], "text/plain": [ " gloss instances\n", "0 book [{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra...\n", "1 drink [{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f...\n", "2 computer [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...\n", "3 before [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...\n", "4 chair [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..." ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wlasl_df.head()" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [], "source": [ "mp4_dir = main_path + \"videos\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **Creating the Video Dataset**" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 100% |█████████████| 11980/11980 [1.7s elapsed, 0s remaining, 7.2K samples/s] \n", "Computing metadata...\n", " 100% |█████████████| 11980/11980 [1.2m elapsed, 0s remaining, 403.2 samples/s] \n" ] } ], "source": [ "import fiftyone\n", "\n", "dataset = fo.Dataset.from_dir(dataset_dir=mp4_dir,dataset_type=fo.types.VideoDirectory)\n", "dataset.ensure_frames()\n", "dataset.compute_metadata()\n", "dataset.name = 'wlasl-dataset'\n", "dataset.persistent = True" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Could not connect session, trying again in 10 seconds\r\n", "\n" ] } ], "source": [ "session = fo.launch_app(dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **Sample Detections**" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "def find_row_by_video_id(dataframe, video_id):\n", " for index, row in dataframe.iterrows():\n", " for instance in row['instances']:\n", " if instance['video_id'] == video_id:\n", " return row, instance\n", " return None " ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "view = dataset.take(100)" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [], "source": [ "for sample in view:\n", " base_file_name = os.path.basename(sample.filepath)\n", " video_id, extension = os.path.splitext(base_file_name)\n", " row, inst = find_row_by_video_id(wlasl_df,video_id)\n", " gloss = row[\"gloss\"]\n", " bbox = inst[\"bbox\"]\n", " imw = sample.metadata.frame_width\n", " imh = sample.metadata.frame_height\n", " x1 = bbox[0] / imw\n", " x2 = bbox[2] / imw\n", " y1 = bbox[1] / imh\n", " y2 = bbox[3] / imh\n", " bbox = [x1,y1,x2-x1,y2-y1]\n", " det = fo.Detection(bounding_box=bbox,label=gloss)\n", " sample['Sample Label'] = fo.Detections(detections=[det])\n", " \n", " sample.save()\n", " " ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "session.view = view" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **Frame Level Detections**" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "def bigger_bbox(x, y, width, height, index):\n", "\n", " offset = 0.001\n", " x_offset = index*offset\n", "\n", " # Apply the offsets to the parameters\n", " n_x = x - x_offset\n", " n_width = width + x_offset*2 \n", "\n", " return [n_x, y, n_width, height]" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [], "source": [ "for sample in view:\n", " base_file_name = os.path.basename(sample.filepath)\n", " video_id, extension = os.path.splitext(base_file_name)\n", " row, inst = find_row_by_video_id(wlasl_df,video_id)\n", " gloss = row[\"gloss\"]\n", " bbox = inst[\"bbox\"]\n", " imw = sample.metadata.frame_width\n", " imh = sample.metadata.frame_height\n", " x1 = bbox[0] / imw\n", " x2 = bbox[2] / imw\n", " y1 = bbox[1] / imh\n", " y2 = bbox[3] / imh\n", " bbox = [x1,y1,x2-x1,y2-y1]\n", " for frame_no, frame in sample.frames.items():\n", " new_bbox = bigger_bbox(bbox[0],bbox[1],bbox[2],bbox[3],frame_no)\n", " det = fo.Detection(bounding_box=new_bbox,label=gloss)\n", " frame['Frame Label'] = fo.Detections(detections=[det])\n", " \n", " sample.save()" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "session.view = view" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **Temporal Detections**" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [], "source": [ "for sample in view:\n", " base_file_name = os.path.basename(sample.filepath)\n", " video_id, extension = os.path.splitext(base_file_name)\n", " row, inst = find_row_by_video_id(wlasl_df,video_id)\n", " gloss = row[\"gloss\"]\n", " sample[\"TD Word\"] = fo.TemporalDetection.from_timestamps(\n", " [0, sample.metadata.duration/2], label=gloss, sample=sample\n", " )\n", " sample[\"TD Word2\"] = fo.TemporalDetection.from_timestamps(\n", " [sample.metadata.duration/2, sample.metadata.duration], label=\"ASL is awesome!\", sample=sample\n", " )\n", "\n", " \n", " sample.save()\n", " " ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "session.view = view" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **Video Classification**" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "for sample in view:\n", " base_file_name = os.path.basename(sample.filepath)\n", " video_id, extension = os.path.splitext(base_file_name)\n", " row, inst = find_row_by_video_id(wlasl_df,video_id)\n", " gloss = row[\"gloss\"]\n", " sample[\"class\"] = fo.Classification(label=gloss)\n", " \n", " sample.save()\n", " " ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "session.view = view" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Classification](video_label_imgs/asl_class.png)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 2 }