{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os \n", "import pandas as pd\n", "import decord\n", "from decord import VideoReader\n", "import numpy as np\n", "from PIL import Image\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_file_stem(path):\n", " base=os.path.basename(path)\n", " return os.path.splitext(base)[0]\n", "\n", "def read_metadata(df_path): \n", " #read df\n", " df = pd.read_csv(df_path,sep=\" \",header= None)\n", " df.columns = [\"video_path\",\"label\",\"frames\"]\n", " return df\n", "\n", "def df_to_txt(df,dir_path):\n", " df.to_csv(dir_path, header=None, index=None, sep=' ', mode='a')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_frames_from_split(df,split,dataset_dir,videos_dir,clips_type):\n", " \n", " clips_base_dir = os.path.join(dataset_dir,split)\n", " \n", " for i in tqdm(range(len(df.index))):\n", " \n", " video_id = get_file_stem(df.loc[i,\"video_path\"])\n", " participant = video_id.split(\"_\")[0]\n", " video_file = f\"{video_id}.mp4\"\n", " clip_path = os.path.join(videos_dir,participant,clips_type,video_file)\n", " decord_vr = VideoReader(clip_path)\n", " n_frames= len(decord_vr)\n", " frames_list = np.arange(n_frames)\n", " # # Decord\n", " frames = decord_vr.get_batch(frames_list).asnumpy()\n", " video_folder = os.path.join(clips_base_dir,video_id)\n", " \n", " if not(os.path.isdir(video_folder)):\n", " os.makedirs(video_folder)\n", "\n", " for i in range(n_frames):\n", " im = Image.fromarray(frames[i])\n", " dir_path = os.path.join(video_folder,f\"frame_{i}.jpg\")\n", " im.save(dir_path)\n", " \n", " #save df\n", " df_path = os.path.join(dataset_dir,f\"{split}.txt\")\n", " #hacer apply\n", " df[\"video_path\"] = df[\"video_path\"].apply(lambda row: get_file_stem(row))\n", " df = df[[\"video_path\",\"frames\",\"label\"]]\n", " df_to_txt(df,df_path)\n", " print(\"Finished\")\n", " \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# dataset_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_dataset_v1\"\n", "# base_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_Video\\Base_Dataset\"\n", "# split = \"Val\"\n", "# splits =[\"Train\",\"Test\"]\n", "# videos_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\"\n", "# for split in splits:\n", "# df_path = os.path.join(dataset_dir,f\"{split}.txt\")\n", "# df = read_metadata(df_path)\n", "# get_frames_from_split(df,split,base_dir,videos_dir,\"HARClips\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "split = \"Train\"\n", "AL_folder = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_AL\"\n", "videos_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\"\n", "base_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_Video\\HAR_AL_subsets\"\n", "for idx in range(len(os.listdir(AL_folder))):\n", " subset = f\"HAR_Dataset_AL_v{idx}\"\n", " subset_dir = os.path.join(base_dir,subset)\n", " dataset_dir = os.path.join(AL_folder,subset)\n", " df_path = os.path.join(dataset_dir,f\"{split}.txt\")\n", " df = read_metadata(df_path)\n", " get_frames_from_split(df,split,subset_dir,videos_dir,\"HARClips\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Encode and filter" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def filter_targets(df,encoder):\n", " \n", " #case where we encounter new labeled class not covered on \n", " #original dataset\n", " targets = df.label.values\n", " try:\n", " targets_enc = encoder.transform(targets)\n", " return df\n", " except:\n", " original_classes = encoder.classes_\n", " actual_classes = set(targets)\n", " unseen_targets = [label for label in actual_classes if not(label in original_classes)]\n", " invalid_ids =[]\n", " for unseen_target in unseen_targets:\n", " invalid_ids.append(np.argwhere(targets==unseen_target))\n", " invalid_ids = np.concatenate(invalid_ids)\n", " valid_ids = [id for id in range(len(targets)) if not(id in invalid_ids)]\n", " return df.loc[valid_ids]\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\jeuux\\Anaconda2\\envs\\ts_env\\lib\\site-packages\\sklearn\\base.py:318: UserWarning: Trying to unpickle estimator LabelEncoder from version 0.22.2.post1 when using version 0.22.1. This might lead to breaking code or invalid results. Use at your own risk.\n", " UserWarning)\n" ] } ], "source": [ "import joblib\n", "encoder_file = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_dataset_v1\\encoder_train.pkl\"\n", "dataset_v1 = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_Video\\Base_Dataset\"\n", "dataset_AL = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_Video\\HAR_AL_subsets\"\n", "# file_list_v1 = [os.path.join(dataset_v1,file) for file in os.listdir(dataset_v1) \n", "# if \"txt\" in file]\n", "file_list_al = [os.path.join(dataset_AL,folder,file) for folder in os.listdir(dataset_AL) \n", " for file in os.listdir(os.path.join(dataset_AL,folder))\n", " if \"txt\" in file]\n", "# file_list = file_list_v1 + file_list_al\n", "file_list = file_list_al\n", "encoder = joblib.load(encoder_file)\n", "for file in file_list:\n", " base_dir = os.path.split(file)[0]\n", " split = get_file_stem(file)\n", " dir_file = os.path.join(base_dir,f\"{split}_encodded.txt\")\n", " #read df\n", " df = read_metadata(file)\n", " # filter unseen\n", " df = filter_targets(df,encoder)\n", " #encode\n", " labels = encoder.transform(df[\"label\"])\n", " df[\"label\"] = labels\n", " df= df[[\"video_path\",\"frames\",\"label\"]]\n", " #save\n", " df_to_txt(df,dir_file)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file_list_v1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(encoder.classes_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file_list[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def filter_targets(df,encoder):\n", " \n", " #case where we encounter new labeled class not covered on \n", " #original dataset\n", " targets = df.label.values\n", " try:\n", " targets_enc = encoder.transform(targets)\n", " return df\n", " except:\n", " original_classes = encoder.classes_\n", " actual_classes = set(targets)\n", " unseen_targets = [label for label in actual_classes if not(label in original_classes)]\n", " invalid_ids =[]\n", " for unseen_target in unseen_targets:\n", " invalid_ids.append(np.argwhere(targets==unseen_target))\n", " invalid_ids = np.concatenate(invalid_ids)\n", " valid_ids = [id for id in range(len(targets)) if not(id in invalid_ids)]\n", " return df.loc[valid_ids]\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }