{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os \n", "import numpy as np\n", "import shutil\n", "import json\n", "from tqdm import tqdm\n", "import numpy as np\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit\n", "from decord import VideoReader" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "def get_file_stem(path):\n", " base=os.path.basename(path)\n", " return os.path.splitext(base)[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "root_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\"\n", "participants = os.listdir(root_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "processed_participants = ['1205a',\n", " '0205b',\n", " '0605b',\n", " '2504g',\n", " '020419c',\n", " '0705b',\n", " '2504d',\n", " '020419f',\n", " '0905a',\n", " '2204c',\n", " '0205e',\n", " '010419c',\n", " '1205b',\n", " '2504e',\n", " '0404b',\n", " '1105a',\n", " '0905b',\n", " '2304b',\n", " '1105d',\n", " '2604a',\n", " '1105c',\n", " '1005c',\n", " '1005b',\n", " '1105e',\n", " '2504c',\n", " '020419e']" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "final_participants= [participant for participant in participants if not(participant in processed_participants)]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "new_root = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dartaset_Pictures\"" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 42.16it/s]\n" ] } ], "source": [ "#get participants with videos\n", "video_folder = \"ArtworkClips\" \n", "\n", "dfs = {\"video_path\":[],\"label\":[],\"frames\":[]}\n", "for participant in tqdm(final_participants):\n", " cd = os.path.join(root_path,participant)\n", "# dest_dir = os.path.join(new_root,participant,video_folder)\n", "\n", " if(video_folder in os.listdir(cd)):\n", "# if(not(os.path.isdir(dest_dir))):\n", "# os.makedirs(dest_dir)\n", "# #copy folder to new_dataset_folder\n", "# dir_to_copy = os.path.join(cd,video_folder)\n", "# for file in os.listdir(dir_to_copy):\n", "# dest = os.path.join(dest_dir,file)\n", "# file_dir =os.path.join(dir_to_copy,file)\n", "# shutil.copyfile(file_dir,dest)\n", " try:\n", " df_path = os.path.join(cd,\"FullDataset\",f\"video_dataset_{participant}.txt\")\n", " df = json.load(open(df_path))\n", " for key in dfs.keys():\n", " dfs[key].append(df[key])\n", " except: \n", " print(f\"{participant} not processed\")\n", "for key in dfs.keys():\n", " dfs[key]= np.concatenate(dfs[key])\n", " \n", " \n" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame.from_dict(dfs)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "df[\"participant\"] = df[\"video_path\"].apply(lambda row: row.split(\"/\")[0])" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
video_pathlabelframesparticipant
0010419e/ArtworkClips/010419e_10.mp415280010419e
1010419e/ArtworkClips/010419e_11.mp44341010419e
2010419e/ArtworkClips/010419e_12.mp44347010419e
3010419e/ArtworkClips/010419e_14.mp45295010419e
4010419e/ArtworkClips/010419e_17.mp40354010419e
...............
20303004e/ArtworkClips/3004e_93.mp4162923004e
20313004e/ArtworkClips/3004e_94.mp4162783004e
20323004e/ArtworkClips/3004e_95.mp4162873004e
20333004e/ArtworkClips/3004e_96.mp402953004e
20343004e/ArtworkClips/3004e_98.mp4162893004e
\n", "

2035 rows × 4 columns

\n", "
" ], "text/plain": [ " video_path label frames participant\n", "0 010419e/ArtworkClips/010419e_10.mp4 15 280 010419e\n", "1 010419e/ArtworkClips/010419e_11.mp4 4 341 010419e\n", "2 010419e/ArtworkClips/010419e_12.mp4 4 347 010419e\n", "3 010419e/ArtworkClips/010419e_14.mp4 5 295 010419e\n", "4 010419e/ArtworkClips/010419e_17.mp4 0 354 010419e\n", "... ... ... ... ...\n", "2030 3004e/ArtworkClips/3004e_93.mp4 16 292 3004e\n", "2031 3004e/ArtworkClips/3004e_94.mp4 16 278 3004e\n", "2032 3004e/ArtworkClips/3004e_95.mp4 16 287 3004e\n", "2033 3004e/ArtworkClips/3004e_96.mp4 0 295 3004e\n", "2034 3004e/ArtworkClips/3004e_98.mp4 16 289 3004e\n", "\n", "[2035 rows x 4 columns]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['C:\\\\Users\\\\jeuux\\\\Desktop\\\\Carrera\\\\MoAI\\\\TFM\\\\AnnotatedData\\\\FinalDatasets\\\\Datasets\\\\Video_Dataset_meta\\\\encoder.pkl']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "encoder_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dataset_meta\\encoder.pkl\"\n", "labels = df[\"label\"].values\n", "encoder = LabelEncoder().fit(labels)\n", "labels_enc = encoder.transform(labels)\n", "joblib.dump(encoder,encoder_path)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "labels_enc = encoder.transform(df[\"label\"])\n", "df[\"label\"] = labels_enc" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "subsets_ids = [(id1,id1+8) for id1 in range(0,len(final_participants),8)]" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def get_subset_df(df,participant_subset):\n", " idx_list = []\n", " for idx in df.index:\n", " if(df.loc[idx,\"participant\"] in participant_subset):\n", " idx_list.append(idx)\n", " df[\"video_path\"] = df[\"video_path\"].apply(lambda row: get_file_stem(row))\n", " return df.loc[idx_list,[\"video_path\",\"label\",\"frames\"]]\n", "def df_to_txt(df,dir_path):\n", " df.to_csv(dir_path, header=None, index=None, sep=' ', mode='a')" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "base_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Frames_Dataset_AL\"\n", "for idx, (id1,id2) in enumerate(subsets_ids):\n", " subset_path = os.path.join(base_dir,f\"Dataset_v{idx}\")\n", " if not(os.path.isdir(subset_path)):\n", " os.makedirs(subset_path)\n", " dataset_file = os.path.join(subset_path,\"Train.txt\")\n", " df_subset = get_subset_df(df,final_participants[id1:id2])\n", " df_to_txt(df_subset,dataset_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "splits[split]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "splits = {\"train\":train_index,\"val\":val_index,\"test\":test_index}\n", "root_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dataset_meta\"\n", "for split in splits.keys():\n", " df_split = df.loc[splits[split]]\n", " dir_path = os.path.join(root_path,f\"{split}.txt\")\n", " df_to_txt(df_split,dir_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.loc[splits[\"train\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_video = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dartaset_Pictures\\0205e\\ArtworkClips\\0205e_3.mp4\" " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vr = VideoReader(test_video)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_frames = len(vr)\n", "base_test = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dartaset_Pictures\\0205e\\test\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from PIL import Image" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#read df\n", "df_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dataset_meta\\test.txt\"\n", "df = pd.read_csv(df_path,sep=\" \",header= None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns = [\"video_path\",\"label\",\"frames\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_file_stem(path):\n", " base=os.path.basename(path)\n", " return os.path.splitext(base)[0]\n", "root = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }