{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os \n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "targets_path=r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset\\targets.npy\"\n", "dataset_path=r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset\\data.npy\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "#Train/val/test\n", "def split_data(data,targets,test_size):\n", " train_idx,test_idx=train_test_split(range(len(data)),test_size=test_size,random_state=42)\n", " samples_train=data[train_idx]\n", " targets_train=targets[train_idx]\n", "\n", " samples_test=data[test_idx]\n", " targets_test=targets[test_idx]\n", " \n", " return samples_train,targets_train,samples_test,targets_test\n", "\n", "def save_data(folder,data,targets):\n", " np.save(os.path.join(folder,\"data\"),data)\n", " np.save(os.path.join(folder,\"targets\"),targets)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "samples_train,targets_train,samples_test,targets_test=split_data(data,targets,0.2)\n", "samples_train,targets_train,samples_val,targets_val=split_data(samples_train,targets_train,0.1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_folder=r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset_raw\\Train\"\n", "test_folder=r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset_raw\\Test\"\n", "val_folder=r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset_raw\\Val\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "save_data(train_folder,samples_train,targets_train)\n", "save_data(test_folder,samples_test,targets_test)\n", "save_data(val_folder,samples_val,targets_val)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# splitting data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "participants_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Participants\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "participant_folders = [os.path.join(participants_dir,participant) for participant in os.listdir(participants_dir)]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(participant_folders)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "participants_train = participant_folders[:25]\n", "participants_val = participant_folders[25:]\n", "# participants_val = participant_folders[25:30]\n", "# participants_test = participant_folders[-5:]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def get_file_stem(path):\n", " base=os.path.basename(path)\n", " return os.path.splitext(base)[0]\n", "\n", "def read_metadata(df_path): \n", " #read df\n", " df = pd.read_csv(df_path,sep=\" \",header= None)\n", " df.columns = [\"video_path\",\"frames\",\"label\"]\n", " return df\n", "\n", "def df_to_txt(df,dir_path):\n", " df.to_csv(dir_path, header=None, index=None, sep=' ', mode='a')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "participants_val" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_data(data_folders,subset,dataset_dir):\n", " df_dict = {\n", " \"video_path\": [],\n", " \"label\": [],\n", " \"frames\": []\n", " }\n", " for folder in data_folders:\n", " participant = get_file_stem(folder)\n", " df_path = os.path.join(folder,\"FullDataset\",f\"HARClips_dataset_{participant}.txt\")\n", " with open(df_path,\"rb\") as f:\n", " df = json.load(f)\n", " for k,v in df.items():\n", " df_dict[k] = df_dict[k] + v\n", " df_path = os.path.join(dataset_dir,f\"{subset}.txt\")\n", "# with open(df_path,\"w\") as f:\n", "# json.dump(df_dict,f)\n", " df_to_txt(pd.DataFrame.from_dict(df_dict),df_path)\n", " print(\"Finished\")\n", " \n", " \n", " \n", "# segments_file = os.path.join(folder,\"FullDataset\",\"segments_data.npy\")\n", "# labels_file = os.path.join(folder,\"FullDataset\",\"targets_data.npy\")\n", "# segments.append(np.load(segments_file))\n", "# labels.append(np.load(labels_file))\n", "# segments = np.concatenate(segments)\n", "# labels = np.concatenate(labels)\n", "# if not(os.path.isdir(os.path.join(dataset_dir,subset))):\n", "# os.makedirs(os.path.join(dataset_dir,subset))\n", "# np.save(os.path.join(dataset_dir,subset,\"data\"),segments)\n", "# np.save(os.path.join(dataset_dir,subset,\"targets\"),labels)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def get_data(data_folders,subset,dataset_dir,dataset_root):\n", " segments = []\n", " labels = []\n", " for folder in data_folders:\n", " participant = get_file_stem(folder)\n", " segments_file = os.path.join(folder,dataset_root,\"segments_data.npy\")\n", " labels_file = os.path.join(folder,dataset_root,\"targets_data.npy\")\n", " segments.append(np.load(segments_file))\n", " labels.append(np.load(labels_file))\n", " \n", " segments = np.concatenate(segments)\n", " labels = np.concatenate(labels)\n", " if not(os.path.isdir(os.path.join(dataset_dir,subset))):\n", " os.makedirs(os.path.join(dataset_dir,subset))\n", " np.save(os.path.join(dataset_dir,subset,\"data\"),segments)\n", " np.save(os.path.join(dataset_dir,subset,\"targets\"),labels)\n", " " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "folders = [participants_train,participants_val]\n", "subsets =[\"Train\",\"Val\"]\n", "dataset_root = \"CompleteData\"\n", "dataset_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset_participants\"\n", "for folders,subset in zip(folders,subsets):\n", " test = get_data(folders,subset,dataset_dir,dataset_root)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", "import joblib\n", "\n", "base_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset_participants\\Train\" \n", "targets = np.load(os.path.join(base_dir,\"targets.npy\"))\n", "data = np.load(os.path.join(base_dir,\"data.npy\"))\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "encoder = LabelEncoder().fit(targets)\n", "scaler = StandardScaler().fit(data.reshape(-1,9))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['C:\\\\Users\\\\jeuux\\\\Desktop\\\\Carrera\\\\MoAI\\\\TFM\\\\AnnotatedData\\\\Accelerometer_Data\\\\Datasets\\\\HAR_Dataset_participants\\\\Train\\\\scaler.pkl']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "joblib.dump(encoder,os.path.join(base_dir,\"encoder.pkl\"))\n", "joblib.dump(scaler,os.path.join(base_dir,\"scaler.pkl\"))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\\0304d\\FullDataset\\HARClips_dataset_0304d.txt\"\n", "\n", "#read df\n", "df = pd.read_csv(df_path,sep=\" \",header= None)\n", "# df.columns = [\"video_path\",\"frames\",\"label\"]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json \n", "with open(df_path,\"rb\") as f:\n", " df= json.load(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"video_path\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_dataset_v1\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "folders = [participants_train,participants_test,participants_val]\n", "subsets =[\"Train\",\"Test\",\"Val\"]\n", "for folders,subset in zip(folders,subsets):\n", " test = get_data(folders,subset,dataset_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame.from_dict(test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "base_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\HAR_AL\"\n", "for index,(id1,id2) in enumerate(zip(range(30,69,5),range(35,71,5))):\n", " \n", " if(id1==65):\n", " id2 = 68\n", " \n", " dataset_path = os.path.join(base_dir,f\"HAR_dataset_AL_v{index}\")\n", " if not(os.path.isdir(dataset_path)):\n", " os.makedirs(dataset_path)\n", " \n", " participant_list = participant_folders[id1:id2]\n", " get_data(participant_list,\"Train\",dataset_path)\n", " \n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#concat every subtable for each participant\n", "participant_folders=[os.path.join(participant_folder,folder) for folder in os.listdir(participant_folder)]\n", "\n", "for participant_folder in tqdm(participant_folders): \n", " folder=os.path.join(participant_folder,data_type)\n", " table_arr=[]\n", " for table in os.listdir(folder):\n", " table_arr.append(pd.read_csv(os.path.join(folder,table)))\n", " df=pd.concat(table_arr)\n", " participant=os.path.basename(participant_folder)\n", " df.to_csv(os.path.join(folder,\"{0}_raw_{1}.csv\".format(data_type,participant)),index=False)\n", " print(\"Save table of participant : {}\".format(participant))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }