{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Data preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This code creates a directory holding an organized version of the original dataset." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os, shutil\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Path to the full data directory, not categorised into train/val/test sets or category folders\n", "original_dataset_dir = 'data_raw'\n", "\n", "# The directory where we will store our dataset, divided into train/val/test directories, and further into category directories \n", "base_dir = 'data'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Created directory: data_ready\n", "Created directory: data_ready/train/alien\n", "Created directory: data_ready/train/predator\n", "Created directory: data_ready/validation/alien\n", "Created directory: data_ready/validation/predator\n" ] } ], "source": [ "categories = ['alien', 'predator']\n", "\n", "# We want to keep our data organized into train and validation folders, each with separate category subfolders\n", "str_train_val = ['train', 'validation']\n", "\n", "if not os.path.exists(base_dir):\n", " os.mkdir(base_dir)\n", " print('Created directory: ', base_dir)\n", "\n", "for dir_type in str_train_val:\n", " train_test_val_dir = os.path.join(base_dir, dir_type)\n", "\n", " if not os.path.exists(train_test_val_dir):\n", " os.mkdir(train_test_val_dir)\n", "\n", " for category in categories:\n", " dir_type_category = os.path.join(train_test_val_dir, category)\n", "\n", " if not os.path.exists(dir_type_category):\n", " os.mkdir(dir_type_category)\n", " print('Created directory: ', dir_type_category)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "alien: 447 files\n", "alien, train: 347 files copied\n", "alien, validation: 100 files copied\n", "predator: 447 files\n", "predator, train: 347 files copied\n", "predator, validation: 100 files copied\n" ] } ], "source": [ "directories_dict = {} # To store directory paths for data subsets.\n", "\n", "np.random.seed(12)\n", "for cat in categories:\n", " list_of_images = np.array(os.listdir(os.path.join(original_dataset_dir,cat)))\n", " print(\"{}: {} files\".format(cat, len(list_of_images)))\n", " indexes = dict()\n", " indexes['validation'] = sorted(np.random.choice(len(list_of_images), size=100, replace=False))\n", " indexes['train'] = list(set(range(len(list_of_images))) - set(indexes['validation']))\n", " for phase in str_train_val:\n", " for i, fname in enumerate(list_of_images[indexes[phase]]):\n", " source = os.path.join(original_dataset_dir, cat, fname)\n", " destination = os.path.join(base_dir, phase, cat, str(i)+\".jpg\")\n", " shutil.copyfile(source, destination)\n", " print(\"{}, {}: {} files copied\".format(cat, phase, len(indexes[phase])))\n", " directories_dict[phase + \"_\" + cat + \"_dir\"] = os.path.join(base_dir, phase, cat)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'train_alien_dir': 'data_ready/train/alien',\n", " 'validation_alien_dir': 'data_ready/validation/alien',\n", " 'train_predator_dir': 'data_ready/train/predator',\n", " 'validation_predator_dir': 'data_ready/validation/predator'}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "directories_dict" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total training alien images: 347\n", "Total training predator images: 347\n", "--------------------------------\n", "Total validation alien images: 100\n", "Total validation predator images: 100\n" ] } ], "source": [ "print('Total training alien images:', len(os.listdir(directories_dict['train_alien_dir'])))\n", "print('Total training predator images:', len(os.listdir(directories_dict['train_predator_dir'])))\n", "print(\"-\"*32)\n", "print('Total validation alien images:', len(os.listdir(directories_dict['validation_alien_dir'])))\n", "print('Total validation predator images:', len(os.listdir(directories_dict['validation_predator_dir'])))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }