{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Download data"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "!curl https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip --output dataset.zip\n",
    "\n",
    "!unzip dataset.zip"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Split training/validation/testing subsets"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "!mkdir -p train/Cat\n",
    "!mkdir -p train/Dog\n",
    "!mkdir -p validation/Cat\n",
    "!mkdir -p validation/Dog\n",
    "!mkdir -p test/Cat\n",
    "!mkdir -p test/Dog"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "Then, using Keras image preprocessing library, we load the images, resize them into 200px per 200px files then save the resulting in the folders. We take a total of 1000 image per class: 600 for training, 200 for validation and 200 for testing."
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# importing libraries\n",
    "from tensorflow.keras.utils import load_img, save_img\n",
    "from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
    "\n",
    "from os import listdir\n",
    "from tensorflow import keras\n",
    "\n",
    "# load dogs vs cats dataset, reshape into 200px x 200px image files\n",
    "classes = ['Cat','Dog']\n",
    "photos, labels = list(), list()\n",
    "files_per_class = 1000\n",
    "for classe in classes:\n",
    "  i = 0\n",
    "  # enumerate files in the directory\n",
    "  for file in listdir('PetImages/'+classe):\n",
    "    if file.endswith(\".jpg\"):\n",
    "      # determine class\n",
    "      output = 0.0\n",
    "      if classe == 'Dog':\n",
    "        output = 1.0\n",
    "      # load image\n",
    "      print(file)\n",
    "      photo = load_img('PetImages/'+classe +'/' + file, target_size=(200, 200))\n",
    "      if i < 600:\n",
    "        save_img('train/'+classe+'/'+file, photo)\n",
    "      elif i < 800:\n",
    "        save_img('validation/'+classe+'/'+file, photo)\n",
    "      else:\n",
    "        save_img('test/'+classe+'/'+file, photo)\n",
    "      i = i + 1\n",
    "      if i == files_per_class:\n",
    "        break"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Training & evaluating the model"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def get_accuracy_for_batch_size(opt, batch_size):\n",
    "  print(\"Evaluating batch size \" + str(batch_size))\n",
    "  # prepare iterators\n",
    "  datagen = ImageDataGenerator(rescale=1.0/255.0)\n",
    "  train_it = datagen.flow_from_directory(directory='train/',\n",
    "    class_mode='binary', batch_size=batch_size, target_size=(200, 200))\n",
    "  val_it = datagen.flow_from_directory(directory='validation/',\n",
    "    class_mode='binary', batch_size=batch_size, target_size=(200, 200))\n",
    "  test_it = datagen.flow_from_directory('test/',\n",
    "    class_mode='binary', batch_size=batch_size, target_size=(200, 200))\n",
    "  \n",
    "  # create model\n",
    "  model = keras.Sequential()\n",
    "  model.add(keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(200, 200, 3)))\n",
    "  model.add(keras.layers.MaxPooling2D((2, 2)))\n",
    "  model.add(keras.layers.Flatten())\n",
    "  model.add(keras.layers.Dense(128, activation='relu', kernel_initializer='he_uniform'))\n",
    "  model.add(keras.layers.Dense(1, activation='sigmoid'))\n",
    "\n",
    "  # compile model\n",
    "  model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])\n",
    "\n",
    "  # train the model\n",
    "  model.fit(train_it,\n",
    "                      validation_data = train_it,\n",
    "                      steps_per_epoch = train_it.n//train_it.batch_size,\n",
    "                      validation_steps = val_it.n//val_it.batch_size,\n",
    "                      epochs=5, verbose=0)\n",
    "  \n",
    "  # evaluate model\n",
    "  _, acc = model.evaluate(test_it, steps=len(test_it), verbose=0)\n",
    "  return acc"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Comparing optimizers: SGD vs Adam"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "batch_size_array = [16,32,64,128]\n",
    "accuracies_sgd = []\n",
    "optimizer = \"sgd\"\n",
    "for bs in batch_size_array:\n",
    "  accuracies_sgd.append(get_accuracy_for_batch_size(optimizer, bs))\n",
    "\n",
    "accuracies_adam = []\n",
    "optimizer = \"adam\"\n",
    "for bs in batch_size_array:\n",
    "  accuracies_adam.append(get_accuracy_for_batch_size(optimizer, bs))"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.plot(batch_size_array, accuracies_sgd, color='green',label='SGD')\n",
    "plt.plot(batch_size_array, accuracies_adam, color='red',label='Adam')\n",
    "plt.show()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Effect of data augmentation"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "!mkdir -p train_augmented/Cat\n",
    "!mkdir -p train_augmented/Dog\n",
    "!mkdir -p validation_augmented/Cat\n",
    "!mkdir -p validation_augmented/Dog"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "Image preprocessing"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "from tensorflow import image\n",
    "# load dogs vs cats dataset, reshape into 200x200 files\n",
    "classes = ['Cat','Dog']\n",
    "photos, labels = list(), list()\n",
    "files_per_class = 1000\n",
    "for classe in classes:\n",
    "  i = 0\n",
    "  # enumerate files in the directory\n",
    "  for file in listdir('PetImages/'+classe):\n",
    "    if file.endswith(\".jpg\"):\n",
    "      # determine class\n",
    "      output = 0.0\n",
    "      if classe == 'Dog':\n",
    "        output = 1.0\n",
    "      # load image\n",
    "      photo = load_img('PetImages/'+classe +'/' + file, target_size=(200, 200))\n",
    "      photo_resized = photo.resize((250,250))\n",
    "      photo_cropped = photo_resized.crop((0,0, 200, 200))\n",
    "      if i < 600:\n",
    "        save_img('train_augmented/'+classe+'/'+file, photo)\n",
    "        save_img('train_augmented/'+classe+'/augmented_'+file, photo_cropped)\n",
    "      elif i < 800:\n",
    "        save_img('validation_augmented/'+classe+'/'+file, photo)\n",
    "        save_img('validation_augmented/'+classe+'/augmented_'+file, photo_cropped)\n",
    "      else:\n",
    "        save_img('test/'+classe+'/'+file, photo)\n",
    "      i = i + 1\n",
    "      if i == files_per_class:\n",
    "        break"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def get_accuracy_for_batch_size_augmented_data(train_folder,validation_folder, batch_size):\n",
    "  print(\"Evaluating batch size \" + str(batch_size))\n",
    "  # prepare iterators\n",
    "  datagen = ImageDataGenerator(rescale=1.0/255.0)\n",
    "  train_it = datagen.flow_from_directory(directory=train_folder,\n",
    "    class_mode='binary', batch_size=batch_size, target_size=(200, 200))\n",
    "  val_it = datagen.flow_from_directory(directory=validation_folder,\n",
    "    class_mode='binary', batch_size=batch_size, target_size=(200, 200))\n",
    "  test_it = datagen.flow_from_directory('test/',\n",
    "    class_mode='binary', batch_size=batch_size, target_size=(200, 200))\n",
    "  \n",
    "  # create model\n",
    "  model = keras.Sequential()\n",
    "  model.add(keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(200, 200, 3)))\n",
    "  model.add(keras.layers.MaxPooling2D((2, 2)))\n",
    "  model.add(keras.layers.Flatten())\n",
    "  model.add(keras.layers.Dense(128, activation='relu', kernel_initializer='he_uniform'))\n",
    "  model.add(keras.layers.Dense(1, activation='sigmoid'))\n",
    "\n",
    "  # compile model\n",
    "  model.compile(optimizer=\"adam\", loss='binary_crossentropy', metrics=['accuracy'])\n",
    "\n",
    "  # train the model\n",
    "  model.fit(train_it,\n",
    "                      validation_data = train_it,\n",
    "                      steps_per_epoch = train_it.n//train_it.batch_size,\n",
    "                      validation_steps = val_it.n//val_it.batch_size,\n",
    "                      epochs=5, verbose=0)\n",
    "  \n",
    "  # evaluate model\n",
    "  _, acc = model.evaluate(test_it, steps=len(test_it), verbose=0)\n",
    "  return acc"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Evaluate the accuracy for different batch sizes"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "batch_size_array = [16,32,64,128,256]\n",
    "accuracies_standard_data = []\n",
    "for bs in batch_size_array:\n",
    "  accuracies_standard_data.append(get_accuracy_for_batch_size_augmented_data(\"train/\",\"validation/\", bs))\n",
    "\n",
    "accuracies_augmented_data = []\n",
    "for bs in batch_size_array:\n",
    "  accuracies_augmented_data.append(get_accuracy_for_batch_size_augmented_data(\"train_augmented/\",\"validation_augmented/\", bs))"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.plot(batch_size_array, accuracies_standard_data, color='green',label='Standard data')\n",
    "plt.plot(batch_size_array, accuracies_augmented_data, color='red',label='Augmented data')\n",
    "plt.legend(bbox_to_anchor =(1.25, 0.8))\n",
    "plt.show()"
   ],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}