{
  "metadata": {
    "kernelspec": {
      "language": "python",
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "version": "3.6.4",
      "file_extension": ".py",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "name": "python",
      "mimetype": "text/x-python"
    },
    "colab": {
      "provenance": []
    },
    "accelerator": "GPU"
  },
  "nbformat_minor": 0,
  "nbformat": 4,
  "cells": [
{
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/programminghistorian/jekyll/blob/gh-pages/assets/computer-vision-deep-learning-pt1-2/computer-vision-deep-learning-pt1-2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "Fjnm_wg5Axz4"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Authors' note: The cells below set up the data to mirror the format found on Kaggle, which is where this notebook was originally written. If you are running the notebook on your own machine or server, you will probably want a different directory structure (note that we can't provide support for this approach to running the lesson material)."
      ],
      "metadata": {
        "id": "mnPMMn3D2pGO"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%%capture\n",
        "!mkdir ../input/computer-vision-for-the-humanities-ph/ads_data/ads_data/ -p\n",
        "!wget https://zenodo.org/record/5838410/files/ads_upsampled.csv?download=1 -O ../input/computer-vision-for-the-humanities-ph/ads_data/ads_data/ads_upsampled.csv\n",
        "!mkdir ../input/computer-vision-for-the-humanities-ph/ads_data/ads_data/images/ -p\n",
        "!wget -O images.zip https://zenodo.org/record/5838410/files/images.zip?download=1\n",
        "!unzip images.zip -d ../input/computer-vision-for-the-humanities-ph/ads_data/ads_data/images/\n",
        "!mkdir ../input/computer-vision-for-the-humanities-ph/photos_multi/photos_multi/ -p\n",
        "!wget https://zenodo.org/record/4487141/files/multi_label.csv?download=1 -O ../input/computer-vision-for-the-humanities-ph/photos_multi/photos_multi/multi_label.csv\n",
        "!wget -O photo_images.zip https://zenodo.org/record/4487141/files/images.zip?download=1\n",
        "!mkdir ../input/computer-vision-for-the-humanities-ph/photos_multi/photos_multi/photo_images -p\n",
        "!unzip photo_images -d ../input/computer-vision-for-the-humanities-ph/photos_multi/photos_multi/photo_images"
      ],
      "metadata": {
        "id": "mZ6kD4-AtQOw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install fastai --upgrade"
      ],
      "metadata": {
        "id": "DBpY2b6dutRK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Computer Vision for the Humanities: an Introduction to Deep Learning for Image Classification\n",
        "\n",
        "This notebook contains the code you'll need to run in both Part 1 and Part 2 of this lesson."
      ],
      "metadata": {
        "id": "L9UuXbPZtGlW"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Part 1"
      ],
      "metadata": {
        "id": "3AwPRMvttGlY"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Creating an Image Classifier in fastai"
      ],
      "metadata": {
        "id": "UbyxOsmVtGlb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from fastai.vision.all import *"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:59:49.083953Z",
          "iopub.execute_input": "2022-04-11T11:59:49.084274Z",
          "iopub.status.idle": "2022-04-11T11:59:52.004748Z",
          "shell.execute_reply.started": "2022-04-11T11:59:49.084227Z",
          "shell.execute_reply": "2022-04-11T11:59:52.004023Z"
        },
        "trusted": true,
        "id": "JHjSMawrtGlc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%matplotlib inline\n",
        "import matplotlib.pyplot as plt\n",
        "plt.style.use('seaborn')"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:59:52.720242Z",
          "iopub.execute_input": "2022-04-11T11:59:52.720702Z",
          "iopub.status.idle": "2022-04-11T11:59:52.729297Z",
          "shell.execute_reply.started": "2022-04-11T11:59:52.720663Z",
          "shell.execute_reply": "2022-04-11T11:59:52.727987Z"
        },
        "trusted": true,
        "id": "sDYg4CDJtGle"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Loading the Data\n"
      ],
      "metadata": {
        "id": "3JVVHm2ZtGlf"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "ad_data = ImageDataLoaders.from_csv(\n",
        "    path=\"../input/computer-vision-for-the-humanities-ph/ads_data/ads_data/\",  # root path to csv file and image directory\n",
        "    csv_fname=\"ads_upsampled.csv\",  # the name of our csv file\n",
        "    folder=\"images/\",  # the folder where our images are stored\n",
        "    fn_col=\"file\",  # the file column in our csv\n",
        "    label_col=\"label\",  # the label column in our csv\n",
        "    item_tfms=Resize(224, ResizeMethod.Squish),  # resize imagesby squishing so they are 224x224 pixels\n",
        "    seed=42,  # set a fixed seed to make results more reproducible\n",
        ")"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:59:55.195514Z",
          "iopub.execute_input": "2022-04-11T11:59:55.195772Z",
          "iopub.status.idle": "2022-04-11T11:59:58.285119Z",
          "shell.execute_reply.started": "2022-04-11T11:59:55.195743Z",
          "shell.execute_reply": "2022-04-11T11:59:58.284354Z"
        },
        "trusted": true,
        "id": "lZ6UzCLItGlg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "ad_data.show_batch()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-05T14:06:03.336292Z",
          "iopub.execute_input": "2022-04-05T14:06:03.336903Z",
          "iopub.status.idle": "2022-04-05T14:06:04.846184Z",
          "shell.execute_reply.started": "2022-04-05T14:06:03.33686Z",
          "shell.execute_reply": "2022-04-05T14:06:04.845404Z"
        },
        "trusted": true,
        "id": "Goj63ha-tGli"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Creating the Model\n",
        "\n"
      ],
      "metadata": {
        "id": "4ddde34WtGlj"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn = vision_learner(\n",
        "    ad_data,  # the data the model will be trained on\n",
        "    resnet18,  # the type of model we want to use\n",
        "    metrics=accuracy,  # the metrics to track\n",
        ")"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-05T14:06:07.790508Z",
          "iopub.execute_input": "2022-04-05T14:06:07.790766Z",
          "iopub.status.idle": "2022-04-05T14:06:08.226152Z",
          "shell.execute_reply.started": "2022-04-05T14:06:07.790738Z",
          "shell.execute_reply": "2022-04-05T14:06:08.225364Z"
        },
        "trusted": true,
        "id": "FuJpCuabtGlk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Training the Model"
      ],
      "metadata": {
        "id": "vZXVOKlwtGll"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn.fine_tune(5)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-05T14:06:11.884992Z",
          "iopub.execute_input": "2022-04-05T14:06:11.885687Z",
          "iopub.status.idle": "2022-04-05T14:07:00.069511Z",
          "shell.execute_reply.started": "2022-04-05T14:06:11.885649Z",
          "shell.execute_reply": "2022-04-05T14:07:00.06871Z"
        },
        "trusted": true,
        "id": "kAQ9LeUwtGll"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Appendix: A Non-Scientific Experiment Assessing Transfer Learning"
      ],
      "metadata": {
        "id": "oz4Vf88PtGln"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn_random_start = vision_learner(ad_data, resnet18, metrics=accuracy, pretrained=False)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-05T14:07:00.071567Z",
          "iopub.execute_input": "2022-04-05T14:07:00.07182Z",
          "iopub.status.idle": "2022-04-05T14:07:00.280735Z",
          "shell.execute_reply.started": "2022-04-05T14:07:00.071784Z",
          "shell.execute_reply": "2022-04-05T14:07:00.280003Z"
        },
        "trusted": true,
        "id": "sB24zj-QtGln"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "learn_random_start.fine_tune(5)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-05T14:07:00.282289Z",
          "iopub.execute_input": "2022-04-05T14:07:00.282596Z",
          "iopub.status.idle": "2022-04-05T14:07:42.701202Z",
          "shell.execute_reply.started": "2022-04-05T14:07:00.28256Z",
          "shell.execute_reply": "2022-04-05T14:07:42.700238Z"
        },
        "trusted": true,
        "id": "Os18su93tGlp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "learn.validate()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-05T14:07:42.706367Z",
          "iopub.execute_input": "2022-04-05T14:07:42.706639Z",
          "iopub.status.idle": "2022-04-05T14:07:44.422563Z",
          "shell.execute_reply.started": "2022-04-05T14:07:42.706602Z",
          "shell.execute_reply": "2022-04-05T14:07:44.421748Z"
        },
        "trusted": true,
        "id": "_kQejerNtGlp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Part 2\n",
        "\n"
      ],
      "metadata": {
        "id": "juNRu6KHtGlp"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Looking More Closely at the Data"
      ],
      "metadata": {
        "id": "46-BOYAdtGls"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:42:02.363139Z",
          "iopub.execute_input": "2022-04-11T11:42:02.363426Z",
          "iopub.status.idle": "2022-04-11T11:42:02.367617Z",
          "shell.execute_reply.started": "2022-04-11T11:42:02.363394Z",
          "shell.execute_reply": "2022-04-11T11:42:02.366599Z"
        },
        "trusted": true,
        "id": "C_4zpaFmtGls"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%matplotlib inline\n",
        "import matplotlib.pyplot as plt\n",
        "plt.style.use('seaborn')"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:42:05.786625Z",
          "iopub.execute_input": "2022-04-11T11:42:05.786948Z",
          "iopub.status.idle": "2022-04-11T11:42:05.798159Z",
          "shell.execute_reply.started": "2022-04-11T11:42:05.786911Z",
          "shell.execute_reply": "2022-04-11T11:42:05.797459Z"
        },
        "trusted": true,
        "id": "2mTQOjLmtGlt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df = pd.read_csv('../input/computer-vision-for-the-humanities-ph/photos_multi/photos_multi/multi_label.csv', na_filter=False)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:42:05.932808Z",
          "iopub.execute_input": "2022-04-11T11:42:05.933052Z",
          "iopub.status.idle": "2022-04-11T11:42:05.974127Z",
          "shell.execute_reply.started": "2022-04-11T11:42:05.933024Z",
          "shell.execute_reply": "2022-04-11T11:42:05.973460Z"
        },
        "trusted": true,
        "id": "4b3H22KBtGlt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:42:15.587703Z",
          "iopub.execute_input": "2022-04-11T11:42:15.588133Z",
          "iopub.status.idle": "2022-04-11T11:42:15.607841Z",
          "shell.execute_reply.started": "2022-04-11T11:42:15.588094Z",
          "shell.execute_reply": "2022-04-11T11:42:15.607190Z"
        },
        "trusted": true,
        "id": "H1gm4_F9tGlt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df['label'].value_counts()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:44:00.933419Z",
          "iopub.execute_input": "2022-04-11T11:44:00.933710Z",
          "iopub.status.idle": "2022-04-11T11:44:00.945885Z",
          "shell.execute_reply.started": "2022-04-11T11:44:00.933681Z",
          "shell.execute_reply": "2022-04-11T11:44:00.945254Z"
        },
        "trusted": true,
        "id": "t1s132SNtGlv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# create a variable lables to store the list\n",
        "labels = df['label'].to_list()\n",
        "# take a slice of this list to display\n",
        "labels[:6]"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:44:27.829466Z",
          "iopub.execute_input": "2022-04-11T11:44:27.829749Z",
          "iopub.status.idle": "2022-04-11T11:44:27.837343Z",
          "shell.execute_reply.started": "2022-04-11T11:44:27.829717Z",
          "shell.execute_reply": "2022-04-11T11:44:27.836475Z"
        },
        "trusted": true,
        "id": "lEW4ECDxtGlw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# for each label in the list split on \"|\"\n",
        "split_labels = [label.split(\"|\") for label in labels]"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:45:15.973822Z",
          "iopub.execute_input": "2022-04-11T11:45:15.974114Z",
          "iopub.status.idle": "2022-04-11T11:45:15.979020Z",
          "shell.execute_reply.started": "2022-04-11T11:45:15.974082Z",
          "shell.execute_reply": "2022-04-11T11:45:15.978258Z"
        },
        "trusted": true,
        "id": "friAb3vetGlw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "split_labels[:4]"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:45:34.901094Z",
          "iopub.execute_input": "2022-04-11T11:45:34.901684Z",
          "iopub.status.idle": "2022-04-11T11:45:34.907259Z",
          "shell.execute_reply.started": "2022-04-11T11:45:34.901641Z",
          "shell.execute_reply": "2022-04-11T11:45:34.906089Z"
        },
        "trusted": true,
        "id": "QJ5au4ZRtGlx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "labels = [label for sublist in split_labels for label in sublist]\n",
        "labels[:4]"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:46:06.263127Z",
          "iopub.execute_input": "2022-04-11T11:46:06.263591Z",
          "iopub.status.idle": "2022-04-11T11:46:06.269926Z",
          "shell.execute_reply.started": "2022-04-11T11:46:06.263555Z",
          "shell.execute_reply": "2022-04-11T11:46:06.269212Z"
        },
        "trusted": true,
        "id": "J-OH52DPtGlx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Counting the Labels\n",
        "\n"
      ],
      "metadata": {
        "id": "FGYCm8b0tGly"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from collections import Counter\n",
        "label_freqs = Counter(labels)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:53:48.445922Z",
          "iopub.execute_input": "2022-04-11T11:53:48.446200Z",
          "iopub.status.idle": "2022-04-11T11:53:48.450742Z",
          "shell.execute_reply.started": "2022-04-11T11:53:48.446165Z",
          "shell.execute_reply": "2022-04-11T11:53:48.449421Z"
        },
        "trusted": true,
        "id": "uojeAb0FtGly"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "label_freqs"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:54:02.776300Z",
          "iopub.execute_input": "2022-04-11T11:54:02.776756Z",
          "iopub.status.idle": "2022-04-11T11:54:02.782036Z",
          "shell.execute_reply.started": "2022-04-11T11:54:02.776716Z",
          "shell.execute_reply": "2022-04-11T11:54:02.781023Z"
        },
        "trusted": true,
        "id": "BFBngQzUtGlz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sum(label_freqs.values())"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:57:02.700782Z",
          "iopub.execute_input": "2022-04-11T11:57:02.701059Z",
          "iopub.status.idle": "2022-04-11T11:57:02.707094Z",
          "shell.execute_reply.started": "2022-04-11T11:57:02.701028Z",
          "shell.execute_reply": "2022-04-11T11:57:02.706336Z"
        },
        "trusted": true,
        "id": "bGrWN0JntGl1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import matplotlib.pyplot as plt\n",
        "\n",
        "plt.bar(\n",
        "    label_freqs.keys(),  #pass in our labels\n",
        "    list(map(lambda x: x / sum(label_freqs.values()), label_freqs.values())),  # normalized values\n",
        ")\n",
        "# add a title to the plot\n",
        "plt.title(\"Label frequencies\")\n",
        "# add a y axis label\n",
        "plt.ylabel(\"Percentage of total labels\")\n",
        "plt.show()  # show the plot"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:57:14.992108Z",
          "iopub.execute_input": "2022-04-11T11:57:14.992807Z",
          "iopub.status.idle": "2022-04-11T11:57:15.292728Z",
          "shell.execute_reply.started": "2022-04-11T11:57:14.992770Z",
          "shell.execute_reply": "2022-04-11T11:57:15.292079Z"
        },
        "trusted": true,
        "id": "kOkXw2MktGl3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Loading Data\n"
      ],
      "metadata": {
        "id": "xLjQQL8MtGl4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from fastai.vision.all import *"
      ],
      "metadata": {
        "id": "ZUK8pVmHAD76"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df.columns"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T11:58:47.124354Z",
          "iopub.execute_input": "2022-04-11T11:58:47.125103Z",
          "iopub.status.idle": "2022-04-11T11:58:47.130423Z",
          "shell.execute_reply.started": "2022-04-11T11:58:47.125064Z",
          "shell.execute_reply": "2022-04-11T11:58:47.129576Z"
        },
        "trusted": true,
        "id": "71GoU8z5tGl6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "photo_data = ImageDataLoaders.from_df(\n",
        "    df,  # the dataframe where our labels and image file paths are stored\n",
        "    folder=\"../input/computer-vision-for-the-humanities-ph/photos_multi/photos_multi/photo_images\",  # the path to the directory holding the images\n",
        "    bs=32,  # the batch size (number of images + labels)\n",
        "    label_delim=\"|\",  # the deliminator between each label in our label column\n",
        "    item_tfms=Resize(224),  # resize each image to 224x224\n",
        "    valid_pct=0.3,  # use 30% of the data as validation data\n",
        "    seed=42  # set a seed to make results more reproducible\n",
        ")"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:00:00.802712Z",
          "iopub.execute_input": "2022-04-11T12:00:00.802979Z",
          "iopub.status.idle": "2022-04-11T12:00:01.004391Z",
          "shell.execute_reply.started": "2022-04-11T12:00:00.802950Z",
          "shell.execute_reply": "2022-04-11T12:00:01.003636Z"
        },
        "trusted": true,
        "id": "qYPZtgTZtGl7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### fastai DataLoaders\n"
      ],
      "metadata": {
        "id": "ruXSFtB0tGl9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "photo_data"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:00:47.989803Z",
          "iopub.execute_input": "2022-04-11T12:00:47.990276Z",
          "iopub.status.idle": "2022-04-11T12:00:47.995285Z",
          "shell.execute_reply.started": "2022-04-11T12:00:47.990236Z",
          "shell.execute_reply": "2022-04-11T12:00:47.994547Z"
        },
        "trusted": true,
        "id": "TtvoDoO2tGl-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Viewing our Loaded Data"
      ],
      "metadata": {
        "id": "87_lX7ILtGl-"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "photo_data.show_batch(figsize=(15,15))"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:01:00.398599Z",
          "iopub.execute_input": "2022-04-11T12:01:00.398876Z",
          "iopub.status.idle": "2022-04-11T12:01:04.071641Z",
          "shell.execute_reply.started": "2022-04-11T12:01:00.398842Z",
          "shell.execute_reply": "2022-04-11T12:01:04.070969Z"
        },
        "trusted": true,
        "id": "7jSQzG-ktGl-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Inspecting Model Inputs"
      ],
      "metadata": {
        "id": "9tKMazuytGl_"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "photo_data.vocab"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:01:52.465592Z",
          "iopub.execute_input": "2022-04-11T12:01:52.465845Z",
          "iopub.status.idle": "2022-04-11T12:01:52.471575Z",
          "shell.execute_reply.started": "2022-04-11T12:01:52.465816Z",
          "shell.execute_reply": "2022-04-11T12:01:52.470667Z"
        },
        "trusted": true,
        "id": "J88c3UaItGl_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "x, y = photo_data.one_batch()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:01:57.590605Z",
          "iopub.execute_input": "2022-04-11T12:01:57.590860Z",
          "iopub.status.idle": "2022-04-11T12:01:59.461254Z",
          "shell.execute_reply.started": "2022-04-11T12:01:57.590831Z",
          "shell.execute_reply": "2022-04-11T12:01:59.460510Z"
        },
        "trusted": true,
        "id": "5K91-0lFtGmA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "type(x), type(y)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:06:13.348351Z",
          "iopub.execute_input": "2022-04-11T12:06:13.348826Z",
          "iopub.status.idle": "2022-04-11T12:06:13.354377Z",
          "shell.execute_reply.started": "2022-04-11T12:06:13.348790Z",
          "shell.execute_reply": "2022-04-11T12:06:13.353715Z"
        },
        "trusted": true,
        "id": "aVOxeLBitGmB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(x), len(y)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:06:29.571807Z",
          "iopub.execute_input": "2022-04-11T12:06:29.572186Z",
          "iopub.status.idle": "2022-04-11T12:06:29.578981Z",
          "shell.execute_reply.started": "2022-04-11T12:06:29.572132Z",
          "shell.execute_reply": "2022-04-11T12:06:29.578301Z"
        },
        "trusted": true,
        "id": "S1pCSw2rtGmB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "x[0]"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:06:52.214975Z",
          "iopub.execute_input": "2022-04-11T12:06:52.215392Z",
          "iopub.status.idle": "2022-04-11T12:06:52.267205Z",
          "shell.execute_reply.started": "2022-04-11T12:06:52.215349Z",
          "shell.execute_reply": "2022-04-11T12:06:52.266180Z"
        },
        "trusted": true,
        "id": "Cr2dbQLJtGmC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "x[0].shape"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:07:09.075707Z",
          "iopub.execute_input": "2022-04-11T12:07:09.075970Z",
          "iopub.status.idle": "2022-04-11T12:07:09.082593Z",
          "shell.execute_reply.started": "2022-04-11T12:07:09.075941Z",
          "shell.execute_reply": "2022-04-11T12:07:09.081808Z"
        },
        "trusted": true,
        "id": "UbOR8P5wtGmD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "y[0]"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:09:35.438368Z",
          "iopub.execute_input": "2022-04-11T12:09:35.438778Z",
          "iopub.status.idle": "2022-04-11T12:09:35.449728Z",
          "shell.execute_reply.started": "2022-04-11T12:09:35.438735Z",
          "shell.execute_reply": "2022-04-11T12:09:35.448895Z"
        },
        "trusted": true,
        "id": "CiYqAtlhtGmE"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "y[0].shape"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:09:49.520776Z",
          "iopub.execute_input": "2022-04-11T12:09:49.521028Z",
          "iopub.status.idle": "2022-04-11T12:09:49.526973Z",
          "shell.execute_reply.started": "2022-04-11T12:09:49.520999Z",
          "shell.execute_reply": "2022-04-11T12:09:49.526287Z"
        },
        "trusted": true,
        "id": "X8deHsHJtGmF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "x.shape, y.shape"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:11:32.645653Z",
          "iopub.execute_input": "2022-04-11T12:11:32.645908Z",
          "iopub.status.idle": "2022-04-11T12:11:32.651573Z",
          "shell.execute_reply.started": "2022-04-11T12:11:32.645879Z",
          "shell.execute_reply": "2022-04-11T12:11:32.650890Z"
        },
        "trusted": true,
        "id": "o_CpW27ktGmF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Image Augmentations"
      ],
      "metadata": {
        "id": "8QQY9ku4tGmG"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "tfms = setup_aug_tfms([Rotate(max_deg=90, p=0.75), Zoom(), Flip()])"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:12:55.253325Z",
          "iopub.execute_input": "2022-04-11T12:12:55.253941Z",
          "iopub.status.idle": "2022-04-11T12:12:55.258419Z",
          "shell.execute_reply.started": "2022-04-11T12:12:55.253888Z",
          "shell.execute_reply": "2022-04-11T12:12:55.257743Z"
        },
        "trusted": true,
        "id": "Qt_1_mKttGmG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "photo_data = ImageDataLoaders.from_df(\n",
        "    df,  # dataframe containing paths to images and labels\n",
        "    folder=\"../input/computer-vision-for-the-humanities-ph/photos_multi/photos_multi/photo_images\",  # folder where images are stored\n",
        "    bs=32,  # batch size\n",
        "    label_delim=\"|\",  # the deliminator for multiple labels\n",
        "    item_tfms=Resize(224),  # resize images to a standard size\n",
        "    batch_tfms=tfms,  # pass in our transforms\n",
        "    valid_pct=0.3,  # 30% of data used for validation\n",
        "    seed=42,  # set a seed,\n",
        ")"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:12:58.415840Z",
          "iopub.execute_input": "2022-04-11T12:12:58.416431Z",
          "iopub.status.idle": "2022-04-11T12:12:58.609823Z",
          "shell.execute_reply.started": "2022-04-11T12:12:58.416389Z",
          "shell.execute_reply": "2022-04-11T12:12:58.609052Z"
        },
        "trusted": true,
        "id": "2vdXBU-RtGmG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "photo_data.show_batch(unique=True, figsize=(10,10))"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:13:03.481839Z",
          "iopub.execute_input": "2022-04-11T12:13:03.482407Z",
          "iopub.status.idle": "2022-04-11T12:13:05.452713Z",
          "shell.execute_reply.started": "2022-04-11T12:13:03.482364Z",
          "shell.execute_reply": "2022-04-11T12:13:05.452024Z"
        },
        "trusted": true,
        "id": "a2tLmt3BtGmJ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Creating a Model"
      ],
      "metadata": {
        "id": "CHFHESS9tGmK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn = vision_learner(photo_data, densenet121, metrics=[F1ScoreMulti(), accuracy_multi])"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:20:37.356704Z",
          "iopub.execute_input": "2022-04-11T12:20:37.356984Z",
          "iopub.status.idle": "2022-04-11T12:20:37.835902Z",
          "shell.execute_reply.started": "2022-04-11T12:20:37.356953Z",
          "shell.execute_reply": "2022-04-11T12:20:37.835121Z"
        },
        "trusted": true,
        "id": "qgV9XF6utGmK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "?learn"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:20:49.845683Z",
          "iopub.execute_input": "2022-04-11T12:20:49.845940Z",
          "iopub.status.idle": "2022-04-11T12:20:49.901842Z",
          "shell.execute_reply.started": "2022-04-11T12:20:49.845912Z",
          "shell.execute_reply": "2022-04-11T12:20:49.901168Z"
        },
        "trusted": true,
        "id": "Yb_QI1y8tGmV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Training the Model"
      ],
      "metadata": {
        "id": "Fr4BmVMhtGmW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn.lr_find()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:28:11.287902Z",
          "iopub.execute_input": "2022-04-11T12:28:11.288195Z",
          "iopub.status.idle": "2022-04-11T12:31:10.320737Z",
          "shell.execute_reply.started": "2022-04-11T12:28:11.288141Z",
          "shell.execute_reply": "2022-04-11T12:31:10.319989Z"
        },
        "trusted": true,
        "id": "YPbZhSGatGmW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Fitting the Model\n"
      ],
      "metadata": {
        "id": "axk7o2xBtGma"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn.fit_one_cycle(5, lr_max=2e-2)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:31:10.322637Z",
          "iopub.execute_input": "2022-04-11T12:31:10.323226Z",
          "iopub.status.idle": "2022-04-11T12:40:50.774841Z",
          "shell.execute_reply.started": "2022-04-11T12:31:10.323187Z",
          "shell.execute_reply": "2022-04-11T12:40:50.774038Z"
        },
        "trusted": true,
        "id": "vUV9kIibtGmc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "learn.recorder.plot_loss()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:40:50.776600Z",
          "iopub.execute_input": "2022-04-11T12:40:50.776881Z",
          "iopub.status.idle": "2022-04-11T12:40:50.998786Z",
          "shell.execute_reply.started": "2022-04-11T12:40:50.776840Z",
          "shell.execute_reply": "2022-04-11T12:40:50.998021Z"
        },
        "trusted": true,
        "id": "YSMpWs6mtGmf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Saving Progress\n"
      ],
      "metadata": {
        "id": "FXQKrGSytGmf"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn.save('stage_1')"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:40:51.000510Z",
          "iopub.execute_input": "2022-04-11T12:40:51.000919Z",
          "iopub.status.idle": "2022-04-11T12:40:51.163166Z",
          "shell.execute_reply.started": "2022-04-11T12:40:51.000880Z",
          "shell.execute_reply": "2022-04-11T12:40:51.162188Z"
        },
        "trusted": true,
        "id": "ky0T-6h5tGmg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Unfreezing the Model\n",
        "\n"
      ],
      "metadata": {
        "id": "F3dFIN3ntGmg"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "learn.unfreeze()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:40:51.164881Z",
          "iopub.execute_input": "2022-04-11T12:40:51.165190Z",
          "iopub.status.idle": "2022-04-11T12:40:51.171877Z",
          "shell.execute_reply.started": "2022-04-11T12:40:51.165136Z",
          "shell.execute_reply": "2022-04-11T12:40:51.170845Z"
        },
        "trusted": true,
        "id": "6Z1XFYKNtGmg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "learn.lr_find()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:40:51.173994Z",
          "iopub.execute_input": "2022-04-11T12:40:51.174334Z",
          "iopub.status.idle": "2022-04-11T12:43:50.743768Z",
          "shell.execute_reply.started": "2022-04-11T12:40:51.174295Z",
          "shell.execute_reply": "2022-04-11T12:43:50.742996Z"
        },
        "trusted": true,
        "id": "cLURB5wktGmg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "learn.fit_one_cycle(4, lr_max=slice(6e-6, 4e-4), cbs=[SaveModelCallback(monitor='f1_score')])"
      ],
      "metadata": {
        "id": "TK4e49Yy0Nkx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Investigating the Results of our Model\n"
      ],
      "metadata": {
        "id": "JI-k_BwktGmh"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "y_pred, y_true = learn.get_preds()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:43:50.746098Z",
          "iopub.execute_input": "2022-04-11T12:43:50.746604Z",
          "iopub.status.idle": "2022-04-11T12:44:24.624914Z",
          "shell.execute_reply.started": "2022-04-11T12:43:50.746563Z",
          "shell.execute_reply": "2022-04-11T12:44:24.624193Z"
        },
        "trusted": true,
        "id": "srg-ueRVtGmh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(y_pred), len(y_true)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:45:34.705358Z",
          "iopub.execute_input": "2022-04-11T12:45:34.706013Z",
          "iopub.status.idle": "2022-04-11T12:45:34.711665Z",
          "shell.execute_reply.started": "2022-04-11T12:45:34.705970Z",
          "shell.execute_reply": "2022-04-11T12:45:34.710692Z"
        },
        "trusted": true,
        "id": "aEa2-Zl2tGmh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "y_pred[0]"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:46:02.496215Z",
          "iopub.execute_input": "2022-04-11T12:46:02.496487Z",
          "iopub.status.idle": "2022-04-11T12:46:02.509305Z",
          "shell.execute_reply.started": "2022-04-11T12:46:02.496458Z",
          "shell.execute_reply": "2022-04-11T12:46:02.508467Z"
        },
        "trusted": true,
        "id": "mbbRj3XutGmi"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Exploring our Predictions Using scikit-learn\n",
        "\n"
      ],
      "metadata": {
        "id": "XoSo8ncFtGmj"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:48:19.183271Z",
          "iopub.execute_input": "2022-04-11T12:48:19.183907Z",
          "iopub.status.idle": "2022-04-11T12:48:19.187798Z",
          "shell.execute_reply.started": "2022-04-11T12:48:19.183869Z",
          "shell.execute_reply": "2022-04-11T12:48:19.187111Z"
        },
        "trusted": true,
        "id": "BDjDkB9stGmj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "f1_score(y_true, y_pred>0.50, average='macro')"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:48:55.264028Z",
          "iopub.execute_input": "2022-04-11T12:48:55.264690Z",
          "iopub.status.idle": "2022-04-11T12:48:55.276923Z",
          "shell.execute_reply.started": "2022-04-11T12:48:55.264646Z",
          "shell.execute_reply": "2022-04-11T12:48:55.276033Z"
        },
        "trusted": true,
        "id": "lPqnknRQtGmk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import classification_report"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:49:16.867682Z",
          "iopub.execute_input": "2022-04-11T12:49:16.867950Z",
          "iopub.status.idle": "2022-04-11T12:49:16.872276Z",
          "shell.execute_reply.started": "2022-04-11T12:49:16.867920Z",
          "shell.execute_reply": "2022-04-11T12:49:16.871446Z"
        },
        "trusted": true,
        "id": "8Tu-kCyKtGml"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(classification_report(y_true, y_pred>0.50, target_names=photo_data.vocab, zero_division=1))"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2022-04-11T12:49:21.208954Z",
          "iopub.execute_input": "2022-04-11T12:49:21.209678Z",
          "iopub.status.idle": "2022-04-11T12:49:21.229606Z",
          "shell.execute_reply.started": "2022-04-11T12:49:21.209638Z",
          "shell.execute_reply": "2022-04-11T12:49:21.228792Z"
        },
        "trusted": true,
        "id": "-T29o-1ctGml"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}