{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PyTorch VGGish"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from local.torch_basics import *\n",
    "from local.test import *\n",
    "from local.basics import *\n",
    "from local.data.all import *\n",
    "from local.vision.core import *\n",
    "from local.notebook.showdoc import show_doc\n",
    "from local.audio.core import *\n",
    "from local.audio.augment import *\n",
    "from local.vision.learner import *\n",
    "from local.vision.models.xresnet import *\n",
    "from local.metrics import *\n",
    "from local.callback.schedule import *\n",
    "import torchaudio\n",
    "from IPython.display import Audio, display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#def arc_extract(fname, dest): Archive(fname).extractall(dest)\n",
    "URLs.ESC50 = 'https://github.com/karoldvl/ESC-50/archive/master.zip'\n",
    "pESC50 = Config()['data_path'] / 'ESC-50/ESC-50-master'\n",
    "PATH_AUDIO = pESC50/\"audio\"\n",
    "PATH_CSV  = pESC50/\"meta/esc50.csv\"\n",
    "DF = pd.read_csv(PATH_CSV)\n",
    "#untar_data(URLs.ESC50, fname=str(pESC50)+'.zip', dest=pESC50, extract_func=arc_extract)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = AudioGetter(\"\", recurse=True, folders=None)\n",
    "files_ESC50 = x(pESC50)\n",
    "#original_aud = AudioItem.create(files[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "      <th>fold</th>\n",
       "      <th>target</th>\n",
       "      <th>category</th>\n",
       "      <th>esc10</th>\n",
       "      <th>src_file</th>\n",
       "      <th>take</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1-100032-A-0.wav</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>dog</td>\n",
       "      <td>True</td>\n",
       "      <td>100032</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1-100038-A-14.wav</td>\n",
       "      <td>1</td>\n",
       "      <td>14</td>\n",
       "      <td>chirping_birds</td>\n",
       "      <td>False</td>\n",
       "      <td>100038</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1-100210-A-36.wav</td>\n",
       "      <td>1</td>\n",
       "      <td>36</td>\n",
       "      <td>vacuum_cleaner</td>\n",
       "      <td>False</td>\n",
       "      <td>100210</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>1-100210-B-36.wav</td>\n",
       "      <td>1</td>\n",
       "      <td>36</td>\n",
       "      <td>vacuum_cleaner</td>\n",
       "      <td>False</td>\n",
       "      <td>100210</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1-101296-A-19.wav</td>\n",
       "      <td>1</td>\n",
       "      <td>19</td>\n",
       "      <td>thunderstorm</td>\n",
       "      <td>False</td>\n",
       "      <td>101296</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            filename  fold  target        category  esc10  src_file take\n",
       "0   1-100032-A-0.wav     1       0             dog   True    100032    A\n",
       "1  1-100038-A-14.wav     1      14  chirping_birds  False    100038    A\n",
       "2  1-100210-A-36.wav     1      36  vacuum_cleaner  False    100210    A\n",
       "3  1-100210-B-36.wav     1      36  vacuum_cleaner  False    100210    B\n",
       "4  1-101296-A-19.wav     1      19    thunderstorm  False    101296    A"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DF.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ESC_10 = DF[DF[\"esc10\"] == True][\"filename\"].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PosixPath('/home/jupyter/.fastai/data/ESC-50/ESC-50-master/audio/1-60676-A-34.wav')"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files_ESC50[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "files_ESC10 = [f for f in files_ESC50 if str(f).split('/')[-1] in ESC_10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "400"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(files_ESC10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#! pip install -i https://test.pypi.org/simple/ torchvggish==0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchvggish import vggish, vggish_input"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VGG(\n",
       "  (features): Sequential(\n",
       "    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "    (1): ReLU(inplace=True)\n",
       "    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
       "    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "    (4): ReLU(inplace=True)\n",
       "    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
       "    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "    (7): ReLU(inplace=True)\n",
       "    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "    (9): ReLU(inplace=True)\n",
       "    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
       "    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "    (12): ReLU(inplace=True)\n",
       "    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
       "    (14): ReLU(inplace=True)\n",
       "    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
       "  )\n",
       "  (embeddings): Sequential(\n",
       "    (0): Linear(in_features=12288, out_features=4096, bias=True)\n",
       "    (1): ReLU(inplace=True)\n",
       "    (2): Linear(in_features=4096, out_features=4096, bias=True)\n",
       "    (3): ReLU(inplace=True)\n",
       "    (4): Linear(in_features=4096, out_features=128, bias=True)\n",
       "    (5): ReLU(inplace=True)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Initialise model and download weights\n",
    "embedding_model = vggish()\n",
    "embedding_model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_example = \"/home/jupyter/rob/pytorch_videos/RNN-walkthrough/audio_sample/album.wav\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "example = vggish_input.wavfile_to_examples(files_ESC10[0]).detach()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([5, 1, 96, 64])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_embedding(p):\n",
    "    example = vggish_input.wavfile_to_examples(p)\n",
    "    embedding = embedding_model.forward(example)\n",
    "    return embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_embedding_batch(paths):\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_esc_classes(files):\n",
    "    return list({get_esc_label(f) for f in files})\n",
    "\n",
    "def get_esc_label(f):\n",
    "    return str(f).split('/')[-1].split('.')[0].split('-')[-1]\n",
    "\n",
    "def get_esc_fold(f):\n",
    "    return str(f).split('/')[-1].split('-')[0]\n",
    "\n",
    "def i2o_esc(files):\n",
    "    return dict(enumerate(get_esc_classes(files)))\n",
    "\n",
    "def o2i_esc(files):\n",
    "    return {o:i for i,o in i2o_esc(files).items()}\n",
    "\n",
    "def get_esc_embedding_data(files, valid_fold):\n",
    "    i2o_dict = i2o_esc(files)\n",
    "    o2i_dict = o2i_esc(files)\n",
    "    x_train = torch.stack([get_embedding(f).detach() for f in files if get_esc_fold(f) != str(valid_fold)], dim=0)\n",
    "    x_valid = torch.stack([get_embedding(f).detach() for f in files if get_esc_fold(f) == str(valid_fold)], dim=0)\n",
    "    y_train = torch.tensor([o2i_dict[get_esc_label(f)] for f in files if get_esc_fold(f) != str(valid_fold)]) \n",
    "    y_valid = torch.tensor([o2i_dict[get_esc_label(f)] for f in files if get_esc_fold(f) == str(valid_fold)])\n",
    "    return x_train, y_train, x_valid, y_valid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ESC-10 train and fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC10, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([320, 5, 128]),\n",
       " torch.Size([80, 5, 128]),\n",
       " torch.Size([320]),\n",
       " torch.Size([80]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = x_train.reshape(x_train.shape[0], -1)\n",
    "x_valid = x_valid.reshape(x_valid.shape[0], -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([320, 640]), torch.Size([320]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape, y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import RidgeClassifierCV\n",
    "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier.fit(x_train, y_train);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8375"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.score(x_valid, y_valid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What if, instead of flattening, we mean across channel to keep embedding context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC10, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([320, 5, 128]),\n",
       " torch.Size([80, 5, 128]),\n",
       " torch.Size([320]),\n",
       " torch.Size([80]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([320, 128])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.mean(dim=1).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = x_train.mean(dim=1)\n",
    "x_valid = x_valid.mean(dim=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([320, 128]),\n",
       " torch.Size([80, 128]),\n",
       " torch.Size([320]),\n",
       " torch.Size([80]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import RidgeClassifierCV\n",
    "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier.fit(x_train, y_train);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.875"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.score(x_valid, y_valid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ESC50 train and fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC50, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([1600, 5, 128]),\n",
       " torch.Size([400, 5, 128]),\n",
       " torch.Size([1600]),\n",
       " torch.Size([400]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = x_train.reshape(x_train.shape[0], -1)\n",
    "x_valid = x_valid.reshape(x_valid.shape[0], -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier.fit(x_train, y_train);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6225"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.score(x_valid, y_valid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train with channel mean instead"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC50, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([1600, 5, 128]),\n",
       " torch.Size([400, 5, 128]),\n",
       " torch.Size([1600]),\n",
       " torch.Size([400]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = x_train.mean(dim=1)\n",
    "x_valid = x_valid.mean(dim=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([1600, 128]),\n",
       " torch.Size([400, 128]),\n",
       " torch.Size([1600]),\n",
       " torch.Size([400]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier.fit(x_train, y_train);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6025"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.score(x_valid, y_valid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's try data augmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#waveform_to_examples(data, sample_rate)+"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}