{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PyTorch VGGish" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from local.torch_basics import *\n", "from local.test import *\n", "from local.basics import *\n", "from local.data.all import *\n", "from local.vision.core import *\n", "from local.notebook.showdoc import show_doc\n", "from local.audio.core import *\n", "from local.audio.augment import *\n", "from local.vision.learner import *\n", "from local.vision.models.xresnet import *\n", "from local.metrics import *\n", "from local.callback.schedule import *\n", "import torchaudio\n", "from IPython.display import Audio, display" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#def arc_extract(fname, dest): Archive(fname).extractall(dest)\n", "URLs.ESC50 = 'https://github.com/karoldvl/ESC-50/archive/master.zip'\n", "pESC50 = Config()['data_path'] / 'ESC-50/ESC-50-master'\n", "PATH_AUDIO = pESC50/\"audio\"\n", "PATH_CSV = pESC50/\"meta/esc50.csv\"\n", "DF = pd.read_csv(PATH_CSV)\n", "#untar_data(URLs.ESC50, fname=str(pESC50)+'.zip', dest=pESC50, extract_func=arc_extract)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = AudioGetter(\"\", recurse=True, folders=None)\n", "files_ESC50 = x(pESC50)\n", "#original_aud = AudioItem.create(files[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenamefoldtargetcategoryesc10src_filetake
01-100032-A-0.wav10dogTrue100032A
11-100038-A-14.wav114chirping_birdsFalse100038A
21-100210-A-36.wav136vacuum_cleanerFalse100210A
31-100210-B-36.wav136vacuum_cleanerFalse100210B
41-101296-A-19.wav119thunderstormFalse101296A
\n", "
" ], "text/plain": [ " filename fold target category esc10 src_file take\n", "0 1-100032-A-0.wav 1 0 dog True 100032 A\n", "1 1-100038-A-14.wav 1 14 chirping_birds False 100038 A\n", "2 1-100210-A-36.wav 1 36 vacuum_cleaner False 100210 A\n", "3 1-100210-B-36.wav 1 36 vacuum_cleaner False 100210 B\n", "4 1-101296-A-19.wav 1 19 thunderstorm False 101296 A" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DF.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ESC_10 = DF[DF[\"esc10\"] == True][\"filename\"].values.tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PosixPath('/home/jupyter/.fastai/data/ESC-50/ESC-50-master/audio/1-60676-A-34.wav')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "files_ESC50[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "files_ESC10 = [f for f in files_ESC50 if str(f).split('/')[-1] in ESC_10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "400" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(files_ESC10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#! pip install -i https://test.pypi.org/simple/ torchvggish==0.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from torchvggish import vggish, vggish_input" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "VGG(\n", " (features): Sequential(\n", " (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (1): ReLU(inplace=True)\n", " (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (4): ReLU(inplace=True)\n", " (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (7): ReLU(inplace=True)\n", " (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (9): ReLU(inplace=True)\n", " (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (12): ReLU(inplace=True)\n", " (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (14): ReLU(inplace=True)\n", " (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " )\n", " (embeddings): Sequential(\n", " (0): Linear(in_features=12288, out_features=4096, bias=True)\n", " (1): ReLU(inplace=True)\n", " (2): Linear(in_features=4096, out_features=4096, bias=True)\n", " (3): ReLU(inplace=True)\n", " (4): Linear(in_features=4096, out_features=128, bias=True)\n", " (5): ReLU(inplace=True)\n", " )\n", ")" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Initialise model and download weights\n", "embedding_model = vggish()\n", "embedding_model.eval()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path_example = \"/home/jupyter/rob/pytorch_videos/RNN-walkthrough/audio_sample/album.wav\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "example = vggish_input.wavfile_to_examples(files_ESC10[0]).detach()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([5, 1, 96, 64])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_embedding(p):\n", " example = vggish_input.wavfile_to_examples(p)\n", " embedding = embedding_model.forward(example)\n", " return embedding" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_embedding_batch(paths):\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_esc_classes(files):\n", " return list({get_esc_label(f) for f in files})\n", "\n", "def get_esc_label(f):\n", " return str(f).split('/')[-1].split('.')[0].split('-')[-1]\n", "\n", "def get_esc_fold(f):\n", " return str(f).split('/')[-1].split('-')[0]\n", "\n", "def i2o_esc(files):\n", " return dict(enumerate(get_esc_classes(files)))\n", "\n", "def o2i_esc(files):\n", " return {o:i for i,o in i2o_esc(files).items()}\n", "\n", "def get_esc_embedding_data(files, valid_fold):\n", " i2o_dict = i2o_esc(files)\n", " o2i_dict = o2i_esc(files)\n", " x_train = torch.stack([get_embedding(f).detach() for f in files if get_esc_fold(f) != str(valid_fold)], dim=0)\n", " x_valid = torch.stack([get_embedding(f).detach() for f in files if get_esc_fold(f) == str(valid_fold)], dim=0)\n", " y_train = torch.tensor([o2i_dict[get_esc_label(f)] for f in files if get_esc_fold(f) != str(valid_fold)]) \n", " y_valid = torch.tensor([o2i_dict[get_esc_label(f)] for f in files if get_esc_fold(f) == str(valid_fold)])\n", " return x_train, y_train, x_valid, y_valid" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ESC-10 train and fit" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC10, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([320, 5, 128]),\n", " torch.Size([80, 5, 128]),\n", " torch.Size([320]),\n", " torch.Size([80]))" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train = x_train.reshape(x_train.shape[0], -1)\n", "x_valid = x_valid.reshape(x_valid.shape[0], -1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([320, 640]), torch.Size([320]))" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.shape, y_train.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import RidgeClassifierCV\n", "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier.fit(x_train, y_train);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8375" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier.score(x_valid, y_valid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What if, instead of flattening, we mean across channel to keep embedding context" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC10, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([320, 5, 128]),\n", " torch.Size([80, 5, 128]),\n", " torch.Size([320]),\n", " torch.Size([80]))" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([320, 128])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.mean(dim=1).shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train = x_train.mean(dim=1)\n", "x_valid = x_valid.mean(dim=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([320, 128]),\n", " torch.Size([80, 128]),\n", " torch.Size([320]),\n", " torch.Size([80]))" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import RidgeClassifierCV\n", "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier.fit(x_train, y_train);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.875" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier.score(x_valid, y_valid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ESC50 train and fit" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC50, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([1600, 5, 128]),\n", " torch.Size([400, 5, 128]),\n", " torch.Size([1600]),\n", " torch.Size([400]))" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train = x_train.reshape(x_train.shape[0], -1)\n", "x_valid = x_valid.reshape(x_valid.shape[0], -1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier.fit(x_train, y_train);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6225" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier.score(x_valid, y_valid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train with channel mean instead" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC50, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([1600, 5, 128]),\n", " torch.Size([400, 5, 128]),\n", " torch.Size([1600]),\n", " torch.Size([400]))" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train = x_train.mean(dim=1)\n", "x_valid = x_valid.mean(dim=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([1600, 128]),\n", " torch.Size([400, 128]),\n", " torch.Size([1600]),\n", " torch.Size([400]))" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train.shape, x_valid.shape, y_train.shape, y_valid.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier.fit(x_train, y_train);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6025" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier.score(x_valid, y_valid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's try data augmentation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#waveform_to_examples(data, sample_rate)+" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }