{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from local.torch_basics import *\n", "from local.test import *\n", "from local.basics import *\n", "from local.data.all import *\n", "from local.vision.core import *\n", "from local.notebook.showdoc import show_doc\n", "from local.audio.core import *\n", "from local.audio.augment import *\n", "from local.vision.learner import *\n", "from local.vision.models.xresnet import *\n", "from local.metrics import *\n", "from local.callback.schedule import *\n", "import torchaudio" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tutorial: Training a Voice Recognition Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p10speakers = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PosixPath('/home/jupyter/.fastai/data/250_speakers/250-speakers')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Warning this dataset is ~8GB\n", "p250speakers = Config()['data_path'] / '250_speakers'\n", "untar_data(URLs.SPEAKERS250, fname=str(p250speakers)+'.tar', dest=p250speakers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = AudioGetter(\"\", recurse=True, folders=None)\n", "files_10 = x(p10speakers)\n", "files_250 = x(p250speakers)\n", "#original_aud = AudioItem.create(files[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Datablock and Basic End to End Training on 10 Speakers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def AudioBlock(cls=AudioItem): return TransformBlock(type_tfms=cls.create, batch_tfms=IntToFloatTensor)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "auds = DataBlock(blocks=(AudioBlock, CategoryBlock), \n", " get_items=get_audio_files, \n", " splitter=RandomSplitter(),\n", " get_y=lambda x: str(x).split('/')[-1][:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cats = [y for _,y in auds.datasource(p10speakers)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#verify categories are being correctly assigned\n", "test_eq(min(cats).item(), 0)\n", "test_eq(max(cats).item(), 9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#crop 2s from the signal and turn it to a MelSpectrogram with no augmentation\n", "cfg_voice = AudioConfig.Voice()\n", "a2s = AudioToSpec.from_cfg(cfg_voice)\n", "crop_2000ms = CropSignal(2000)\n", "tfms = [crop_2000ms, a2s]\n", "dbunch = auds.databunch(p10speakers, item_tfms=tfms, bs=64)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "