{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#default_exp audio.augment\n", "#default_cls_lvl 3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Augmentation for Audio\n", "\n", "> Transforms to apply data augmentation to AudioSpectrograms and Signals" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "from local.torch_basics import *\n", "from local.test import *\n", "from local.data.all import *\n", "from local.vision.all import *\n", "from local.notebook.showdoc import show_doc\n", "from local.audio.core import *\n", "from local.learner import *\n", "from local.vision.models.xresnet import *\n", "from local.metrics import *\n", "from local.basics import *\n", "from local.callback.all import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ImageDataBunch??" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# export\n", "import torch.nn\n", "from torch import stack, zeros_like as t0, ones_like as t1\n", "from torch.distributions.bernoulli import Bernoulli\n", "from librosa.effects import split\n", "from dataclasses import asdict\n", "from scipy.signal import resample_poly\n", "from scipy.ndimage.interpolation import shift\n", "import librosa\n", "import colorednoise as cn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Table of Contents\n", "1. [Preprocessing Functions](#Preprocessing-Functions) \n", " 1. [Remove Silence](#Remove-Silence)\n", " 1. [Resampling](#Resampling)\n", "1. [Signal Transforms](#Signal-Transforms)\n", " 1. [Signal Cropping/Padding](#Signal-Cropping/Padding)\n", " 1. [Signal Shifting](#Signal-Shifting)\n", " 1. [Add Noise to Signal](#Add-Noise-to-Signal)\n", " 1. [Adjust Volume](#Adjust-Volume)\n", " 1. [Signal Cutout](#Signal-Cutout)\n", " 1. [Signal Loss](#Signal-Loss)\n", " 1. [DownmixMono](#DownmixMono)\n", "1. [Spectrogram Transforms](#Spectrogram-Transforms)\n", " 1. [Time Cropping](#Time-Cropping)\n", " 1. [Time and Frequency Masking (SpecAugment)](#Time-and-Frequency-Masking-(SpecAugment))\n", " 1. [Spectrogram Rolling](#Spectrogram-Rolling)\n", " 1. [Delta/Accelerate](#Delta/Accelerate)\n", "1. [Pipelines](#Pipelines)\n", " 1. [Signal Pipelines](#Signal-Pipelines)\n", " 1. [Spectrogram Pipelines](#Spectrogram-Pipelines)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "##export\n", "#_all_ = ['AudioGetter', 'get_audio_files', 'AudioItem', 'OpenAudio', 'AudioSpectrogram', 'AudioToSpec',\n", " # 'SpectrogramConfig', 'AudioConfig', 'audio_extensions']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'\n", "untar_data(URLs.SPEAKERS10, fname=str(p)+'.tar', dest=p)\n", "x = AudioGetter(\"\", recurse=True, folders=None)\n", "files = x(p)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#files will load differently on different machines so we specify examples by name\n", "ex_files = [p/f for f in ['m0005_us_m0005_00218.wav', \n", " 'f0003_us_f0003_00279.wav', \n", " 'f0001_us_f0001_00168.wav', \n", " 'f0005_us_f0005_00286.wav',]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "audio_orig = AudioItem.create(ex_files[0])\n", "a2s = AudioToSpec(n_fft = 1024, hop_length=256)\n", "sg_orig = a2s(audio_orig)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#get 3 equal length portions of 3 different signals so we can stack them\n", "#for a fake multichannel example\n", "ai0, ai1, ai2 = map(AudioItem.create, ex_files[1:4]);\n", "min_samples = min(ai0.nsamples, ai1.nsamples, ai2.nsamples)\n", "s0, s1, s2 = map(lambda x: x[:,:min_samples], (ai0.sig, ai1.sig, ai2.sig))\n", "fake_multichannel = AudioItem((torch.stack((s0, s1, s2), dim=1).squeeze(0), 16000, None))\n", "sg_multi = a2s(fake_multichannel)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing Functions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "