{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# FMA: A Dataset For Music Analysis\n", "\n", "Michaƫl Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n", "\n", "## Baselines\n", "\n", "* This notebook evalutates standard classifiers from scikit-learn on the provided features.\n", "* Moreover, it evaluates Deep Learning models on both audio and spectrograms." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "import os\n", "\n", "import IPython.display as ipd\n", "from tqdm import tqdm_notebook\n", "import numpy as np\n", "import pandas as pd\n", "import keras\n", "from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape\n", "\n", "from sklearn.utils import shuffle\n", "from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.svm import SVC, LinearSVC\n", "#from sklearn.gaussian_process import GaussianProcessClassifier\n", "#from sklearn.gaussian_process.kernels import RBF\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", "from sklearn.neural_network import MLPClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n", "from sklearn.multiclass import OneVsRestClassifier\n", "\n", "import utils" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "AUDIO_DIR = os.environ.get('AUDIO_DIR')\n", "\n", "tracks = utils.load('tracks.csv')\n", "features = utils.load('features.csv')\n", "echonest = utils.load('echonest.csv')\n", "\n", "np.testing.assert_array_equal(features.index, tracks.index)\n", "assert echonest.index.isin(tracks.index).all()\n", "\n", "tracks.shape, features.shape, echonest.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Subset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "subset = tracks.index[tracks['set', 'subset'] <= 'medium']\n", "\n", "assert subset.isin(tracks.index).all()\n", "assert subset.isin(features.index).all()\n", "\n", "features_all = features.join(echonest, how='inner').sort_index(axis=1)\n", "print('Not enough Echonest features: {}'.format(features_all.shape))\n", "\n", "tracks = tracks.loc[subset]\n", "features_all = features.loc[subset]\n", "\n", "tracks.shape, features_all.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train = tracks.index[tracks['set', 'split'] == 'training']\n", "val = tracks.index[tracks['set', 'split'] == 'validation']\n", "test = tracks.index[tracks['set', 'split'] == 'test']\n", "\n", "print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))\n", "\n", "genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)\n", "#genres = list(tracks['track', 'genre_top'].unique())\n", "print('Top genres ({}): {}'.format(len(genres), genres))\n", "genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)\n", "print('All genres ({}): {}'.format(len(genres), genres))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 Multiple classifiers and feature sets\n", "\n", "Todo:\n", "* Cross-validation for hyper-parameters.\n", "* Dimensionality reduction?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 Pre-processing" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def pre_process(tracks, features, columns, multi_label=False, verbose=False):\n", " if not multi_label:\n", " # Assign an integer value to each genre.\n", " enc = LabelEncoder()\n", " labels = tracks['track', 'genre_top']\n", " #y = enc.fit_transform(tracks['track', 'genre_top'])\n", " else:\n", " # Create an indicator matrix.\n", " enc = MultiLabelBinarizer()\n", " labels = tracks['track', 'genres_all']\n", " #labels = tracks['track', 'genres']\n", "\n", " # Split in training, validation and testing sets.\n", " y_train = enc.fit_transform(labels[train])\n", " y_val = enc.transform(labels[val])\n", " y_test = enc.transform(labels[test])\n", " X_train = features.loc[train, columns].as_matrix()\n", " X_val = features.loc[val, columns].as_matrix()\n", " X_test = features.loc[test, columns].as_matrix()\n", " \n", " X_train, y_train = shuffle(X_train, y_train, random_state=42)\n", " \n", " # Standardize features by removing the mean and scaling to unit variance.\n", " scaler = StandardScaler(copy=False)\n", " scaler.fit_transform(X_train)\n", " scaler.transform(X_val)\n", " scaler.transform(X_test)\n", " \n", " return y_train, y_val, y_test, X_train, X_val, X_test" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2 Single genre" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def test_classifiers_features(classifiers, feature_sets, multi_label=False):\n", " columns = list(classifiers.keys()).insert(0, 'dim')\n", " scores = pd.DataFrame(columns=columns, index=feature_sets.keys())\n", " times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())\n", " for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):\n", " y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)\n", " scores.loc[fset_name, 'dim'] = X_train.shape[1]\n", " for clf_name, clf in classifiers.items(): # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):\n", " t = time.process_time()\n", " clf.fit(X_train, y_train)\n", " score = clf.score(X_test, y_test)\n", " scores.loc[fset_name, clf_name] = score\n", " times.loc[fset_name, clf_name] = time.process_time() - t\n", " return scores, times\n", "\n", "def format_scores(scores):\n", " def highlight(s):\n", " is_max = s == max(s[1:])\n", " return ['background-color: yellow' if v else '' for v in is_max]\n", " scores = scores.style.apply(highlight, axis=1)\n", " return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifiers = {\n", " 'LR': LogisticRegression(),\n", " 'kNN': KNeighborsClassifier(n_neighbors=200),\n", " 'SVCrbf': SVC(kernel='rbf'),\n", " 'SVCpoly1': SVC(kernel='poly', degree=1),\n", " 'linSVC1': SVC(kernel=\"linear\"),\n", " 'linSVC2': LinearSVC(),\n", " #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),\n", " 'DT': DecisionTreeClassifier(max_depth=5),\n", " 'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n", " 'AdaBoost': AdaBoostClassifier(n_estimators=10),\n", " 'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),\n", " 'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),\n", " 'NB': GaussianNB(),\n", " 'QDA': QuadraticDiscriminantAnalysis(),\n", "}\n", "\n", "feature_sets = {\n", "# 'echonest_audio': ('echonest', 'audio_features'),\n", "# 'echonest_social': ('echonest', 'social_features'),\n", "# 'echonest_temporal': ('echonest', 'temporal_features'),\n", "# 'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),\n", "# 'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),\n", "}\n", "for name in features.columns.levels[0]:\n", " feature_sets[name] = name\n", "feature_sets.update({\n", " 'mfcc/contrast': ['mfcc', 'spectral_contrast'],\n", " 'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],\n", " 'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],\n", " 'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],\n", " 'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],\n", " 'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],\n", " 'all_non-echonest': list(features.columns.levels[0])\n", "})\n", "\n", "scores, times = test_classifiers_features(classifiers, feature_sets)\n", "\n", "ipd.display(format_scores(scores))\n", "ipd.display(times.style.format('{:.4f}'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3 Multiple genres\n", "\n", "Todo:\n", "* Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifiers = {\n", " #LogisticRegression(),\n", " 'LR': OneVsRestClassifier(LogisticRegression()),\n", " 'SVC': OneVsRestClassifier(SVC()),\n", " 'MLP': MLPClassifier(max_iter=700),\n", "}\n", "\n", "feature_sets = {\n", "# 'echonest_audio': ('echonest', 'audio_features'),\n", "# 'echonest_temporal': ('echonest', 'temporal_features'),\n", " 'mfcc': 'mfcc',\n", " 'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],\n", " 'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],\n", "}\n", "\n", "scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)\n", "\n", "ipd.display(format_scores(scores))\n", "ipd.display(times.style.format('{:.4f}'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2 Deep learning on raw audio\n", "\n", "Other architectures:\n", "* [Learning Features of Music from Scratch (MusicNet)](https://arxiv.org/abs/1611.09827), John Thickstun, Zaid Harchaoui, Sham Kakade." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])\n", "labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:\n", "* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg\n", " * resampling is very slow --> use `kaiser_fast`\n", " * does not work with multi-processing, for keras `fit_generator()`\n", "* pydub is a high-level interface for audio modification, uses ffmpeg to load\n", " * store a temporary `.wav`\n", "* directly pipe ffmpeg output\n", " * fastest method\n", "* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Just be sure that everything is fine. Multiprocessing is tricky to debug.\n", "utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))\n", "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())\n", "SampleLoader(train, batch_size=2).__next__()[0].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Keras parameters.\n", "NB_WORKER = len(os.sched_getaffinity(0)) # number of usables CPUs\n", "params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1 Fully connected neural network\n", "\n", "* Two layers with 10 hiddens is no better than random, ~11%.\n", "\n", "Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "loader = utils.FfmpegLoader(sampling_rate=2000)\n", "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n", "print('Dimensionality: {}'.format(loader.shape))\n", "\n", "keras.backend.clear_session()\n", "\n", "model = keras.models.Sequential()\n", "model.add(Dense(output_dim=1000, input_shape=loader.shape))\n", "model.add(Activation(\"relu\"))\n", "model.add(Dense(output_dim=100))\n", "model.add(Activation(\"relu\"))\n", "model.add(Dense(output_dim=labels_onehot.shape[1]))\n", "model.add(Activation(\"softmax\"))\n", "\n", "optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)\n", "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n", "\n", "model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)\n", "loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)\n", "loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)\n", "#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);\n", "\n", "loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.2 Convolutional neural network\n", "\n", "* Architecture: [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf), Sander Dieleman, Benjamin Schrauwen.\n", "* Missing: track segmentation and class averaging (majority voting)\n", "* Compared with log-scaled mel-spectrograms instead of strided convolution as first layer.\n", "* Larger net: http://benanne.github.io/2014/08/05/spotify-cnns.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "loader = utils.FfmpegLoader(sampling_rate=16000)\n", "#loader = utils.LibrosaLoader(sampling_rate=16000)\n", "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n", "\n", "keras.backend.clear_session()\n", "\n", "model = keras.models.Sequential()\n", "model.add(Reshape((-1, 1), input_shape=loader.shape))\n", "print(model.output_shape)\n", "\n", "model.add(Conv1D(128, 512, subsample_length=512))\n", "print(model.output_shape)\n", "model.add(Activation(\"relu\"))\n", "\n", "model.add(Conv1D(32, 8))\n", "print(model.output_shape)\n", "model.add(Activation(\"relu\"))\n", "model.add(MaxPooling1D(4))\n", "\n", "model.add(Conv1D(32, 8))\n", "print(model.output_shape)\n", "model.add(Activation(\"relu\"))\n", "model.add(MaxPooling1D(4))\n", "\n", "print(model.output_shape)\n", "#model.add(Dropout(0.25))\n", "model.add(Flatten())\n", "print(model.output_shape)\n", "model.add(Dense(100))\n", "model.add(Activation(\"relu\"))\n", "print(model.output_shape)\n", "model.add(Dense(labels_onehot.shape[1]))\n", "model.add(Activation(\"softmax\"))\n", "print(model.output_shape)\n", "\n", "optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)\n", "#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)\n", "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n", "\n", "model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)\n", "loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)\n", "loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)\n", "\n", "loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.3 Recurrent neural network" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3 Deep learning on extracted audio features\n", "\n", "Look at:\n", "* Pre-processing in Keras: https://github.com/keunwoochoi/kapre\n", "* Convolutional Recurrent Neural Networks for Music Classification: https://github.com/keunwoochoi/icassp_2017\n", "* Music Auto-Tagger: https://github.com/keunwoochoi/music-auto_tagging-keras\n", "* Pre-processor: https://github.com/bmcfee/pumpp" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.1 ConvNet on MFCC\n", "\n", "* Architecture: [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf), Tom LH. Li, Antoni B. Chan and Andy HW. Chun\n", "* Missing: track segmentation and majority voting.\n", "* Best seen: 17.6%" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class MfccLoader(utils.Loader):\n", " raw_loader = utils.FfmpegLoader(sampling_rate=22050)\n", " #shape = (13, 190) # For segmented tracks.\n", " shape = (13, 2582)\n", " def load(self, filename):\n", " import librosa\n", " x = self.raw_loader.load(filename)\n", " # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.\n", " mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)\n", " return mfcc\n", "\n", "loader = MfccLoader()\n", "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n", "loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "\n", "model = keras.models.Sequential()\n", "model.add(Reshape((*loader.shape, 1), input_shape=loader.shape))\n", "print(model.output_shape)\n", "\n", "model.add(Conv2D(3, 13, 10, subsample=(1, 4)))\n", "model.add(Activation(\"relu\"))\n", "print(model.output_shape)\n", "\n", "model.add(Conv2D(15, 1, 10, subsample=(1, 4)))\n", "model.add(Activation(\"relu\"))\n", "print(model.output_shape)\n", "\n", "model.add(Conv2D(65, 1, 10, subsample=(1, 4)))\n", "model.add(Activation(\"relu\"))\n", "print(model.output_shape)\n", "\n", "model.add(Flatten())\n", "print(model.output_shape)\n", "model.add(Dense(labels_onehot.shape[1]))\n", "model.add(Activation(\"softmax\"))\n", "print(model.output_shape)\n", "\n", "optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)\n", "#optimizer = keras.optimizers.Adam()#lr=1e-5)#\n", "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n", "\n", "model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)\n", "loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)\n", "loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)\n", "#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)\n", "\n", "loss" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 1 }