Marzi... 0 \n",
"10000 \n",
"\n",
" \n",
" \n",
" | \n",
" genre_color | \n",
" genre_handle | \n",
" genre_parent_id | \n",
" genre_title | \n",
"
\n",
" \n",
" genre_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" #006666 | \n",
" Avant-Garde | \n",
" 38.0 | \n",
" Avant-Garde | \n",
"
\n",
" \n",
" 2 | \n",
" #CC3300 | \n",
" International | \n",
" NaN | \n",
" International | \n",
"
\n",
" \n",
" 3 | \n",
" #000099 | \n",
" Blues | \n",
" NaN | \n",
" Blues | \n",
"
\n",
" \n",
" 4 | \n",
" #990099 | \n",
" Jazz | \n",
" NaN | \n",
" Jazz | \n",
"
\n",
" \n",
" 5 | \n",
" #8A8A65 | \n",
" Classical | \n",
" NaN | \n",
" Classical | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" genre_color genre_handle genre_parent_id genre_title\n",
"genre_id \n",
"1 #006666 Avant-Garde 38.0 Avant-Garde\n",
"2 #CC3300 International NaN International\n",
"3 #000099 Blues NaN Blues\n",
"4 #990099 Jazz NaN Jazz\n",
"5 #8A8A65 Classical NaN Classical"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"N = 5\n",
"ipd.display(tracks.head(N))\n",
"ipd.display(albums.head(N))\n",
"ipd.display(artists.head(N))\n",
"ipd.display(genres.head(N))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2 Format metadata\n",
"\n",
"Todo:\n",
"* Sanitize values, e.g. list of words for tags, valid links in `artist_wikipedia_page`, remove html markup in free-form text.\n",
" * Clean tags. E.g. some tags are just artist names.\n",
"* Fill metadata about encoding: length, number of samples, sample rate, bit rate, channels (mono/stereo), 16bits?.\n",
"* Update duration from audio\n",
" * 2624 is marked as 05:05:50 (18350s) although it is reported as 00:21:15.15 by ffmpeg.\n",
" * 112067: 3714s --> 01:59:55.06, 112808: 3718s --> 01:59:59.56\n",
" * ffmpeg: Estimating duration from bitrate, this may be inaccurate\n",
" * Solution, decode the complete mp3: `ffmpeg -i input.mp3 -f null -`"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 null, 109727 non-null\n"
]
},
{
"data": {
"text/plain": [
"[] 85881\n",
"['interiors c1964', 'existential', 'hardcore-punk', 'pop-punk', 'punk-rock', 'internet boyfriend', 'rew starr', 'public domain', 'creative commons', 'microsong challenge'] 314\n",
"['classwar karaoke'] 239\n",
"['all styles experimental'] 215\n",
"['improvisation', 'not normal music', 'all styles experimental'] 195\n",
"['era 1'] 176\n",
"['all styles experimental', 'harsh noise', 'not normal music'] 150\n",
"['music is a belief', 'chary', 'nishad', 'uju', 'ibiene', 'nazeem', 'deepu', 'maneet', 'azedine', 'mohammad'] 140\n",
"['new zealand'] 140\n",
"['improvisation', 'all styles experimental', 'not normal music'] 128\n",
"Name: tags, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df, column = tracks, 'tags'\n",
"null = sum(df[column].isnull())\n",
"print('{} null, {} non-null'.format(null, df.shape[0] - null))\n",
"df[column].value_counts().head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1 Tracks"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"drop = [\n",
" 'license_image_file', 'license_image_file_large', 'license_parent_id', 'license_url', # keep title only\n",
" 'track_file', 'track_image_file', # used to download only\n",
" 'track_url', 'album_url', 'artist_url', # only relevant on website\n",
" 'track_copyright_c', 'track_copyright_p', # present for ~1000 tracks only\n",
" # 'track_composer', 'track_lyricist', 'track_publisher', # present for ~4000, <1000 and <2000 tracks\n",
" 'track_disc_number', # different from 1 for <1000 tracks\n",
" 'track_explicit', 'track_explicit_notes', # present for <4000 tracks\n",
" 'track_instrumental' # ~6000 tracks have a 1, there is an instrumental genre\n",
"]\n",
"tracks.drop(drop, axis=1, inplace=True)\n",
"tracks.rename(columns={'license_title': 'track_license', 'tags': 'track_tags'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"tracks['track_duration'] = tracks['track_duration'].map(creation.convert_duration)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def convert_datetime(df, column, format=None):\n",
" df[column] = pd.to_datetime(df[column], infer_datetime_format=True, format=format)\n",
"convert_datetime(tracks, 'track_date_created')\n",
"convert_datetime(tracks, 'track_date_recorded')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"tracks['album_id'].fillna(-1, inplace=True)\n",
"tracks['track_bit_rate'].fillna(-1, inplace=True)\n",
"tracks = tracks.astype({'album_id': int, 'track_bit_rate': int})"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def convert_genres(genres):\n",
" genres = ast.literal_eval(genres)\n",
" return [int(genre['genre_id']) for genre in genres]\n",
"\n",
"tracks['track_genres'].fillna('[]', inplace=True)\n",
"tracks['track_genres'] = tracks['track_genres'].map(convert_genres)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['album_id', 'album_title', 'artist_id', 'artist_name', 'artist_website',\n",
" 'track_license', 'track_tags', 'track_bit_rate', 'track_comments',\n",
" 'track_composer', 'track_date_created', 'track_date_recorded',\n",
" 'track_duration', 'track_favorites', 'track_genres',\n",
" 'track_information', 'track_interest', 'track_language_code',\n",
" 'track_listens', 'track_lyricist', 'track_number', 'track_publisher',\n",
" 'track_title'],\n",
" dtype='object')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tracks.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.2 Albums"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"drop = [\n",
" 'artist_name', 'album_url', 'artist_url', # in tracks already (though it can be different)\n",
" 'album_handle',\n",
" 'album_image_file', 'album_images', # todo: shall be downloaded\n",
" #'album_producer', 'album_engineer', # present for ~2400 albums only\n",
"]\n",
"albums.drop(drop, axis=1, inplace=True)\n",
"albums.rename(columns={'tags': 'album_tags'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"convert_datetime(albums, 'album_date_created')\n",
"convert_datetime(albums, 'album_date_released')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['album_comments', 'album_date_created', 'album_date_released',\n",
" 'album_engineer', 'album_favorites', 'album_information',\n",
" 'album_listens', 'album_producer', 'album_title', 'album_tracks',\n",
" 'album_type', 'album_tags'],\n",
" dtype='object')"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"albums.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 Artists"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"drop = [\n",
" 'artist_website', 'artist_url', # in tracks already (though it can be different)\n",
" 'artist_handle',\n",
" 'artist_image_file', 'artist_images', # todo: shall be downloaded\n",
" 'artist_donation_url', 'artist_paypal_name', 'artist_flattr_name', # ~1600 & ~400 & ~70, not relevant\n",
" 'artist_contact', # ~1500, not very useful data\n",
" # 'artist_active_year_begin', 'artist_active_year_end', # ~1400, ~500 only\n",
" # 'artist_associated_labels', # ~1000\n",
" # 'artist_related_projects', # only ~800, but can be combined with bio\n",
"]\n",
"artists.drop(drop, axis=1, inplace=True)\n",
"artists.rename(columns={'tags': 'artist_tags'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"convert_datetime(artists, 'artist_date_created')\n",
"for column in ['artist_active_year_begin', 'artist_active_year_end']:\n",
" artists[column].replace(0.0, np.nan, inplace=True)\n",
" convert_datetime(artists, column, format='%Y.0')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['artist_active_year_begin', 'artist_active_year_end',\n",
" 'artist_associated_labels', 'artist_bio', 'artist_comments',\n",
" 'artist_date_created', 'artist_favorites', 'artist_latitude',\n",
" 'artist_location', 'artist_longitude', 'artist_members', 'artist_name',\n",
" 'artist_related_projects', 'artist_wikipedia_page', 'artist_tags'],\n",
" dtype='object')"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"artists.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.4 Merge DataFrames"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"not_found['albums'].remove(None)\n",
"not_found['albums'].append(-1)\n",
"not_found['albums'] = [int(i) for i in not_found['albums']]\n",
"not_found['artists'] = [int(i) for i in not_found['artists']]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3674 tracks without extended album information (1041 tracks without album_id)\n"
]
}
],
"source": [
"tracks = tracks.merge(albums, left_on='album_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))\n",
"\n",
"n = sum(tracks['album_title_dup'].isnull())\n",
"print('{} tracks without extended album information ({} tracks without album_id)'.format(\n",
" n, sum(tracks['album_id'] == -1)))\n",
"assert sum(tracks['album_id'].isin(not_found['albums'])) == n\n",
"assert sum(tracks['album_title'] != tracks['album_title_dup']) == n\n",
"\n",
"tracks.drop('album_title_dup', axis=1, inplace=True)\n",
"assert not any('dup' in col for col in tracks.columns)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# Album artist can be different than track artist. Keep track artist.\n",
"#tracks[tracks['artist_name'] != tracks['artist_name_dup']].select(lambda x: 'artist_name' in x, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"974 tracks without extended artist information\n"
]
}
],
"source": [
"tracks = tracks.merge(artists, left_on='artist_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))\n",
"\n",
"n = sum(tracks['artist_name_dup'].isnull())\n",
"print('{} tracks without extended artist information'.format(n))\n",
"assert sum(tracks['artist_id'].isin(not_found['artists'])) == n\n",
"assert sum(tracks['artist_name'] != tracks[('artist_name_dup')]) == n\n",
"\n",
"tracks.drop('artist_name_dup', axis=1, inplace=True)\n",
"assert not any('dup' in col for col in tracks.columns)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"columns = []\n",
"for name in tracks.columns:\n",
" names = name.split('_')\n",
" columns.append((names[0], '_'.join(names[1:])))\n",
"tracks.columns = pd.MultiIndex.from_tuples(columns)\n",
"assert all(label in ['track', 'album', 'artist'] for label in tracks.columns.get_level_values(0))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# Todo: fill other columns ?\n",
"tracks['album', 'tags'].fillna('[]', inplace=True)\n",
"tracks['artist', 'tags'].fillna('[]', inplace=True)\n",
"\n",
"columns = [('album', 'favorites'), ('album', 'comments'), ('album', 'listens'), ('album', 'tracks'),\n",
" ('artist', 'favorites'), ('artist', 'comments')]\n",
"for column in columns:\n",
" tracks[column].fillna(-1, inplace=True)\n",
"columns = {column: int for column in columns}\n",
"tracks = tracks.astype(columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3 Data cleaning\n",
"\n",
"Todo: duplicates (metadata and audio)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 lost, 109727 left\n"
]
}
],
"source": [
"def keep(index, df):\n",
" old = len(df)\n",
" df = df.loc[index]\n",
" new = len(df)\n",
" print('{} lost, {} left'.format(old - new, new))\n",
" return df\n",
"\n",
"tracks = keep(tracks.index, tracks)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"180 lost, 109547 left\n",
"286 lost, 109261 left\n"
]
}
],
"source": [
"# Audio not found or could not be trimmed.\n",
"tracks = keep(tracks.index.difference(not_found['audio']), tracks)\n",
"tracks = keep(tracks.index.difference(not_found['clips']), tracks)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Errors from the `features.py` script.\n",
"* IndexError('index 0 is out of bounds for axis 0 with size 0',)\n",
" * ffmpeg: Header missing\n",
" * ffmpeg: Could not find codec parameters for stream 0 (Audio: mp3, 0 channels, s16p): unspecified frame size. Consider increasing the value for the 'analyzeduration' and 'probesize' options\n",
" * tids: 117759\n",
"* NoBackendError()\n",
" * ffmpeg: Format mp3 detected only with low score of 1, misdetection possible!\n",
" * tids: 80015, 115235\n",
"* UserWarning('Trying to estimate tuning from empty frequency set.',)\n",
" * librosa error\n",
" * tids: 1440, 26436, 38903, 57603, 62095, 62954, 62956, 62957, 62959, 62971, 86079, 96426, 104623, 106719, 109714, 114501, 114528, 118003, 118004, 127827, 130298, 130296, 131076, 135804, 154923\n",
"* ParameterError('Filter pass-band lies beyond Nyquist',)\n",
" * librosa error\n",
" * tids: 152204, 28106, 29166, 29167, 29169, 29168, 29170, 29171, 29172, 29173, 29179, 43903, 56757, 59361, 75461, 92346, 92345, 92347, 92349, 92350, 92351, 92353, 92348, 92352, 92354, 92355, 92356, 92358, 92359, 92361, 92360, 114448, 136486, 144769, 144770, 144771, 144773, 144774, 144775, 144778, 144776, 144777"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"71 lost, 109190 left\n"
]
}
],
"source": [
"# Feature extraction failed.\n",
"FAILED = [1440, 26436, 28106, 29166, 29167, 29168, 29169, 29170, 29171, 29172,\n",
" 29173, 29179, 38903, 43903, 56757, 57603, 59361, 62095, 62954, 62956,\n",
" 62957, 62959, 62971, 75461, 80015, 86079, 92345, 92346, 92347, 92348,\n",
" 92349, 92350, 92351, 92352, 92353, 92354, 92355, 92356, 92357, 92358,\n",
" 92359, 92360, 92361, 96426, 104623, 106719, 109714, 114448, 114501,114528,\n",
" 115235, 117759, 118003, 118004, 127827, 130296, 130298, 131076, 135804, 136486,\n",
" 144769, 144770, 144771, 144773, 144774, 144775, 144776, 144777, 144778, 152204,\n",
" 154923]\n",
"tracks = keep(tracks.index.difference(FAILED), tracks)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2616 lost, 106574 left\n",
"114 licenses\n"
]
}
],
"source": [
"# License forbids redistribution.\n",
"tracks = keep(tracks['track', 'license'] != 'FMA-Limited: Download Only', tracks)\n",
"print('{} licenses'.format(len(tracks[('track', 'license')].unique())))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"#sum(tracks['track', 'title'].duplicated())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4 Genres"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"genres.drop(['genre_handle', 'genre_color'], axis=1, inplace=True)\n",
"genres.rename(columns={'genre_parent_id': 'parent', 'genre_title': 'title'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"genres['parent'].fillna(0, inplace=True)\n",
"genres = genres.astype({'parent': int})"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# 13 (Easy Listening) has parent 126 which is missing\n",
"# --> a root genre on the website, although not in the genre menu\n",
"genres.at[13, 'parent'] = 0\n",
"\n",
"# 580 (Abstract Hip-Hop) has parent 1172 which is missing\n",
"# --> listed as child of Hip-Hop on the website\n",
"genres.at[580, 'parent'] = 21\n",
"\n",
"# 810 (Nu-Jazz) has parent 51 which is missing\n",
"# --> listed as child of Easy Listening on website\n",
"genres.at[810, 'parent'] = 13\n",
"\n",
"# 763 (Holiday) has parent 763 which is itself\n",
"# --> listed as child of Sound Effects on website\n",
"genres.at[763, 'parent'] = 16\n",
"\n",
"# Todo: should novelty be under Experimental? It is alone on website."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"34 tracks have genre 806\n"
]
}
],
"source": [
"# Genre 806 (hiphop) should not exist. Replace it by 21 (Hip-Hop).\n",
"print('{} tracks have genre 806'.format(\n",
" sum(tracks['track', 'genres'].map(lambda genres: 806 in genres))))\n",
"def change_genre(genres):\n",
" return [genre if genre != 806 else 21 for genre in genres]\n",
"tracks['track', 'genres'] = tracks['track', 'genres'].map(change_genre)\n",
"genres.drop(806, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def get_parent(genre, track_all_genres=None):\n",
" parent = genres.at[genre, 'parent']\n",
" if track_all_genres is not None:\n",
" track_all_genres.append(genre)\n",
" return genre if parent == 0 else get_parent(parent, track_all_genres)\n",
"\n",
"# Get all genres, i.e. all genres encountered when walking from leafs to roots.\n",
"def get_all_genres(track_genres):\n",
" track_all_genres = list()\n",
" for genre in track_genres:\n",
" get_parent(genre, track_all_genres)\n",
" return list(set(track_all_genres))\n",
"\n",
"tracks['track', 'genres_all'] = tracks['track', 'genres'].map(get_all_genres)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" parent | \n",
" title | \n",
" #tracks | \n",
"
\n",
" \n",
" genre_id | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 175 | \n",
" 86 | \n",
" Bollywood | \n",
" 0 | \n",
"
\n",
" \n",
" 178 | \n",
" 4 | \n",
" Be-Bop | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" parent title #tracks\n",
"genre_id \n",
"175 86 Bollywood 0\n",
"178 4 Be-Bop 0"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of tracks per genre.\n",
"def count_genres(subset=tracks.index):\n",
" count = pd.Series(0, index=genres.index)\n",
" for _, track_all_genres in tracks.loc[subset, ('track', 'genres_all')].items():\n",
" for genre in track_all_genres:\n",
" count[genre] += 1\n",
" return count\n",
"\n",
"genres['#tracks'] = count_genres()\n",
"genres[genres['#tracks'] == 0]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def get_top_genre(track_genres):\n",
" top_genres = set(genres.at[genres.at[genre, 'top_level'], 'title'] for genre in track_genres)\n",
" return top_genres.pop() if len(top_genres) == 1 else np.nan\n",
"\n",
"# Top-level genre.\n",
"genres['top_level'] = genres.index.map(get_parent)\n",
"tracks['track', 'genre_top'] = tracks['track', 'genres'].map(get_top_genre)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" parent | \n",
" title | \n",
" #tracks | \n",
" top_level | \n",
"
\n",
" \n",
" genre_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 38 | \n",
" Avant-Garde | \n",
" 8693 | \n",
" 38 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" International | \n",
" 5271 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" Blues | \n",
" 1752 | \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" Jazz | \n",
" 4126 | \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" Classical | \n",
" 4106 | \n",
" 5 | \n",
"
\n",
" \n",
" 6 | \n",
" 38 | \n",
" Novelty | \n",
" 914 | \n",
" 38 | \n",
"
\n",
" \n",
" 7 | \n",
" 20 | \n",
" Comedy | \n",
" 217 | \n",
" 20 | \n",
"
\n",
" \n",
" 8 | \n",
" 0 | \n",
" Old-Time / Historic | \n",
" 868 | \n",
" 8 | \n",
"
\n",
" \n",
" 9 | \n",
" 0 | \n",
" Country | \n",
" 1987 | \n",
" 9 | \n",
"
\n",
" \n",
" 10 | \n",
" 0 | \n",
" Pop | \n",
" 13845 | \n",
" 10 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" parent title #tracks top_level\n",
"genre_id \n",
"1 38 Avant-Garde 8693 38\n",
"2 0 International 5271 2\n",
"3 0 Blues 1752 3\n",
"4 0 Jazz 4126 4\n",
"5 0 Classical 4106 5\n",
"6 38 Novelty 914 38\n",
"7 20 Comedy 217 20\n",
"8 0 Old-Time / Historic 868 8\n",
"9 0 Country 1987 9\n",
"10 0 Pop 13845 10"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5 Subsets: large, medium, small"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.1 Large\n",
"\n",
"Main characteristic: the full set with clips trimmed to a manageable size."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.2 Medium\n",
"\n",
"Main characteristic: clean metadata (includes 1 top-level genre) and quality audio."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"fma_medium = pd.DataFrame(tracks)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3529 lost, 103045 left\n",
"598 lost, 102447 left\n",
"1 lost, 102446 left\n",
"674 lost, 101772 left\n",
"65 lost, 101707 left\n"
]
}
],
"source": [
"# Missing meta-information.\n",
"\n",
"# Missing extended album and artist information.\n",
"fma_medium = keep(~fma_medium['album', 'id'].isin(not_found['albums']), fma_medium)\n",
"fma_medium = keep(~fma_medium['artist', 'id'].isin(not_found['artists']), fma_medium)\n",
"\n",
"# Untitled track or album.\n",
"fma_medium = keep(~fma_medium['track', 'title'].isnull(), fma_medium)\n",
"fma_medium = keep(fma_medium['track', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)\n",
"fma_medium = keep(fma_medium['album', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)\n",
"\n",
"# One tag is often just the artist name. Tags too scarce for tracks and albums.\n",
"#keep(fma_medium['artist', 'tags'].map(len) >= 2, fma_medium)\n",
"\n",
"# Too scarce.\n",
"#fma_medium = keep(~fma_medium['album', 'information'].isnull(), fma_medium)\n",
"#fma_medium = keep(~fma_medium['artist', 'bio'].isnull(), fma_medium)\n",
"#fma_medium = keep(~fma_medium['artist', 'website'].isnull(), fma_medium)\n",
"#fma_medium = keep(~fma_medium['artist', 'wikipedia_page'].isnull(), fma_medium)\n",
"\n",
"# Too scarce.\n",
"#fma_medium = keep(~fma_medium['artist', 'location'].isnull(), fma_medium)\n",
"#fma_medium = keep(~fma_medium['artist', 'latitude'].isnull(), fma_medium)\n",
"#fma_medium = keep(~fma_medium['artist', 'longitude'].isnull(), fma_medium)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1326 lost, 100381 left\n"
]
}
],
"source": [
"# Technical quality.\n",
"# Todo: sample rate\n",
"fma_medium = keep(fma_medium['track', 'bit_rate'] > 100000, fma_medium)\n",
"\n",
"# Choosing standard bit rates discards all VBR.\n",
"#fma_medium = keep(fma_medium['track', 'bit_rate'].isin([320000, 256000, 192000, 160000, 128000]), fma_medium)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4736 lost, 95645 left\n",
"5399 lost, 90246 left\n",
"466 lost, 89780 left\n",
"5353 lost, 84427 left\n"
]
}
],
"source": [
"fma_medium = keep(fma_medium['track', 'duration'] >= 60, fma_medium)\n",
"fma_medium = keep(fma_medium['track', 'duration'] <= 600, fma_medium)\n",
"\n",
"fma_medium = keep(fma_medium['album', 'tracks'] >= 1, fma_medium)\n",
"fma_medium = keep(fma_medium['album', 'tracks'] <= 50, fma_medium)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4941 lost, 79486 left\n",
"1064 lost, 78422 left\n",
"1769 lost, 76653 left\n"
]
}
],
"source": [
"# Lower popularity bound.\n",
"fma_medium = keep(fma_medium['track', 'listens'] >= 100, fma_medium)\n",
"fma_medium = keep(fma_medium['track', 'interest'] >= 200, fma_medium)\n",
"fma_medium = keep(fma_medium['album', 'listens'] >= 1000, fma_medium);\n",
"\n",
"# Favorites and comments are very scarce.\n",
"#fma_medium = keep(fma_medium['artist', 'favorites'] >= 1, fma_medium)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"42495 lost, 34158 left\n"
]
}
],
"source": [
"# Targeted genre classification.\n",
"fma_medium = keep(~fma_medium['track', 'genre_top'].isnull(), fma_medium);\n",
"#keep(fma_medium['track', 'genres'].map(len) == 1, fma_medium);"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9158 lost, 25000 left\n"
]
}
],
"source": [
"# Adjust size with popularity measure. Should be of better quality.\n",
"N_TRACKS = 25000\n",
"\n",
"# Observations\n",
"# * More albums killed than artists --> be sure not to kill diversity\n",
"# * Favorites and preterites genres differently --> do it per genre?\n",
"# Normalization\n",
"# * mean, median, std, max\n",
"# * tracks per album or artist\n",
"# Test\n",
"# * 4/5 of same tracks were selected with various set of measures\n",
"# * <5% diff with max and mean\n",
"\n",
"popularity_measures = [('track', 'listens'), ('track', 'interest')] # ('album', 'listens')\n",
"# ('track', 'favorites'), ('track', 'comments'),\n",
"# ('album', 'favorites'), ('album', 'comments'),\n",
"# ('artist', 'favorites'), ('artist', 'comments'),\n",
"\n",
"normalization = {measure: fma_medium[measure].max() for measure in popularity_measures}\n",
"def popularity_measure(track):\n",
" return sum(track[measure] / normalization[measure] for measure in popularity_measures)\n",
"fma_medium['popularity_measure'] = fma_medium.apply(popularity_measure, axis=1)\n",
"fma_medium = keep(fma_medium.sort_values('popularity_measure', ascending=False).index[:N_TRACKS], fma_medium)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" genre_id | \n",
" parent | \n",
" #tracks | \n",
" top_level | \n",
" #tracks_medium | \n",
"
\n",
" \n",
" title | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Rock | \n",
" 12 | \n",
" 0 | \n",
" 32923 | \n",
" 12 | \n",
" 7103 | \n",
"
\n",
" \n",
" Electronic | \n",
" 15 | \n",
" 0 | \n",
" 34413 | \n",
" 15 | \n",
" 6314 | \n",
"
\n",
" \n",
" Experimental | \n",
" 38 | \n",
" 0 | \n",
" 38154 | \n",
" 38 | \n",
" 2251 | \n",
"
\n",
" \n",
" Hip-Hop | \n",
" 21 | \n",
" 0 | \n",
" 8389 | \n",
" 21 | \n",
" 2201 | \n",
"
\n",
" \n",
" Folk | \n",
" 17 | \n",
" 0 | \n",
" 12706 | \n",
" 17 | \n",
" 1519 | \n",
"
\n",
" \n",
" Instrumental | \n",
" 1235 | \n",
" 0 | \n",
" 14938 | \n",
" 1235 | \n",
" 1350 | \n",
"
\n",
" \n",
" Pop | \n",
" 10 | \n",
" 0 | \n",
" 13845 | \n",
" 10 | \n",
" 1186 | \n",
"
\n",
" \n",
" International | \n",
" 2 | \n",
" 0 | \n",
" 5271 | \n",
" 2 | \n",
" 1018 | \n",
"
\n",
" \n",
" Classical | \n",
" 5 | \n",
" 0 | \n",
" 4106 | \n",
" 5 | \n",
" 619 | \n",
"
\n",
" \n",
" Old-Time / Historic | \n",
" 8 | \n",
" 0 | \n",
" 868 | \n",
" 8 | \n",
" 510 | \n",
"
\n",
" \n",
" Jazz | \n",
" 4 | \n",
" 0 | \n",
" 4126 | \n",
" 4 | \n",
" 384 | \n",
"
\n",
" \n",
" Country | \n",
" 9 | \n",
" 0 | \n",
" 1987 | \n",
" 9 | \n",
" 178 | \n",
"
\n",
" \n",
" Soul-RnB | \n",
" 14 | \n",
" 0 | \n",
" 1499 | \n",
" 14 | \n",
" 154 | \n",
"
\n",
" \n",
" Spoken | \n",
" 20 | \n",
" 0 | \n",
" 1876 | \n",
" 20 | \n",
" 118 | \n",
"
\n",
" \n",
" Blues | \n",
" 3 | \n",
" 0 | \n",
" 1752 | \n",
" 3 | \n",
" 74 | \n",
"
\n",
" \n",
" Easy Listening | \n",
" 13 | \n",
" 0 | \n",
" 730 | \n",
" 13 | \n",
" 21 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" genre_id parent #tracks top_level #tracks_medium\n",
"title \n",
"Rock 12 0 32923 12 7103\n",
"Electronic 15 0 34413 15 6314\n",
"Experimental 38 0 38154 38 2251\n",
"Hip-Hop 21 0 8389 21 2201\n",
"Folk 17 0 12706 17 1519\n",
"Instrumental 1235 0 14938 1235 1350\n",
"Pop 10 0 13845 10 1186\n",
"International 2 0 5271 2 1018\n",
"Classical 5 0 4106 5 619\n",
"Old-Time / Historic 8 0 868 8 510\n",
"Jazz 4 0 4126 4 384\n",
"Country 9 0 1987 9 178\n",
"Soul-RnB 14 0 1499 14 154\n",
"Spoken 20 0 1876 20 118\n",
"Blues 3 0 1752 3 74\n",
"Easy Listening 13 0 730 13 21"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmp = genres[genres['parent'] == 0].reset_index().set_index('title')\n",
"tmp['#tracks_medium'] = fma_medium['track', 'genre_top'].value_counts()\n",
"tmp.sort_values('#tracks_medium', ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.3 Small\n",
"\n",
"Main characteristic: genre balanced (and echonest features).\n",
"\n",
"Choices:\n",
"* 8 genres with 1000 tracks --> 8,000 tracks\n",
"* 10 genres with 500 tracks --> 5,000 tracks\n",
"\n",
"Todo:\n",
"* Download more echonest features so that all tracks can have them. Otherwise intersection of tracks with echonest features and one top-level genre is too small."
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2058 lost, 22942 left\n"
]
}
],
"source": [
"N_GENRES = 8\n",
"N_TRACKS = 1000\n",
"\n",
"top_genres = tmp.sort_values('#tracks_medium', ascending=False)[:N_GENRES].index\n",
"fma_small = pd.DataFrame(fma_medium)\n",
"fma_small = keep(fma_small['track', 'genre_top'].isin(top_genres), fma_small)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"to_keep = []\n",
"for genre in top_genres:\n",
" subset = fma_small[fma_small['track', 'genre_top'] == genre]\n",
" drop = subset.sort_values('popularity_measure').index[:-N_TRACKS]\n",
" fma_small.drop(drop, inplace=True)\n",
"assert len(fma_small) == N_GENRES * N_TRACKS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.4 Subset indication"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"SUBSETS = ('small', 'medium', 'large')\n",
"tracks['set', 'subset'] = pd.Series().astype('category', categories=SUBSETS, ordered=True)\n",
"tracks.loc[tracks.index, ('set', 'subset')] = 'large'\n",
"tracks.loc[fma_medium.index, ('set', 'subset')] = 'medium'\n",
"tracks.loc[fma_small.index, ('set', 'subset')] = 'small'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.5 Echonest"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 lost, 14511 left\n",
"205 lost, 14306 left\n",
"239 lost, 14067 left\n",
"938 lost, 13129 left\n",
"7848 lost, 5281 left\n",
"11835 lost, 1294 left\n"
]
}
],
"source": [
"echonest = pd.read_csv('raw_echonest.csv', index_col=0, header=[0, 1, 2])\n",
"echonest = keep(~echonest['echonest', 'temporal_features'].isnull().any(axis=1), echonest)\n",
"echonest = keep(~echonest['echonest', 'audio_features'].isnull().any(axis=1), echonest)\n",
"echonest = keep(~echonest['echonest', 'social_features'].isnull().any(axis=1), echonest)\n",
"\n",
"echonest = keep(echonest.index.isin(tracks.index), echonest);\n",
"keep(echonest.index.isin(fma_medium.index), echonest);\n",
"keep(echonest.index.isin(fma_small.index), echonest);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6 Splits: training, validation, test\n",
"\n",
"Take into account:\n",
"* Artists may only appear on one side.\n",
"* Stratification: ideally, all characteristics (#tracks per artist, duration, sampling rate, information, bio) and targets (genres, tags) should be equally distributed."
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"for genre in genres.index:\n",
" tracks['genre', genres.at[genre, 'title']] = tracks['track', 'genres_all'].map(lambda genres: genre in genres)\n",
"\n",
"SPLITS = ('training', 'test', 'validation')\n",
"PERCENTAGES = (0.8, 0.1, 0.1)\n",
"tracks['set', 'split'] = pd.Series().astype('category', categories=SPLITS)\n",
"\n",
"for subset in SUBSETS:\n",
"\n",
" tracks_subset = tracks['set', 'subset'] <= subset\n",
"\n",
" # Consider only top-level genres for small and medium.\n",
" genre_list = list(tracks.loc[tracks_subset, ('track', 'genre_top')].unique())\n",
" if subset == 'large':\n",
" genre_list = list(genres['title']) \n",
"\n",
" while True:\n",
" if len(genre_list) == 0:\n",
" break\n",
"\n",
" # Choose most constrained genre, i.e. genre with the least unassigned artists.\n",
" tracks_unsplit = tracks['set', 'split'].isnull()\n",
" count = tracks[tracks_subset & tracks_unsplit].set_index(('artist', 'id'), append=True)['genre']\n",
" count = count.groupby(level=1).sum().astype(np.bool).sum()\n",
" genre = np.argmin(count[genre_list])\n",
" genre_list.remove(genre)\n",
" \n",
" # Given genre, select artists.\n",
" tracks_genre = tracks['genre', genre] == 1\n",
" artists = tracks.loc[tracks_genre & tracks_subset & tracks_unsplit, ('artist', 'id')].value_counts()\n",
" #print('-->', genre, len(artists))\n",
"\n",
" current = {split: np.sum(tracks_genre & tracks_subset & (tracks['set', 'split'] == split)) for split in SPLITS}\n",
"\n",
" # Assign artists with most tracks first.\n",
" for artist, count in artists.items():\n",
" choice = np.argmin([current[split] / percentage for split, percentage in zip(SPLITS, PERCENTAGES)])\n",
" current[SPLITS[choice]] += count\n",
" #assert tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')].isnull().all()\n",
" tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')] = SPLITS[choice]\n",
"\n",
"# Tracks without genre can only serve as unlabeled data for training, e.g. for semi-supervised algorithms.\n",
"no_genres = tracks['track', 'genres_all'].map(lambda genres: len(genres) == 0)\n",
"no_split = tracks['set', 'split'].isnull()\n",
"assert not (no_split & ~no_genres).any()\n",
"tracks.loc[no_split, ('set', 'split')] = 'training'\n",
"\n",
"# Not needed any more.\n",
"tracks.drop('genre', axis=1, level=0, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7 Store"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"for dataset in 'tracks', 'genres', 'echonest':\n",
" eval(dataset).sort_index(axis=0, inplace=True)\n",
" eval(dataset).sort_index(axis=1, inplace=True)\n",
" params = dict(float_format='%.10f') if dataset == 'echonest' else dict()\n",
" eval(dataset).to_csv(dataset + '.csv', **params)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"# ./creation.py normalize /path/to/fma\n",
"# ./creation.py zips /path/to/fma"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8 Description"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"album comments int64\n",
" date_created datetime64[ns]\n",
" date_released datetime64[ns]\n",
" engineer object\n",
" favorites int64\n",
" id int64\n",
" information category\n",
" listens int64\n",
" producer object\n",
" tags object\n",
" title object\n",
" tracks int64\n",
" type category\n",
"artist active_year_begin datetime64[ns]\n",
" active_year_end datetime64[ns]\n",
" associated_labels object\n",
" bio category\n",
" comments int64\n",
" date_created datetime64[ns]\n",
" favorites int64\n",
" id int64\n",
" latitude float64\n",
" location object\n",
" longitude float64\n",
" members object\n",
" name object\n",
" related_projects object\n",
" tags object\n",
" website object\n",
" wikipedia_page object\n",
"set split object\n",
" subset category\n",
"track bit_rate int64\n",
" comments int64\n",
" composer object\n",
" date_created datetime64[ns]\n",
" date_recorded datetime64[ns]\n",
" duration int64\n",
" favorites int64\n",
" genre_top category\n",
" genres object\n",
" genres_all object\n",
" information object\n",
" interest int64\n",
" language_code object\n",
" license category\n",
" listens int64\n",
" lyricist object\n",
" number int64\n",
" publisher object\n",
" tags object\n",
" title object\n",
"dtype: object"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tracks = utils.load('tracks.csv')\n",
"tracks.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bit_rate | \n",
" comments | \n",
" composer | \n",
" date_created | \n",
" date_recorded | \n",
" duration | \n",
" favorites | \n",
" genre_top | \n",
" genres | \n",
" genres_all | \n",
" information | \n",
" interest | \n",
" language_code | \n",
" license | \n",
" listens | \n",
" lyricist | \n",
" number | \n",
" publisher | \n",
" tags | \n",
" title | \n",
"
\n",
" \n",
" track_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" 256000 | \n",
" 0 | \n",
" NaN | \n",
" 2008-11-26 01:48:12 | \n",
" 2008-11-26 | \n",
" 168 | \n",
" 2 | \n",
" Hip-Hop | \n",
" [21] | \n",
" [21] | \n",
" NaN | \n",
" 4656 | \n",
" en | \n",
" Attribution-NonCommercial-ShareAlike 3.0 Inter... | \n",
" 1293 | \n",
" NaN | \n",
" 3 | \n",
" NaN | \n",
" [] | \n",
" Food | \n",
"
\n",
" \n",
" 3 | \n",
" 256000 | \n",
" 0 | \n",
" NaN | \n",
" 2008-11-26 01:48:14 | \n",
" 2008-11-26 | \n",
" 237 | \n",
" 1 | \n",
" Hip-Hop | \n",
" [21] | \n",
" [21] | \n",
" NaN | \n",
" 1470 | \n",
" en | \n",
" Attribution-NonCommercial-ShareAlike 3.0 Inter... | \n",
" 514 | \n",
" NaN | \n",
" 4 | \n",
" NaN | \n",
" [] | \n",
" Electric Ave | \n",
"
\n",
" \n",
" 5 | \n",
" 256000 | \n",
" 0 | \n",
" NaN | \n",
" 2008-11-26 01:48:20 | \n",
" 2008-11-26 | \n",
" 206 | \n",
" 6 | \n",
" Hip-Hop | \n",
" [21] | \n",
" [21] | \n",
" NaN | \n",
" 1933 | \n",
" en | \n",
" Attribution-NonCommercial-ShareAlike 3.0 Inter... | \n",
" 1151 | \n",
" NaN | \n",
" 6 | \n",
" NaN | \n",
" [] | \n",
" This World | \n",
"
\n",
" \n",
" 10 | \n",
" 192000 | \n",
" 0 | \n",
" Kurt Vile | \n",
" 2008-11-25 17:49:06 | \n",
" 2008-11-26 | \n",
" 161 | \n",
" 178 | \n",
" Pop | \n",
" [10] | \n",
" [10] | \n",
" NaN | \n",
" 54881 | \n",
" en | \n",
" Attribution-NonCommercial-NoDerivatives (aka M... | \n",
" 50135 | \n",
" NaN | \n",
" 1 | \n",
" NaN | \n",
" [] | \n",
" Freeway | \n",
"
\n",
" \n",
" 20 | \n",
" 256000 | \n",
" 0 | \n",
" NaN | \n",
" 2008-11-26 01:48:56 | \n",
" 2008-01-01 | \n",
" 311 | \n",
" 0 | \n",
" NaN | \n",
" [76, 103] | \n",
" [17, 10, 76, 103] | \n",
" NaN | \n",
" 978 | \n",
" en | \n",
" Attribution-NonCommercial-NoDerivatives (aka M... | \n",
" 361 | \n",
" NaN | \n",
" 3 | \n",
" NaN | \n",
" [] | \n",
" Spiritual Level | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bit_rate comments composer date_created date_recorded \\\n",
"track_id \n",
"2 256000 0 NaN 2008-11-26 01:48:12 2008-11-26 \n",
"3 256000 0 NaN 2008-11-26 01:48:14 2008-11-26 \n",
"5 256000 0 NaN 2008-11-26 01:48:20 2008-11-26 \n",
"10 192000 0 Kurt Vile 2008-11-25 17:49:06 2008-11-26 \n",
"20 256000 0 NaN 2008-11-26 01:48:56 2008-01-01 \n",
"\n",
" duration favorites genre_top genres genres_all \\\n",
"track_id \n",
"2 168 2 Hip-Hop [21] [21] \n",
"3 237 1 Hip-Hop [21] [21] \n",
"5 206 6 Hip-Hop [21] [21] \n",
"10 161 178 Pop [10] [10] \n",
"20 311 0 NaN [76, 103] [17, 10, 76, 103] \n",
"\n",
" information interest language_code \\\n",
"track_id \n",
"2 NaN 4656 en \n",
"3 NaN 1470 en \n",
"5 NaN 1933 en \n",
"10 NaN 54881 en \n",
"20 NaN 978 en \n",
"\n",
" license listens lyricist \\\n",
"track_id \n",
"2 Attribution-NonCommercial-ShareAlike 3.0 Inter... 1293 NaN \n",
"3 Attribution-NonCommercial-ShareAlike 3.0 Inter... 514 NaN \n",
"5 Attribution-NonCommercial-ShareAlike 3.0 Inter... 1151 NaN \n",
"10 Attribution-NonCommercial-NoDerivatives (aka M... 50135 NaN \n",
"20 Attribution-NonCommercial-NoDerivatives (aka M... 361 NaN \n",
"\n",
" number publisher tags title \n",
"track_id \n",
"2 3 NaN [] Food \n",
"3 4 NaN [] Electric Ave \n",
"5 6 NaN [] This World \n",
"10 1 NaN [] Freeway \n",
"20 3 NaN [] Spiritual Level "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" comments | \n",
" date_created | \n",
" date_released | \n",
" engineer | \n",
" favorites | \n",
" id | \n",
" information | \n",
" listens | \n",
" producer | \n",
" tags | \n",
" title | \n",
" tracks | \n",
" type | \n",
"
\n",
" \n",
" track_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" 0 | \n",
" 2008-11-26 01:44:45 | \n",
" 2009-01-05 | \n",
" NaN | \n",
" 4 | \n",
" 1 | \n",
" <p></p> | \n",
" 6073 | \n",
" NaN | \n",
" [] | \n",
" AWOL - A Way Of Life | \n",
" 7 | \n",
" Album | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 2008-11-26 01:44:45 | \n",
" 2009-01-05 | \n",
" NaN | \n",
" 4 | \n",
" 1 | \n",
" <p></p> | \n",
" 6073 | \n",
" NaN | \n",
" [] | \n",
" AWOL - A Way Of Life | \n",
" 7 | \n",
" Album | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" 2008-11-26 01:44:45 | \n",
" 2009-01-05 | \n",
" NaN | \n",
" 4 | \n",
" 1 | \n",
" <p></p> | \n",
" 6073 | \n",
" NaN | \n",
" [] | \n",
" AWOL - A Way Of Life | \n",
" 7 | \n",
" Album | \n",
"
\n",
" \n",
" 10 | \n",
" 0 | \n",
" 2008-11-26 01:45:08 | \n",
" 2008-02-06 | \n",
" NaN | \n",
" 4 | \n",
" 6 | \n",
" NaN | \n",
" 47632 | \n",
" NaN | \n",
" [] | \n",
" Constant Hitmaker | \n",
" 2 | \n",
" Album | \n",
"
\n",
" \n",
" 20 | \n",
" 0 | \n",
" 2008-11-26 01:45:05 | \n",
" 2009-01-06 | \n",
" NaN | \n",
" 2 | \n",
" 4 | \n",
" <p> \"spiritual songs\" from Nicky Cook</p> | \n",
" 2710 | \n",
" NaN | \n",
" [] | \n",
" Niris | \n",
" 13 | \n",
" Album | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" comments date_created date_released engineer favorites id \\\n",
"track_id \n",
"2 0 2008-11-26 01:44:45 2009-01-05 NaN 4 1 \n",
"3 0 2008-11-26 01:44:45 2009-01-05 NaN 4 1 \n",
"5 0 2008-11-26 01:44:45 2009-01-05 NaN 4 1 \n",
"10 0 2008-11-26 01:45:08 2008-02-06 NaN 4 6 \n",
"20 0 2008-11-26 01:45:05 2009-01-06 NaN 2 4 \n",
"\n",
" information listens producer tags \\\n",
"track_id \n",
"2 6073 NaN [] \n",
"3 6073 NaN [] \n",
"5 6073 NaN [] \n",
"10 NaN 47632 NaN [] \n",
"20 \"spiritual songs\" from Nicky Cook
2710 NaN [] \n",
"\n",
" title tracks type \n",
"track_id \n",
"2 AWOL - A Way Of Life 7 Album \n",
"3 AWOL - A Way Of Life 7 Album \n",
"5 AWOL - A Way Of Life 7 Album \n",
"10 Constant Hitmaker 2 Album \n",
"20 Niris 13 Album "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" active_year_begin | \n",
" active_year_end | \n",
" associated_labels | \n",
" bio | \n",
" comments | \n",
" date_created | \n",
" favorites | \n",
" id | \n",
" latitude | \n",
" location | \n",
" longitude | \n",
" members | \n",
" name | \n",
" related_projects | \n",
" tags | \n",
" website | \n",
" wikipedia_page | \n",
"
\n",
" \n",
" track_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" 2006-01-01 | \n",
" NaT | \n",
" NaN | \n",
" <p>A Way Of Life, A Collective of Hip-Hop from... | \n",
" 0 | \n",
" 2008-11-26 01:42:32 | \n",
" 9 | \n",
" 1 | \n",
" 40.058324 | \n",
" New Jersey | \n",
" -74.405661 | \n",
" Sajje Morocco,Brownbum,ZawidaGod,Custodian of ... | \n",
" AWOL | \n",
" The list of past projects is 2 long but every1... | \n",
" [awol] | \n",
" http://www.AzillionRecords.blogspot.com | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 2006-01-01 | \n",
" NaT | \n",
" NaN | \n",
" <p>A Way Of Life, A Collective of Hip-Hop from... | \n",
" 0 | \n",
" 2008-11-26 01:42:32 | \n",
" 9 | \n",
" 1 | \n",
" 40.058324 | \n",
" New Jersey | \n",
" -74.405661 | \n",
" Sajje Morocco,Brownbum,ZawidaGod,Custodian of ... | \n",
" AWOL | \n",
" The list of past projects is 2 long but every1... | \n",
" [awol] | \n",
" http://www.AzillionRecords.blogspot.com | \n",
" NaN | \n",
"
\n",
" \n",
" 5 | \n",
" 2006-01-01 | \n",
" NaT | \n",
" NaN | \n",
" <p>A Way Of Life, A Collective of Hip-Hop from... | \n",
" 0 | \n",
" 2008-11-26 01:42:32 | \n",
" 9 | \n",
" 1 | \n",
" 40.058324 | \n",
" New Jersey | \n",
" -74.405661 | \n",
" Sajje Morocco,Brownbum,ZawidaGod,Custodian of ... | \n",
" AWOL | \n",
" The list of past projects is 2 long but every1... | \n",
" [awol] | \n",
" http://www.AzillionRecords.blogspot.com | \n",
" NaN | \n",
"
\n",
" \n",
" 10 | \n",
" NaT | \n",
" NaT | \n",
" Mexican Summer, Richie Records, Woodsist, Skul... | \n",
" <p><span style=\"font-family:Verdana, Geneva, A... | \n",
" 3 | \n",
" 2008-11-26 01:42:55 | \n",
" 74 | \n",
" 6 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" Kurt Vile, the Violators | \n",
" Kurt Vile | \n",
" NaN | \n",
" [philly, kurt vile] | \n",
" http://kurtvile.com | \n",
" NaN | \n",
"
\n",
" \n",
" 20 | \n",
" 1990-01-01 | \n",
" 2011-01-01 | \n",
" NaN | \n",
" <p>Songs written by: Nicky Cook</p>\\n<p>VOCALS... | \n",
" 2 | \n",
" 2008-11-26 01:42:52 | \n",
" 10 | \n",
" 4 | \n",
" 51.895927 | \n",
" Colchester England | \n",
" 0.891874 | \n",
" Nicky Cook\\n | \n",
" Nicky Cook | \n",
" NaN | \n",
" [instrumentals, experimental pop, post punk, e... | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" active_year_begin active_year_end \\\n",
"track_id \n",
"2 2006-01-01 NaT \n",
"3 2006-01-01 NaT \n",
"5 2006-01-01 NaT \n",
"10 NaT NaT \n",
"20 1990-01-01 2011-01-01 \n",
"\n",
" associated_labels \\\n",
"track_id \n",
"2 NaN \n",
"3 NaN \n",
"5 NaN \n",
"10 Mexican Summer, Richie Records, Woodsist, Skul... \n",
"20 NaN \n",
"\n",
" bio comments \\\n",
"track_id \n",
"2 A Way Of Life, A Collective of Hip-Hop from... 0 \n",
"3
A Way Of Life, A Collective of Hip-Hop from... 0 \n",
"5
A Way Of Life, A Collective of Hip-Hop from... 0 \n",
"10
Songs written by: Nicky Cook
\\nVOCALS... 2 \n",
"\n",
" date_created favorites id latitude location \\\n",
"track_id \n",
"2 2008-11-26 01:42:32 9 1 40.058324 New Jersey \n",
"3 2008-11-26 01:42:32 9 1 40.058324 New Jersey \n",
"5 2008-11-26 01:42:32 9 1 40.058324 New Jersey \n",
"10 2008-11-26 01:42:55 74 6 NaN NaN \n",
"20 2008-11-26 01:42:52 10 4 51.895927 Colchester England \n",
"\n",
" longitude members \\\n",
"track_id \n",
"2 -74.405661 Sajje Morocco,Brownbum,ZawidaGod,Custodian of ... \n",
"3 -74.405661 Sajje Morocco,Brownbum,ZawidaGod,Custodian of ... \n",
"5 -74.405661 Sajje Morocco,Brownbum,ZawidaGod,Custodian of ... \n",
"10 NaN Kurt Vile, the Violators \n",
"20 0.891874 Nicky Cook\\n \n",
"\n",
" name related_projects \\\n",
"track_id \n",
"2 AWOL The list of past projects is 2 long but every1... \n",
"3 AWOL The list of past projects is 2 long but every1... \n",
"5 AWOL The list of past projects is 2 long but every1... \n",
"10 Kurt Vile NaN \n",
"20 Nicky Cook NaN \n",
"\n",
" tags \\\n",
"track_id \n",
"2 [awol] \n",
"3 [awol] \n",
"5 [awol] \n",
"10 [philly, kurt vile] \n",
"20 [instrumentals, experimental pop, post punk, e... \n",
"\n",
" website wikipedia_page \n",
"track_id \n",
"2 http://www.AzillionRecords.blogspot.com NaN \n",
"3 http://www.AzillionRecords.blogspot.com NaN \n",
"5 http://www.AzillionRecords.blogspot.com NaN \n",
"10 http://kurtvile.com NaN \n",
"20 NaN NaN "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"N = 5\n",
"ipd.display(tracks['track'].head(N))\n",
"ipd.display(tracks['album'].head(N))\n",
"ipd.display(tracks['artist'].head(N))"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}