{
 "metadata": {
  "name": "",
  "signature": "sha256:20b5d8c7ea091ab1110d5f5d332067d76f52456f28201d8ced2c900397a51db1"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Analysis of Billboard charts to determine most characteristic song title words per decade #\n",
      "\n",
      "By David Taylor, http://www.prooffreader.com\n",
      "\n",
      "*Note that all timings listed here are on my cheapo Windows laptop, so chances are you'll do at least as well.*"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 1. Import and preliminaries #"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "\n",
      "import pandas as pd\n",
      "import re\n",
      "import os\n",
      "import urllib\n",
      "import rarfile # not part of standard distro\n",
      "import glob\n",
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "from difflib import SequenceMatcher\n",
      "from collections import Counter\n",
      "%matplotlib inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Wall time: 19.7 s\n"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "# download and extract charts.rar\n",
      "# note that this archive is periodically updated\n",
      "\n",
      "# you can skip this cell if you manually download http://bullfrogspond.com/charts.rar\n",
      "# and put it in the same directory as this notebook\n",
      "\n",
      "# use this command if unrar.exe is not in your PATH, changing to your path:\n",
      "rarfile.UNRAR_TOOL = r\"C:\\Program Files\\WinRAR\\UnRAR.exe\"\n",
      "\n",
      "urllib.urlretrieve('http://bullfrogspond.com/charts.rar', 'charts.rar')\n",
      "with rarfile.RarFile('charts.rar') as rf:\n",
      "    for member in rf.infolist():\n",
      "        rf.extract(member)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Wall time: 5.84 s\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 2. Create and process dataframe #"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "# create dataframe from .xls file and manipulate it\n",
      "\n",
      "# use most recent .xls file in case more than one is in directory, i.e.\n",
      "# you've downloaded and extracted charts.rar on different dates, after\n",
      "# it's been updated\n",
      "\n",
      "globlist = glob.glob('*.xls')\n",
      "globlist.sort()\n",
      "filename = globlist[-1]\n",
      "\n",
      "# read excel file into pandas dataframe. it's a huge file, but only four columns are required.\n",
      "df_tracks = pd.read_excel(filename, sheetname=\"\\\"Pop Annual\\\"\", parse_cols='A,B,K,Q')\n",
      "print \"ORIGINAL DATAFRAME:\"\n",
      "print df_tracks.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ORIGINAL DATAFRAME:\n",
        "   Year Yearly Rank           Artist            Track\n",
        "0  2014         162  Scotty McCreery  See You Tonight\n",
        "1  2014         199           B.o.B          HeadBand\n",
        "2  2014         226      OneRepublic   Counting Stars\n",
        "3  2014         285        Passenger       Let Her Go\n",
        "4  2014         296         Bastille          Pompeii\n",
        "Wall time: 16.9 s\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "# The Yearly Rank column has some alphabetic data, e.g. 95a, 95b\n",
      "# This is sometimes multiple releases from the same artist, which we\n",
      "# wish to keep, and sometimes Parts 1 and 2 of the same track,\n",
      "# which we don't.\n",
      "# Some Yearly Ranks are n/a, which we will change to 999 to avoid NaNs\n",
      "# (No year has over 998 entries)\n",
      "# BTW, we use 'ranked' instead of 'rank' as column name because\n",
      "# the latter is in the pandas namespace\n",
      "\n",
      "# Add a column ranked as float, with 0.1 added for a, 0.2 added for b, etc.\n",
      "# while we're at it, change all column names to lower case with underscores\n",
      "\n",
      "df_tracks.columns = [['year', 'yearly_rank', 'artist', 'track']]\n",
      "df_tracks['ranked'] = 0.0\n",
      "def calc_rankfloat(row):\n",
      "    rank = row.yearly_rank\n",
      "    if type(rank) != int:\n",
      "        try:\n",
      "            try:\n",
      "                suffix = re.search('([^0-9])', rank).group(1) #extract alphabetic\n",
      "                assert len(suffix) == 1 #just in case\n",
      "                rank = float(rank[:-1])\n",
      "                rank += (ord(suffix) - 96) * 0.1\n",
      "            except AttributeError:\n",
      "                # sometimes Yearly Rank looks like an int, but doesn't pass the\n",
      "                # type test.\n",
      "                rank = float(rank.strip())\n",
      "        except ValueError:\n",
      "            rank = 999 # for n/as\n",
      "    return float(rank)\n",
      "df_tracks['ranked'] = df_tracks.apply(calc_rankfloat, axis=1)\n",
      "\n",
      "# calculate difference in consecutive ranks so we can evaluate cases\n",
      "# where difference < 1, i.e. 82a, 82b which became 82.1, 82.2, etc.\n",
      "df_tracks.sort(['year', 'ranked'], ascending=True, inplace=True)\n",
      "df_tracks.reset_index(inplace = True, drop=True)\n",
      "df_tracks['diff_rank'] = 0.0\n",
      "for i in range(len(df_tracks)):\n",
      "    if i == 0:\n",
      "        df_tracks.diff_rank.iloc[i] = 1\n",
      "    elif df_tracks.year.iloc[i] != df_tracks.year.iloc[i-1]:\n",
      "        df_tracks.diff_rank.iloc[i] = 1\n",
      "    else:\n",
      "        df_tracks.diff_rank.iloc[i] = df_tracks.ranked.iloc[i] - df_tracks.ranked.iloc[i-1]\n",
      "\n",
      "# go through dataframe and find consecutive entries where the difference in rank\n",
      "# is less than one. Perform actions according to the following scenarios\n",
      "# 1: Artist same, track names similar tracks contain 'Part 1' and 'Part 2'\n",
      "# Keep first entry, without 'Part 1'\n",
      "# 2: Artist same, track names similar\n",
      "# Keep first entry\n",
      "# Note that 'similar' means SequenceMatcher's result is > 0.5\n",
      "# Note that entries are tagged for deletion by changing the year to 0.\n",
      "# At the end, all rows with year == 0 are deleted\n",
      "\n",
      "for i in range(len(df_tracks)):\n",
      "    if df_tracks.diff_rank.iloc[i] < 0.5 and df_tracks.ranked.iloc[i] != 0:\n",
      "        diff_rank = df_tracks.diff_rank.iloc[i]\n",
      "        year = df_tracks.year.iloc[i]\n",
      "        artist_prev = df_tracks.artist.iloc[i-1]\n",
      "        artist = df_tracks.artist.iloc[i]\n",
      "        ranked_prev = df_tracks.ranked.iloc[i-1]\n",
      "        ranked = df_tracks.ranked.iloc[i]\n",
      "        track_prev = df_tracks.track.iloc[i-1]\n",
      "        track = df_tracks.track.iloc[i]\n",
      "        seq_match = SequenceMatcher(None, track_prev, track).ratio()\n",
      "        #scenario 1\n",
      "        if (re.search('[Pp]art 1', track_prev) and\n",
      "            re.search('[Pp]art 2', track) and\n",
      "            seq_match > 0.5):\n",
      "            df_tracks.track.iloc[i-1] = re.sub('[Pp]art 1', '', track_prev)\n",
      "            df_tracks.year.iloc[i] = 0\n",
      "        elif seq_match > 0.5:\n",
      "            df_tracks.year.iloc[i] = 0\n",
      "\n",
      "df_tracks = df_tracks[df_tracks.year != 0] # remove those flagged for removal\n",
      "\n",
      "# remove duplicate song titles in one year -- before the 1960s, it was \n",
      "# very common for multiple artists to appear in the Billboard chart with\n",
      "# the same song at about the same time; this skews the results towards\n",
      "# these songs. After removal, the highest-ranking version will be kept.\n",
      "print \"Before duplicates removed:\"\n",
      "print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]\n",
      "print \"\"\n",
      "df_tracks.drop_duplicates(['track', 'year'], inplace=True)\n",
      "print \"After duplicates removed:\"\n",
      "print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]\n",
      "df_tracks.to_pickle('df_tracks_v1.pickle')\n",
      "\n",
      "df_tracks.to_pickle('df_tracks_v1.pickle')\n",
      "\n",
      "df_tracks.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Before duplicates removed:\n",
        "       year yearly_rank                                           artist  \\\n",
        "10062  1950           6     Nat \"King\" Cole (Les Baxter & His Orchestra)   \n",
        "10125  1950          69  Victor Young & His Orchestra (Vocal Don Cherry)   \n",
        "10202  1950         146                      Harry James & His Orchestra   \n",
        "10203  1950         147                                         Art Lund   \n",
        "10222  1950         166                   Charlie Spivak & His Orchestra   \n",
        "10228  1950         172                   Ralph Flanagan & His Orchestra   \n",
        "10373  1950         318                                       Dennis Day   \n",
        "\n",
        "           track  ranked  diff_rank  \n",
        "10062  Mona Lisa       6          1  \n",
        "10125  Mona Lisa      69          1  \n",
        "10202  Mona Lisa     146          1  \n",
        "10203  Mona Lisa     147          1  \n",
        "10222  Mona Lisa     166          1  \n",
        "10228  Mona Lisa     172          1  \n",
        "10373  Mona Lisa     318          1  \n",
        "\n",
        "After duplicates removed:"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "       year yearly_rank                                        artist  \\\n",
        "10062  1950           6  Nat \"King\" Cole (Les Baxter & His Orchestra)   \n",
        "\n",
        "           track  ranked  diff_rank  \n",
        "10062  Mona Lisa       6          1  "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 33.3 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 3. Parse words in song titles #\n",
      "\n",
      "Starting from df_tracks, we will create:\n",
      "\n",
      "* df_year, a dataframe of word frequencies for each year\n",
      "* df_decade, a dataframe of word frequencies for each decade\n",
      "* a new column, 'wordlist', in df_tracks, containing the parsed, lowercase words without punctuation in song titles.\n",
      "\n",
      "Note that the following changes to song titles are performed:\n",
      "\n",
      "* \"Part #\", where # is a digit, is removed. We removed the ones involving duplicated yearly rank in the previous cell, but there are plenty of others\n",
      "* Characters other than letters, spaces, hyphens and apostrophes are removed\n",
      "* Duplicates words in the same title are reduced to one (Otherwise titles like \"Tzena, Tzena, Tzena\" and \"Na na na na\" become quite weighted)\n",
      "\n",
      "Note that a very, very few records with NaN values are removed (less than 1 per 10 000 song titles)."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Make some lists and dicts and functions we will use\n",
      "\n",
      "# in case you start here\n",
      "df_tracks = pd.read_pickle('df_tracks_v1.pickle')\n",
      "\n",
      "# lists of years and decades in df_tracks\n",
      "decades = list(df_tracks.decade.unique())\n",
      "decades.sort()\n",
      "years = list(df_tracks.year.unique())\n",
      "years.sort()\n",
      "\n",
      "# dict comprehension to create dicts of \n",
      "# lists of words with decades or years as key\n",
      "# lists are empty for now, when initialized\n",
      "decades_words = {decade: [] for decade in decades}\n",
      "years_words = {year: [] for year in years}\n",
      "\n",
      "# Define our log-likelihood function\n",
      "\n",
      "def loglike(n1, t1, n2, t2):\n",
      "    \"\"\"Calculates Dunning log likelihood of an observation of \n",
      "    frequency n1 in a corpus of size t1, compared to a frequency n2 \n",
      "    in a corpus of size t2. If result is positive, it is more \n",
      "    likely to occur in corpus 1, otherwise in corpus 2.\"\"\"\n",
      "    from numpy import log\n",
      "    e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values\n",
      "    e2 = t2*1.0*(n1+n2)/(t1+t2)\n",
      "    LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))\n",
      "    if n2*1.0/t2 > n1*1.0/t1:\n",
      "        LL = -LL\n",
      "    return LL\n",
      "\n",
      "len_before = len(df_tracks)\n",
      "df_tracks = df_tracks.dropna()\n",
      "print \"{} NaN-containing tracks dropped; {} remain\".format(len_before - len(df_tracks), len(df_tracks))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "3 NaN-containing tracks dropped; 36283 remain\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "# make lists of words per song, per year and per decade\n",
      "\n",
      "df_tracks['wordlist'] = ''\n",
      "for idx, row in df_tracks.iterrows():\n",
      "    track = unicode(row.track)\n",
      "    track = re.sub('[^A-Za-z0-9 \\']', '', track) # remove punctuation\n",
      "    track = re.sub('[Pp]art [0-9]', '', track)\n",
      "    track = track.lower()\n",
      "    words = list(set(track.split())) #removes duplicates in one song title\n",
      "    for word in words:\n",
      "        decades_words[row.decade].append(word)\n",
      "        years_words[row.year].append(word)\n",
      "    df_tracks.wordlist[idx] = ' '.join(words)\n",
      "    \n",
      "# create dict of total word counts per decade and per word\n",
      "decades_count = {decade: len(decades_words[decade]) for decade in decades}\n",
      "decades_count_max = max(decades_count.values())\n",
      "years_count = {year: len(years_words[year]) for year in years}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Wall time: 14.3 s\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "# create df_year and df_decade dataframes\n",
      "# 'counted' is raw count (called 'counted' to avoid namespace\n",
      "# conflict with 'count' method)\n",
      "\n",
      "dfy_words = []\n",
      "dfy_years = []\n",
      "dfy_counts = []\n",
      "\n",
      "for year in years:\n",
      "    for word in set(years_words[year]):\n",
      "        dfy_years.append(year)\n",
      "        dfy_words.append(word)\n",
      "        dfy_counts.append(years_words[year].count(word))\n",
      "df_year = pd.DataFrame({'word':dfy_words, 'year':dfy_years, 'counted':dfy_counts})\n",
      "\n",
      "def calc_yr_pct(row):\n",
      "    return row.counted * 100.0 / years_count[row.year]\n",
      "\n",
      "df_year['pct'] = df_year.apply(calc_yr_pct, axis=1)\n",
      "\n",
      "dfd_words = []\n",
      "dfd_decades = []\n",
      "dfd_counts = []\n",
      "\n",
      "for decade in decades:\n",
      "    for word in set(decades_words[decade]):\n",
      "        dfd_decades.append(decade)\n",
      "        dfd_words.append(word)\n",
      "        dfd_counts.append(decades_words[decade].count(word))\n",
      "df_decade = pd.DataFrame({'word':dfd_words, 'decade':dfd_decades, 'counted':dfd_counts})\n",
      "\n",
      "def calc_dec_pct(row):\n",
      "    return row.counted * 100.0 / decades_count[row.decade]\n",
      "\n",
      "df_decade['pct'] = df_decade.apply(calc_dec_pct, axis=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Wall time: 27.3 s\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "# add calculated log-likelihood column\n",
      "\n",
      "decades_pct = {decade: df_decade[df_decade.decade == decade].pct.sum() for decade in decades}\n",
      "\n",
      "# create dict of total counts and total pct per word\n",
      "word_counts = {}\n",
      "for word in df_decade.word.unique():\n",
      "    word_counts[word] = df_decade[df_decade.word == word].counted.sum()\n",
      "word_counts_total = sum(decades_count.values())\n",
      "assert word_counts_total == df_decade.counted.sum()\n",
      "\n",
      "word_pcts = {}\n",
      "for word in df_decade.word.unique():\n",
      "    word_pcts[word] = df_decade[df_decade.word == word].pct.sum()\n",
      "word_pcts_total = df_decade.pct.sum()\n",
      "\n",
      "\n",
      "def calc_ll(row):\n",
      "    return loglike(row.counted,\n",
      "                   decades_count[row.decade],\n",
      "                   word_counts[row.word],\n",
      "                   word_counts_total)\n",
      "\n",
      "df_decade['loglike'] = df_decade.apply(calc_ll, axis=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Wall time: 5min 55s\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#pickle all dataframes\n",
      "df_tracks.to_pickle('df_tracks_v2.pickle')\n",
      "df_decade.to_pickle('df_decade.pickle')\n",
      "df_year.to_pickle('df_year.pickle')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## 4. Explore data and extract top keywords #"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# read from pickle in case you start here:\n",
      "df_tracks = pd.read_pickle('df_tracks_v2.pickle')\n",
      "df_tracks = df_tracks[['year', 'decade', 'artist', 'track', 'ranked', 'wordlist']]\n",
      "df_decade = pd.read_pickle('df_decade.pickle')\n",
      "df_year = pd.read_pickle('df_year.pickle')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_tracks.tail()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>year</th>\n",
        "      <th>decade</th>\n",
        "      <th>artist</th>\n",
        "      <th>track</th>\n",
        "      <th>ranked</th>\n",
        "      <th>wordlist</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>39667</th>\n",
        "      <td> 2014</td>\n",
        "      <td> 2010</td>\n",
        "      <td> Frankie Ballard</td>\n",
        "      <td>        Helluva Life</td>\n",
        "      <td> 438</td>\n",
        "      <td>        life helluva</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>39668</th>\n",
        "      <td> 2014</td>\n",
        "      <td> 2010</td>\n",
        "      <td>  Kelly Clarkson</td>\n",
        "      <td> Underneath The Tree</td>\n",
        "      <td> 439</td>\n",
        "      <td> underneath the tree</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>39669</th>\n",
        "      <td> 2014</td>\n",
        "      <td> 2010</td>\n",
        "      <td>     Lupe Fiasco</td>\n",
        "      <td>     Old School Love</td>\n",
        "      <td> 440</td>\n",
        "      <td>     love school old</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>39670</th>\n",
        "      <td> 2014</td>\n",
        "      <td> 2010</td>\n",
        "      <td>  Arctic Monkeys</td>\n",
        "      <td>    Do I Wanna Know?</td>\n",
        "      <td> 442</td>\n",
        "      <td>     i do wanna know</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>39671</th>\n",
        "      <td> 2014</td>\n",
        "      <td> 2010</td>\n",
        "      <td>      Aloe Blacc</td>\n",
        "      <td>             The Man</td>\n",
        "      <td> 447</td>\n",
        "      <td>             the man</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 16,
       "text": [
        "       year  decade           artist                track  ranked  \\\n",
        "39667  2014    2010  Frankie Ballard         Helluva Life     438   \n",
        "39668  2014    2010   Kelly Clarkson  Underneath The Tree     439   \n",
        "39669  2014    2010      Lupe Fiasco      Old School Love     440   \n",
        "39670  2014    2010   Arctic Monkeys     Do I Wanna Know?     442   \n",
        "39671  2014    2010       Aloe Blacc              The Man     447   \n",
        "\n",
        "                  wordlist  \n",
        "39667         life helluva  \n",
        "39668  underneath the tree  \n",
        "39669      love school old  \n",
        "39670      i do wanna know  \n",
        "39671              the man  "
       ]
      }
     ],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_decade.tail()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>counted</th>\n",
        "      <th>decade</th>\n",
        "      <th>word</th>\n",
        "      <th>pct</th>\n",
        "      <th>loglike</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>27712</th>\n",
        "      <td> 3</td>\n",
        "      <td> 2010</td>\n",
        "      <td>  friends</td>\n",
        "      <td> 0.052595</td>\n",
        "      <td> 0.417387</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>27713</th>\n",
        "      <td> 1</td>\n",
        "      <td> 2010</td>\n",
        "      <td>     goin</td>\n",
        "      <td> 0.017532</td>\n",
        "      <td> 3.633835</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>27714</th>\n",
        "      <td> 1</td>\n",
        "      <td> 2010</td>\n",
        "      <td>      bch</td>\n",
        "      <td> 0.017532</td>\n",
        "      <td> 3.633835</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>27715</th>\n",
        "      <td> 1</td>\n",
        "      <td> 2010</td>\n",
        "      <td> runaways</td>\n",
        "      <td> 0.017532</td>\n",
        "      <td> 3.633835</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>27716</th>\n",
        "      <td> 4</td>\n",
        "      <td> 2010</td>\n",
        "      <td>    we're</td>\n",
        "      <td> 0.070126</td>\n",
        "      <td> 0.850789</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 15,
       "text": [
        "       counted  decade      word       pct   loglike\n",
        "27712        3    2010   friends  0.052595  0.417387\n",
        "27713        1    2010      goin  0.017532  3.633835\n",
        "27714        1    2010       bch  0.017532  3.633835\n",
        "27715        1    2010  runaways  0.017532  3.633835\n",
        "27716        4    2010     we're  0.070126  0.850789"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_year.tail()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>counted</th>\n",
        "      <th>word</th>\n",
        "      <th>year</th>\n",
        "      <th>pct</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>59021</th>\n",
        "      <td> 1</td>\n",
        "      <td>      u</td>\n",
        "      <td> 2014</td>\n",
        "      <td> 0.114416</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>59022</th>\n",
        "      <td> 1</td>\n",
        "      <td>   time</td>\n",
        "      <td> 2014</td>\n",
        "      <td> 0.114416</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>59023</th>\n",
        "      <td> 1</td>\n",
        "      <td> walker</td>\n",
        "      <td> 2014</td>\n",
        "      <td> 0.114416</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>59024</th>\n",
        "      <td> 1</td>\n",
        "      <td>  she's</td>\n",
        "      <td> 2014</td>\n",
        "      <td> 0.114416</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>59025</th>\n",
        "      <td> 1</td>\n",
        "      <td>  hello</td>\n",
        "      <td> 2014</td>\n",
        "      <td> 0.114416</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 14,
       "text": [
        "       counted    word  year       pct\n",
        "59021        1       u  2014  0.114416\n",
        "59022        1    time  2014  0.114416\n",
        "59023        1  walker  2014  0.114416\n",
        "59024        1   she's  2014  0.114416\n",
        "59025        1   hello  2014  0.114416"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df_decade.sort('loglike', ascending=False, inplace=True)\n",
      "\n",
      "#determine how many rows are needed until each decade is represented\n",
      "#at least once\n",
      "from collections import Counter\n",
      "c = Counter()\n",
      "decades = list(df_decade.decade.unique())\n",
      "remaining_decades = list(df_decade.decade.unique())\n",
      "decadespop = decades\n",
      "num_rows = 0\n",
      "while len(remaining_decades) > 0:\n",
      "    decade = df_decade.decade.iloc[num_rows]\n",
      "    c[decade] += 1\n",
      "    if decade in remaining_decades:\n",
      "        remaining_decades.remove(decade)\n",
      "    num_rows += 1\n",
      "print '{} rows required for each decade to be represented.'.format(num_rows)\n",
      "print c"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "63 rows required for each decade to be represented.\n",
        "Counter({1910: 15, 1900: 12, 1930: 6, 1920: 5, 1890: 5, 2000: 4, 1970: 4, 1940: 4, 1990: 2, 1960: 2, 1980: 2, 2010: 1, 1950: 1})\n"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# with this approach, there would be 32 of 64 before 1930.\n",
      "# instead, let's use the top five for each decade."
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import csv\n",
      "with open('billboard_output.csv', 'wb+') as csvfile:\n",
      "    csvwriter = csv.writer(csvfile, delimiter='\\t',\n",
      "                           quotechar='\\\"', quoting=csv.QUOTE_MINIMAL)\n",
      "\n",
      "    decades = range(1890, 2020, 10)\n",
      "\n",
      "    for decade in decades:\n",
      "        dftemp = df_decade[df_decade.decade == decade].sort('loglike', ascending=False)\n",
      "        for i in range(5):\n",
      "            output = []\n",
      "            word = dftemp.word.iloc[i]\n",
      "            keyness = int(dftemp.loglike.iloc[i])\n",
      "            regex = '(^{0} |^{0}$| {0}$| {0} )'.format(word)\n",
      "            dftemp2 = df_tracks[(df_tracks.decade == decade) &\n",
      "                               (df_tracks.wordlist.str.contains(regex))]\n",
      "            dftemp2.sort(['ranked', 'year'], ascending=True, inplace=True)\n",
      "            artist = dftemp2.artist.iloc[0]\n",
      "            track = dftemp2.track.iloc[0]\n",
      "            year = dftemp2.year.iloc[0]\n",
      "            print decade, word, keyness, artist, track, year\n",
      "            output.append(decade)\n",
      "            output.append(word)\n",
      "            output.append(keyness)\n",
      "            output.append(artist)\n",
      "            output.append(track)\n",
      "            output.append(year)\n",
      "            for year in range(1890,2015):\n",
      "                dftemp3 = df_year[(df_year.word == word) & (df_year.year == year)]\n",
      "                if len(dftemp3) > 0:\n",
      "                    output.append(dftemp3.pct.iloc[0])\n",
      "                else:\n",
      "                    output.append(0)\n",
      "            csvwriter.writerow(output)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1890 uncle 59 Cal Stewart Uncle Josh's Arrival in New York 1898\n",
        "1890"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " casey 54 Russell Hunting Michael Casey Taking the Census 1892\n",
        "1890"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " josh 53 Cal Stewart Uncle Josh at the Opera 1898\n",
        "1890"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " old 26 Dan Quinn A Hot Time in the Old Town 1896\n",
        "1890"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " michael 24 Russell Hunting Michael Casey Taking the Census 1892\n",
        "1900"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " uncle 58 Cal Stewart Uncle Josh's Huskin' Bee Dance 1901\n",
        "1900"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " old 58 Haydn Quartet In the Good Old Summer Time 1903\n",
        "1900"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " josh 44 Cal Stewart Uncle Josh On an Automobile 1903\n",
        "1900"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " reuben 38 S. H. Dudley When Reuben Comes to Town 1901\n",
        "1900"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " when 33 George J. Gaskin When You Were Sweet Sixteen 1900\n",
        "1910"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " gems 70 Victor Light Opera Co. Gems from \"Naughty Marietta\" 1912\n",
        "1910"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " rag 52 Original Dixieland Jazz Band Tiger Rag 1918\n",
        "1910"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " home 43 Henry Burr When You're a Long, Long Way from Home 1914\n",
        "1910"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " land 41 Al Jolson Hello Central, Give Me No Man's Land 1918\n",
        "1910"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " old 38 Harry Macdonough Down by the Old Mill Stream 1912\n",
        "1920"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " blues 153 Paul Whiteman & His Orchestra Wang Wang Blues 1921\n",
        "1920"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " pal 42 Al Jolson Little Pal 1929\n",
        "1920"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " sweetheart 27 Isham Jones & His Orchestra Nobody's Sweetheart 1924\n",
        "1920"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " rose 25 Ted Lewis & His Band Second Hand Rose 1921\n",
        "1920"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " mammy 23 Paul Whiteman & His Orchestra My Mammy 1921\n",
        "1930"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " moon 79 Glenn Miller & His Orchestra Moon Love 1939\n",
        "1930"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " in 38 Ted Lewis & His Band In A Shanty In Old Shanty Town 1932\n",
        "1930"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " swing 34 Ray Noble & His Orchestra Let's Swing It 1935\n",
        "1930"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " sing 34 Benny Goodman & His Orchestra (Vocal Martha Tilton) And the Angels Sing 1939\n",
        "1930"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " a 30 Ted Lewis & His Band In A Shanty In Old Shanty Town 1932\n",
        "1940"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " polka 50 Kay Kyser & His Orchestra Strip Polka 1942\n",
        "1940"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " serenade 35 Andrews Sisters Ferry Boat Serenade 1940\n",
        "1940"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " boogie 28 Will Bradley & His Orchestra Scrub Me, Mama, With a Boogie Beat 1941\n",
        "1940"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " blue 26 Tommy Dorsey & His Orchestra (Vocal Frank Sinatra) In The Blue Of Evening 1943\n",
        "1940"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " christmas 22 Bing Crosby White Christmas 1942\n",
        "1950"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " christmas 31 Art Mooney & His Orchestra (I'm Getting) Nuttin' For Christmas 1955\n",
        "1950"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " penny 18 Dinah Shore & Tony Martin A Penny A Kiss 1951\n",
        "1950"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " mambo 15 Perry Como Papa Loves Mambo 1954\n",
        "1950"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " rednosed 15 Gene Autry Rudolph, the Red-Nosed Reindeer 1950\n",
        "1950"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " three 15 Browns, The The Three Bells 1959\n",
        "1960"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " baby 51 Supremes, The Baby Love 1964\n",
        "1960"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " twist 24 Joey Dee & the Starliters Peppermint Twist - Part 1 1962\n",
        "1960"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " little 16 Steve Lawrence Go Away Little Girl 1963\n",
        "1960"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " twistin' 15 Chubby Checker Slow Twistin' 1962\n",
        "1960"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " lonely 14 Bobby Vinton Mr. Lonely 1964\n",
        "1970"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " woman 33 Guess Who, The American Woman 1970\n",
        "1970"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " disco 31 Johnnie Taylor Disco Lady 1976\n",
        "1970"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " rock 24 Elton John Crocodile Rock 1973\n",
        "1970"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " music 24 Wild Cherry Play That Funky Music 1976\n",
        "1970"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " dancin' 20 Leif Garrett I Was Made For Dancin' 1979\n",
        "1980"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " love 48 Joan Jett & The Blackhearts I Love Rock 'N Roll 1982\n",
        "1980"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " fire 24 Billy Joel We Didn't Start The Fire 1989\n",
        "1980"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " don't 20 Human League, The Don't You Want Me 1982\n",
        "1980"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " rock 14 Joan Jett & The Blackhearts I Love Rock 'N Roll 1982\n",
        "1980"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " on 14 Bon Jovi Livin' On A Prayer 1987\n",
        "1990"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " u 49 Sinead O'Connor Nothing Compares 2 U 1990\n",
        "1990"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " you 28 Stevie B Because I Love You (The Postman Song) 1990\n",
        "1990"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " up 21 Brandy Sittin' Up In My Room 1996\n",
        "1990"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " get 20 En Vogue My Lovin' (You're Never Gonna Get It) 1992\n",
        "1990"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " thang 18 Dr. Dre Nuthin' But A \"G\" Thang 1993\n",
        "2000"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " u 71 Usher U Got It Bad 2001\n",
        "2000"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " like 28 T.I. Whatever You Like 2008\n",
        "2000"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " breathe 25 Faith Hill Breathe 2000\n",
        "2000"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " it 24 Usher U Got It Bad 2001\n",
        "2000"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " ya 19 OutKast Hey Ya! 2003\n",
        "2010"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " we 22 Rihanna We Found Love 2011\n",
        "2010"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " yeah 18 Austin Mahone Mmm Yeah 2014\n",
        "2010"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " hell 18 Avril Lavigne What The Hell 2011\n",
        "2010"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " fk 15 Cee Lo Green F**K You (Forget You) 2011\n",
        "2010"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " die 14 Ke$ha Die Young 2012\n"
       ]
      }
     ],
     "prompt_number": 19
    }
   ],
   "metadata": {}
  }
 ]
}