{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Data Analysis Examples" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## 1.USA.gov Data from Bitly" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from numpy.random import randn\n", "import numpy as np\n", "np.random.seed(123)\n", "import os\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "plt.rc('figure', figsize=(10, 6))\n", "np.set_printoptions(precision=4)\n", "pd.options.display.max_rows = 20" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "In [5]: path = 'datasets/bitly_usagov/example.txt'\n", "\n", "In [6]: open(path).readline()\n", "Out[6]: '{ \"a\": \"Mozilla\\\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\\\/535.11\n", "(KHTML, like Gecko) Chrome\\\\/17.0.963.78 Safari\\\\/535.11\", \"c\": \"US\", \"nk\": 1,\n", "\"tz\": \"America\\\\/New_York\", \"gr\": \"MA\", \"g\": \"A6qOVH\", \"h\": \"wfLQtf\", \"l\":\n", "\"orofrog\", \"al\": \"en-US,en;q=0.8\", \"hh\": \"1.usa.gov\", \"r\":\n", "\"http:\\\\/\\\\/www.facebook.com\\\\/l\\\\/7AQEFzjSi\\\\/1.usa.gov\\\\/wfLQtf\", \"u\":\n", "\"http:\\\\/\\\\/www.ncbi.nlm.nih.gov\\\\/pubmed\\\\/22415991\", \"t\": 1331923247, \"hc\":\n", "1331822918, \"cy\": \"Danvers\", \"ll\": [ 42.576698, -70.954903 ] }\\n'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import json\n", "path = 'datasets/bitly_usagov/example.txt'\n", "records = [json.loads(line) for line in open(path)]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "In [18]: records[0]\n", "Out[18]:\n", "{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko)\n", "Chrome/17.0.963.78 Safari/535.11',\n", " 'al': 'en-US,en;q=0.8',\n", " 'c': 'US',\n", " 'cy': 'Danvers',\n", " 'g': 'A6qOVH',\n", " 'gr': 'MA',\n", " 'h': 'wfLQtf',\n", " 'hc': 1331822918,\n", " 'hh': '1.usa.gov',\n", " 'l': 'orofrog',\n", " 'll': [42.576698, -70.954903],\n", " 'nk': 1,\n", " 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',\n", " 't': 1331923247,\n", " 'tz': 'America/New_York',\n", " 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Counting Time Zones in Pure Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "time_zones = [rec['tz'] for rec in records]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "time_zones = [rec['tz'] for rec in records if 'tz' in rec]\n", "time_zones[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def get_counts(sequence):\n", " counts = {}\n", " for x in sequence:\n", " if x in counts:\n", " counts[x] += 1\n", " else:\n", " counts[x] = 1\n", " return counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "def get_counts2(sequence):\n", " counts = defaultdict(int) # values will initialize to 0\n", " for x in sequence:\n", " counts[x] += 1\n", " return counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "counts = get_counts(time_zones)\n", "counts['America/New_York']\n", "len(time_zones)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def top_counts(count_dict, n=10):\n", " value_key_pairs = [(count, tz) for tz, count in count_dict.items()]\n", " value_key_pairs.sort()\n", " return value_key_pairs[-n:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "top_counts(counts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from collections import Counter\n", "counts = Counter(time_zones)\n", "counts.most_common(10)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Counting Time Zones with pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import pandas as pd\n", "frame = pd.DataFrame(records)\n", "frame.info()\n", "frame['tz'][:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "tz_counts = frame['tz'].value_counts()\n", "tz_counts[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "clean_tz = frame['tz'].fillna('Missing')\n", "clean_tz[clean_tz == ''] = 'Unknown'\n", "tz_counts = clean_tz.value_counts()\n", "tz_counts[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure(figsize=(10, 4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import seaborn as sns\n", "subset = tz_counts[:10]\n", "sns.barplot(y=subset.index, x=subset.values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "frame['a'][1]\n", "frame['a'][50]\n", "frame['a'][51][:50] # long line" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "results = pd.Series([x.split()[0] for x in frame.a.dropna()])\n", "results[:5]\n", "results.value_counts()[:8]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cframe = frame[frame.a.notnull()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cframe = cframe.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cframe['os'] = np.where(cframe['a'].str.contains('Windows'),\n", " 'Windows', 'Not Windows')\n", "cframe['os'][:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "by_tz_os = cframe.groupby(['tz', 'os'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "agg_counts = by_tz_os.size().unstack().fillna(0)\n", "agg_counts[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Use to sort in ascending order\n", "indexer = agg_counts.sum(1).argsort()\n", "indexer[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "count_subset = agg_counts.take(indexer[-10:])\n", "count_subset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "agg_counts.sum(1).nlargest(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Rearrange the data for plotting\n", "count_subset = count_subset.stack()\n", "count_subset.name = 'total'\n", "count_subset = count_subset.reset_index()\n", "count_subset[:10]\n", "sns.barplot(x='total', y='tz', hue='os', data=count_subset)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def norm_total(group):\n", " group['normed_total'] = group.total / group.total.sum()\n", " return group\n", "\n", "results = count_subset.groupby('tz').apply(norm_total)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "sns.barplot(x='normed_total', y='tz', hue='os', data=results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g = count_subset.groupby('tz')\n", "results2 = count_subset.total / g.total.transform('sum')" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## MovieLens 1M Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Make display smaller\n", "pd.options.display.max_rows = 10\n", "\n", "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n", "users = pd.read_table('datasets/movielens/users.dat', sep='::',\n", " header=None, names=unames)\n", "\n", "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n", "ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',\n", " header=None, names=rnames)\n", "\n", "mnames = ['movie_id', 'title', 'genres']\n", "movies = pd.read_table('datasets/movielens/movies.dat', sep='::',\n", " header=None, names=mnames)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "users[:5]\n", "ratings[:5]\n", "movies[:5]\n", "ratings" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = pd.merge(pd.merge(ratings, users), movies)\n", "data\n", "data.iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "mean_ratings = data.pivot_table('rating', index='title',\n", " columns='gender', aggfunc='mean')\n", "mean_ratings[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "ratings_by_title = data.groupby('title').size()\n", "ratings_by_title[:10]\n", "active_titles = ratings_by_title.index[ratings_by_title >= 250]\n", "active_titles" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Select rows on the index\n", "mean_ratings = mean_ratings.loc[active_titles]\n", "mean_ratings" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':\n", " 'Seven Samurai (Shichinin no samurai) (1954)'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)\n", "top_female_ratings[:10]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Measuring Rating Disagreement" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "sorted_by_diff = mean_ratings.sort_values(by='diff')\n", "sorted_by_diff[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Reverse order of rows, take first 10 rows\n", "sorted_by_diff[::-1][:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Standard deviation of rating grouped by title\n", "rating_std_by_title = data.groupby('title')['rating'].std()\n", "# Filter down to active_titles\n", "rating_std_by_title = rating_std_by_title.loc[active_titles]\n", "# Order Series by value in descending order\n", "rating_std_by_title.sort_values(ascending=False)[:10]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## US Baby Names 1880–2010" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "In [4]: names.head(10)\n", "Out[4]:\n", " name sex births year\n", "0 Mary F 7065 1880\n", "1 Anna F 2604 1880\n", "2 Emma F 2003 1880\n", "3 Elizabeth F 1939 1880\n", "4 Minnie F 1746 1880\n", "5 Margaret F 1578 1880\n", "6 Ida F 1472 1880\n", "7 Alice F 1414 1880\n", "8 Bertha F 1320 1880\n", "9 Sarah F 1288 1880" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!head -n 10 datasets/babynames/yob1880.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import pandas as pd\n", "names1880 = pd.read_csv('datasets/babynames/yob1880.txt',\n", " names=['name', 'sex', 'births'])\n", "names1880" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "names1880.groupby('sex').births.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "years = range(1880, 2011)\n", "\n", "pieces = []\n", "columns = ['name', 'sex', 'births']\n", "\n", "for year in years:\n", " path = 'datasets/babynames/yob%d.txt' % year\n", " frame = pd.read_csv(path, names=columns)\n", "\n", " frame['year'] = year\n", " pieces.append(frame)\n", "\n", "# Concatenate everything into a single DataFrame\n", "names = pd.concat(pieces, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "total_births = names.pivot_table('births', index='year',\n", " columns='sex', aggfunc=sum)\n", "total_births.tail()\n", "total_births.plot(title='Total births by sex and year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def add_prop(group):\n", " group['prop'] = group.births / group.births.sum()\n", " return group\n", "names = names.groupby(['year', 'sex']).apply(add_prop)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "names.groupby(['year', 'sex']).prop.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def get_top1000(group):\n", " return group.sort_values(by='births', ascending=False)[:1000]\n", "grouped = names.groupby(['year', 'sex'])\n", "top1000 = grouped.apply(get_top1000)\n", "# Drop the group index, not needed\n", "top1000.reset_index(inplace=True, drop=True)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "pieces = []\n", "for year, group in names.groupby(['year', 'sex']):\n", " pieces.append(group.sort_values(by='births', ascending=False)[:1000])\n", "top1000 = pd.concat(pieces, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "top1000" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Analyzing Naming Trends" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "boys = top1000[top1000.sex == 'M']\n", "girls = top1000[top1000.sex == 'F']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "total_births = top1000.pivot_table('births', index='year',\n", " columns='name',\n", " aggfunc=sum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "total_births.info()\n", "subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]\n", "subset.plot(subplots=True, figsize=(12, 10), grid=False,\n", " title=\"Number of births per year\")" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "#### Measuring the increase in naming diversity" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "table = top1000.pivot_table('prop', index='year',\n", " columns='sex', aggfunc=sum)\n", "table.plot(title='Sum of table1000.prop by year and sex',\n", " yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df = boys[boys.year == 2010]\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()\n", "prop_cumsum[:10]\n", "prop_cumsum.values.searchsorted(0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df = boys[boys.year == 1900]\n", "in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()\n", "in1900.values.searchsorted(0.5) + 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def get_quantile_count(group, q=0.5):\n", " group = group.sort_values(by='prop', ascending=False)\n", " return group.prop.cumsum().values.searchsorted(q) + 1\n", "\n", "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n", "diversity = diversity.unstack('sex')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "diversity.head()\n", "diversity.plot(title=\"Number of popular names in top 50%\")" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "#### The “last letter” revolution" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# extract last letter from name column\n", "get_last_letter = lambda x: x[-1]\n", "last_letters = names.name.map(get_last_letter)\n", "last_letters.name = 'last_letter'\n", "\n", "table = names.pivot_table('births', index=last_letters,\n", " columns=['sex', 'year'], aggfunc=sum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "subtable = table.reindex(columns=[1910, 1960, 2010], level='year')\n", "subtable.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "subtable.sum()\n", "letter_prop = subtable / subtable.sum()\n", "letter_prop" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "fig, axes = plt.subplots(2, 1, figsize=(10, 8))\n", "letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')\n", "letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',\n", " legend=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.subplots_adjust(hspace=0.25)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "letter_prop = table / table.sum()\n", "dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T\n", "dny_ts.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.close('all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "dny_ts.plot()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "#### Boy names that became girl names (and vice versa)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "all_names = pd.Series(top1000.name.unique())\n", "lesley_like = all_names[all_names.str.lower().str.contains('lesl')]\n", "lesley_like" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "filtered = top1000[top1000.name.isin(lesley_like)]\n", "filtered.groupby('name').births.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "table = filtered.pivot_table('births', index='year',\n", " columns='sex', aggfunc='sum')\n", "table = table.div(table.sum(1), axis=0)\n", "table.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "table.plot(style={'M': 'k-', 'F': 'k--'})" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## USDA Food Database" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "{\n", " \"id\": 21441,\n", " \"description\": \"KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,\n", "Wing, meat and skin with breading\",\n", " \"tags\": [\"KFC\"],\n", " \"manufacturer\": \"Kentucky Fried Chicken\",\n", " \"group\": \"Fast Foods\",\n", " \"portions\": [\n", " {\n", " \"amount\": 1,\n", " \"unit\": \"wing, with skin\",\n", " \"grams\": 68.0\n", " },\n", "\n", " ...\n", " ],\n", " \"nutrients\": [\n", " {\n", " \"value\": 20.8,\n", " \"units\": \"g\",\n", " \"description\": \"Protein\",\n", " \"group\": \"Composition\"\n", " },\n", "\n", " ...\n", " ]\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import json\n", "db = json.load(open('datasets/usda_food/database.json'))\n", "len(db)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "db[0].keys()\n", "db[0]['nutrients'][0]\n", "nutrients = pd.DataFrame(db[0]['nutrients'])\n", "nutrients[:7]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "info_keys = ['description', 'group', 'id', 'manufacturer']\n", "info = pd.DataFrame(db, columns=info_keys)\n", "info[:5]\n", "info.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.value_counts(info.group)[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "nutrients = []\n", "\n", "for rec in db:\n", " fnuts = pd.DataFrame(rec['nutrients'])\n", " fnuts['id'] = rec['id']\n", " nutrients.append(fnuts)\n", "\n", "nutrients = pd.concat(nutrients, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "nutrients" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "nutrients.duplicated().sum() # number of duplicates\n", "nutrients = nutrients.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "col_mapping = {'description' : 'food',\n", " 'group' : 'fgroup'}\n", "info = info.rename(columns=col_mapping, copy=False)\n", "info.info()\n", "col_mapping = {'description' : 'nutrient',\n", " 'group' : 'nutgroup'}\n", "nutrients = nutrients.rename(columns=col_mapping, copy=False)\n", "nutrients" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "ndata = pd.merge(nutrients, info, on='id', how='outer')\n", "ndata.info()\n", "ndata.iloc[30000]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)\n", "result['Zinc, Zn'].sort_values().plot(kind='barh')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])\n", "\n", "get_maximum = lambda x: x.loc[x.value.idxmax()]\n", "get_minimum = lambda x: x.loc[x.value.idxmin()]\n", "\n", "max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]\n", "\n", "# make the food a little smaller\n", "max_foods.food = max_foods.food.str[:50]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "max_foods.loc['Amino Acids']['food']" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## 2012 Federal Election Commission Database" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fec = pd.read_csv('datasets/fec/P00000001-ALL.csv')\n", "fec.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fec.iloc[123456]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "unique_cands = fec.cand_nm.unique()\n", "unique_cands\n", "unique_cands[2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "parties = {'Bachmann, Michelle': 'Republican',\n", " 'Cain, Herman': 'Republican',\n", " 'Gingrich, Newt': 'Republican',\n", " 'Huntsman, Jon': 'Republican',\n", " 'Johnson, Gary Earl': 'Republican',\n", " 'McCotter, Thaddeus G': 'Republican',\n", " 'Obama, Barack': 'Democrat',\n", " 'Paul, Ron': 'Republican',\n", " 'Pawlenty, Timothy': 'Republican',\n", " 'Perry, Rick': 'Republican',\n", " \"Roemer, Charles E. 'Buddy' III\": 'Republican',\n", " 'Romney, Mitt': 'Republican',\n", " 'Santorum, Rick': 'Republican'}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fec.cand_nm[123456:123461]\n", "fec.cand_nm[123456:123461].map(parties)\n", "# Add it as a column\n", "fec['party'] = fec.cand_nm.map(parties)\n", "fec['party'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "(fec.contb_receipt_amt > 0).value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fec = fec[fec.contb_receipt_amt > 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Donation Statistics by Occupation and Employer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fec.contbr_occupation.value_counts()[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "occ_mapping = {\n", " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", " 'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',\n", " 'C.E.O.': 'CEO'\n", "}\n", "\n", "# If no mapping provided, return x\n", "f = lambda x: occ_mapping.get(x, x)\n", "fec.contbr_occupation = fec.contbr_occupation.map(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "emp_mapping = {\n", " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", " 'SELF' : 'SELF-EMPLOYED',\n", " 'SELF EMPLOYED' : 'SELF-EMPLOYED',\n", "}\n", "\n", "# If no mapping provided, return x\n", "f = lambda x: emp_mapping.get(x, x)\n", "fec.contbr_employer = fec.contbr_employer.map(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "by_occupation = fec.pivot_table('contb_receipt_amt',\n", " index='contbr_occupation',\n", " columns='party', aggfunc='sum')\n", "over_2mm = by_occupation[by_occupation.sum(1) > 2000000]\n", "over_2mm" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "over_2mm.plot(kind='barh')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def get_top_amounts(group, key, n=5):\n", " totals = group.groupby(key)['contb_receipt_amt'].sum()\n", " return totals.nlargest(n)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "grouped = fec_mrbo.groupby('cand_nm')\n", "grouped.apply(get_top_amounts, 'contbr_occupation', n=7)\n", "grouped.apply(get_top_amounts, 'contbr_employer', n=10)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Bucketing Donation Amounts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "bins = np.array([0, 1, 10, 100, 1000, 10000,\n", " 100000, 1000000, 10000000])\n", "labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)\n", "labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "grouped = fec_mrbo.groupby(['cand_nm', labels])\n", "grouped.size().unstack(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)\n", "normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)\n", "normed_sums\n", "normed_sums[:-2].plot(kind='barh')" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Donation Statistics by State" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])\n", "totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)\n", "totals = totals[totals.sum(1) > 100000]\n", "totals[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "percent = totals.div(totals.sum(1), axis=0)\n", "percent[:10]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Conclusion" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 0 }