{ "metadata": { "name": "", "signature": "sha256:1a808b3f21872ae58cfa925fbcf8434b5a7ca1eb432c83df1c536c48e6f5ca36" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "We are going to play with recreating this animation: http://www.prooffreader.com/2014/04/baby-names-rise-of-n.html\n", "\n", "The first step of this is to get the data, so let's pop over to David Taylor's [notebook](get_baby_names_data.ipynb) and download the files.\n", "\n", "Then let's load those dataframes." ] }, { "cell_type": "code", "collapsed": false, "input": [ "data_path = \"baby_names\" \n", "\n", "import os\n", "import pandas as pd\n", "\n", "os.chdir(data_path)\n", "yob = pd.read_pickle('yob.pickle')\n", "names = pd.read_pickle('names.pickle')\n", "years = pd.read_pickle('years.pickle')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "print yob" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " name sex births year pct ranked firstletter\n", "0 Mary F 7065 1880 7.764334 1 M\n", "1 Anna F 2604 1880 2.861759 2 A\n", "2 Emma F 2003 1880 2.201268 3 E\n", "3 Elizabeth F 1939 1880 2.130933 4 E\n", "4 Minnie F 1746 1880 1.918829 5 M\n", "5 Margaret F 1578 1880 1.734199 6 M\n", "6 Ida F 1472 1880 1.617707 7 I\n", "7 Alice F 1414 1880 1.553966 8 A\n", "8 Bertha F 1320 1880 1.450661 9 B\n", "9 Sarah F 1288 1880 1.415493 10 S\n", "10 Annie F 1258 1880 1.382524 11 A\n", "11 Clara F 1226 1880 1.347356 12 C\n", "12 Ella F 1156 1880 1.270427 13 E\n", "13 Florence F 1063 1880 1.168222 14 F\n", "14 Cora F 1045 1880 1.148440 15 C\n", "15 Martha F 1040 1880 1.142945 16 M\n", "16 Laura F 1012 1880 1.112173 17 L\n", "17 Nellie F 995 1880 1.093491 18 N\n", "18 Grace F 982 1880 1.079204 19 G\n", "19 Carrie F 949 1880 1.042937 20 C\n", "20 Maude F 858 1880 0.942930 21 M\n", "21 Mabel F 808 1880 0.887980 22 M\n", "22 Bessie F 796 1880 0.874793 23 B\n", "23 Jennie F 793 1880 0.871496 24 J\n", "24 Gertrude F 787 1880 0.864902 25 G\n", "25 Julia F 783 1880 0.860506 26 J\n", "26 Hattie F 769 1880 0.845120 27 H\n", "27 Edith F 768 1880 0.844021 28 E\n", "28 Mattie F 704 1880 0.773686 29 M\n", "29 Rose F 700 1880 0.769290 30 R\n", "... ... .. ... ... ... ... ...\n", "1792061 Zayceon M 5 2013 0.000267 12995 Z\n", "1792062 Zayid M 5 2013 0.000267 12995 Z\n", "1792063 Zaylynn M 5 2013 0.000267 12995 Z\n", "1792064 Zecheriah M 5 2013 0.000267 12995 Z\n", "1792065 Zedric M 5 2013 0.000267 12995 Z\n", "1792066 Zefram M 5 2013 0.000267 12995 Z\n", "1792067 Zekhi M 5 2013 0.000267 12995 Z\n", "1792068 Zenith M 5 2013 0.000267 12995 Z\n", "1792069 Zennon M 5 2013 0.000267 12995 Z\n", "1792070 Zepplin M 5 2013 0.000267 12995 Z\n", "1792071 Zevon M 5 2013 0.000267 12995 Z\n", "1792072 Zhaiden M 5 2013 0.000267 12995 Z\n", "1792073 Zhen M 5 2013 0.000267 12995 Z\n", "1792074 Zhian M 5 2013 0.000267 12995 Z\n", "1792075 Zien M 5 2013 0.000267 12995 Z\n", "1792076 Zierre M 5 2013 0.000267 12995 Z\n", "1792077 Zimri M 5 2013 0.000267 12995 Z\n", "1792078 Ziquan M 5 2013 0.000267 12995 Z\n", "1792079 Ziyaad M 5 2013 0.000267 12995 Z\n", "1792080 Ziyang M 5 2013 0.000267 12995 Z\n", "1792081 Zmari M 5 2013 0.000267 12995 Z\n", "1792082 Zolan M 5 2013 0.000267 12995 Z\n", "1792083 Zurich M 5 2013 0.000267 12995 Z\n", "1792084 Zyeer M 5 2013 0.000267 12995 Z\n", "1792085 Zyere M 5 2013 0.000267 12995 Z\n", "1792086 Zyhier M 5 2013 0.000267 12995 Z\n", "1792087 Zylar M 5 2013 0.000267 12995 Z\n", "1792088 Zymari M 5 2013 0.000267 12995 Z\n", "1792089 Zymeer M 5 2013 0.000267 12995 Z\n", "1792090 Zyree M 5 2013 0.000267 12995 Z\n", "\n", "[1792091 rows x 7 columns]\n" ] } ], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "print yob[yob.year == 1880]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " name sex births year pct ranked firstletter\n", "0 Mary F 7065 1880 7.764334 1.0 M\n", "1 Anna F 2604 1880 2.861759 2.0 A\n", "2 Emma F 2003 1880 2.201268 3.0 E\n", "3 Elizabeth F 1939 1880 2.130933 4.0 E\n", "4 Minnie F 1746 1880 1.918829 5.0 M\n", "5 Margaret F 1578 1880 1.734199 6.0 M\n", "6 Ida F 1472 1880 1.617707 7.0 I\n", "7 Alice F 1414 1880 1.553966 8.0 A\n", "8 Bertha F 1320 1880 1.450661 9.0 B\n", "9 Sarah F 1288 1880 1.415493 10.0 S\n", "10 Annie F 1258 1880 1.382524 11.0 A\n", "11 Clara F 1226 1880 1.347356 12.0 C\n", "12 Ella F 1156 1880 1.270427 13.0 E\n", "13 Florence F 1063 1880 1.168222 14.0 F\n", "14 Cora F 1045 1880 1.148440 15.0 C\n", "15 Martha F 1040 1880 1.142945 16.0 M\n", "16 Laura F 1012 1880 1.112173 17.0 L\n", "17 Nellie F 995 1880 1.093491 18.0 N\n", "18 Grace F 982 1880 1.079204 19.0 G\n", "19 Carrie F 949 1880 1.042937 20.0 C\n", "20 Maude F 858 1880 0.942930 21.0 M\n", "21 Mabel F 808 1880 0.887980 22.0 M\n", "22 Bessie F 796 1880 0.874793 23.0 B\n", "23 Jennie F 793 1880 0.871496 24.0 J\n", "24 Gertrude F 787 1880 0.864902 25.0 G\n", "25 Julia F 783 1880 0.860506 26.0 J\n", "26 Hattie F 769 1880 0.845120 27.0 H\n", "27 Edith F 768 1880 0.844021 28.0 E\n", "28 Mattie F 704 1880 0.773686 29.0 M\n", "29 Rose F 700 1880 0.769290 30.0 R\n", "... ... .. ... ... ... ... ...\n", "1970 Philo M 5 1880 0.004525 983.5 P\n", "1971 Phineas M 5 1880 0.004525 983.5 P\n", "1972 Presley M 5 1880 0.004525 983.5 P\n", "1973 Ransom M 5 1880 0.004525 983.5 R\n", "1974 Reece M 5 1880 0.004525 983.5 R\n", "1975 Rene M 5 1880 0.004525 983.5 R\n", "1976 Roswell M 5 1880 0.004525 983.5 R\n", "1977 Rowland M 5 1880 0.004525 983.5 R\n", "1978 Sampson M 5 1880 0.004525 983.5 S\n", "1979 Samual M 5 1880 0.004525 983.5 S\n", "1980 Santos M 5 1880 0.004525 983.5 S\n", "1981 Schuyler M 5 1880 0.004525 983.5 S\n", "1982 Sheppard M 5 1880 0.004525 983.5 S\n", "1983 Spurgeon M 5 1880 0.004525 983.5 S\n", "1984 Starling M 5 1880 0.004525 983.5 S\n", "1985 Sylvanus M 5 1880 0.004525 983.5 S\n", "1986 Theadore M 5 1880 0.004525 983.5 T\n", "1987 Theophile M 5 1880 0.004525 983.5 T\n", "1988 Tilmon M 5 1880 0.004525 983.5 T\n", "1989 Tommy M 5 1880 0.004525 983.5 T\n", "1990 Unknown M 5 1880 0.004525 983.5 U\n", "1991 Vann M 5 1880 0.004525 983.5 V\n", "1992 Wes M 5 1880 0.004525 983.5 W\n", "1993 Winston M 5 1880 0.004525 983.5 W\n", "1994 Wood M 5 1880 0.004525 983.5 W\n", "1995 Woodie M 5 1880 0.004525 983.5 W\n", "1996 Worthy M 5 1880 0.004525 983.5 W\n", "1997 Wright M 5 1880 0.004525 983.5 W\n", "1998 York M 5 1880 0.004525 983.5 Y\n", "1999 Zachariah M 5 1880 0.004525 983.5 Z\n", "\n", "[2000 rows x 7 columns]\n" ] } ], "prompt_number": 55 }, { "cell_type": "code", "collapsed": false, "input": [ "print yob[(yob['name'].str.startswith('A')) & (yob.year == 1880)]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " name sex births year pct ranked\n", "1 Anna F 2604 1880 2.861759 2.0\n", "7 Alice F 1414 1880 1.553966 8.0\n", "10 Annie F 1258 1880 1.382524 11.0\n", "32 Ada F 652 1880 0.716539 33.0\n", "53 Agnes F 473 1880 0.519820 54.0\n", "81 Alma F 277 1880 0.304419 82.0\n", "82 Addie F 274 1880 0.301122 83.0\n", "88 Amanda F 241 1880 0.264856 89.0\n", "95 Amelia F 221 1880 0.242876 96.5\n", "107 Amy F 167 1880 0.183531 108.0\n", "112 Augusta F 151 1880 0.165947 113.0\n", "119 Anne F 136 1880 0.149462 120.0\n", "121 Ann F 131 1880 0.143967 123.5\n", "143 Allie F 105 1880 0.115393 144.5\n", "150 Alta F 91 1880 0.100008 151.0\n", "170 Alberta F 76 1880 0.083523 171.5\n", "177 Abbie F 71 1880 0.078028 178.0\n", "185 Adelaide F 65 1880 0.071434 188.0\n", "207 Adeline F 54 1880 0.059345 209.0\n", "239 Adele F 41 1880 0.045058 243.0\n", "253 Angie F 36 1880 0.039563 257.0\n", "287 Artie F 29 1880 0.031871 290.5\n", "293 Alvina F 28 1880 0.030772 299.0\n", "294 Annette F 28 1880 0.030772 299.0\n", "308 Adella F 26 1880 0.028574 312.5\n", "309 Alpha F 26 1880 0.028574 312.5\n", "316 Angeline F 25 1880 0.027475 321.0\n", "325 Adah F 24 1880 0.026376 329.0\n", "332 Adaline F 23 1880 0.025277 337.0\n", "351 Almeda F 21 1880 0.023079 359.0\n", "... ... .. ... ... ... ...\n", "1690 Arlie M 7 1880 0.006335 776.5\n", "1750 Adolf M 6 1880 0.005430 858.5\n", "1751 Albin M 6 1880 0.005430 858.5\n", "1752 Albion M 6 1880 0.005430 858.5\n", "1753 Allison M 6 1880 0.005430 858.5\n", "1754 Alpha M 6 1880 0.005430 858.5\n", "1755 Alpheus M 6 1880 0.005430 858.5\n", "1756 Anastacio M 6 1880 0.005430 858.5\n", "1757 Andre M 6 1880 0.005430 858.5\n", "1758 Annie M 6 1880 0.005430 858.5\n", "1759 Arlington M 6 1880 0.005430 858.5\n", "1760 Armand M 6 1880 0.005430 858.5\n", "1761 Asberry M 6 1880 0.005430 858.5\n", "1762 Asbury M 6 1880 0.005430 858.5\n", "1763 Asher M 6 1880 0.005430 858.5\n", "1764 Augustin M 6 1880 0.005430 858.5\n", "1765 Auther M 6 1880 0.005430 858.5\n", "1766 Author M 6 1880 0.005430 858.5\n", "1850 Ab M 5 1880 0.004525 983.5\n", "1851 Abbott M 5 1880 0.004525 983.5\n", "1852 Agustus M 5 1880 0.004525 983.5\n", "1853 Albertus M 5 1880 0.004525 983.5\n", "1854 Almer M 5 1880 0.004525 983.5\n", "1855 Alphonso M 5 1880 0.004525 983.5\n", "1856 Alvia M 5 1880 0.004525 983.5\n", "1857 Artie M 5 1880 0.004525 983.5\n", "1858 Arvid M 5 1880 0.004525 983.5\n", "1859 Ashby M 5 1880 0.004525 983.5\n", "1860 Augusta M 5 1880 0.004525 983.5\n", "1861 Aurthur M 5 1880 0.004525 983.5\n", "\n", "[190 rows x 6 columns]\n" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "yob['firstletter'] = yob['name'].str[0]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "# Now let's add dummy data to sanitize things. \n", "temp_frame = pd.DataFrame(columns=yob.columns.values.tolist(), index=range((2014 - 1880)* 26))\n", "for y in range(1880, 2014):\n", " for c in range(65, 91):\n", " ind = 26*(y - 1880) + c - 65\n", " temp_frame.ix[ind].year = y\n", " temp_frame.ix[ind].firstletter = chr(c)\n", " temp_frame.ix[ind].name = chr(c)\n", " temp_frame.ix[ind].sex = 'F'\n", " temp_frame.ix[ind].births = 0\n", " temp_frame.ix[ind].pct = 0\n", " temp_frame.ix[ind].ranked = 0\n", "print temp_frame.ix[1]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "name NaN\n", "sex F\n", "births 0\n", "year 1880\n", "pct 0\n", "ranked 0\n", "firstletter B\n", "Name: 1, dtype: object\n" ] } ], "prompt_number": 89 }, { "cell_type": "code", "collapsed": false, "input": [ "yob_aggregated = yob.groupby(['year', 'firstletter', 'sex']).sum().reset_index()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 88 }, { "cell_type": "code", "collapsed": false, "input": [ "yob_sanitized = yob_aggregated.append(temp_frame)\n", "print yob_sanitized" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " births firstletter name pct ranked sex year\n", "0 9334 A NaN 10.25793 43318 F 1880\n", "1 7406 A NaN 6.702808 54054.5 M 1880\n", "2 3876 B NaN 4.259668 12487.5 F 1880\n", "3 2115 B NaN 1.914183 37631.5 M 1880\n", "4 5868 C NaN 6.448848 36263 F 1880\n", "5 9949 C NaN 9.004353 40421.5 M 1880\n", "6 2218 D NaN 2.43755 21554.5 F 1880\n", "7 2488 D NaN 2.251767 24340 M 1880\n", "8 11444 E NaN 12.57679 42338 F 1880\n", "9 6894 E NaN 6.239422 43971 M 1880\n", "10 2957 F NaN 3.249701 11487 F 1880\n", "11 6529 F NaN 5.909079 16127.5 M 1880\n", "12 2463 G NaN 2.706802 12885.5 F 1880\n", "13 6274 G NaN 5.678291 21138 M 1880\n", "14 2743 H NaN 3.014518 11620 F 1880\n", "15 7599 H NaN 6.877483 43150.5 M 1880\n", "16 2480 I NaN 2.725484 12726 F 1880\n", "17 947 I NaN 0.8570834 11240 M 1880\n", "18 3800 J NaN 4.176145 20455.5 F 1880\n", "19 22272 J NaN 20.1573 23451.5 M 1880\n", "20 1514 K NaN 1.663864 5389.5 F 1880\n", "21 106 K NaN 0.09593542 5792.5 M 1880\n", "22 8713 L NaN 9.575462 48888.5 F 1880\n", "23 4086 L NaN 3.698039 31886.5 M 1880\n", "24 19779 M NaN 21.73684 46765.5 F 1880\n", "25 3166 M NaN 2.865392 38498.5 M 1880\n", "26 3026 N NaN 3.325531 11614.5 F 1880\n", "27 893 N NaN 0.8082106 12978 M 1880\n", "28 968 O NaN 1.063818 14405 F 1880\n", "29 1736 O NaN 1.571169 21460 M 1880\n", "... ... ... ... ... ... .. ...\n", "3454 0 W NaN 0 0 F 2012\n", "3455 0 X NaN 0 0 F 2012\n", "3456 0 Y NaN 0 0 F 2012\n", "3457 0 Z NaN 0 0 F 2012\n", "3458 0 A NaN 0 0 F 2013\n", "3459 0 B NaN 0 0 F 2013\n", "3460 0 C NaN 0 0 F 2013\n", "3461 0 D NaN 0 0 F 2013\n", "3462 0 E NaN 0 0 F 2013\n", "3463 0 F NaN 0 0 F 2013\n", "3464 0 G NaN 0 0 F 2013\n", "3465 0 H NaN 0 0 F 2013\n", "3466 0 I NaN 0 0 F 2013\n", "3467 0 J NaN 0 0 F 2013\n", "3468 0 K NaN 0 0 F 2013\n", "3469 0 L NaN 0 0 F 2013\n", "3470 0 M NaN 0 0 F 2013\n", "3471 0 N NaN 0 0 F 2013\n", "3472 0 O NaN 0 0 F 2013\n", "3473 0 P NaN 0 0 F 2013\n", "3474 0 Q NaN 0 0 F 2013\n", "3475 0 R NaN 0 0 F 2013\n", "3476 0 S NaN 0 0 F 2013\n", "3477 0 T NaN 0 0 F 2013\n", "3478 0 U NaN 0 0 F 2013\n", "3479 0 V NaN 0 0 F 2013\n", "3480 0 W NaN 0 0 F 2013\n", "3481 0 X NaN 0 0 F 2013\n", "3482 0 Y NaN 0 0 F 2013\n", "3483 0 Z NaN 0 0 F 2013\n", "\n", "[10381 rows x 7 columns]\n" ] } ], "prompt_number": 90 }, { "cell_type": "code", "collapsed": false, "input": [ "# sanity check to make sure that column percent adds up to 100\n", "letters = yob_aggregated[yob_aggregated.year == 1880]['firstletter']\n", "print yob_aggregated[(yob_aggregated.year == 1884) & (yob_aggregated.sex == 'F')]['pct'].sum()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "100.0\n" ] } ], "prompt_number": 92 }, { "cell_type": "code", "collapsed": false, "input": [ "print yob_aggregated[(yob_aggregated.year == 1884) & (yob_aggregated.sex == 'F')][['firstletter', 'pct']]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " firstletter pct\n", "197 A 10.326921\n", "199 B 4.654245\n", "201 C 6.137713\n", "203 D 2.371689\n", "205 E 12.761390\n", "207 F 3.252159\n", "209 G 2.937484\n", "211 H 3.079320\n", "213 I 2.626684\n", "215 J 4.060548\n", "217 K 1.527646\n", "219 L 9.493730\n", "221 M 20.987894\n", "223 N 3.279286\n", "225 O 1.258700\n", "227 P 1.464091\n", "229 Q 0.024802\n", "231 R 3.056068\n", "233 S 3.940413\n", "235 T 0.654152\n", "237 U 0.028677\n", "239 V 1.211421\n", "241 W 0.605323\n", "243 Y 0.003875\n", "245 Z 0.255770\n" ] } ], "prompt_number": 97 }, { "cell_type": "code", "collapsed": false, "input": [ "import bokeh\n", "bokeh.load_notebook()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ " \n", " \n", " \n", "
\n", " \n", " BokehJS successfully loaded.\n", "
" ], "metadata": {}, "output_type": "display_data" } ], "prompt_number": 93 }, { "cell_type": "code", "collapsed": false, "input": [ "from collections import OrderedDict\n", "\n", "import numpy as np\n", "\n", "from bokeh.charts import Bar\n", "\n", "\n", "agg = yob_aggregated[(yob_aggregated.year == 1884) & (yob_aggregated.sex == 'F')]\n", "\n", "data = OrderedDict(percent = np.array(agg['pct'], dtype=np.float))\n", "\n", "bar = Bar(data, agg['firstletter'].tolist())\n", "bar.title(\"1884 Female Names\").notebook().stacked().xlabel(\"First Letter\").ylabel(\"Percent born with name\")\n", "bar.show()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "
\n", "\n" ], "metadata": {}, "output_type": "display_data" } ], "prompt_number": 144 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's try our hand at animation. \n", "\n", "Go to the command line and start the bokeh server by typing bokeh-server.\n", "\n", "It should be avalible on port 5006." ] }, { "cell_type": "code", "collapsed": false, "input": [ "import time\n", "output_notebook(url=\"default\")\n", "\n", "from IPython.display import clear_output\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 155 }, { "cell_type": "code", "collapsed": false, "input": [ "def makeBar(year):\n", " agg = yob_aggregated[(yob_aggregated.year == year) & (yob_aggregated.sex == 'F')]\n", " data = OrderedDict(percent = np.array(agg['pct'], dtype=np.float))\n", " bar = Bar(data, agg['firstletter'].tolist())\n", " bar.title(str(year) + \" Female Names\").notebook().stacked().xlabel(\"First Letter\").ylabel(\"Percent born with name\")\n", " return bar\n", "\n", "for year in range(1881, 2014):\n", " bar = makeBar(year)\n", " clear_output()\n", " bar.show()\n", " time.sleep(.5)\n", " \n" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "
\n", "\n" ], "metadata": {}, "output_type": "display_data" }, { "output_type": "stream", "stream": "stderr", "text": [ "ERROR: Internal Python error in the inspect module.\n", "Below is the traceback from this internal error.\n", "\n", "\n", "KeyboardInterrupt\n" ] } ], "prompt_number": 165 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 169 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 169 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }