{ "metadata": { "name": "", "signature": "sha256:701fe9ae90e2133189ceeada0d343d6bbf4591b0b582814e2f53a2d55e7adad8" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### U.S. Baby names iPython notebooks #\n", "\n", " * By David Taylor, [www.prooffreader.com](http://www.prooffreader.com)\n", " * using data from United States Social Security Administration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download data and create pandas dataframes ##\n", "\n", "* A .py version of this script is %run in other notebooks to load primary dataframes\n", "* Last cell contains descriptions of dataframe schemas\n", "* Script is smart enough not to download or perform lengthy procedures if the files already exist" ] }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Set working path and import libraries" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data_path = \"baby_names\" \n", "\n", "import os\n", "if not os.path.isdir(data_path): # creates path if it does not exist\n", " os.makedirs(data_path)\n", "\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Download files from Social Security website unless files already exist in working_path" ] }, { "cell_type": "code", "collapsed": false, "input": [ "os.chdir(data_path)\n", "\n", "ssa_url = 'http://www.socialsecurity.gov/oact/babynames/names.zip' \n", "\n", "if not os.path.isfile(\"names.zip\"):\n", " print \"Downloading.\"\n", " import urllib\n", " urllib.urlretrieve(ssa_url, 'names.zip')\n", "else: print \"Data already downloaded.\"\n", "\n", "if not os.path.isfile(\"yob1880.txt\") or not os.path.isfile(\"yob2013.txt\"):\n", " print \"Extracting.\"\n", " import zipfile\n", " with zipfile.ZipFile('names.zip') as zf:\n", " for member in zf.infolist():\n", " zf.extract(member)\n", "else: print \"Data already extracted.\"\n", "\n", "os.chdir(\"../\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Downloading.\n", "Extracting." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 2 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Create pandas dataframes from U.S. Social Security baby names database and pickle for later use in other notebooks\n", "\n", "This block takes well under a minute on my medium-quality desktop Windows PC." ] }, { "cell_type": "code", "collapsed": false, "input": [ "redo_dataframes = True\n", "os.chdir(data_path)\n", "\n", "if (redo_dataframes == True or\n", " not os.path.isfile(\"yob.pickle\") or \n", " not os.path.isfile(\"names.pickle\") or \n", " not os.path.isfile(\"years.pickle\")):\n", "\n", " print \"Processing.\"\n", " \n", " # read individual files, yob1880.txt, yob1881.txt, etc. and assemble into a dataframe\n", " years = range(1880, 2014) # stops at 2013: update this when Social Security Administration adds to data \n", " parts = []\n", " part_columns = ['name', 'sex', 'births']\n", " for year in years:\n", " path = 'yob' + str(year) + '.txt'\n", " part_df = pd.read_csv(path, names=part_columns)\n", " part_df['year'] = year\n", " parts.append(part_df)\n", " yob = pd.concat(parts, ignore_index=True)\n", " \n", " # add column 'pct': the number of births of that name and sex in that year\n", " # divided by the total number of births of that sex in that year, multiplied by\n", " # 100 to turn into a percentage and reduce leading zeroes\n", " def add_pct(group):\n", " births = group.births.astype(float)\n", " group['pct'] = (births / births.sum() * 100)\n", " return group\n", " yob = yob.groupby(['year', 'sex']).apply(add_pct)\n", " #add rank of each name each year each sex\n", " yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)\n", " yob.to_pickle(\"yob.pickle\")\n", " \n", " # names dataframe: discards individual birth or pct values, and instead collects data on unique names.\n", " # There is one row per unique combination of name and sex.\n", " yobf = yob[yob.sex == 'F']\n", " yobm = yob[yob.sex == 'M']\n", " names_count = pd.DataFrame(yobf['name'].value_counts())\n", " names_count.columns= ['year_count']\n", " names_min = pd.DataFrame(yobf.groupby('name').year.min()) \n", " names_min.columns = ['year_min']\n", " names_max = pd.DataFrame(yobf.groupby('name').year.max()) \n", " names_max.columns = ['year_max']\n", " names_pctsum = pd.DataFrame(yobf.groupby('name').pct.sum()) \n", " names_pctsum.columns = ['pct_sum']\n", " names_pctmax = pd.DataFrame(yobf.groupby('name').pct.max())\n", " names_pctmax.columns = ['pct_max']\n", " names_f = names_count.join(names_min)\n", " names_f = names_f.join(names_max)\n", " names_f = names_f.join(names_pctsum)\n", " names_f = names_f.join(names_pctmax)\n", " names_f['sex'] = \"F\"\n", " names_f.reset_index(inplace=True, drop=False)\n", " names_f.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']\n", " names_f = names_f[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]\n", " names_count = pd.DataFrame(yobm['name'].value_counts()) \n", " names_count.columns=['year_count']\n", " names_min = pd.DataFrame(yobm.groupby('name').year.min()) \n", " names_min.columns = ['year_min']\n", " names_max = pd.DataFrame(yobm.groupby('name').year.max()) \n", " names_max.columns = ['year_max']\n", " names_pctsum = pd.DataFrame(yobm.groupby('name').pct.sum()) \n", " names_pctsum.columns = ['pct_sum']\n", " names_pctmax = pd.DataFrame(yobm.groupby('name').pct.max()) \n", " names_pctmax.columns = ['pct_max']\n", " names_m = names_count.join(names_min)\n", " names_m = names_m.join(names_max)\n", " names_m = names_m.join(names_pctsum)\n", " names_m = names_m.join(names_pctmax)\n", " names_m['sex'] = \"M\"\n", " names_m.reset_index(inplace=True, drop=False)\n", " names_m.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']\n", " names_m = names_m[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]\n", " names = pd.concat([names_f, names_m], ignore_index=True)\n", " names.to_pickle('names.pickle')\n", " \n", " # create years dataframe. Discards individual name data, aggregating by year.\n", " total = pd.DataFrame(yob.pivot_table('births', index='year', columns = 'sex', aggfunc=sum))\n", " total.reset_index(drop=False, inplace=True)\n", " total.columns = ['year', 'births_f', 'births_m']\n", " total['births_t'] = total.births_f + total.births_m\n", " newnames = pd.DataFrame(names.groupby('year_min').year_min.count())\n", " newnames.columns = ['firstyearcount']\n", " newnames.reset_index(drop=False, inplace=True)\n", " newnames.columns = ['year', 'new_names']\n", " uniquenames = pd.DataFrame()\n", " for yr in range(1880, 2013):\n", " uniquenames = uniquenames.append(pd.DataFrame([{'year':yr, 'unique_names':len(yob[yob.year == yr].name.unique())}]), ignore_index=True)\n", " years = pd.merge(left=total, right=newnames, on='year', right_index=False, left_index=False)\n", " years = pd.merge(left=years, right=uniquenames, on='year', right_index=False, left_index=False)\n", " years['sexratio'] = 100.0 * years.births_m / years.births_f\n", " years.to_pickle('years.pickle')\n", " \n", "else:\n", " \n", " print \"Reading from pickle.\"\n", " yob = pd.read_pickle('yob.pickle')\n", " names = pd.read_pickle('names.pickle')\n", " years = pd.read_pickle('years.pickle')\n", " \n", "os.chdir(\"../\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Processing.\n" ] } ], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Dataframe schemas: ####\n", "\n", "Note dataframes have only an arbitrary ordinal index. Indexes and multi-indexes are added later where needed.\n", "\n", "-----------------\n", "\n", "yob = a dataframe with each record comprising a unique name, sex and year.\n", "Length: approx. 1.76 million records; pickle = ~100 MB\n", "\n", " name String\n", " sex M or F\n", " births Number of birth with that name of that sex during that year;\n", " names with fewer than 5 births in a given year are omitted due to privacy concerns\n", " year 1880-2012\n", " pct Percentage of births of that sex during that year with that name (float)\n", " ranked Rank of number of births of that name among all births of that sex during that year\n", "\n", "-----------------\n", "\n", "names = a dataframe with each record comprising a unique name and sex, with data for individual years discarded but summary and additional data added.\n", "Length: approx. 101,000 records; pickle = ~ 7 MB\n", "\n", " name Same as in df\n", " sex Same as in df\n", " year_count Number of different years in which that name appears in dataframe, from 1 to 133.\n", " year_min First year name appears in database\n", " year_max Last year name appears in database\n", " pct_sum Sum of pct field for that name for all years. Not a statistically meaningful number\n", " (because the underlying distribution of names varies from year to year),\n", " but I have found it a useful rough metric during development\n", " pct_max Maximum value in pct field for all years, indicating the most popular that name has ever been in the database.\n", "\n", "------------------\n", "\n", "years = a dataframe with each record comprising a unique year, with individual name data discarded but summary and additional data added.\n", "Length: 133 records; pickle = ~ 8 kB\n", "\n", " year Same as in df\n", " births_f Number of female births during that year\n", " births_m Number of male births during that year\n", " births_t Total number of births during that year\n", " new_names Number of names that appear for the first time during that year\n", " unique_names Number of different names that appear during that year\n", " sexratio Number of boys born per hundred girls\n" ] }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Tails of all three dataframes:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "print \"Tail of dataframe 'yob':\"\n", "print yob.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Tail of dataframe 'yob':\n", " name sex births year pct ranked\n", "1792086 Zyhier M 5 2013 0.000267 12995\n", "1792087 Zylar M 5 2013 0.000267 12995\n", "1792088 Zymari M 5 2013 0.000267 12995\n", "1792089 Zymeer M 5 2013 0.000267 12995\n", "1792090 Zyree M 5 2013 0.000267 12995" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "print \"\\nTail of dataframe 'names':\"\n", "print names.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Tail of dataframe 'names':\n", " name sex year_count year_min year_max pct_sum pct_max\n", "102685 Gross M 1 1925 1925 0.000538 0.000538\n", "102686 Elik M 1 2012 2012 0.000318 0.000318\n", "102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262\n", "102688 Southern M 1 1923 1923 0.000547 0.000547\n", "102689 Jeon M 1 1999 1999 0.000261 0.000261\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "print \"\\nTail of dataframe 'years':\"\n", "print years.tail()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Tail of dataframe 'years':\n", " year births_f births_m births_t new_names unique_names sexratio\n", "128 2008 1886765 2035811 3922576 2046 32483 107.899553\n", "129 2009 1832276 1978582 3810858 1789 32210 107.984932\n", "130 2010 1771846 1912915 3684761 1635 31593 107.961696\n", "131 2011 1752198 1891800 3643998 1539 31412 107.967250\n", "132 2012 1751866 1886972 3638838 1531 31212 107.712120\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }