{
 "metadata": {
  "name": "",
  "signature": "sha256:701fe9ae90e2133189ceeada0d343d6bbf4591b0b582814e2f53a2d55e7adad8"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### U.S. Baby names iPython notebooks #\n",
      "\n",
      "  * By David Taylor, [www.prooffreader.com](http://www.prooffreader.com)\n",
      "  * using data from United States Social Security Administration"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Download data and create pandas dataframes ##\n",
      "\n",
      "* A .py version of this script is %run in other notebooks to load primary dataframes\n",
      "* Last cell contains descriptions of dataframe schemas\n",
      "* Script is smart enough not to download or perform lengthy procedures if the files already exist"
     ]
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Set working path and import libraries"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data_path = \"baby_names\" \n",
      "\n",
      "import os\n",
      "if not os.path.isdir(data_path): # creates path if it does not exist\n",
      "    os.makedirs(data_path)\n",
      "\n",
      "import pandas as pd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Download files from Social Security website unless files already exist in working_path"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "os.chdir(data_path)\n",
      "\n",
      "ssa_url = 'http://www.socialsecurity.gov/oact/babynames/names.zip' \n",
      "\n",
      "if not os.path.isfile(\"names.zip\"):\n",
      "    print \"Downloading.\"\n",
      "    import urllib\n",
      "    urllib.urlretrieve(ssa_url, 'names.zip')\n",
      "else: print \"Data already downloaded.\"\n",
      "\n",
      "if not os.path.isfile(\"yob1880.txt\") or not os.path.isfile(\"yob2013.txt\"):\n",
      "    print \"Extracting.\"\n",
      "    import zipfile\n",
      "    with zipfile.ZipFile('names.zip') as zf:\n",
      "        for member in zf.infolist():\n",
      "            zf.extract(member)\n",
      "else: print \"Data already extracted.\"\n",
      "\n",
      "os.chdir(\"../\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Downloading.\n",
        "Extracting."
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Create pandas dataframes from U.S. Social Security baby names database and pickle for later use in other notebooks\n",
      "\n",
      "This block takes well under a minute on my medium-quality desktop Windows PC."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "redo_dataframes = True\n",
      "os.chdir(data_path)\n",
      "\n",
      "if (redo_dataframes == True or\n",
      "    not os.path.isfile(\"yob.pickle\") or \n",
      "    not os.path.isfile(\"names.pickle\") or \n",
      "    not os.path.isfile(\"years.pickle\")):\n",
      "\n",
      "    print \"Processing.\"\n",
      "    \n",
      "    # read individual files, yob1880.txt, yob1881.txt, etc. and assemble into a dataframe\n",
      "    years = range(1880, 2014) # stops at 2013: update this when Social Security Administration adds to data \n",
      "    parts = []\n",
      "    part_columns = ['name', 'sex', 'births']\n",
      "    for year in years:\n",
      "        path = 'yob' + str(year) + '.txt'\n",
      "        part_df = pd.read_csv(path, names=part_columns)\n",
      "        part_df['year'] = year\n",
      "        parts.append(part_df)\n",
      "    yob = pd.concat(parts, ignore_index=True)\n",
      "    \n",
      "    # add column 'pct': the number of births of that name and sex in that year\n",
      "    # divided by the total number of births of that sex in that year, multiplied by\n",
      "    # 100 to turn into a percentage and reduce leading zeroes\n",
      "    def add_pct(group):\n",
      "        births = group.births.astype(float)\n",
      "        group['pct'] = (births / births.sum() * 100)\n",
      "        return group\n",
      "    yob = yob.groupby(['year', 'sex']).apply(add_pct)\n",
      "    #add rank of each name each year each sex\n",
      "    yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)\n",
      "    yob.to_pickle(\"yob.pickle\")\n",
      "    \n",
      "    # names dataframe: discards individual birth or pct values, and instead collects data on unique names.\n",
      "    # There is one row per unique combination of name and sex.\n",
      "    yobf = yob[yob.sex == 'F']\n",
      "    yobm = yob[yob.sex == 'M']\n",
      "    names_count = pd.DataFrame(yobf['name'].value_counts())\n",
      "    names_count.columns= ['year_count']\n",
      "    names_min = pd.DataFrame(yobf.groupby('name').year.min()) \n",
      "    names_min.columns = ['year_min']\n",
      "    names_max = pd.DataFrame(yobf.groupby('name').year.max()) \n",
      "    names_max.columns = ['year_max']\n",
      "    names_pctsum = pd.DataFrame(yobf.groupby('name').pct.sum()) \n",
      "    names_pctsum.columns = ['pct_sum']\n",
      "    names_pctmax = pd.DataFrame(yobf.groupby('name').pct.max())\n",
      "    names_pctmax.columns = ['pct_max']\n",
      "    names_f = names_count.join(names_min)\n",
      "    names_f = names_f.join(names_max)\n",
      "    names_f = names_f.join(names_pctsum)\n",
      "    names_f = names_f.join(names_pctmax)\n",
      "    names_f['sex'] = \"F\"\n",
      "    names_f.reset_index(inplace=True, drop=False)\n",
      "    names_f.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']\n",
      "    names_f = names_f[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]\n",
      "    names_count = pd.DataFrame(yobm['name'].value_counts()) \n",
      "    names_count.columns=['year_count']\n",
      "    names_min = pd.DataFrame(yobm.groupby('name').year.min()) \n",
      "    names_min.columns = ['year_min']\n",
      "    names_max = pd.DataFrame(yobm.groupby('name').year.max()) \n",
      "    names_max.columns = ['year_max']\n",
      "    names_pctsum = pd.DataFrame(yobm.groupby('name').pct.sum()) \n",
      "    names_pctsum.columns = ['pct_sum']\n",
      "    names_pctmax = pd.DataFrame(yobm.groupby('name').pct.max()) \n",
      "    names_pctmax.columns = ['pct_max']\n",
      "    names_m = names_count.join(names_min)\n",
      "    names_m = names_m.join(names_max)\n",
      "    names_m = names_m.join(names_pctsum)\n",
      "    names_m = names_m.join(names_pctmax)\n",
      "    names_m['sex'] = \"M\"\n",
      "    names_m.reset_index(inplace=True, drop=False)\n",
      "    names_m.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']\n",
      "    names_m = names_m[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]\n",
      "    names = pd.concat([names_f, names_m], ignore_index=True)\n",
      "    names.to_pickle('names.pickle')\n",
      "    \n",
      "    # create years dataframe. Discards individual name data, aggregating by year.\n",
      "    total = pd.DataFrame(yob.pivot_table('births', index='year', columns = 'sex', aggfunc=sum))\n",
      "    total.reset_index(drop=False, inplace=True)\n",
      "    total.columns = ['year', 'births_f', 'births_m']\n",
      "    total['births_t'] = total.births_f + total.births_m\n",
      "    newnames = pd.DataFrame(names.groupby('year_min').year_min.count())\n",
      "    newnames.columns = ['firstyearcount']\n",
      "    newnames.reset_index(drop=False, inplace=True)\n",
      "    newnames.columns = ['year', 'new_names']\n",
      "    uniquenames = pd.DataFrame()\n",
      "    for yr in range(1880, 2013):\n",
      "        uniquenames = uniquenames.append(pd.DataFrame([{'year':yr, 'unique_names':len(yob[yob.year == yr].name.unique())}]), ignore_index=True)\n",
      "    years = pd.merge(left=total, right=newnames, on='year', right_index=False, left_index=False)\n",
      "    years = pd.merge(left=years, right=uniquenames, on='year', right_index=False, left_index=False)\n",
      "    years['sexratio'] = 100.0 * years.births_m / years.births_f\n",
      "    years.to_pickle('years.pickle')\n",
      "    \n",
      "else:\n",
      "    \n",
      "    print \"Reading from pickle.\"\n",
      "    yob = pd.read_pickle('yob.pickle')\n",
      "    names = pd.read_pickle('names.pickle')\n",
      "    years = pd.read_pickle('years.pickle')\n",
      "    \n",
      "os.chdir(\"../\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Processing.\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#### Dataframe schemas: ####\n",
      "\n",
      "Note dataframes have only an arbitrary ordinal index. Indexes and multi-indexes are added later where needed.\n",
      "\n",
      "-----------------\n",
      "\n",
      "yob = a dataframe with each record comprising a unique name, sex and year.\n",
      "Length: approx. 1.76 million records; pickle = ~100 MB\n",
      "\n",
      "    name     String\n",
      "    sex      M or F\n",
      "    births   Number of birth with that name of that sex during that year;\n",
      "             names with fewer than 5 births in a given year are omitted due to privacy concerns\n",
      "    year     1880-2012\n",
      "    pct      Percentage of births of that sex during that year with that name (float)\n",
      "    ranked   Rank of number of births of that name among all births of that sex during that year\n",
      "\n",
      "-----------------\n",
      "\n",
      "names = a dataframe with each record comprising a unique name and sex, with data for individual years discarded but summary and additional data added.\n",
      "Length: approx. 101,000 records; pickle = ~ 7 MB\n",
      "\n",
      "    name              Same as in df\n",
      "    sex               Same as in df\n",
      "    year_count        Number of different years in which that name appears in dataframe, from 1 to 133.\n",
      "    year_min          First year name appears in database\n",
      "    year_max          Last year name appears in database\n",
      "    pct_sum           Sum of pct field for that name for all years. Not a statistically meaningful number\n",
      "                      (because the underlying distribution of names varies from year to year),\n",
      "                      but I have found it a useful rough metric during development\n",
      "    pct_max           Maximum value in pct field for all years, indicating the most popular that name has ever been in the database.\n",
      "\n",
      "------------------\n",
      "\n",
      "years = a dataframe with each record comprising a unique year, with individual name data discarded but summary and additional data added.\n",
      "Length: 133 records; pickle = ~ 8 kB\n",
      "\n",
      "    year              Same as in df\n",
      "    births_f          Number of female births during that year\n",
      "    births_m          Number of male births during that year\n",
      "    births_t          Total number of births during that year\n",
      "    new_names         Number of names that appear for the first time during that year\n",
      "    unique_names      Number of different names that appear during that year\n",
      "    sexratio          Number of boys born per hundred girls\n"
     ]
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Tails of all three dataframes:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"Tail of dataframe 'yob':\"\n",
      "print yob.tail()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Tail of dataframe 'yob':\n",
        "           name sex  births  year       pct  ranked\n",
        "1792086  Zyhier   M       5  2013  0.000267   12995\n",
        "1792087   Zylar   M       5  2013  0.000267   12995\n",
        "1792088  Zymari   M       5  2013  0.000267   12995\n",
        "1792089  Zymeer   M       5  2013  0.000267   12995\n",
        "1792090   Zyree   M       5  2013  0.000267   12995"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"\\nTail of dataframe 'names':\"\n",
      "print names.tail()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Tail of dataframe 'names':\n",
        "                 name sex  year_count  year_min  year_max   pct_sum   pct_max\n",
        "102685          Gross   M           1      1925      1925  0.000538  0.000538\n",
        "102686           Elik   M           1      2012      2012  0.000318  0.000318\n",
        "102687  Patrickjoseph   M           1      1998      1998  0.000262  0.000262\n",
        "102688       Southern   M           1      1923      1923  0.000547  0.000547\n",
        "102689           Jeon   M           1      1999      1999  0.000261  0.000261\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"\\nTail of dataframe 'years':\"\n",
      "print years.tail()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Tail of dataframe 'years':\n",
        "     year  births_f  births_m  births_t  new_names  unique_names    sexratio\n",
        "128  2008   1886765   2035811   3922576       2046         32483  107.899553\n",
        "129  2009   1832276   1978582   3810858       1789         32210  107.984932\n",
        "130  2010   1771846   1912915   3684761       1635         31593  107.961696\n",
        "131  2011   1752198   1891800   3643998       1539         31412  107.967250\n",
        "132  2012   1751866   1886972   3638838       1531         31212  107.712120\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}