{
 "metadata": {
  "name": "",
  "signature": "sha256:826110cd178bdd440951af49b533cf11d85b39637602a5ea6be6c2f7783010ba"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Creating a list of potential baby names\n",
      "\n",
      "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n",
      "- **Date:** -\n",
      "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n",
      "- **Note:**"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Prelimaries"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Import modules\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "import os\n",
      "\n",
      "# Set plots to be inline\n",
      "%matplotlib inline\n",
      "\n",
      "# Set ipython's max row display\n",
      "pd.set_option('display.max_row', 1000)\n",
      "\n",
      "# Set iPython's max column width to 50\n",
      "pd.set_option('display.max_columns', 50)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 41
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Load the data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a sequence of numbers as a list, from 1880 to 2013\n",
      "file_number = list(range(1880, 2013, 1))\n",
      "\n",
      "# Convert the list to a string\n",
      "file_number = list(map((lambda x: str(x)), file_number))\n",
      "\n",
      "# View the first five elements of the list\n",
      "file_number[0:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 42,
       "text": [
        "['1880', '1881', '1882', '1883', '1884']"
       ]
      }
     ],
     "prompt_number": 42
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a dataframe name variable\n",
      "df_name = []\n",
      "\n",
      "# Set the iteration counter\n",
      "i = 0\n",
      "\n",
      "# For each item in file_number list\n",
      "for item in file_number:\n",
      "    # Create a file name that is df_ and the file_number, then attach to df_name\n",
      "    df_name.append('df_' + str(file_number[i]))\n",
      "    # Add one to the iteration counter\n",
      "    i = i+1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 43
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# View the top five rows of df_name\n",
      "df_name[0:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 44,
       "text": [
        "['df_1880', 'df_1881', 'df_1882', 'df_1883', 'df_1884']"
       ]
      }
     ],
     "prompt_number": 44
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a list for the file names\n",
      "file_name = []\n",
      "\n",
      "# Create the iteration counter\n",
      "i = 0\n",
      "\n",
      "# For each item in file number,\n",
      "for item in file_number:\n",
      "    # Create a filename that combines, yob the year, and .txt\n",
      "    file_name.append('yob' + str(file_number[i]) + '.txt')\n",
      "    # Add 1 to the iteration counter\n",
      "    i = i+1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# View the top five rows of file_name\n",
      "file_name[0:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 46,
       "text": [
        "['yob1880.txt', 'yob1881.txt', 'yob1882.txt', 'yob1883.txt', 'yob1884.txt']"
       ]
      }
     ],
     "prompt_number": 46
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a file path for the directory where the files are located\n",
      "file_loc = os.path.abspath(\"data/names/\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 47
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a dataframe for the data we will creat in the next step\n",
      "df = pd.DataFrame()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 48
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create an iteration counter\n",
      "k = 0\n",
      "\n",
      "# For each item in df_name,\n",
      "for item in df_name:\n",
      "    # Run the command to read the csv using the variables we created previously\n",
      "    data = pd.read_csv(file_loc+'/'+file_name[k], header=None, names=['name', 'sex', 'count'])\n",
      "    # Create a variable with the year of the observation\n",
      "    data['year'] = file_number[k]\n",
      "    # Concat (i.e. attach) the data to the df\n",
      "    df = pd.concat([df, data])\n",
      "    # Add 1 to the iteration counter\n",
      "    k = k+1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 49
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Check the length of the df, just to make sure everything is okay\n",
      "len(df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 50,
       "text": [
        "1759019"
       ]
      }
     ],
     "prompt_number": 50
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Clean the data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Drop all males (I'm having a daughter!)\n",
      "df = df[df.sex != 'M'] "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 51
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Check the length of the df, we should lose roughly half the observations\n",
      "len(df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 52,
       "text": [
        "1043318"
       ]
      }
     ],
     "prompt_number": 52
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a boolean variable that is true when year == 2012 and False otherise\n",
      "df['2012'] = np.where(df['year'] == '2012', True, False)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 53
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a variable called df.count_2012 that is df.count when df.2012 is true\n",
      "df['count_2012'] = df['count'][df['2012']]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 54
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df.head(3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>name</th>\n",
        "      <th>sex</th>\n",
        "      <th>count</th>\n",
        "      <th>year</th>\n",
        "      <th>2012</th>\n",
        "      <th>count_2012</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> Mary</td>\n",
        "      <td> F</td>\n",
        "      <td> 7065</td>\n",
        "      <td> 1880</td>\n",
        "      <td> False</td>\n",
        "      <td> 22245</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> Anna</td>\n",
        "      <td> F</td>\n",
        "      <td> 2604</td>\n",
        "      <td> 1880</td>\n",
        "      <td> False</td>\n",
        "      <td> 20871</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> Emma</td>\n",
        "      <td> F</td>\n",
        "      <td> 2003</td>\n",
        "      <td> 1880</td>\n",
        "      <td> False</td>\n",
        "      <td> 19026</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 55,
       "text": [
        "   name sex  count  year   2012  count_2012\n",
        "0  Mary   F   7065  1880  False       22245\n",
        "1  Anna   F   2604  1880  False       20871\n",
        "2  Emma   F   2003  1880  False       19026"
       ]
      }
     ],
     "prompt_number": 55
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Reshape the data into the format we want"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a variable that is a pivot table, \n",
      "# totalling the number of times a name is registered\n",
      "names = df.pivot_table(index=['name'], aggfunc=np.sum)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 56
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Sort the names variable by their popularity in 2012\n",
      "names = names.sort(['count_2012'], ascending=[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 57
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Clean the dataset by dropping the boolean 2012 variable\n",
      "names = names.drop('2012', axis=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 58
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Turn the index into its own column\n",
      "names['names'] = names.index"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 59
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create a dataframe with all names ending in a\n",
      "a_names = names[names['names'].str.endswith('a')]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 62
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# How many names in a_names?\n",
      "len(a_names)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 75,
       "text": [
        "26687"
       ]
      }
     ],
     "prompt_number": 75
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Let's find Zaria\n",
      "a_names[a_names['names'] == 'Zaria']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>count</th>\n",
        "      <th>count_2012</th>\n",
        "      <th>names</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>name</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>Zaria</th>\n",
        "      <td> 6892</td>\n",
        "      <td> 7449</td>\n",
        "      <td> Zaria</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 76,
       "text": [
        "       count  count_2012  names\n",
        "name                           \n",
        "Zaria   6892        7449  Zaria"
       ]
      }
     ],
     "prompt_number": 76
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Export the data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Export the data to csv\n",
      "a_names.to_csv('names.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 61
    }
   ],
   "metadata": {}
  }
 ]
}