{ "metadata": { "name": "", "signature": "sha256:826110cd178bdd440951af49b533cf11d85b39637602a5ea6be6c2f7783010ba" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Creating a list of potential baby names\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prelimaries" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Import modules\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "\n", "# Set plots to be inline\n", "%matplotlib inline\n", "\n", "# Set ipython's max row display\n", "pd.set_option('display.max_row', 1000)\n", "\n", "# Set iPython's max column width to 50\n", "pd.set_option('display.max_columns', 50)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 41 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load the data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a sequence of numbers as a list, from 1880 to 2013\n", "file_number = list(range(1880, 2013, 1))\n", "\n", "# Convert the list to a string\n", "file_number = list(map((lambda x: str(x)), file_number))\n", "\n", "# View the first five elements of the list\n", "file_number[0:5]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 42, "text": [ "['1880', '1881', '1882', '1883', '1884']" ] } ], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a dataframe name variable\n", "df_name = []\n", "\n", "# Set the iteration counter\n", "i = 0\n", "\n", "# For each item in file_number list\n", "for item in file_number:\n", " # Create a file name that is df_ and the file_number, then attach to df_name\n", " df_name.append('df_' + str(file_number[i]))\n", " # Add one to the iteration counter\n", " i = i+1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 43 }, { "cell_type": "code", "collapsed": false, "input": [ "# View the top five rows of df_name\n", "df_name[0:5]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 44, "text": [ "['df_1880', 'df_1881', 'df_1882', 'df_1883', 'df_1884']" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a list for the file names\n", "file_name = []\n", "\n", "# Create the iteration counter\n", "i = 0\n", "\n", "# For each item in file number,\n", "for item in file_number:\n", " # Create a filename that combines, yob the year, and .txt\n", " file_name.append('yob' + str(file_number[i]) + '.txt')\n", " # Add 1 to the iteration counter\n", " i = i+1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "# View the top five rows of file_name\n", "file_name[0:5]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ "['yob1880.txt', 'yob1881.txt', 'yob1882.txt', 'yob1883.txt', 'yob1884.txt']" ] } ], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a file path for the directory where the files are located\n", "file_loc = os.path.abspath(\"data/names/\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a dataframe for the data we will creat in the next step\n", "df = pd.DataFrame()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create an iteration counter\n", "k = 0\n", "\n", "# For each item in df_name,\n", "for item in df_name:\n", " # Run the command to read the csv using the variables we created previously\n", " data = pd.read_csv(file_loc+'/'+file_name[k], header=None, names=['name', 'sex', 'count'])\n", " # Create a variable with the year of the observation\n", " data['year'] = file_number[k]\n", " # Concat (i.e. attach) the data to the df\n", " df = pd.concat([df, data])\n", " # Add 1 to the iteration counter\n", " k = k+1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "# Check the length of the df, just to make sure everything is okay\n", "len(df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 50, "text": [ "1759019" ] } ], "prompt_number": 50 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean the data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Drop all males (I'm having a daughter!)\n", "df = df[df.sex != 'M'] " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 51 }, { "cell_type": "code", "collapsed": false, "input": [ "# Check the length of the df, we should lose roughly half the observations\n", "len(df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 52, "text": [ "1043318" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a boolean variable that is true when year == 2012 and False otherise\n", "df['2012'] = np.where(df['year'] == '2012', True, False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a variable called df.count_2012 that is df.count when df.2012 is true\n", "df['count_2012'] = df['count'][df['2012']]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "df.head(3)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namesexcountyear2012count_2012
0 Mary F 7065 1880 False 22245
1 Anna F 2604 1880 False 20871
2 Emma F 2003 1880 False 19026
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 55, "text": [ " name sex count year 2012 count_2012\n", "0 Mary F 7065 1880 False 22245\n", "1 Anna F 2604 1880 False 20871\n", "2 Emma F 2003 1880 False 19026" ] } ], "prompt_number": 55 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reshape the data into the format we want" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Create a variable that is a pivot table, \n", "# totalling the number of times a name is registered\n", "names = df.pivot_table(index=['name'], aggfunc=np.sum)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "# Sort the names variable by their popularity in 2012\n", "names = names.sort(['count_2012'], ascending=[0])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [ "# Clean the dataset by dropping the boolean 2012 variable\n", "names = names.drop('2012', axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 58 }, { "cell_type": "code", "collapsed": false, "input": [ "# Turn the index into its own column\n", "names['names'] = names.index" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 59 }, { "cell_type": "code", "collapsed": false, "input": [ "# create a dataframe with all names ending in a\n", "a_names = names[names['names'].str.endswith('a')]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 62 }, { "cell_type": "code", "collapsed": false, "input": [ "# How many names in a_names?\n", "len(a_names)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 75, "text": [ "26687" ] } ], "prompt_number": 75 }, { "cell_type": "code", "collapsed": false, "input": [ "# Let's find Zaria\n", "a_names[a_names['names'] == 'Zaria']" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countcount_2012names
name
Zaria 6892 7449 Zaria
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 76, "text": [ " count count_2012 names\n", "name \n", "Zaria 6892 7449 Zaria" ] } ], "prompt_number": 76 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Export the data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Export the data to csv\n", "a_names.to_csv('names.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 61 } ], "metadata": {} } ] }