{
"metadata": {
"name": "",
"signature": "sha256:826110cd178bdd440951af49b533cf11d85b39637602a5ea6be6c2f7783010ba"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Creating a list of potential baby names\n",
"\n",
"- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n",
"- **Date:** -\n",
"- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n",
"- **Note:**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prelimaries"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Import modules\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"\n",
"# Set plots to be inline\n",
"%matplotlib inline\n",
"\n",
"# Set ipython's max row display\n",
"pd.set_option('display.max_row', 1000)\n",
"\n",
"# Set iPython's max column width to 50\n",
"pd.set_option('display.max_columns', 50)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 41
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load the data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a sequence of numbers as a list, from 1880 to 2013\n",
"file_number = list(range(1880, 2013, 1))\n",
"\n",
"# Convert the list to a string\n",
"file_number = list(map((lambda x: str(x)), file_number))\n",
"\n",
"# View the first five elements of the list\n",
"file_number[0:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 42,
"text": [
"['1880', '1881', '1882', '1883', '1884']"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a dataframe name variable\n",
"df_name = []\n",
"\n",
"# Set the iteration counter\n",
"i = 0\n",
"\n",
"# For each item in file_number list\n",
"for item in file_number:\n",
" # Create a file name that is df_ and the file_number, then attach to df_name\n",
" df_name.append('df_' + str(file_number[i]))\n",
" # Add one to the iteration counter\n",
" i = i+1"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# View the top five rows of df_name\n",
"df_name[0:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 44,
"text": [
"['df_1880', 'df_1881', 'df_1882', 'df_1883', 'df_1884']"
]
}
],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a list for the file names\n",
"file_name = []\n",
"\n",
"# Create the iteration counter\n",
"i = 0\n",
"\n",
"# For each item in file number,\n",
"for item in file_number:\n",
" # Create a filename that combines, yob the year, and .txt\n",
" file_name.append('yob' + str(file_number[i]) + '.txt')\n",
" # Add 1 to the iteration counter\n",
" i = i+1"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# View the top five rows of file_name\n",
"file_name[0:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 46,
"text": [
"['yob1880.txt', 'yob1881.txt', 'yob1882.txt', 'yob1883.txt', 'yob1884.txt']"
]
}
],
"prompt_number": 46
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a file path for the directory where the files are located\n",
"file_loc = os.path.abspath(\"data/names/\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 47
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a dataframe for the data we will creat in the next step\n",
"df = pd.DataFrame()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create an iteration counter\n",
"k = 0\n",
"\n",
"# For each item in df_name,\n",
"for item in df_name:\n",
" # Run the command to read the csv using the variables we created previously\n",
" data = pd.read_csv(file_loc+'/'+file_name[k], header=None, names=['name', 'sex', 'count'])\n",
" # Create a variable with the year of the observation\n",
" data['year'] = file_number[k]\n",
" # Concat (i.e. attach) the data to the df\n",
" df = pd.concat([df, data])\n",
" # Add 1 to the iteration counter\n",
" k = k+1"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 49
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Check the length of the df, just to make sure everything is okay\n",
"len(df)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 50,
"text": [
"1759019"
]
}
],
"prompt_number": 50
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean the data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Drop all males (I'm having a daughter!)\n",
"df = df[df.sex != 'M'] "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 51
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Check the length of the df, we should lose roughly half the observations\n",
"len(df)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 52,
"text": [
"1043318"
]
}
],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a boolean variable that is true when year == 2012 and False otherise\n",
"df['2012'] = np.where(df['year'] == '2012', True, False)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a variable called df.count_2012 that is df.count when df.2012 is true\n",
"df['count_2012'] = df['count'][df['2012']]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.head(3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" sex | \n",
" count | \n",
" year | \n",
" 2012 | \n",
" count_2012 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Mary | \n",
" F | \n",
" 7065 | \n",
" 1880 | \n",
" False | \n",
" 22245 | \n",
"
\n",
" \n",
" 1 | \n",
" Anna | \n",
" F | \n",
" 2604 | \n",
" 1880 | \n",
" False | \n",
" 20871 | \n",
"
\n",
" \n",
" 2 | \n",
" Emma | \n",
" F | \n",
" 2003 | \n",
" 1880 | \n",
" False | \n",
" 19026 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 55,
"text": [
" name sex count year 2012 count_2012\n",
"0 Mary F 7065 1880 False 22245\n",
"1 Anna F 2604 1880 False 20871\n",
"2 Emma F 2003 1880 False 19026"
]
}
],
"prompt_number": 55
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reshape the data into the format we want"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Create a variable that is a pivot table, \n",
"# totalling the number of times a name is registered\n",
"names = df.pivot_table(index=['name'], aggfunc=np.sum)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Sort the names variable by their popularity in 2012\n",
"names = names.sort(['count_2012'], ascending=[0])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Clean the dataset by dropping the boolean 2012 variable\n",
"names = names.drop('2012', axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Turn the index into its own column\n",
"names['names'] = names.index"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create a dataframe with all names ending in a\n",
"a_names = names[names['names'].str.endswith('a')]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# How many names in a_names?\n",
"len(a_names)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 75,
"text": [
"26687"
]
}
],
"prompt_number": 75
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Let's find Zaria\n",
"a_names[a_names['names'] == 'Zaria']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" count_2012 | \n",
" names | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Zaria | \n",
" 6892 | \n",
" 7449 | \n",
" Zaria | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 76,
"text": [
" count count_2012 names\n",
"name \n",
"Zaria 6892 7449 Zaria"
]
}
],
"prompt_number": 76
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Export the data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Export the data to csv\n",
"a_names.to_csv('names.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 61
}
],
"metadata": {}
}
]
}