{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine data files with state populations\n",
"The first data file has 2000-2010"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:34:12.487554Z",
"start_time": "2017-10-30T16:34:10.713622Z"
}
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from os.path import join\n",
"cwd = os.getcwd()\n",
"data_directory = join(cwd, '..', 'Data storage')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import 2000-2010 data\n",
"The sex, origin, race, and age columns are ALL when they have values of 0\n",
"\n",
"Not clear if these are beginning or end of year values.\n",
"\n",
"https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:34:16.891879Z",
"start_time": "2017-10-30T16:34:16.674736Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"path = os.path.join(data_directory, 'Population data',\n",
" 'st-est00int-alldata.csv')\n",
"pop1 = pd.read_csv(path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:34:18.341992Z",
"start_time": "2017-10-30T16:34:18.316636Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" REGION | \n",
" DIVISION | \n",
" STATE | \n",
" NAME | \n",
" SEX | \n",
" ORIGIN | \n",
" RACE | \n",
" AGEGRP | \n",
" ESTIMATESBASE2000 | \n",
" POPESTIMATE2000 | \n",
" ... | \n",
" POPESTIMATE2002 | \n",
" POPESTIMATE2003 | \n",
" POPESTIMATE2004 | \n",
" POPESTIMATE2005 | \n",
" POPESTIMATE2006 | \n",
" POPESTIMATE2007 | \n",
" POPESTIMATE2008 | \n",
" POPESTIMATE2009 | \n",
" CENSUS2010POP | \n",
" POPESTIMATE2010 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" United States | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 281424600 | \n",
" 282162411 | \n",
" ... | \n",
" 287625193 | \n",
" 290107933 | \n",
" 292805298 | \n",
" 295516599 | \n",
" 298379912 | \n",
" 301231207 | \n",
" 304093966 | \n",
" 306771529 | \n",
" 308745538 | \n",
" 309349689 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" United States | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 19176154 | \n",
" 19178293 | \n",
" ... | \n",
" 19429192 | \n",
" 19592446 | \n",
" 19785885 | \n",
" 19917400 | \n",
" 19938883 | \n",
" 20125962 | \n",
" 20271127 | \n",
" 20244518 | \n",
" 20201362 | \n",
" 20200529 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" United States | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 20549855 | \n",
" 20463852 | \n",
" ... | \n",
" 19872417 | \n",
" 19620851 | \n",
" 19454237 | \n",
" 19389067 | \n",
" 19544688 | \n",
" 19714611 | \n",
" 19929602 | \n",
" 20182499 | \n",
" 20348657 | \n",
" 20382409 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" United States | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 20528425 | \n",
" 20637696 | \n",
" ... | \n",
" 21261421 | \n",
" 21415353 | \n",
" 21411680 | \n",
" 21212579 | \n",
" 21033138 | \n",
" 20841042 | \n",
" 20706655 | \n",
" 20660564 | \n",
" 20677194 | \n",
" 20694011 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" United States | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 4 | \n",
" 20218782 | \n",
" 20294955 | \n",
" ... | \n",
" 20610370 | \n",
" 20797166 | \n",
" 21102552 | \n",
" 21486214 | \n",
" 21807709 | \n",
" 22067816 | \n",
" 22210880 | \n",
" 22192810 | \n",
" 22040343 | \n",
" 21959087 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 21 columns
\n",
"
"
],
"text/plain": [
" REGION DIVISION STATE NAME SEX ORIGIN RACE AGEGRP \\\n",
"0 0 0 0 United States 0 0 0 0 \n",
"1 0 0 0 United States 0 0 0 1 \n",
"2 0 0 0 United States 0 0 0 2 \n",
"3 0 0 0 United States 0 0 0 3 \n",
"4 0 0 0 United States 0 0 0 4 \n",
"\n",
" ESTIMATESBASE2000 POPESTIMATE2000 ... POPESTIMATE2002 \\\n",
"0 281424600 282162411 ... 287625193 \n",
"1 19176154 19178293 ... 19429192 \n",
"2 20549855 20463852 ... 19872417 \n",
"3 20528425 20637696 ... 21261421 \n",
"4 20218782 20294955 ... 20610370 \n",
"\n",
" POPESTIMATE2003 POPESTIMATE2004 POPESTIMATE2005 POPESTIMATE2006 \\\n",
"0 290107933 292805298 295516599 298379912 \n",
"1 19592446 19785885 19917400 19938883 \n",
"2 19620851 19454237 19389067 19544688 \n",
"3 21415353 21411680 21212579 21033138 \n",
"4 20797166 21102552 21486214 21807709 \n",
"\n",
" POPESTIMATE2007 POPESTIMATE2008 POPESTIMATE2009 CENSUS2010POP \\\n",
"0 301231207 304093966 306771529 308745538 \n",
"1 20125962 20271127 20244518 20201362 \n",
"2 19714611 19929602 20182499 20348657 \n",
"3 20841042 20706655 20660564 20677194 \n",
"4 22067816 22210880 22192810 22040343 \n",
"\n",
" POPESTIMATE2010 \n",
"0 309349689 \n",
"1 20200529 \n",
"2 20382409 \n",
"3 20694011 \n",
"4 21959087 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop1.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:34:40.634078Z",
"start_time": "2017-10-30T16:34:40.613118Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"pop1 = pop1.loc[(pop1['SEX'] == 0) &\n",
" (pop1['ORIGIN'] == 0) &\n",
" (pop1['RACE'] == 0) &\n",
" (pop1['AGEGRP'] == 0), :]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:35:15.240643Z",
"start_time": "2017-10-30T16:35:15.236029Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"# Column names for population estimate\n",
"est_cols = ['POPESTIMATE{}'.format(x) for x in range(2000, 2011)]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:35:32.342354Z",
"start_time": "2017-10-30T16:35:32.332791Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"# Melt the wide-form data into a tidy dataframe\n",
"pop1_tidy = pd.melt(pop1, id_vars='NAME',\n",
" value_vars=est_cols, var_name='Year',\n",
" value_name='Population')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:35:35.480574Z",
"start_time": "2017-10-30T16:35:35.470689Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" NAME | \n",
" Year | \n",
" Population | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" United States | \n",
" POPESTIMATE2000 | \n",
" 282162411 | \n",
"
\n",
" \n",
" 1 | \n",
" Alabama | \n",
" POPESTIMATE2000 | \n",
" 4452173 | \n",
"
\n",
" \n",
" 2 | \n",
" Alaska | \n",
" POPESTIMATE2000 | \n",
" 627963 | \n",
"
\n",
" \n",
" 3 | \n",
" Arizona | \n",
" POPESTIMATE2000 | \n",
" 5160586 | \n",
"
\n",
" \n",
" 4 | \n",
" Arkansas | \n",
" POPESTIMATE2000 | \n",
" 2678588 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" NAME Year Population\n",
"0 United States POPESTIMATE2000 282162411\n",
"1 Alabama POPESTIMATE2000 4452173\n",
"2 Alaska POPESTIMATE2000 627963\n",
"3 Arizona POPESTIMATE2000 5160586\n",
"4 Arkansas POPESTIMATE2000 2678588"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop1_tidy.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:04.134970Z",
"start_time": "2017-10-30T16:36:04.126266Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"def map_year(x):\n",
" 'Return last 4 characters (the year)'\n",
" year = x[-4:]\n",
" return int(year)\n",
"\n",
"pop1_tidy['Year'] = pop1_tidy['Year'].map(map_year)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The values shown below are ever slightly different than those listed in the later dataset."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:10.619297Z",
"start_time": "2017-10-30T16:36:10.606318Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" NAME | \n",
" Year | \n",
" Population | \n",
"
\n",
" \n",
" \n",
" \n",
" 520 | \n",
" United States | \n",
" 2010 | \n",
" 309349689 | \n",
"
\n",
" \n",
" 521 | \n",
" Alabama | \n",
" 2010 | \n",
" 4785298 | \n",
"
\n",
" \n",
" 522 | \n",
" Alaska | \n",
" 2010 | \n",
" 713985 | \n",
"
\n",
" \n",
" 523 | \n",
" Arizona | \n",
" 2010 | \n",
" 6413737 | \n",
"
\n",
" \n",
" 524 | \n",
" Arkansas | \n",
" 2010 | \n",
" 2921606 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" NAME Year Population\n",
"520 United States 2010 309349689\n",
"521 Alabama 2010 4785298\n",
"522 Alaska 2010 713985\n",
"523 Arizona 2010 6413737\n",
"524 Arkansas 2010 2921606"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop1_tidy.loc[pop1_tidy['Year'] == 2010].head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:29.041098Z",
"start_time": "2017-10-30T16:36:29.031243Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" NAME | \n",
" Year | \n",
" Population | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" United States | \n",
" 2000 | \n",
" 282162411 | \n",
"
\n",
" \n",
" 1 | \n",
" Alabama | \n",
" 2000 | \n",
" 4452173 | \n",
"
\n",
" \n",
" 2 | \n",
" Alaska | \n",
" 2000 | \n",
" 627963 | \n",
"
\n",
" \n",
" 3 | \n",
" Arizona | \n",
" 2000 | \n",
" 5160586 | \n",
"
\n",
" \n",
" 4 | \n",
" Arkansas | \n",
" 2000 | \n",
" 2678588 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" NAME Year Population\n",
"0 United States 2000 282162411\n",
"1 Alabama 2000 4452173\n",
"2 Alaska 2000 627963\n",
"3 Arizona 2000 5160586\n",
"4 Arkansas 2000 2678588"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop1_tidy.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:29.457716Z",
"start_time": "2017-10-30T16:36:29.447824Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" NAME | \n",
" Year | \n",
" Population | \n",
"
\n",
" \n",
" \n",
" \n",
" 567 | \n",
" Virginia | \n",
" 2010 | \n",
" 8024617 | \n",
"
\n",
" \n",
" 568 | \n",
" Washington | \n",
" 2010 | \n",
" 6744496 | \n",
"
\n",
" \n",
" 569 | \n",
" West Virginia | \n",
" 2010 | \n",
" 1853973 | \n",
"
\n",
" \n",
" 570 | \n",
" Wisconsin | \n",
" 2010 | \n",
" 5691047 | \n",
"
\n",
" \n",
" 571 | \n",
" Wyoming | \n",
" 2010 | \n",
" 564460 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" NAME Year Population\n",
"567 Virginia 2010 8024617\n",
"568 Washington 2010 6744496\n",
"569 West Virginia 2010 1853973\n",
"570 Wisconsin 2010 5691047\n",
"571 Wyoming 2010 564460"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop1_tidy.tail()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:29.850270Z",
"start_time": "2017-10-30T16:36:29.846689Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"pop1_tidy.columns = ['State', 'Year', 'Population']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import 2010-2016 data\n",
"https://www.census.gov/data/tables/2016/demo/popest/state-total.html"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:43.458638Z",
"start_time": "2017-10-30T16:36:43.408276Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"path = os.path.join(data_directory, 'Population data', 'nst-est2016-01.xlsx')\n",
"pop2 = pd.read_excel(path, header=3, parse_cols='A, D:J', skip_footer=7)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:45.940900Z",
"start_time": "2017-10-30T16:36:45.928511Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2010 | \n",
" 2011 | \n",
" 2012 | \n",
" 2013 | \n",
" 2014 | \n",
" 2015 | \n",
" 2016 | \n",
"
\n",
" \n",
" \n",
" \n",
" United States | \n",
" 309348193 | \n",
" 311663358 | \n",
" 313998379 | \n",
" 316204908 | \n",
" 318563456 | \n",
" 320896618 | \n",
" 323127513 | \n",
"
\n",
" \n",
" Northeast | \n",
" 55388056 | \n",
" 55632766 | \n",
" 55829059 | \n",
" 55988771 | \n",
" 56116791 | \n",
" 56184737 | \n",
" 56209510 | \n",
"
\n",
" \n",
" Midwest | \n",
" 66978602 | \n",
" 67153331 | \n",
" 67332320 | \n",
" 67543948 | \n",
" 67726368 | \n",
" 67838387 | \n",
" 67941429 | \n",
"
\n",
" \n",
" South | \n",
" 114863114 | \n",
" 116061801 | \n",
" 117299171 | \n",
" 118424320 | \n",
" 119696311 | \n",
" 121039206 | \n",
" 122319574 | \n",
"
\n",
" \n",
" West | \n",
" 72118421 | \n",
" 72815460 | \n",
" 73537829 | \n",
" 74247869 | \n",
" 75023986 | \n",
" 75834288 | \n",
" 76657000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2010 2011 2012 2013 2014 \\\n",
"United States 309348193 311663358 313998379 316204908 318563456 \n",
"Northeast 55388056 55632766 55829059 55988771 56116791 \n",
"Midwest 66978602 67153331 67332320 67543948 67726368 \n",
"South 114863114 116061801 117299171 118424320 119696311 \n",
"West 72118421 72815460 73537829 74247869 75023986 \n",
"\n",
" 2015 2016 \n",
"United States 320896618 323127513 \n",
"Northeast 56184737 56209510 \n",
"Midwest 67838387 67941429 \n",
"South 121039206 122319574 \n",
"West 75834288 76657000 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop2.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:36:55.381981Z",
"start_time": "2017-10-30T16:36:55.369945Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2010 | \n",
" 2011 | \n",
" 2012 | \n",
" 2013 | \n",
" 2014 | \n",
" 2015 | \n",
" 2016 | \n",
"
\n",
" \n",
" \n",
" \n",
" .Virginia | \n",
" 8025773 | \n",
" 8110035 | \n",
" 8192048 | \n",
" 8262692 | \n",
" 8317372 | \n",
" 8367587 | \n",
" 8411808 | \n",
"
\n",
" \n",
" .Washington | \n",
" 6743226 | \n",
" 6822520 | \n",
" 6895226 | \n",
" 6968006 | \n",
" 7054196 | \n",
" 7160290 | \n",
" 7288000 | \n",
"
\n",
" \n",
" .West Virginia | \n",
" 1854230 | \n",
" 1854972 | \n",
" 1856560 | \n",
" 1853231 | \n",
" 1848514 | \n",
" 1841053 | \n",
" 1831102 | \n",
"
\n",
" \n",
" .Wisconsin | \n",
" 5690263 | \n",
" 5709640 | \n",
" 5726177 | \n",
" 5742854 | \n",
" 5758377 | \n",
" 5767891 | \n",
" 5778708 | \n",
"
\n",
" \n",
" .Wyoming | \n",
" 564513 | \n",
" 567725 | \n",
" 576765 | \n",
" 582684 | \n",
" 583642 | \n",
" 586555 | \n",
" 585501 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2010 2011 2012 2013 2014 2015 2016\n",
".Virginia 8025773 8110035 8192048 8262692 8317372 8367587 8411808\n",
".Washington 6743226 6822520 6895226 6968006 7054196 7160290 7288000\n",
".West Virginia 1854230 1854972 1856560 1853231 1848514 1841053 1831102\n",
".Wisconsin 5690263 5709640 5726177 5742854 5758377 5767891 5778708\n",
".Wyoming 564513 567725 576765 582684 583642 586555 585501"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop2.tail()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:37:04.068511Z",
"start_time": "2017-10-30T16:37:04.063956Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"drop_rows = ['Northeast', 'Midwest', 'South', 'West']\n",
"pop2.drop(drop_rows, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:37:41.861566Z",
"start_time": "2017-10-30T16:37:41.857935Z"
}
},
"outputs": [],
"source": [
"pop2.index = pop2.index.str.strip('.')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:37:42.857641Z",
"start_time": "2017-10-30T16:37:42.844396Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2010 | \n",
" 2011 | \n",
" 2012 | \n",
" 2013 | \n",
" 2014 | \n",
" 2015 | \n",
" 2016 | \n",
"
\n",
" \n",
" \n",
" \n",
" United States | \n",
" 309348193 | \n",
" 311663358 | \n",
" 313998379 | \n",
" 316204908 | \n",
" 318563456 | \n",
" 320896618 | \n",
" 323127513 | \n",
"
\n",
" \n",
" Alabama | \n",
" 4785492 | \n",
" 4799918 | \n",
" 4815960 | \n",
" 4829479 | \n",
" 4843214 | \n",
" 4853875 | \n",
" 4863300 | \n",
"
\n",
" \n",
" Alaska | \n",
" 714031 | \n",
" 722713 | \n",
" 731089 | \n",
" 736879 | \n",
" 736705 | \n",
" 737709 | \n",
" 741894 | \n",
"
\n",
" \n",
" Arizona | \n",
" 6408312 | \n",
" 6467163 | \n",
" 6549634 | \n",
" 6624617 | \n",
" 6719993 | \n",
" 6817565 | \n",
" 6931071 | \n",
"
\n",
" \n",
" Arkansas | \n",
" 2921995 | \n",
" 2939493 | \n",
" 2950685 | \n",
" 2958663 | \n",
" 2966912 | \n",
" 2977853 | \n",
" 2988248 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2010 2011 2012 2013 2014 \\\n",
"United States 309348193 311663358 313998379 316204908 318563456 \n",
"Alabama 4785492 4799918 4815960 4829479 4843214 \n",
"Alaska 714031 722713 731089 736879 736705 \n",
"Arizona 6408312 6467163 6549634 6624617 6719993 \n",
"Arkansas 2921995 2939493 2950685 2958663 2966912 \n",
"\n",
" 2015 2016 \n",
"United States 320896618 323127513 \n",
"Alabama 4853875 4863300 \n",
"Alaska 737709 741894 \n",
"Arizona 6817565 6931071 \n",
"Arkansas 2977853 2988248 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop2.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:37:49.158398Z",
"start_time": "2017-10-30T16:37:49.152225Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Int64Index([2010, 2011, 2012, 2013, 2014, 2015, 2016], dtype='int64')"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop2.columns"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:37:54.388262Z",
"start_time": "2017-10-30T16:37:54.377205Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"pop2_tidy = pd.melt(pop2.reset_index(), id_vars='index',\n",
" value_vars=range(2010, 2017), value_name='Population',\n",
" var_name='Year')\n",
"pop2_tidy.columns = ['State', 'Year', 'Population']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Combine data"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:37:58.078817Z",
"start_time": "2017-10-30T16:37:58.073653Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"pop_total = pd.concat([pop1_tidy, pop2_tidy])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The overlapping 2010 values are different, but just barely. I'm going to re-combine the datasets and keep values from the second dataset."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:38:17.265430Z",
"start_time": "2017-10-30T16:38:17.239274Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" State | \n",
" Year | \n",
" Population | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Alabama | \n",
" 2010 | \n",
" 4785492 | \n",
"
\n",
" \n",
" 521 | \n",
" Alabama | \n",
" 2010 | \n",
" 4785298 | \n",
"
\n",
" \n",
" 522 | \n",
" Alaska | \n",
" 2010 | \n",
" 713985 | \n",
"
\n",
" \n",
" 2 | \n",
" Alaska | \n",
" 2010 | \n",
" 714031 | \n",
"
\n",
" \n",
" 523 | \n",
" Arizona | \n",
" 2010 | \n",
" 6413737 | \n",
"
\n",
" \n",
" 3 | \n",
" Arizona | \n",
" 2010 | \n",
" 6408312 | \n",
"
\n",
" \n",
" 524 | \n",
" Arkansas | \n",
" 2010 | \n",
" 2921606 | \n",
"
\n",
" \n",
" 4 | \n",
" Arkansas | \n",
" 2010 | \n",
" 2921995 | \n",
"
\n",
" \n",
" 525 | \n",
" California | \n",
" 2010 | \n",
" 37349363 | \n",
"
\n",
" \n",
" 5 | \n",
" California | \n",
" 2010 | \n",
" 37332685 | \n",
"
\n",
" \n",
" 526 | \n",
" Colorado | \n",
" 2010 | \n",
" 5049071 | \n",
"
\n",
" \n",
" 6 | \n",
" Colorado | \n",
" 2010 | \n",
" 5048644 | \n",
"
\n",
" \n",
" 527 | \n",
" Connecticut | \n",
" 2010 | \n",
" 3577073 | \n",
"
\n",
" \n",
" 7 | \n",
" Connecticut | \n",
" 2010 | \n",
" 3579899 | \n",
"
\n",
" \n",
" 528 | \n",
" Delaware | \n",
" 2010 | \n",
" 899769 | \n",
"
\n",
" \n",
" 8 | \n",
" Delaware | \n",
" 2010 | \n",
" 899816 | \n",
"
\n",
" \n",
" 529 | \n",
" District of Columbia | \n",
" 2010 | \n",
" 604453 | \n",
"
\n",
" \n",
" 9 | \n",
" District of Columbia | \n",
" 2010 | \n",
" 605183 | \n",
"
\n",
" \n",
" 530 | \n",
" Florida | \n",
" 2010 | \n",
" 18843326 | \n",
"
\n",
" \n",
" 10 | \n",
" Florida | \n",
" 2010 | \n",
" 18849098 | \n",
"
\n",
" \n",
" 11 | \n",
" Georgia | \n",
" 2010 | \n",
" 9713521 | \n",
"
\n",
" \n",
" 531 | \n",
" Georgia | \n",
" 2010 | \n",
" 9712587 | \n",
"
\n",
" \n",
" 532 | \n",
" Hawaii | \n",
" 2010 | \n",
" 1363621 | \n",
"
\n",
" \n",
" 12 | \n",
" Hawaii | \n",
" 2010 | \n",
" 1363945 | \n",
"
\n",
" \n",
" 13 | \n",
" Idaho | \n",
" 2010 | \n",
" 1571010 | \n",
"
\n",
" \n",
" 533 | \n",
" Idaho | \n",
" 2010 | \n",
" 1571450 | \n",
"
\n",
" \n",
" 14 | \n",
" Illinois | \n",
" 2010 | \n",
" 12841578 | \n",
"
\n",
" \n",
" 534 | \n",
" Illinois | \n",
" 2010 | \n",
" 12843166 | \n",
"
\n",
" \n",
" 15 | \n",
" Indiana | \n",
" 2010 | \n",
" 6490528 | \n",
"
\n",
" \n",
" 535 | \n",
" Indiana | \n",
" 2010 | \n",
" 6490621 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 558 | \n",
" Oregon | \n",
" 2010 | \n",
" 3838957 | \n",
"
\n",
" \n",
" 38 | \n",
" Oregon | \n",
" 2010 | \n",
" 3838048 | \n",
"
\n",
" \n",
" 39 | \n",
" Pennsylvania | \n",
" 2010 | \n",
" 12712343 | \n",
"
\n",
" \n",
" 559 | \n",
" Pennsylvania | \n",
" 2010 | \n",
" 12709630 | \n",
"
\n",
" \n",
" 40 | \n",
" Rhode Island | \n",
" 2010 | \n",
" 1053337 | \n",
"
\n",
" \n",
" 560 | \n",
" Rhode Island | \n",
" 2010 | \n",
" 1052886 | \n",
"
\n",
" \n",
" 41 | \n",
" South Carolina | \n",
" 2010 | \n",
" 4635943 | \n",
"
\n",
" \n",
" 561 | \n",
" South Carolina | \n",
" 2010 | \n",
" 4636312 | \n",
"
\n",
" \n",
" 42 | \n",
" South Dakota | \n",
" 2010 | \n",
" 816325 | \n",
"
\n",
" \n",
" 562 | \n",
" South Dakota | \n",
" 2010 | \n",
" 816463 | \n",
"
\n",
" \n",
" 43 | \n",
" Tennessee | \n",
" 2010 | \n",
" 6356671 | \n",
"
\n",
" \n",
" 563 | \n",
" Tennessee | \n",
" 2010 | \n",
" 6356897 | \n",
"
\n",
" \n",
" 564 | \n",
" Texas | \n",
" 2010 | \n",
" 25257114 | \n",
"
\n",
" \n",
" 44 | \n",
" Texas | \n",
" 2010 | \n",
" 25244310 | \n",
"
\n",
" \n",
" 0 | \n",
" United States | \n",
" 2010 | \n",
" 309348193 | \n",
"
\n",
" \n",
" 520 | \n",
" United States | \n",
" 2010 | \n",
" 309349689 | \n",
"
\n",
" \n",
" 45 | \n",
" Utah | \n",
" 2010 | \n",
" 2775326 | \n",
"
\n",
" \n",
" 565 | \n",
" Utah | \n",
" 2010 | \n",
" 2776469 | \n",
"
\n",
" \n",
" 566 | \n",
" Vermont | \n",
" 2010 | \n",
" 625960 | \n",
"
\n",
" \n",
" 46 | \n",
" Vermont | \n",
" 2010 | \n",
" 625982 | \n",
"
\n",
" \n",
" 47 | \n",
" Virginia | \n",
" 2010 | \n",
" 8025773 | \n",
"
\n",
" \n",
" 567 | \n",
" Virginia | \n",
" 2010 | \n",
" 8024617 | \n",
"
\n",
" \n",
" 48 | \n",
" Washington | \n",
" 2010 | \n",
" 6743226 | \n",
"
\n",
" \n",
" 568 | \n",
" Washington | \n",
" 2010 | \n",
" 6744496 | \n",
"
\n",
" \n",
" 49 | \n",
" West Virginia | \n",
" 2010 | \n",
" 1854230 | \n",
"
\n",
" \n",
" 569 | \n",
" West Virginia | \n",
" 2010 | \n",
" 1853973 | \n",
"
\n",
" \n",
" 50 | \n",
" Wisconsin | \n",
" 2010 | \n",
" 5690263 | \n",
"
\n",
" \n",
" 570 | \n",
" Wisconsin | \n",
" 2010 | \n",
" 5691047 | \n",
"
\n",
" \n",
" 571 | \n",
" Wyoming | \n",
" 2010 | \n",
" 564460 | \n",
"
\n",
" \n",
" 51 | \n",
" Wyoming | \n",
" 2010 | \n",
" 564513 | \n",
"
\n",
" \n",
"
\n",
"
104 rows × 3 columns
\n",
"
"
],
"text/plain": [
" State Year Population\n",
"1 Alabama 2010 4785492\n",
"521 Alabama 2010 4785298\n",
"522 Alaska 2010 713985\n",
"2 Alaska 2010 714031\n",
"523 Arizona 2010 6413737\n",
"3 Arizona 2010 6408312\n",
"524 Arkansas 2010 2921606\n",
"4 Arkansas 2010 2921995\n",
"525 California 2010 37349363\n",
"5 California 2010 37332685\n",
"526 Colorado 2010 5049071\n",
"6 Colorado 2010 5048644\n",
"527 Connecticut 2010 3577073\n",
"7 Connecticut 2010 3579899\n",
"528 Delaware 2010 899769\n",
"8 Delaware 2010 899816\n",
"529 District of Columbia 2010 604453\n",
"9 District of Columbia 2010 605183\n",
"530 Florida 2010 18843326\n",
"10 Florida 2010 18849098\n",
"11 Georgia 2010 9713521\n",
"531 Georgia 2010 9712587\n",
"532 Hawaii 2010 1363621\n",
"12 Hawaii 2010 1363945\n",
"13 Idaho 2010 1571010\n",
"533 Idaho 2010 1571450\n",
"14 Illinois 2010 12841578\n",
"534 Illinois 2010 12843166\n",
"15 Indiana 2010 6490528\n",
"535 Indiana 2010 6490621\n",
".. ... ... ...\n",
"558 Oregon 2010 3838957\n",
"38 Oregon 2010 3838048\n",
"39 Pennsylvania 2010 12712343\n",
"559 Pennsylvania 2010 12709630\n",
"40 Rhode Island 2010 1053337\n",
"560 Rhode Island 2010 1052886\n",
"41 South Carolina 2010 4635943\n",
"561 South Carolina 2010 4636312\n",
"42 South Dakota 2010 816325\n",
"562 South Dakota 2010 816463\n",
"43 Tennessee 2010 6356671\n",
"563 Tennessee 2010 6356897\n",
"564 Texas 2010 25257114\n",
"44 Texas 2010 25244310\n",
"0 United States 2010 309348193\n",
"520 United States 2010 309349689\n",
"45 Utah 2010 2775326\n",
"565 Utah 2010 2776469\n",
"566 Vermont 2010 625960\n",
"46 Vermont 2010 625982\n",
"47 Virginia 2010 8025773\n",
"567 Virginia 2010 8024617\n",
"48 Washington 2010 6743226\n",
"568 Washington 2010 6744496\n",
"49 West Virginia 2010 1854230\n",
"569 West Virginia 2010 1853973\n",
"50 Wisconsin 2010 5690263\n",
"570 Wisconsin 2010 5691047\n",
"571 Wyoming 2010 564460\n",
"51 Wyoming 2010 564513\n",
"\n",
"[104 rows x 3 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop_total.loc[pop_total['Year']==2010].sort_values('State')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:38:23.689337Z",
"start_time": "2017-10-30T16:38:23.682891Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"pop_total = pd.concat([pop1_tidy.loc[pop1_tidy['Year'] < 2010], pop2_tidy])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:38:26.926979Z",
"start_time": "2017-10-30T16:38:26.915337Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" State | \n",
" Year | \n",
" Population | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" United States | \n",
" 2000 | \n",
" 282162411 | \n",
"
\n",
" \n",
" 1 | \n",
" Alabama | \n",
" 2000 | \n",
" 4452173 | \n",
"
\n",
" \n",
" 2 | \n",
" Alaska | \n",
" 2000 | \n",
" 627963 | \n",
"
\n",
" \n",
" 3 | \n",
" Arizona | \n",
" 2000 | \n",
" 5160586 | \n",
"
\n",
" \n",
" 4 | \n",
" Arkansas | \n",
" 2000 | \n",
" 2678588 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" State Year Population\n",
"0 United States 2000 282162411\n",
"1 Alabama 2000 4452173\n",
"2 Alaska 2000 627963\n",
"3 Arizona 2000 5160586\n",
"4 Arkansas 2000 2678588"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop_total.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"ExecuteTime": {
"end_time": "2017-10-30T16:38:27.256796Z",
"start_time": "2017-10-30T16:38:27.242016Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" State | \n",
" Year | \n",
" Population | \n",
"
\n",
" \n",
" \n",
" \n",
" 359 | \n",
" Virginia | \n",
" 2016 | \n",
" 8411808 | \n",
"
\n",
" \n",
" 360 | \n",
" Washington | \n",
" 2016 | \n",
" 7288000 | \n",
"
\n",
" \n",
" 361 | \n",
" West Virginia | \n",
" 2016 | \n",
" 1831102 | \n",
"
\n",
" \n",
" 362 | \n",
" Wisconsin | \n",
" 2016 | \n",
" 5778708 | \n",
"
\n",
" \n",
" 363 | \n",
" Wyoming | \n",
" 2016 | \n",
" 585501 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" State Year Population\n",
"359 Virginia 2016 8411808\n",
"360 Washington 2016 7288000\n",
"361 West Virginia 2016 1831102\n",
"362 Wisconsin 2016 5778708\n",
"363 Wyoming 2016 585501"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop_total.tail()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"ExecuteTime": {
"end_time": "2017-08-03T17:56:17.320409Z",
"start_time": "2017-08-03T17:56:17.312010Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"path = os.path.join('Data storage', 'Derived data', 'State population.csv')\n",
"pop_total.to_csv(path, index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py36",
"language": "python",
"name": "py36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}