{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Compile State Usage Data\n", "Loops through a list of all states an pulls USGS water use data for each state into a single table. \n", "\n", "#### Required packages: \n", "* pandas\n", "* us \n", "---\n", "`Created September 2017`
`John.Fay@duke.edu`\n", "\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set year and the output filename" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Set the year\n", "year = 2010" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Create the output csv file\n", "outFilename = \"../Data/AllStatesUsage{}.csv\".format(year)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The rest is automated..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### First, we need to import required packages, installing 3rd party packages if required" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#Import built-in modules\n", "import os, urllib" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Import pandas, install if needed\n", "try:\n", " import pandas as pd\n", "except:\n", " import pip\n", " pip.main(['install','pandas'])\n", " import pandas as pd" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Import us, install if needed\n", "try:\n", " import us\n", "except:\n", " import pip\n", " pip.main(['install','us'])\n", " import us" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Define the function that will pull the data, reshape it, and create a 'tidy' data frame" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def getData(state_abbr,year):\n", " '''Downloads USGS data and creates a tidy dataframe of all the usage data for the state'''\n", " \n", " #Set the data URL path and parameters and construct the url\n", " path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state_abbr)\n", " values = {'format':'rdb',\n", " 'rdb_compression':'value',\n", " 'wu_area':'County',\n", " 'wu_year': year,\n", " 'wu_county':'ALL',\n", " 'wu_county_nms':'--ALL+Counties--',\n", " 'wu_category_nms':'--ALL+Categories--'\n", " }\n", " url = path + urllib.urlencode(values)\n", " \n", " #Pull data in using the URL and remove the 2nd row of headers\n", " dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-')\n", " dfRaw.columns = dfRaw.columns.droplevel(level=1)\n", "\n", " #Tidy the data: transform so data in each usage column become row values with a new column listing the usage type\n", " rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year']\n", " dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group')\n", "\n", " #Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name)\n", " dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')].copy(deep=True)\n", "\n", " #Change the type of the MGal column to float \n", " dfTidy['MGal'] = dfTidy.MGal.astype('float')\n", " \n", " #Summarize county data for the whole state\n", " stateSummary = dfTidy.groupby(['Group'])['MGal'].sum()\n", " \n", " #Convert to a dataframe\n", " dfState = pd.DataFrame(stateSummary)\n", " \n", " #Rename the MGal column to the state abbreviation\n", " dfState.columns = [state_abbr]\n", " \n", " #Return the dataframe\n", " return dfState\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Import data for the first state into a tidy data frame" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing AL, \n" ] } ], "source": [ "#Get the data for the first state\n", "abbr = us.states.STATES_CONTINENTAL[0].abbr\n", "print \"Processing {}, \".format(abbr),\n", "dfAll = getData(abbr,year)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Loop through the remaining states and merge them to the data frame created above" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AK AZ AR CA CO CT DE DC FL GA ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY\n" ] } ], "source": [ "#Loop through the remaining states\n", "for state in us.states.STATES_CONTINENTAL[1:]:\n", " abbr = state.abbr\n", " print abbr,\n", " dfState = getData(abbr,year)\n", " dfAll = pd.merge(dfAll,dfState,how='inner',left_index='Group',right_index='Group')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Write the output to a csv file" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfAll.to_csv(outFilename)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 2 }