{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compile State Usage Data\n",
    "Loops through a list of all states an pulls USGS water use data for each state into a single table. \n",
    "\n",
    "#### Required packages: \n",
    "* pandas\n",
    "* us \n",
    "---\n",
    "`Created September 2017` <br> `John.Fay@duke.edu`\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set year and the output filename"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Set the year\n",
    "year = 2010"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Create the output csv file\n",
    "outFilename = \"../Data/AllStatesUsage{}.csv\".format(year)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The rest is automated..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### First, we need to import required packages, installing 3rd party packages if required"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Import built-in modules\n",
    "import os, urllib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Import pandas, install if needed\n",
    "try:\n",
    "    import pandas as pd\n",
    "except:\n",
    "    import pip\n",
    "    pip.main(['install','pandas'])\n",
    "    import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Import us, install if needed\n",
    "try:\n",
    "    import us\n",
    "except:\n",
    "    import pip\n",
    "    pip.main(['install','us'])\n",
    "    import us"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Define the function that will pull the data, reshape it, and create a 'tidy' data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def getData(state_abbr,year):\n",
    "    '''Downloads USGS data and creates a tidy dataframe of all the usage data for the state'''\n",
    "    \n",
    "    #Set the data URL path and parameters and construct the url\n",
    "    path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state_abbr)\n",
    "    values = {'format':'rdb',\n",
    "             'rdb_compression':'value',\n",
    "             'wu_area':'County',\n",
    "             'wu_year': year,\n",
    "             'wu_county':'ALL',\n",
    "             'wu_county_nms':'--ALL+Counties--',\n",
    "             'wu_category_nms':'--ALL+Categories--'\n",
    "            }\n",
    "    url = path + urllib.urlencode(values)\n",
    "    \n",
    "    #Pull data in using the URL and remove the 2nd row of headers\n",
    "    dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-')\n",
    "    dfRaw.columns = dfRaw.columns.droplevel(level=1)\n",
    "\n",
    "    #Tidy the data: transform so data in each usage column become row values with a new column listing the usage type\n",
    "    rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year']\n",
    "    dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group')\n",
    "\n",
    "    #Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name)\n",
    "    dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')].copy(deep=True)\n",
    "\n",
    "    #Change the type of the MGal column to float \n",
    "    dfTidy['MGal'] = dfTidy.MGal.astype('float')\n",
    "    \n",
    "    #Summarize county data for the whole state\n",
    "    stateSummary = dfTidy.groupby(['Group'])['MGal'].sum()\n",
    "    \n",
    "    #Convert to a dataframe\n",
    "    dfState = pd.DataFrame(stateSummary)\n",
    "    \n",
    "    #Rename the MGal column to the state abbreviation\n",
    "    dfState.columns = [state_abbr]\n",
    "    \n",
    "    #Return the dataframe\n",
    "    return dfState\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Import data for the first state into a tidy data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing AL, \n"
     ]
    }
   ],
   "source": [
    "#Get the data for the first state\n",
    "abbr = us.states.STATES_CONTINENTAL[0].abbr\n",
    "print \"Processing {}, \".format(abbr),\n",
    "dfAll = getData(abbr,year)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Loop through the remaining states and merge them to the data frame created above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AK AZ AR CA CO CT DE DC FL GA ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY\n"
     ]
    }
   ],
   "source": [
    "#Loop through the remaining states\n",
    "for state in us.states.STATES_CONTINENTAL[1:]:\n",
    "    abbr = state.abbr\n",
    "    print abbr,\n",
    "    dfState = getData(abbr,year)\n",
    "    dfAll = pd.merge(dfAll,dfState,how='inner',left_index='Group',right_index='Group')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Write the output to a csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dfAll.to_csv(outFilename)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}