{
 "metadata": {
  "name": "getting_data"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Getting data"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Getting and setting directory, equivalent to R getwd() and setwd() commands\n",
      "\n",
      "This is the more general Python way. In IPython we can use system commands directly prefixed by an exclamation mark (!)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "\n",
      "os.getcwd()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "'/Users/erriza/dataanalysis/week2'"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "os.chdir('..')\n",
      "os.getcwd()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "'/Users/erriza/dataanalysis'"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "os.chdir('./week2/')\n",
      "os.getcwd()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "'/Users/erriza/dataanalysis/week2'"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Load CSV data. We'll use mostly `pandas`."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "\n",
      "fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD'\n",
      "\n",
      "# we can directly use read_csv to download the file\n",
      "# this is equivalent to R's combined download.file() and read.table() or read.csv() commands\n",
      "cameraData = pd.read_csv(fileUrl)\n",
      "\n",
      "# save data locally\n",
      "cameraData.to_csv('../data/cameras.csv', index=False)\n",
      "\n",
      "# for simplicity I'll use IPython tricks to list folder contents\n",
      "!ls ../data\n",
      "\n",
      "\n",
      "# get current date and time\n",
      "# this is equivalent to R date() command\n",
      "# note that I use IPython ! prefix to run my system's command\n",
      "dateDownloaded = !date\n",
      "print '\\nDate downloaded: ' + str(dateDownloaded)\n",
      "\n",
      "cameraData.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "camera.xls          face.rda            loansData.csv       samsungData.csv\r\n",
        "camera.xlsx         gaData.csv          movies.txt          samsungData.rda\r\n",
        "cameras.csv         gaData.rda          ravensData.csv      ss06pid.csv\r\n",
        "camerasModified.csv galton.csv          ravensData.rda      warpbreaks.csv\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Date downloaded: ['Mon Mar 18 21:29:11 CET 2013']\n"
       ]
      },
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>address</th>\n",
        "      <th>direction</th>\n",
        "      <th>street</th>\n",
        "      <th>crossStreet</th>\n",
        "      <th>intersection</th>\n",
        "      <th>Location 1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> N/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693779962, -76.6688185297)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> S/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693157898, -76.6689698176)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
        "      <td> E/B</td>\n",
        "      <td> Wilkens Ave</td>\n",
        "      <td> Pine Heights</td>\n",
        "      <td> Wilkens Ave & Pine Heights</td>\n",
        "      <td>  (39.2720252302, -76.676960806)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>        THE ALAMEDA & E 33RD ST</td>\n",
        "      <td> S/B</td>\n",
        "      <td> The Alameda</td>\n",
        "      <td>      33rd St</td>\n",
        "      <td>     The Alameda  & 33rd St</td>\n",
        "      <td> (39.3285013141, -76.5953545714)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>        E 33RD ST & THE ALAMEDA</td>\n",
        "      <td> E/B</td>\n",
        "      <td>      E 33rd</td>\n",
        "      <td>  The Alameda</td>\n",
        "      <td>      E 33rd  & The Alameda</td>\n",
        "      <td> (39.3283410623, -76.5953594625)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "                          address direction       street   crossStreet  \\\n",
        "0        S CATON AVE & BENSON AVE       N/B    Caton Ave    Benson Ave   \n",
        "1        S CATON AVE & BENSON AVE       S/B    Caton Ave    Benson Ave   \n",
        "2  WILKENS AVE & PINE HEIGHTS AVE       E/B  Wilkens Ave  Pine Heights   \n",
        "3         THE ALAMEDA & E 33RD ST       S/B  The Alameda       33rd St   \n",
        "4         E 33RD ST & THE ALAMEDA       E/B       E 33rd   The Alameda   \n",
        "\n",
        "                 intersection                       Location 1  \n",
        "0      Caton Ave & Benson Ave  (39.2693779962, -76.6688185297)  \n",
        "1      Caton Ave & Benson Ave  (39.2693157898, -76.6689698176)  \n",
        "2  Wilkens Ave & Pine Heights   (39.2720252302, -76.676960806)  \n",
        "3      The Alameda  & 33rd St  (39.3285013141, -76.5953545714)  \n",
        "4       E 33rd  & The Alameda  (39.3283410623, -76.5953594625)  "
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<b>Read Excel file</b>\n",
      "\n",
      "This is equivalent to R `read.xlsx()` and `read.xlsx2()` commands.\n",
      "\n",
      "We need openpyxl 1.5.8 (don't use the latest version due to a bug) and xlrd packages. Install with: \n",
      "\n",
      "<pre><code>\n",
      "sudo pip install openpyxl==1.5.8\n",
      "sudo pip install xlrd\n",
      "</code></pre>\n",
      "\n",
      "Pandas `ExcelFile()` can't download and read at once (in contrast to `read_csv()`), so we need to resort to the basic Python way.\n",
      "Also notice I'm using .xls; .xlsx doesn't work in my computer."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import urllib2\n",
      "\n",
      "# download the file as camera.xls and save it in ./data subfolder\n",
      "fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xls?accessType=DOWNLOAD'\n",
      "f = urllib2.urlopen(fileUrl)\n",
      "data = f.read()\n",
      "with open('../data/camera.xls', 'wb') as w:\n",
      "    w.write(data)\n",
      "\n",
      "# load the Excel file as a pandas DataFrame\n",
      "cameraData = pd.ExcelFile('../data/camera.xls')\n",
      "cameraData = cameraData.parse('Baltimore Fixed Speed Cameras', index_col=None, na_values=['NA'])\n",
      "cameraData.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>address</th>\n",
        "      <th>direction</th>\n",
        "      <th>street</th>\n",
        "      <th>crossStreet</th>\n",
        "      <th>intersection</th>\n",
        "      <th>Location 1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> N/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693779962, -76.6688185297)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> S/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693157898, -76.6689698176)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
        "      <td> E/B</td>\n",
        "      <td> Wilkens Ave</td>\n",
        "      <td> Pine Heights</td>\n",
        "      <td> Wilkens Ave & Pine Heights</td>\n",
        "      <td>  (39.2720252302, -76.676960806)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>        THE ALAMEDA & E 33RD ST</td>\n",
        "      <td> S/B</td>\n",
        "      <td> The Alameda</td>\n",
        "      <td>      33rd St</td>\n",
        "      <td>     The Alameda  & 33rd St</td>\n",
        "      <td> (39.3285013141, -76.5953545714)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>        E 33RD ST & THE ALAMEDA</td>\n",
        "      <td> E/B</td>\n",
        "      <td>      E 33rd</td>\n",
        "      <td>  The Alameda</td>\n",
        "      <td>      E 33rd  & The Alameda</td>\n",
        "      <td> (39.3283410623, -76.5953594625)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "                          address direction       street   crossStreet  \\\n",
        "0        S CATON AVE & BENSON AVE       N/B    Caton Ave    Benson Ave   \n",
        "1        S CATON AVE & BENSON AVE       S/B    Caton Ave    Benson Ave   \n",
        "2  WILKENS AVE & PINE HEIGHTS AVE       E/B  Wilkens Ave  Pine Heights   \n",
        "3         THE ALAMEDA & E 33RD ST       S/B  The Alameda       33rd St   \n",
        "4         E 33RD ST & THE ALAMEDA       E/B       E 33rd   The Alameda   \n",
        "\n",
        "                 intersection                       Location 1  \n",
        "0      Caton Ave & Benson Ave  (39.2693779962, -76.6688185297)  \n",
        "1      Caton Ave & Benson Ave  (39.2693157898, -76.6689698176)  \n",
        "2  Wilkens Ave & Pine Heights   (39.2720252302, -76.676960806)  \n",
        "3      The Alameda  & 33rd St  (39.3285013141, -76.5953545714)  \n",
        "4       E 33rd  & The Alameda  (39.3283410623, -76.5953594625)  "
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The course video describes R's `readLines()` for reading a text file, which is similar to standard Python file access, so I'm not going to detail it here.\n",
      "\n",
      "Similarly, R's `readLines()` to read data from a website is similar to Python with urllib2 package as in xls example above`"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<br><b>Read JSON file</b>\n",
      "\n",
      "This is equivalent to R's `fromJSON()` command."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import json\n",
      "\n",
      "# first we get the json file from the website\n",
      "fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.json?accessType=DOWNLOAD'\n",
      "req = urllib2.Request(fileUrl)\n",
      "opener = urllib2.build_opener()\n",
      "f = opener.open(req)\n",
      "\n",
      "# then we read it into a data structure\n",
      "jsonCamera = json.loads(f.read())\n",
      "\n",
      "# json is loadad as dictionary\n",
      "print jsonCamera['meta']['view']['id']\n",
      "print jsonCamera['meta']['view']['name']\n",
      "print jsonCamera['meta']['view']['attribution']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "dz54-2aru\n",
        "Baltimore Fixed Speed Cameras\n",
        "Department of Transportation\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<br><b>Writing data</b>"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# first read the csv file\n",
      "cameraData = pd.read_csv('../data/cameras.csv')\n",
      "\n",
      "# take a subset of the columns\n",
      "tmpData = cameraData.ix[:,2:]\n",
      "\n",
      "# then save it to a different csv file\n",
      "# this is equivalent to R's write.table() command\n",
      "tmpData.to_csv('../data/camerasModified.csv', sep=',', index=False)\n",
      "\n",
      "cameraData2 = pd.read_csv('../data/camerasModified.csv')\n",
      "cameraData2.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>street</th>\n",
        "      <th>crossStreet</th>\n",
        "      <th>intersection</th>\n",
        "      <th>Location 1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693779962, -76.6688185297)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693157898, -76.6689698176)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> Wilkens Ave</td>\n",
        "      <td> Pine Heights</td>\n",
        "      <td> Wilkens Ave & Pine Heights</td>\n",
        "      <td>  (39.2720252302, -76.676960806)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> The Alameda</td>\n",
        "      <td>      33rd St</td>\n",
        "      <td>     The Alameda  & 33rd St</td>\n",
        "      <td> (39.3285013141, -76.5953545714)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>      E 33rd</td>\n",
        "      <td>  The Alameda</td>\n",
        "      <td>      E 33rd  & The Alameda</td>\n",
        "      <td> (39.3283410623, -76.5953594625)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 10,
       "text": [
        "        street   crossStreet                intersection  \\\n",
        "0    Caton Ave    Benson Ave      Caton Ave & Benson Ave   \n",
        "1    Caton Ave    Benson Ave      Caton Ave & Benson Ave   \n",
        "2  Wilkens Ave  Pine Heights  Wilkens Ave & Pine Heights   \n",
        "3  The Alameda       33rd St      The Alameda  & 33rd St   \n",
        "4       E 33rd   The Alameda       E 33rd  & The Alameda   \n",
        "\n",
        "                        Location 1  \n",
        "0  (39.2693779962, -76.6688185297)  \n",
        "1  (39.2693157898, -76.6689698176)  \n",
        "2   (39.2720252302, -76.676960806)  \n",
        "3  (39.3285013141, -76.5953545714)  \n",
        "4  (39.3283410623, -76.5953594625)  "
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The course video explains R commands to save and load the workspace. I don't think we have the equivalent for that."
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<br>The course video explains R's `paste()` and `paste0()` commands, which look like standard Python's string manipulations:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print ['../data' + str(i) + '.csv' for i in range(1, 6)]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['../data1.csv', '../data2.csv', '../data3.csv', '../data4.csv', '../data5.csv']\n"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<br><b>Getting data off webpages</b>"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from lxml.html import parse\n",
      "\n",
      "url = 'http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en'\n",
      "\n",
      "# this is equivalent to the combined R's opening/reading/closing connection and htmlTreeParse() commands\n",
      "html3 = parse(url).getroot()\n",
      "\n",
      "# get the title text using xpath expression\n",
      "# this is equivalent to R xpathSApply() command\n",
      "title = html3.xpath('//title')\n",
      "print [x.text_content() for x in title]\n",
      "\n",
      "# get the texts of col-citedby elements using xpath expression\n",
      "citedby = html3.xpath(\"//td[@id='col-citedby']\")\n",
      "print [x.text_content() for x in citedby]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['Jeff Leek - Google Scholar Citations']\n",
        "['Cited by', '344', '183', '147', '143', '111', '96', '87', '80', '59', '18', '11', '10', '10', '8', '8', '8', '7', '6', '5', '3']\n"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}