{
 "metadata": {
  "name": "",
  "signature": "sha256:57f90e77d787adaeb091f0b719d74b7e1052bdc5a045b437d96b7816b72b9150"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "**Outdated** -- use fbi_crime_data instead\n",
      "\n",
      "This notebook scrapes US crime data from http://www.disastercenter.com/crime/, and merges it into a single csv file"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import requests\n",
      "from fnmatch import fnmatch\n",
      "import re\n",
      "\n",
      "from bs4 import BeautifulSoup, SoupStrainer\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "\n",
      "pd.set_option('display.width', 500)\n",
      "pd.set_option('display.max_columns', 50)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "columns = 'Year Population Index Violent Property Murder Rape Robbery Assault Burglary Larceny Vehicle'.split()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Gather the URLs for each state's page"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dom = BeautifulSoup(requests.get('http://www.disastercenter.com/crime/vacrime.htm').text, \"html5lib\")\n",
      "urls = [a['href'] for a in dom.findAll('a') if fnmatch(a['href'], 'http://www.disastercenter.com/crime/*.htm')]\n",
      "urls = {u.split('/')[-1].split('crim')[0]: u for u in urls}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def number(element):\n",
      "    try:\n",
      "        node = 'font' if element.find('font') else 'small'\n",
      "        d = element.find(node).contents[0]\n",
      "        d = d.replace(',', '').replace(u'\\xa0', '').replace('*', '').replace(\"'\",'')\n",
      "        return float(d)\n",
      "    except:\n",
      "        return np.nan\n",
      "\n",
      "def crime_stats(url):\n",
      "    dom = BeautifulSoup(requests.get(url).text, \"html5lib\")    \n",
      "    table = dom.find('center', text=re.compile('100,000')).find_next('table')\n",
      "    rows = filter(lambda x: len(x.findAll('td')) == 12, table.find_all('tr'))\n",
      "    data = [map(number, r.findAll('td')) for r in rows[2:]]\n",
      "    return pd.DataFrame(data, columns=columns)\n",
      "\n",
      "states = {}\n",
      "for state, url in urls.items():\n",
      "    print state,\n",
      "    if state in ['oh']:  # Ohio is a malformed table. Ignore it.\n",
      "        continue        \n",
      "    states[state] = crime_stats(url)\n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "va "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "co "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ca "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "al "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ar "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "vt "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "il "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ga "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "in "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ia "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "az "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "id "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ct "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "nh "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "nj "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "nm "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "tx "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "la "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "nc "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "nd "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ne "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "tn "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ny "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "pa "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ak "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "nv "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "wa "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "de "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "dc "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "wi "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "wv "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "hi "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ok "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "fl "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "wy "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "me "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "md "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ma "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "oh ut "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "mo "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "mn "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "mi "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "kn "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "us "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ri "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "mt "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ms "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "sc "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "ky "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "or "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "sd\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for s in states:\n",
      "    states[s]['state'] = s\n",
      "\n",
      "data = pd.concat(states.values(), ignore_index=True)\n",
      "data.to_csv('crime.csv', index=False)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    }
   ],
   "metadata": {}
  }
 ]
}