{ "metadata": { "name": "", "signature": "sha256:57f90e77d787adaeb091f0b719d74b7e1052bdc5a045b437d96b7816b72b9150" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Outdated** -- use fbi_crime_data instead\n", "\n", "This notebook scrapes US crime data from http://www.disastercenter.com/crime/, and merges it into a single csv file" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import requests\n", "from fnmatch import fnmatch\n", "import re\n", "\n", "from bs4 import BeautifulSoup, SoupStrainer\n", "import pandas as pd\n", "import numpy as np\n", "\n", "pd.set_option('display.width', 500)\n", "pd.set_option('display.max_columns', 50)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "columns = 'Year Population Index Violent Property Murder Rape Robbery Assault Burglary Larceny Vehicle'.split()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Gather the URLs for each state's page" ] }, { "cell_type": "code", "collapsed": false, "input": [ "dom = BeautifulSoup(requests.get('http://www.disastercenter.com/crime/vacrime.htm').text, \"html5lib\")\n", "urls = [a['href'] for a in dom.findAll('a') if fnmatch(a['href'], 'http://www.disastercenter.com/crime/*.htm')]\n", "urls = {u.split('/')[-1].split('crim')[0]: u for u in urls}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "def number(element):\n", " try:\n", " node = 'font' if element.find('font') else 'small'\n", " d = element.find(node).contents[0]\n", " d = d.replace(',', '').replace(u'\\xa0', '').replace('*', '').replace(\"'\",'')\n", " return float(d)\n", " except:\n", " return np.nan\n", "\n", "def crime_stats(url):\n", " dom = BeautifulSoup(requests.get(url).text, \"html5lib\") \n", " table = dom.find('center', text=re.compile('100,000')).find_next('table')\n", " rows = filter(lambda x: len(x.findAll('td')) == 12, table.find_all('tr'))\n", " data = [map(number, r.findAll('td')) for r in rows[2:]]\n", " return pd.DataFrame(data, columns=columns)\n", "\n", "states = {}\n", "for state, url in urls.items():\n", " print state,\n", " if state in ['oh']: # Ohio is a malformed table. Ignore it.\n", " continue \n", " states[state] = crime_stats(url)\n", " " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "va " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "co " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ca " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "al " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ar " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "vt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "il " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ga " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "in " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ia " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "az " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "id " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ct " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "nh " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "nj " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "nm " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "tx " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "la " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "nc " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "nd " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ne " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "tn " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ny " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "pa " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ak " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "nv " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "wa " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "de " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "dc " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "wi " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "wv " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "hi " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ok " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "fl " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "wy " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "me " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "md " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ma " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "oh ut " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "mo " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "mn " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "mi " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "kn " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "us " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ri " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "mt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ms " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "sc " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "ky " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "or " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "sd\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "for s in states:\n", " states[s]['state'] = s\n", "\n", "data = pd.concat(states.values(), ignore_index=True)\n", "data.to_csv('crime.csv', index=False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 } ], "metadata": {} } ] }