{ "cells": [ { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# The usual preamble\n", "%matplotlib inline\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# Make the graphs a bit prettier, and bigger\n", "pd.set_option('display.mpl_style', 'default')\n", "plt.rcParams['figure.figsize'] = (15, 5)\n", "plt.rcParams['font.family'] = 'sans-serif'\n", "\n", "# This is necessary to show lots of columns in pandas 0.12. \n", "# Not necessary in pandas 0.13.\n", "pd.set_option('display.width', 5000) \n", "pd.set_option('display.max_columns', 60)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One of the main problems with messy data is: how do you know if it's messy or not?\n", "\n", "We're going to use the NYC 311 service request dataset again here, since it's big and a bit unwieldy." ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [], "source": [ "requests = pd.read_csv('../data/311-service-requests.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7.1 How do we know if it's messy? " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We're going to look at a few columns here. I know already that there are some problems with the zip code, so let's look at that first.\n", " \n", "To get a sense for whether a column has problems, I usually use `.unique()` to look at all its values. If it's a numeric column, I'll instead plot a histogram to get a sense of the distribution.\n", "\n", "When we look at the unique values in \"Incident Zip\", it quickly becomes clear that this is a mess.\n", "\n", "Some of the problems:\n", "\n", "* Some have been parsed as strings, and some as floats\n", "* There are `nan`s \n", "* Some of the zip codes are `29616-0759` or `83`\n", "* There are some N/A values that pandas didn't recognize, like 'N/A' and 'NO CLUE'\n", "\n", "What we can do:\n", "\n", "* Normalize 'N/A' and 'NO CLUE' into regular nan values\n", "* Look at what's up with the 83, and decide what to do\n", "* Make everything strings" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0,\n", " 11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0,\n", " 11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0,\n", " 10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0,\n", " 11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0,\n", " 10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0,\n", " 10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0,\n", " 10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0,\n", " 11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0,\n", " 10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0,\n", " 10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0,\n", " 10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0,\n", " 11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0,\n", " 10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0,\n", " 11209.0, 10021.0, 10037.0, 11413.0, 11375.0, 11238.0, 10473.0,\n", " 11103.0, 11354.0, 11361.0, 11106.0, 11385.0, 10463.0, 10467.0,\n", " 11204.0, 11237.0, 11377.0, 11364.0, 11434.0, 11435.0, 11210.0,\n", " 11228.0, 11368.0, 11694.0, 10464.0, 11415.0, 10314.0, 10301.0,\n", " 10018.0, 10038.0, 11105.0, 11230.0, 10468.0, 11104.0, 10471.0,\n", " 11416.0, 10075.0, 11422.0, 11355.0, 10028.0, 10462.0, 10306.0,\n", " 10461.0, 11224.0, 11429.0, 10035.0, 11366.0, 11362.0, 11206.0,\n", " 10460.0, 10304.0, 11360.0, 11411.0, 10455.0, 10475.0, 10069.0,\n", " 10303.0, 10308.0, 10302.0, 11357.0, 10470.0, 11367.0, 11370.0,\n", " 10454.0, 10451.0, 11436.0, 11426.0, 10153.0, 11004.0, 11428.0,\n", " 11427.0, 11001.0, 11363.0, 10004.0, 10474.0, 11430.0, 10000.0,\n", " 10307.0, 11239.0, 10119.0, 10006.0, 10048.0, 11697.0, 11692.0,\n", " 11693.0, 10573.0, 83.0, 11559.0, 10020.0, 77056.0, 11776.0, 70711.0,\n", " 10282.0, 11109.0, 10044.0, '10452', '11233', '10468', '10310',\n", " '11105', '10462', '10029', '10301', '10457', '10467', '10469',\n", " '11225', '10035', '10031', '11226', '10454', '11221', '10025',\n", " '11229', '11235', '11422', '10472', '11208', '11102', '10032',\n", " '11216', '10473', '10463', '11213', '10040', '10302', '11231',\n", " '10470', '11204', '11104', '11212', '10466', '11416', '11214',\n", " '10009', '11692', '11385', '11423', '11201', '10024', '11435',\n", " '10312', '10030', '11106', '10033', '10303', '11215', '11222',\n", " '11354', '10016', '10034', '11420', '10304', '10019', '11237',\n", " '11249', '11230', '11372', '11207', '11378', '11419', '11361',\n", " '10011', '11357', '10012', '11358', '10003', '10002', '11374',\n", " '10007', '11234', '10065', '11369', '11434', '11205', '11206',\n", " '11415', '11236', '11218', '11413', '10458', '11101', '10306',\n", " '11355', '10023', '11368', '10314', '11421', '10010', '10018',\n", " '11223', '10455', '11377', '11433', '11375', '10037', '11209',\n", " '10459', '10128', '10014', '10282', '11373', '10451', '11238',\n", " '11211', '10038', '11694', '11203', '11691', '11232', '10305',\n", " '10021', '11228', '10036', '10001', '10017', '11217', '11219',\n", " '10308', '10465', '11379', '11414', '10460', '11417', '11220',\n", " '11366', '10027', '11370', '10309', '11412', '11356', '10456',\n", " '11432', '10022', '10013', '11367', '11040', '10026', '10475',\n", " '11210', '11364', '11426', '10471', '10119', '11224', '11418',\n", " '11429', '11365', '10461', '11239', '10039', '00083', '11411',\n", " '10075', '11004', '11360', '10453', '10028', '11430', '10307',\n", " '11103', '10004', '10069', '10005', '10474', '11428', '11436',\n", " '10020', '11001', '11362', '11693', '10464', '11427', '10044',\n", " '11363', '10006', '10000', '02061', '77092-2016', '10280', '11109',\n", " '14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE',\n", " '90010', '10281', '11747', '23541', '11776', '11697', '11788',\n", " '07604', 10112.0, 11788.0, 11563.0, 11580.0, 7087.0, 11042.0,\n", " 7093.0, 11501.0, 92123.0, 0.0, 11575.0, 7109.0, 11797.0, '10803',\n", " '11716', '11722', '11549-3650', '10162', '92123', '23502', '11518',\n", " '07020', '08807', '11577', '07114', '11003', '07201', '11563',\n", " '61702', '10103', '29616-0759', '35209-3114', '11520', '11735',\n", " '10129', '11005', '41042', '11590', 6901.0, 7208.0, 11530.0,\n", " 13221.0, 10954.0, 11735.0, 10103.0, 7114.0, 11111.0, 10107.0], dtype=object)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests['Incident Zip'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7.3 Fixing the nan values and string/float confusion" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can pass a `na_values` option to `pd.read_csv` to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float." ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [], "source": [ "na_values = ['NO CLUE', 'N/A', '0']\n", "requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',\n", " '11417', '10011', '11225', '11218', '10003', '10029', '10466',\n", " '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',\n", " '10305', '10312', '10026', '10309', '10036', '11433', '11235',\n", " '11213', '11379', '11101', '10014', '11231', '11234', '10457',\n", " '10459', '10465', '11207', '10002', '10034', '11233', '10453',\n", " '10456', '10469', '11374', '11221', '11421', '11215', '10007',\n", " '10019', '11205', '11418', '11369', '11249', '10005', '10009',\n", " '11211', '11412', '10458', '11229', '10065', '10030', '11222',\n", " '10024', '10013', '11420', '11365', '10012', '11214', '11212',\n", " '10022', '11232', '11040', '11226', '10281', '11102', '11208',\n", " '10001', '10472', '11414', '11223', '10040', '11220', '11373',\n", " '11203', '11691', '11356', '10017', '10452', '10280', '11217',\n", " '10031', '11201', '11358', '10128', '11423', '10039', '10010',\n", " '11209', '10021', '10037', '11413', '11375', '11238', '10473',\n", " '11103', '11354', '11361', '11106', '11385', '10463', '10467',\n", " '11204', '11237', '11377', '11364', '11434', '11435', '11210',\n", " '11228', '11368', '11694', '10464', '11415', '10314', '10301',\n", " '10018', '10038', '11105', '11230', '10468', '11104', '10471',\n", " '11416', '10075', '11422', '11355', '10028', '10462', '10306',\n", " '10461', '11224', '11429', '10035', '11366', '11362', '11206',\n", " '10460', '10304', '11360', '11411', '10455', '10475', '10069',\n", " '10303', '10308', '10302', '11357', '10470', '11367', '11370',\n", " '10454', '10451', '11436', '11426', '10153', '11004', '11428',\n", " '11427', '11001', '11363', '10004', '10474', '11430', '10000',\n", " '10307', '11239', '10119', '10006', '10048', '11697', '11692',\n", " '11693', '10573', '00083', '11559', '10020', '77056', '11776',\n", " '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225',\n", " '55164-0737', '19711', '07306', '000000', '90010', '11747', '23541',\n", " '11788', '07604', '10112', '11563', '11580', '07087', '11042',\n", " '07093', '11501', '92123', '00000', '11575', '07109', '11797',\n", " '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518',\n", " '07020', '08807', '11577', '07114', '11003', '07201', '61702',\n", " '10103', '29616-0759', '35209-3114', '11520', '11735', '10129',\n", " '11005', '41042', '11590', '06901', '07208', '11530', '13221',\n", " '10954', '11111', '10107'], dtype=object)" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests['Incident Zip'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7.4 What's up with the dashes?" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "5" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)\n", "len(requests[rows_with_dashes])" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unique KeyCreated DateClosed DateAgencyAgency NameComplaint TypeDescriptorLocation TypeIncident ZipIncident AddressStreet NameCross Street 1Cross Street 2Intersection Street 1Intersection Street 2Address TypeCityLandmarkFacility TypeStatusDue DateResolution Action Updated DateCommunity BoardBoroughX Coordinate (State Plane)Y Coordinate (State Plane)Park Facility NamePark BoroughSchool NameSchool NumberSchool RegionSchool CodeSchool Phone NumberSchool AddressSchool CitySchool StateSchool ZipSchool Not FoundSchool or Citywide ComplaintVehicle TypeTaxi Company BoroughTaxi Pick Up LocationBridge Highway NameBridge Highway DirectionRoad RampBridge Highway SegmentGarage Lot NameFerry DirectionFerry Terminal NameLatitudeLongitudeLocation
29136 26550551 10/24/2013 06:16:34 PM NaN DCA Department of Consumer Affairs Consumer Complaint False Advertising NaN 77092-2016 2700 EAST SELTICE WAY EAST SELTICE WAY NaN NaN NaN NaN NaN HOUSTON NaN NaN Assigned 11/13/2013 11:15:20 AM 10/29/2013 11:16:16 AM 0 Unspecified UnspecifiedNaNNaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified NNaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaNNaNNaN NaN
30939 26548831 10/24/2013 09:35:10 AM NaN DCA Department of Consumer Affairs Consumer Complaint Harassment NaN 55164-0737 P.O. BOX 64437 64437 NaN NaN NaN NaN NaN ST. PAUL NaN NaN Assigned 11/13/2013 02:30:21 PM 10/29/2013 02:31:06 PM 0 Unspecified UnspecifiedNaNNaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified NNaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaNNaNNaN NaN
70539 26488417 10/15/2013 03:40:33 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint Street 11549-3650 365 HOFSTRA UNIVERSITY HOFSTRA UNIVERSITY NaN NaN NaN NaN NaN HEMSTEAD NaN NaN Assigned 11/30/2013 01:20:33 PM 10/16/2013 01:21:39 PM 0 Unspecified UnspecifiedNaNNaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified NNaN NaN NaN La Guardia Airport NaN NaN NaN NaN NaN NaN NaNNaNNaN NaN
85821 26468296 10/10/2013 12:36:43 PM 10/26/2013 01:07:07 AM DCA Department of Consumer Affairs Consumer Complaint Debt Not Owed NaN 29616-0759 PO BOX 25759 BOX 25759 NaN NaN NaN NaN NaN GREENVILLE NaN NaN Closed 10/26/2013 09:20:28 AM 10/26/2013 01:07:07 AM 0 Unspecified UnspecifiedNaNNaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified NNaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaNNaNNaN NaN
89304 26461137 10/09/2013 05:23:46 PM 10/25/2013 01:06:41 AM DCA Department of Consumer Affairs Consumer Complaint Harassment NaN 35209-3114 600 BEACON PKWY BEACON PKWY NaN NaN NaN NaN NaN BIRMINGHAM NaN NaN Closed 10/25/2013 02:43:42 PM 10/25/2013 01:06:41 AM 0 Unspecified UnspecifiedNaNNaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified NNaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaNNaNNaN NaN
\n", "
" ], "text/plain": [ " Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type City Landmark Facility Type Status Due Date Resolution Action Updated Date Community Board Borough X Coordinate (State Plane) Y Coordinate (State Plane) Park Facility Name Park Borough School Name School Number School Region School Code School Phone Number School Address School City School State School Zip School Not Found School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location\n", "29136 26550551 10/24/2013 06:16:34 PM NaN DCA Department of Consumer Affairs Consumer Complaint False Advertising NaN 77092-2016 2700 EAST SELTICE WAY EAST SELTICE WAY NaN NaN NaN NaN NaN HOUSTON NaN NaN Assigned 11/13/2013 11:15:20 AM 10/29/2013 11:16:16 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "30939 26548831 10/24/2013 09:35:10 AM NaN DCA Department of Consumer Affairs Consumer Complaint Harassment NaN 55164-0737 P.O. BOX 64437 64437 NaN NaN NaN NaN NaN ST. PAUL NaN NaN Assigned 11/13/2013 02:30:21 PM 10/29/2013 02:31:06 PM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "70539 26488417 10/15/2013 03:40:33 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint Street 11549-3650 365 HOFSTRA UNIVERSITY HOFSTRA UNIVERSITY NaN NaN NaN NaN NaN HEMSTEAD NaN NaN Assigned 11/30/2013 01:20:33 PM 10/16/2013 01:21:39 PM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN La Guardia Airport NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "85821 26468296 10/10/2013 12:36:43 PM 10/26/2013 01:07:07 AM DCA Department of Consumer Affairs Consumer Complaint Debt Not Owed NaN 29616-0759 PO BOX 25759 BOX 25759 NaN NaN NaN NaN NaN GREENVILLE NaN NaN Closed 10/26/2013 09:20:28 AM 10/26/2013 01:07:07 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "89304 26461137 10/09/2013 05:23:46 PM 10/25/2013 01:06:41 AM DCA Department of Consumer Affairs Consumer Complaint Harassment NaN 35209-3114 600 BEACON PKWY BEACON PKWY NaN NaN NaN NaN NaN BIRMINGHAM NaN NaN Closed 10/25/2013 02:43:42 PM 10/25/2013 01:06:41 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests[rows_with_dashes]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I thought these were missing data and originally deleted them like this:\n", "\n", "`requests['Incident Zip'][rows_with_dashes] = np.nan`\n", "\n", "But then my friend Dave pointed out that 9-digit zip codes are normal. Let's look at all the zip codes with more than 5 digits, make sure they're okay, and then truncate them." ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759',\n", " '35209-3114'], dtype=object)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "long_zip_codes = requests['Incident Zip'].str.len() > 5\n", "requests['Incident Zip'][long_zip_codes].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Those all look okay to truncate to me." ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [], "source": [ "requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Done." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Earlier I thought 00083 was a broken zip code, but turns out Central Park's zip code 00083! Shows what I know. I'm still concerned about the 00000 zip codes, though: let's look at that. " ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unique KeyCreated DateClosed DateAgencyAgency NameComplaint TypeDescriptorLocation TypeIncident ZipIncident AddressStreet NameCross Street 1Cross Street 2Intersection Street 1Intersection Street 2Address TypeCityLandmarkFacility TypeStatusDue DateResolution Action Updated DateCommunity BoardBoroughX Coordinate (State Plane)Y Coordinate (State Plane)Park Facility NamePark BoroughSchool NameSchool NumberSchool RegionSchool CodeSchool Phone NumberSchool AddressSchool CitySchool StateSchool ZipSchool Not FoundSchool or Citywide ComplaintVehicle TypeTaxi Company BoroughTaxi Pick Up LocationBridge Highway NameBridge Highway DirectionRoad RampBridge Highway SegmentGarage Lot NameFerry DirectionFerry Terminal NameLatitudeLongitudeLocation
42600 26529313 10/22/2013 02:51:06 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint NaN 00000 EWR EWR EWR NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/07/2013 09:53:51 AM 10/23/2013 09:54:43 AM 0 Unspecified UnspecifiedNaNNaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified NNaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaNNaNNaN NaN
60843 26507389 10/17/2013 05:48:44 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint Street 00000 1 NEWARK AIRPORT NEWARK AIRPORT NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/02/2013 11:59:46 AM 10/18/2013 12:01:08 PM 0 Unspecified UnspecifiedNaNNaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified NNaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaNNaNNaN NaN
\n", "
" ], "text/plain": [ " Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type City Landmark Facility Type Status Due Date Resolution Action Updated Date Community Board Borough X Coordinate (State Plane) Y Coordinate (State Plane) Park Facility Name Park Borough School Name School Number School Region School Code School Phone Number School Address School City School State School Zip School Not Found School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location\n", "42600 26529313 10/22/2013 02:51:06 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint NaN 00000 EWR EWR EWR NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/07/2013 09:53:51 AM 10/23/2013 09:54:43 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "60843 26507389 10/17/2013 05:48:44 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint Street 00000 1 NEWARK AIRPORT NEWARK AIRPORT NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/02/2013 11:59:46 AM 10/18/2013 12:01:08 PM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests[requests['Incident Zip'] == '00000']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This looks bad to me. Let's set these to nan." ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [], "source": [ "zero_zips = requests['Incident Zip'] == '00000'\n", "requests.loc[zero_zips, 'Incident Zip'] = np.nan" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Great. Let's see where we are now:" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([nan, '00083', '02061', '06901', '07020', '07087', '07093', '07109',\n", " '07114', '07201', '07208', '07306', '07604', '08807', '10000',\n", " '10001', '10002', '10003', '10004', '10005', '10006', '10007',\n", " '10009', '10010', '10011', '10012', '10013', '10014', '10016',\n", " '10017', '10018', '10019', '10020', '10021', '10022', '10023',\n", " '10024', '10025', '10026', '10027', '10028', '10029', '10030',\n", " '10031', '10032', '10033', '10034', '10035', '10036', '10037',\n", " '10038', '10039', '10040', '10044', '10048', '10065', '10069',\n", " '10075', '10103', '10107', '10112', '10119', '10128', '10129',\n", " '10153', '10162', '10280', '10281', '10282', '10301', '10302',\n", " '10303', '10304', '10305', '10306', '10307', '10308', '10309',\n", " '10310', '10312', '10314', '10451', '10452', '10453', '10454',\n", " '10455', '10456', '10457', '10458', '10459', '10460', '10461',\n", " '10462', '10463', '10464', '10465', '10466', '10467', '10468',\n", " '10469', '10470', '10471', '10472', '10473', '10474', '10475',\n", " '10573', '10803', '10954', '11001', '11003', '11004', '11005',\n", " '11040', '11042', '11101', '11102', '11103', '11104', '11105',\n", " '11106', '11109', '11111', '11201', '11203', '11204', '11205',\n", " '11206', '11207', '11208', '11209', '11210', '11211', '11212',\n", " '11213', '11214', '11215', '11216', '11217', '11218', '11219',\n", " '11220', '11221', '11222', '11223', '11224', '11225', '11226',\n", " '11228', '11229', '11230', '11231', '11232', '11233', '11234',\n", " '11235', '11236', '11237', '11238', '11239', '11249', '11354',\n", " '11355', '11356', '11357', '11358', '11360', '11361', '11362',\n", " '11363', '11364', '11365', '11366', '11367', '11368', '11369',\n", " '11370', '11372', '11373', '11374', '11375', '11377', '11378',\n", " '11379', '11385', '11411', '11412', '11413', '11414', '11415',\n", " '11416', '11417', '11418', '11419', '11420', '11421', '11422',\n", " '11423', '11426', '11427', '11428', '11429', '11430', '11432',\n", " '11433', '11434', '11435', '11436', '11501', '11518', '11520',\n", " '11530', '11549', '11559', '11563', '11575', '11577', '11580',\n", " '11590', '11691', '11692', '11693', '11694', '11697', '11716',\n", " '11722', '11735', '11747', '11776', '11788', '11797', '13221',\n", " '14225', '19711', '23502', '23541', '29616', '35209', '41042',\n", " '55164', '61702', '70711', '77056', '77092', '90010', '92123'], dtype=object)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_zips = requests['Incident Zip'].unique()\n", "unique_zips.sort()\n", "unique_zips" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Amazing! This is much cleaner. There's something a bit weird here, though -- I looked up 77056 on Google maps, and that's in Texas.\n", "\n", "Let's take a closer look:" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [], "source": [ "zips = requests['Incident Zip']\n", "# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)\n", "is_close = zips.str.startswith('0') | zips.str.startswith('1')\n", "# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False\n", "is_far = ~(is_close) & zips.notnull()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "12102 77056\n", "13450 70711\n", "29136 77092\n", "30939 55164\n", "44008 90010\n", "47048 23541\n", "57636 92123\n", "71001 92123\n", "71834 23502\n", "80573 61702\n", "85821 29616\n", "89304 35209\n", "94201 41042\n", "Name: Incident Zip, dtype: object" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "zips[is_far]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Incident ZipDescriptorCity
71834 23502 Harassment NORFOLK
47048 23541 Harassment NORFOLK
85821 29616 Debt Not Owed GREENVILLE
89304 35209 Harassment BIRMINGHAM
94201 41042 Harassment FLORENCE
30939 55164 Harassment ST. PAUL
80573 61702 Billing Dispute BLOOMIGTON
13450 70711 Contract Dispute CLIFTON
12102 77056 Debt Not Owed HOUSTON
29136 77092 False Advertising HOUSTON
44008 90010 Billing Dispute LOS ANGELES
57636 92123 Harassment SAN DIEGO
71001 92123 Billing Dispute SAN DIEGO
\n", "
" ], "text/plain": [ " Incident Zip Descriptor City\n", "71834 23502 Harassment NORFOLK\n", "47048 23541 Harassment NORFOLK\n", "85821 29616 Debt Not Owed GREENVILLE\n", "89304 35209 Harassment BIRMINGHAM\n", "94201 41042 Harassment FLORENCE\n", "30939 55164 Harassment ST. PAUL\n", "80573 61702 Billing Dispute BLOOMIGTON\n", "13450 70711 Contract Dispute CLIFTON\n", "12102 77056 Debt Not Owed HOUSTON\n", "29136 77092 False Advertising HOUSTON\n", "44008 90010 Billing Dispute LOS ANGELES\n", "57636 92123 Harassment SAN DIEGO\n", "71001 92123 Billing Dispute SAN DIEGO" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead." ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "BROOKLYN 31662\n", "NEW YORK 22664\n", "BRONX 18438\n", "STATEN ISLAND 4766\n", "JAMAICA 2246\n", "FLUSHING 1803\n", "ASTORIA 1568\n", "RIDGEWOOD 1073\n", "CORONA 707\n", "OZONE PARK 693\n", "LONG ISLAND CITY 678\n", "FAR ROCKAWAY 652\n", "ELMHURST 647\n", "WOODSIDE 609\n", "EAST ELMHURST 562\n", "...\n", "MELVILLE 1\n", "PORT JEFFERSON STATION 1\n", "NORWELL 1\n", "EAST ROCKAWAY 1\n", "BIRMINGHAM 1\n", "ROSLYN 1\n", "LOS ANGELES 1\n", "MINEOLA 1\n", "JERSEY CITY 1\n", "ST. PAUL 1\n", "CLIFTON 1\n", "COL.ANVURES 1\n", "EDGEWATER 1\n", "ROSELYN 1\n", "CENTRAL ISLIP 1\n", "Length: 100, dtype: int64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests['City'].str.upper().value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It looks like these are legitimate complaints, so we'll just leave them alone." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7.5 Putting it together" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's what we ended up doing to clean up our zip codes, all together:" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [], "source": [ "na_values = ['NO CLUE', 'N/A', '0']\n", "requests = pd.read_csv('../data/311-service-requests.csv', \n", " na_values=na_values, \n", " dtype={'Incident Zip': str})" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def fix_zip_codes(zips):\n", " # Truncate everything to length 5 \n", " zips = zips.str.slice(0, 5)\n", " \n", " # Set 00000 zip codes to nan\n", " zero_zips = zips == '00000'\n", " zips[zero_zips] = np.nan\n", " \n", " return zips" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',\n", " '11417', '10011', '11225', '11218', '10003', '10029', '10466',\n", " '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',\n", " '10305', '10312', '10026', '10309', '10036', '11433', '11235',\n", " '11213', '11379', '11101', '10014', '11231', '11234', '10457',\n", " '10459', '10465', '11207', '10002', '10034', '11233', '10453',\n", " '10456', '10469', '11374', '11221', '11421', '11215', '10007',\n", " '10019', '11205', '11418', '11369', '11249', '10005', '10009',\n", " '11211', '11412', '10458', '11229', '10065', '10030', '11222',\n", " '10024', '10013', '11420', '11365', '10012', '11214', '11212',\n", " '10022', '11232', '11040', '11226', '10281', '11102', '11208',\n", " '10001', '10472', '11414', '11223', '10040', '11220', '11373',\n", " '11203', '11691', '11356', '10017', '10452', '10280', '11217',\n", " '10031', '11201', '11358', '10128', '11423', '10039', '10010',\n", " '11209', '10021', '10037', '11413', '11375', '11238', '10473',\n", " '11103', '11354', '11361', '11106', '11385', '10463', '10467',\n", " '11204', '11237', '11377', '11364', '11434', '11435', '11210',\n", " '11228', '11368', '11694', '10464', '11415', '10314', '10301',\n", " '10018', '10038', '11105', '11230', '10468', '11104', '10471',\n", " '11416', '10075', '11422', '11355', '10028', '10462', '10306',\n", " '10461', '11224', '11429', '10035', '11366', '11362', '11206',\n", " '10460', '10304', '11360', '11411', '10455', '10475', '10069',\n", " '10303', '10308', '10302', '11357', '10470', '11367', '11370',\n", " '10454', '10451', '11436', '11426', '10153', '11004', '11428',\n", " '11427', '11001', '11363', '10004', '10474', '11430', '10000',\n", " '10307', '11239', '10119', '10006', '10048', '11697', '11692',\n", " '11693', '10573', '00083', '11559', '10020', '77056', '11776',\n", " '70711', '10282', '11109', '10044', '02061', '77092', '14225',\n", " '55164', '19711', '07306', '90010', '11747', '23541', '11788',\n", " '07604', '10112', '11563', '11580', '07087', '11042', '07093',\n", " '11501', '92123', '11575', '07109', '11797', '10803', '11716',\n", " '11722', '11549', '10162', '23502', '11518', '07020', '08807',\n", " '11577', '07114', '11003', '07201', '61702', '10103', '29616',\n", " '35209', '11520', '11735', '10129', '11005', '41042', '11590',\n", " '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests['Incident Zip'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "