{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Chapter 7 - Cleaning up messy data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n", "line_width has been deprecated, use display.width instead (currently both are\n", "identical)\n", "\n" ] } ], "source": [ "%pylab inline\n", "import pandas as pd\n", "\n", "# Make the graphs a bit prettier, and bigger\n", "pd.set_option('display.mpl_style', 'default')\n", "figsize(15, 5)\n", "\n", "# Always display all the columns\n", "pd.set_option('display.line_width', 5000) \n", "pd.set_option('display.max_columns', 60) \n", "\n", "import matplotlib\n", "matplotlib.style.use('ggplot')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7.1 How do we know if it's messy?" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "requests = pd.read_csv('./data/311-service-requests.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0,\n", " 11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0,\n", " 11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0,\n", " 10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0,\n", " 11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0,\n", " 10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0,\n", " 10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0,\n", " 10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0,\n", " 11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0,\n", " 10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0,\n", " 10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0,\n", " 10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0,\n", " 11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0,\n", " 10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0,\n", " 11209.0, 10021.0, 10037.0, 11413.0, 11375.0, 11238.0, 10473.0,\n", " 11103.0, 11354.0, 11361.0, 11106.0, 11385.0, 10463.0, 10467.0,\n", " 11204.0, 11237.0, 11377.0, 11364.0, 11434.0, 11435.0, 11210.0,\n", " 11228.0, 11368.0, 11694.0, 10464.0, 11415.0, 10314.0, 10301.0,\n", " 10018.0, 10038.0, 11105.0, 11230.0, 10468.0, 11104.0, 10471.0,\n", " 11416.0, 10075.0, 11422.0, 11355.0, 10028.0, 10462.0, 10306.0,\n", " 10461.0, 11224.0, 11429.0, 10035.0, 11366.0, 11362.0, 11206.0,\n", " 10460.0, 10304.0, 11360.0, 11411.0, 10455.0, 10475.0, 10069.0,\n", " 10303.0, 10308.0, 10302.0, 11357.0, 10470.0, 11367.0, 11370.0,\n", " 10454.0, 10451.0, 11436.0, 11426.0, 10153.0, 11004.0, 11428.0,\n", " 11427.0, 11001.0, 11363.0, 10004.0, 10474.0, 11430.0, 10000.0,\n", " 10307.0, 11239.0, 10119.0, 10006.0, 10048.0, 11697.0, 11692.0,\n", " 11693.0, 10573.0, 83.0, 11559.0, 10020.0, 77056.0, 11776.0, 70711.0,\n", " 10282.0, 11109.0, 10044.0, '10452', '11233', '10468', '10310',\n", " '11105', '10462', '10029', '10301', '10457', '10467', '10469',\n", " '11225', '10035', '10031', '11226', '10454', '11221', '10025',\n", " '11229', '11235', '11422', '10472', '11208', '11102', '10032',\n", " '11216', '10473', '10463', '11213', '10040', '10302', '11231',\n", " '10470', '11204', '11104', '11212', '10466', '11416', '11214',\n", " '10009', '11692', '11385', '11423', '11201', '10024', '11435',\n", " '10312', '10030', '11106', '10033', '10303', '11215', '11222',\n", " '11354', '10016', '10034', '11420', '10304', '10019', '11237',\n", " '11249', '11230', '11372', '11207', '11378', '11419', '11361',\n", " '10011', '11357', '10012', '11358', '10003', '10002', '11374',\n", " '10007', '11234', '10065', '11369', '11434', '11205', '11206',\n", " '11415', '11236', '11218', '11413', '10458', '11101', '10306',\n", " '11355', '10023', '11368', '10314', '11421', '10010', '10018',\n", " '11223', '10455', '11377', '11433', '11375', '10037', '11209',\n", " '10459', '10128', '10014', '10282', '11373', '10451', '11238',\n", " '11211', '10038', '11694', '11203', '11691', '11232', '10305',\n", " '10021', '11228', '10036', '10001', '10017', '11217', '11219',\n", " '10308', '10465', '11379', '11414', '10460', '11417', '11220',\n", " '11366', '10027', '11370', '10309', '11412', '11356', '10456',\n", " '11432', '10022', '10013', '11367', '11040', '10026', '10475',\n", " '11210', '11364', '11426', '10471', '10119', '11224', '11418',\n", " '11429', '11365', '10461', '11239', '10039', '00083', '11411',\n", " '10075', '11004', '11360', '10453', '10028', '11430', '10307',\n", " '11103', '10004', '10069', '10005', '10474', '11428', '11436',\n", " '10020', '11001', '11362', '11693', '10464', '11427', '10044',\n", " '11363', '10006', '10000', '02061', '77092-2016', '10280', '11109',\n", " '14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE',\n", " '90010', '10281', '11747', '23541', '11776', '11697', '11788',\n", " '07604', 10112.0, 11788.0, 11563.0, 11580.0, 7087.0, 11042.0,\n", " 7093.0, 11501.0, 92123.0, 0.0, 11575.0, 7109.0, 11797.0, '10803',\n", " '11716', '11722', '11549-3650', '10162', '92123', '23502', '11518',\n", " '07020', '08807', '11577', '07114', '11003', '07201', '11563',\n", " '61702', '10103', '29616-0759', '35209-3114', '11520', '11735',\n", " '10129', '11005', '41042', '11590', 6901.0, 7208.0, 11530.0,\n", " 13221.0, 10954.0, 11735.0, 10103.0, 7114.0, 11111.0, 10107.0], dtype=object)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests['Incident Zip'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7.2 Fixing the nan values and string/float confusion" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "na_values = ['NO CLUE', 'N/A', '0']\n", "requests = pd.read_csv('./data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',\n", " '11417', '10011', '11225', '11218', '10003', '10029', '10466',\n", " '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',\n", " '10305', '10312', '10026', '10309', '10036', '11433', '11235',\n", " '11213', '11379', '11101', '10014', '11231', '11234', '10457',\n", " '10459', '10465', '11207', '10002', '10034', '11233', '10453',\n", " '10456', '10469', '11374', '11221', '11421', '11215', '10007',\n", " '10019', '11205', '11418', '11369', '11249', '10005', '10009',\n", " '11211', '11412', '10458', '11229', '10065', '10030', '11222',\n", " '10024', '10013', '11420', '11365', '10012', '11214', '11212',\n", " '10022', '11232', '11040', '11226', '10281', '11102', '11208',\n", " '10001', '10472', '11414', '11223', '10040', '11220', '11373',\n", " '11203', '11691', '11356', '10017', '10452', '10280', '11217',\n", " '10031', '11201', '11358', '10128', '11423', '10039', '10010',\n", " '11209', '10021', '10037', '11413', '11375', '11238', '10473',\n", " '11103', '11354', '11361', '11106', '11385', '10463', '10467',\n", " '11204', '11237', '11377', '11364', '11434', '11435', '11210',\n", " '11228', '11368', '11694', '10464', '11415', '10314', '10301',\n", " '10018', '10038', '11105', '11230', '10468', '11104', '10471',\n", " '11416', '10075', '11422', '11355', '10028', '10462', '10306',\n", " '10461', '11224', '11429', '10035', '11366', '11362', '11206',\n", " '10460', '10304', '11360', '11411', '10455', '10475', '10069',\n", " '10303', '10308', '10302', '11357', '10470', '11367', '11370',\n", " '10454', '10451', '11436', '11426', '10153', '11004', '11428',\n", " '11427', '11001', '11363', '10004', '10474', '11430', '10000',\n", " '10307', '11239', '10119', '10006', '10048', '11697', '11692',\n", " '11693', '10573', '00083', '11559', '10020', '77056', '11776',\n", " '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225',\n", " '55164-0737', '19711', '07306', '000000', '90010', '11747', '23541',\n", " '11788', '07604', '10112', '11563', '11580', '07087', '11042',\n", " '07093', '11501', '92123', '00000', '11575', '07109', '11797',\n", " '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518',\n", " '07020', '08807', '11577', '07114', '11003', '07201', '61702',\n", " '10103', '29616-0759', '35209-3114', '11520', '11735', '10129',\n", " '11005', '41042', '11590', '06901', '07208', '11530', '13221',\n", " '10954', '11111', '10107'], dtype=object)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests['Incident Zip'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7.3 What's up with the dashes?" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "5" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)\n", "len(requests[rows_with_dashes])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "long_zip_codes = requests['Incident Zip'].str.len() > 5" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759',\n", " '35209-3114'], dtype=object)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests['Incident Zip'][long_zip_codes].unique()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unique KeyCreated DateClosed DateAgencyAgency NameComplaint TypeDescriptorLocation TypeIncident ZipIncident AddressStreet NameCross Street 1Cross Street 2Intersection Street 1Intersection Street 2Address TypeCityLandmarkFacility TypeStatusDue DateResolution Action Updated DateCommunity BoardBoroughX Coordinate (State Plane)Y Coordinate (State Plane)Park Facility NamePark BoroughSchool NameSchool NumberSchool RegionSchool CodeSchool Phone NumberSchool AddressSchool CitySchool StateSchool ZipSchool Not FoundSchool or Citywide ComplaintVehicle TypeTaxi Company BoroughTaxi Pick Up LocationBridge Highway NameBridge Highway DirectionRoad RampBridge Highway SegmentGarage Lot NameFerry DirectionFerry Terminal NameLatitudeLongitudeLocation
426002652931310/22/2013 02:51:06 PMNaNTLCTaxi and Limousine CommissionTaxi ComplaintDriver ComplaintNaN00000EWR EWREWRNaNNaNNaNNaNNaNNEWARKNaNNaNAssigned12/07/2013 09:53:51 AM10/23/2013 09:54:43 AM0 UnspecifiedUnspecifiedNaNNaNUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedNNaNNaNNaNOtherNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
608432650738910/17/2013 05:48:44 PMNaNTLCTaxi and Limousine CommissionTaxi ComplaintDriver ComplaintStreet000001 NEWARK AIRPORTNEWARK AIRPORTNaNNaNNaNNaNNaNNEWARKNaNNaNAssigned12/02/2013 11:59:46 AM10/18/2013 12:01:08 PM0 UnspecifiedUnspecifiedNaNNaNUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedUnspecifiedNNaNNaNNaNOtherNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type City Landmark Facility Type Status Due Date Resolution Action Updated Date Community Board Borough X Coordinate (State Plane) Y Coordinate (State Plane) Park Facility Name Park Borough School Name School Number School Region School Code School Phone Number School Address School City School State School Zip School Not Found School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location\n", "42600 26529313 10/22/2013 02:51:06 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint NaN 00000 EWR EWR EWR NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/07/2013 09:53:51 AM 10/23/2013 09:54:43 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "60843 26507389 10/17/2013 05:48:44 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint Street 00000 1 NEWARK AIRPORT NEWARK AIRPORT NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/02/2013 11:59:46 AM 10/18/2013 12:01:08 PM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests[requests['Incident Zip'] == '00000']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/wangshuailong/Library/Python/2.7/lib/python/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " if __name__ == '__main__':\n" ] } ], "source": [ "requests['Incident Zip'][requests['Incident Zip'] == '00000'] = np.nan" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([nan, '00083', '02061', '06901', '07020', '07087', '07093', '07109',\n", " '07114', '07201', '07208', '07306', '07604', '08807', '10000',\n", " '10001', '10002', '10003', '10004', '10005', '10006', '10007',\n", " '10009', '10010', '10011', '10012', '10013', '10014', '10016',\n", " '10017', '10018', '10019', '10020', '10021', '10022', '10023',\n", " '10024', '10025', '10026', '10027', '10028', '10029', '10030',\n", " '10031', '10032', '10033', '10034', '10035', '10036', '10037',\n", " '10038', '10039', '10040', '10044', '10048', '10065', '10069',\n", " '10075', '10103', '10107', '10112', '10119', '10128', '10129',\n", " '10153', '10162', '10280', '10281', '10282', '10301', '10302',\n", " '10303', '10304', '10305', '10306', '10307', '10308', '10309',\n", " '10310', '10312', '10314', '10451', '10452', '10453', '10454',\n", " '10455', '10456', '10457', '10458', '10459', '10460', '10461',\n", " '10462', '10463', '10464', '10465', '10466', '10467', '10468',\n", " '10469', '10470', '10471', '10472', '10473', '10474', '10475',\n", " '10573', '10803', '10954', '11001', '11003', '11004', '11005',\n", " '11040', '11042', '11101', '11102', '11103', '11104', '11105',\n", " '11106', '11109', '11111', '11201', '11203', '11204', '11205',\n", " '11206', '11207', '11208', '11209', '11210', '11211', '11212',\n", " '11213', '11214', '11215', '11216', '11217', '11218', '11219',\n", " '11220', '11221', '11222', '11223', '11224', '11225', '11226',\n", " '11228', '11229', '11230', '11231', '11232', '11233', '11234',\n", " '11235', '11236', '11237', '11238', '11239', '11249', '11354',\n", " '11355', '11356', '11357', '11358', '11360', '11361', '11362',\n", " '11363', '11364', '11365', '11366', '11367', '11368', '11369',\n", " '11370', '11372', '11373', '11374', '11375', '11377', '11378',\n", " '11379', '11385', '11411', '11412', '11413', '11414', '11415',\n", " '11416', '11417', '11418', '11419', '11420', '11421', '11422',\n", " '11423', '11426', '11427', '11428', '11429', '11430', '11432',\n", " '11433', '11434', '11435', '11436', '11501', '11518', '11520',\n", " '11530', '11549', '11559', '11563', '11575', '11577', '11580',\n", " '11590', '11691', '11692', '11693', '11694', '11697', '11716',\n", " '11722', '11735', '11747', '11776', '11788', '11797', '13221',\n", " '14225', '19711', '23502', '23541', '29616', '35209', '41042',\n", " '55164', '61702', '70711', '77056', '77092', '90010', '92123'], dtype=object)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_code = requests['Incident Zip'].unique()\n", "unique_code.sort()\n", "unique_code" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }