{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Import the required libraries" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\16098\\anaconda3\\lib\\site-packages\\tqdm\\std.py:697: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", " from pandas import Panel\n" ] } ], "source": [ "import os\n", "import zipfile\n", "from pathlib import Path\n", "import pandas as pd\n", "import numpy as np\n", "from geopy.geocoders import Nominatim\n", "from geopy.extra.rate_limiter import RateLimiter\n", "from geopy.point import Point\n", "from tqdm.auto import tqdm\n", "import re\n", "tqdm.pandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a variable to define where the data is located" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "data_directory = Path.cwd()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through the data directory to get the zip files and then extract them to the working directory" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "for file in data_directory.iterdir():\n", " if zipfile.is_zipfile(file):\n", " with zipfile.ZipFile(file) as item:\n", " item.extractall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through the data directory and list all files with .csv extension and 'CRASH' in their file name" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "files = [file for file in data_directory.rglob('*.csv') if file.is_file() and file.match('CRASH_PHILADELPHIA*.csv')]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through the list of files, append to one another to create one large dataframe for each year of data" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\16098\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3146: DtypeWarning: Columns (96,97,98) have mixed types.Specify dtype option on import or set low_memory=False.\n", " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n" ] } ], "source": [ "df_list = []\n", "\n", "for file in files:\n", " csv = pd.read_csv(file)\n", " df_list.append(csv)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create the dataframe" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "crash_df = pd.concat(df_list)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select the columns to display in the dataframe" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "cols = ['BELTED_DEATH_COUNT', 'BICYCLE_DEATH_COUNT', 'COLLISION_TYPE', 'CRASH_MONTH',\n", " 'CRASH_YEAR', 'DAY_OF_WEEK', 'DEC_LAT', 'DEC_LONG', 'FATAL_COUNT',\n", " 'HOUR_OF_DAY', 'INJURY_COUNT', 'INTERSECT_TYPE', 'MCYCLE_DEATH_COUNT',\n", " 'PED_DEATH_COUNT']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Created a function to clean and transform the crash data" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BELTED_DEATH_COUNTBICYCLE_DEATH_COUNTCOLLISION_TYPECRASH_MONTHCRASH_YEARDAY_OF_WEEKDEC_LATDEC_LONGFATAL_COUNTHOUR_OF_DAYINJURY_COUNTINTERSECT_TYPEMCYCLE_DEATH_COUNTPED_DEATH_COUNTCRASH_REPORT_NUMBER
000412015539.9579-75.162301701001
100712015639.9553-75.147802200001
200012015540.0396-75.006701330001
300112015640.0207-75.042401810001
400712015739.9750-75.11740210001
................................................
10131004122020540.0107-75.162702011001
1013200552020440.0308-75.054601840001
1013300552020440.0277-75.061001800001
10134008112020239.9695-75.230401112001
1013500182020240.0010-75.13280810001
\n", "

67333 rows × 15 columns

\n", "
" ], "text/plain": [ " BELTED_DEATH_COUNT BICYCLE_DEATH_COUNT COLLISION_TYPE CRASH_MONTH \\\n", "0 0 0 4 1 \n", "1 0 0 7 1 \n", "2 0 0 0 1 \n", "3 0 0 1 1 \n", "4 0 0 7 1 \n", "... ... ... ... ... \n", "10131 0 0 4 12 \n", "10132 0 0 5 5 \n", "10133 0 0 5 5 \n", "10134 0 0 8 11 \n", "10135 0 0 1 8 \n", "\n", " CRASH_YEAR DAY_OF_WEEK DEC_LAT DEC_LONG FATAL_COUNT HOUR_OF_DAY \\\n", "0 2015 5 39.9579 -75.1623 0 17 \n", "1 2015 6 39.9553 -75.1478 0 22 \n", "2 2015 5 40.0396 -75.0067 0 13 \n", "3 2015 6 40.0207 -75.0424 0 18 \n", "4 2015 7 39.9750 -75.1174 0 2 \n", "... ... ... ... ... ... ... \n", "10131 2020 5 40.0107 -75.1627 0 20 \n", "10132 2020 4 40.0308 -75.0546 0 18 \n", "10133 2020 4 40.0277 -75.0610 0 18 \n", "10134 2020 2 39.9695 -75.2304 0 11 \n", "10135 2020 2 40.0010 -75.1328 0 8 \n", "\n", " INJURY_COUNT INTERSECT_TYPE MCYCLE_DEATH_COUNT PED_DEATH_COUNT \\\n", "0 0 1 0 0 \n", "1 0 0 0 0 \n", "2 3 0 0 0 \n", "3 1 0 0 0 \n", "4 1 0 0 0 \n", "... ... ... ... ... \n", "10131 1 1 0 0 \n", "10132 4 0 0 0 \n", "10133 0 0 0 0 \n", "10134 1 2 0 0 \n", "10135 1 0 0 0 \n", "\n", " CRASH_REPORT_NUMBER \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 \n", "... ... \n", "10131 1 \n", "10132 1 \n", "10133 1 \n", "10134 1 \n", "10135 1 \n", "\n", "[67333 rows x 15 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_crash_data(crash_df):\n", " cols = ['BELTED_DEATH_COUNT', 'BICYCLE_DEATH_COUNT', 'COLLISION_TYPE', 'CRASH_MONTH',\n", " 'CRASH_YEAR', 'DAY_OF_WEEK', 'DEC_LAT', 'DEC_LONG', 'FATAL_COUNT',\n", " 'HOUR_OF_DAY', 'INJURY_COUNT', 'INTERSECT_TYPE', 'MCYCLE_DEATH_COUNT',\n", " 'PED_DEATH_COUNT']\n", " return (crash_df\n", " [cols]\n", " .assign(CRASH_REPORT_NUMBER=1, \n", " DEC_LAT=crash_df.DEC_LAT.fillna(method='ffill'),\n", " DEC_LONG=crash_df.DEC_LONG.fillna(method='ffill'))\n", " .astype({'BELTED_DEATH_COUNT': 'int8', 'BICYCLE_DEATH_COUNT': 'int8', 'COLLISION_TYPE': 'int8', \n", " 'CRASH_MONTH': 'int8','CRASH_YEAR': 'int16', 'DAY_OF_WEEK': 'int8', 'FATAL_COUNT': 'int8',\n", " 'HOUR_OF_DAY': 'int8', 'INJURY_COUNT': 'int8', 'INTERSECT_TYPE': 'int8', \n", " 'MCYCLE_DEATH_COUNT': 'int8','PED_DEATH_COUNT': 'int8'})\n", ")\n", "\n", "transform_crash_data(crash_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Created a geolocator object using the Openstreet Nominatim API. In addition, I created a function to extract the zip code from the latitude and longitude data" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "locator = Nominatim(user_agent=\"myGeolocator\", timeout=10)\n", "rgeocode = RateLimiter(locator.reverse, min_delay_seconds=.25)\n", "\n", "def reverse_geocoding(GEOM):\n", " try:\n", " location = locator.reverse(Point(GEOM))\n", " return location.raw['address']['postcode']\n", " except:\n", " return None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concatenate the Latitude and Longitude fields to create a new feature" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BELTED_DEATH_COUNTBICYCLE_DEATH_COUNTCOLLISION_TYPECRASH_MONTHCRASH_YEARDAY_OF_WEEKDEC_LATDEC_LONGFATAL_COUNTHOUR_OF_DAYINJURY_COUNTINTERSECT_TYPEMCYCLE_DEATH_COUNTPED_DEATH_COUNTCRASH_REPORT_NUMBERGEOM
000412015539.9579-75.16230170100139.9579, -75.1623
100712015639.9553-75.14780220000139.9553, -75.1478
200012015540.0396-75.00670133000140.0396, -75.0067
300112015640.0207-75.04240181000140.0207, -75.0424
400712015739.9750-75.1174021000139.975, -75.1174
...................................................
10131004122020540.0107-75.16270201100140.0107, -75.1627
1013200552020440.0308-75.05460184000140.0308, -75.0546
1013300552020440.0277-75.06100180000140.0277, -75.061
10134008112020239.9695-75.23040111200139.9695, -75.2304
1013500182020240.0010-75.1328081000140.001, -75.1328
\n", "

67333 rows × 16 columns

\n", "
" ], "text/plain": [ " BELTED_DEATH_COUNT BICYCLE_DEATH_COUNT COLLISION_TYPE CRASH_MONTH \\\n", "0 0 0 4 1 \n", "1 0 0 7 1 \n", "2 0 0 0 1 \n", "3 0 0 1 1 \n", "4 0 0 7 1 \n", "... ... ... ... ... \n", "10131 0 0 4 12 \n", "10132 0 0 5 5 \n", "10133 0 0 5 5 \n", "10134 0 0 8 11 \n", "10135 0 0 1 8 \n", "\n", " CRASH_YEAR DAY_OF_WEEK DEC_LAT DEC_LONG FATAL_COUNT HOUR_OF_DAY \\\n", "0 2015 5 39.9579 -75.1623 0 17 \n", "1 2015 6 39.9553 -75.1478 0 22 \n", "2 2015 5 40.0396 -75.0067 0 13 \n", "3 2015 6 40.0207 -75.0424 0 18 \n", "4 2015 7 39.9750 -75.1174 0 2 \n", "... ... ... ... ... ... ... \n", "10131 2020 5 40.0107 -75.1627 0 20 \n", "10132 2020 4 40.0308 -75.0546 0 18 \n", "10133 2020 4 40.0277 -75.0610 0 18 \n", "10134 2020 2 39.9695 -75.2304 0 11 \n", "10135 2020 2 40.0010 -75.1328 0 8 \n", "\n", " INJURY_COUNT INTERSECT_TYPE MCYCLE_DEATH_COUNT PED_DEATH_COUNT \\\n", "0 0 1 0 0 \n", "1 0 0 0 0 \n", "2 3 0 0 0 \n", "3 1 0 0 0 \n", "4 1 0 0 0 \n", "... ... ... ... ... \n", "10131 1 1 0 0 \n", "10132 4 0 0 0 \n", "10133 0 0 0 0 \n", "10134 1 2 0 0 \n", "10135 1 0 0 0 \n", "\n", " CRASH_REPORT_NUMBER GEOM \n", "0 1 39.9579, -75.1623 \n", "1 1 39.9553, -75.1478 \n", "2 1 40.0396, -75.0067 \n", "3 1 40.0207, -75.0424 \n", "4 1 39.975, -75.1174 \n", "... ... ... \n", "10131 1 40.0107, -75.1627 \n", "10132 1 40.0308, -75.0546 \n", "10133 1 40.0277, -75.061 \n", "10134 1 39.9695, -75.2304 \n", "10135 1 40.001, -75.1328 \n", "\n", "[67333 rows x 16 columns]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "crash_df = (transform_crash_data(crash_df)\n", ".assign(GEOM=crash_df['DEC_LAT'].map(str) + ',' + \" \" + crash_df['DEC_LONG'].map(str)\n", "))\n", "\n", "crash_df\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Called the API to obtain addresses using the latitude and longitude data" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\16098\\anaconda3\\lib\\site-packages\\tqdm\\std.py:697: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", " from pandas import Panel\n", "100%|██████████| 67333/67333 [9:18:32<00:00, 2.01it/s] \n" ] } ], "source": [ "tqdm.pandas()\n", "crash_df['ZIP'] = crash_df['GEOM'].progress_apply(reverse_geocoding)\n", "crash_df.to_csv('crash.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extract the first five digits of the zip code and save to a csv" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "crash_df = pd.read_csv('crash.csv')\n", "crash_df = (crash_df\n", ".assign(ZIP=crash_df.ZIP.str[0:5])\n", ")\n", "crash_df.to_csv('crash.csv', index=False)" ] } ], "metadata": { "interpreter": { "hash": "ef22e6a1e008a05d8d505539b26854d80ff9620493d32018ef7adf8bb9cfcbe2" }, "kernelspec": { "display_name": "Python 3.8.5 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }