{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Import the required libraries" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\16098\\anaconda3\\lib\\site-packages\\tqdm\\std.py:697: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", " from pandas import Panel\n" ] } ], "source": [ "import os\n", "import zipfile\n", "from pathlib import Path\n", "import pandas as pd\n", "import numpy as np\n", "from geopy.geocoders import Nominatim\n", "from geopy.extra.rate_limiter import RateLimiter\n", "from geopy.point import Point\n", "from tqdm.auto import tqdm\n", "import re\n", "tqdm.pandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a variable to define where the data is located" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "data_directory = Path.cwd()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through the data directory to get the zip files and then extract them to the working directory" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "for file in data_directory.iterdir():\n", " if zipfile.is_zipfile(file):\n", " with zipfile.ZipFile(file) as item:\n", " item.extractall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through the data directory and list all files with .csv extension and 'CRASH' in their file name" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "files = [file for file in data_directory.rglob('*.csv') if file.is_file() and file.match('CRASH_PHILADELPHIA*.csv')]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loop through the list of files, append to one another to create one large dataframe for each year of data" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\16098\\anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3146: DtypeWarning: Columns (96,97,98) have mixed types.Specify dtype option on import or set low_memory=False.\n", " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n" ] } ], "source": [ "df_list = []\n", "\n", "for file in files:\n", " csv = pd.read_csv(file)\n", " df_list.append(csv)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create the dataframe" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "crash_df = pd.concat(df_list)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select the columns to display in the dataframe" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "cols = ['BELTED_DEATH_COUNT', 'BICYCLE_DEATH_COUNT', 'COLLISION_TYPE', 'CRASH_MONTH',\n", " 'CRASH_YEAR', 'DAY_OF_WEEK', 'DEC_LAT', 'DEC_LONG', 'FATAL_COUNT',\n", " 'HOUR_OF_DAY', 'INJURY_COUNT', 'INTERSECT_TYPE', 'MCYCLE_DEATH_COUNT',\n", " 'PED_DEATH_COUNT']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Created a function to clean and transform the crash data" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | BELTED_DEATH_COUNT | \n", "BICYCLE_DEATH_COUNT | \n", "COLLISION_TYPE | \n", "CRASH_MONTH | \n", "CRASH_YEAR | \n", "DAY_OF_WEEK | \n", "DEC_LAT | \n", "DEC_LONG | \n", "FATAL_COUNT | \n", "HOUR_OF_DAY | \n", "INJURY_COUNT | \n", "INTERSECT_TYPE | \n", "MCYCLE_DEATH_COUNT | \n", "PED_DEATH_COUNT | \n", "CRASH_REPORT_NUMBER | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "0 | \n", "4 | \n", "1 | \n", "2015 | \n", "5 | \n", "39.9579 | \n", "-75.1623 | \n", "0 | \n", "17 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "0 | \n", "0 | \n", "7 | \n", "1 | \n", "2015 | \n", "6 | \n", "39.9553 | \n", "-75.1478 | \n", "0 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
2 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "2015 | \n", "5 | \n", "40.0396 | \n", "-75.0067 | \n", "0 | \n", "13 | \n", "3 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "2015 | \n", "6 | \n", "40.0207 | \n", "-75.0424 | \n", "0 | \n", "18 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
4 | \n", "0 | \n", "0 | \n", "7 | \n", "1 | \n", "2015 | \n", "7 | \n", "39.9750 | \n", "-75.1174 | \n", "0 | \n", "2 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
10131 | \n", "0 | \n", "0 | \n", "4 | \n", "12 | \n", "2020 | \n", "5 | \n", "40.0107 | \n", "-75.1627 | \n", "0 | \n", "20 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
10132 | \n", "0 | \n", "0 | \n", "5 | \n", "5 | \n", "2020 | \n", "4 | \n", "40.0308 | \n", "-75.0546 | \n", "0 | \n", "18 | \n", "4 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
10133 | \n", "0 | \n", "0 | \n", "5 | \n", "5 | \n", "2020 | \n", "4 | \n", "40.0277 | \n", "-75.0610 | \n", "0 | \n", "18 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
10134 | \n", "0 | \n", "0 | \n", "8 | \n", "11 | \n", "2020 | \n", "2 | \n", "39.9695 | \n", "-75.2304 | \n", "0 | \n", "11 | \n", "1 | \n", "2 | \n", "0 | \n", "0 | \n", "1 | \n", "
10135 | \n", "0 | \n", "0 | \n", "1 | \n", "8 | \n", "2020 | \n", "2 | \n", "40.0010 | \n", "-75.1328 | \n", "0 | \n", "8 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
67333 rows × 15 columns
\n", "\n", " | BELTED_DEATH_COUNT | \n", "BICYCLE_DEATH_COUNT | \n", "COLLISION_TYPE | \n", "CRASH_MONTH | \n", "CRASH_YEAR | \n", "DAY_OF_WEEK | \n", "DEC_LAT | \n", "DEC_LONG | \n", "FATAL_COUNT | \n", "HOUR_OF_DAY | \n", "INJURY_COUNT | \n", "INTERSECT_TYPE | \n", "MCYCLE_DEATH_COUNT | \n", "PED_DEATH_COUNT | \n", "CRASH_REPORT_NUMBER | \n", "GEOM | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "0 | \n", "4 | \n", "1 | \n", "2015 | \n", "5 | \n", "39.9579 | \n", "-75.1623 | \n", "0 | \n", "17 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "39.9579, -75.1623 | \n", "
1 | \n", "0 | \n", "0 | \n", "7 | \n", "1 | \n", "2015 | \n", "6 | \n", "39.9553 | \n", "-75.1478 | \n", "0 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "39.9553, -75.1478 | \n", "
2 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "2015 | \n", "5 | \n", "40.0396 | \n", "-75.0067 | \n", "0 | \n", "13 | \n", "3 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "40.0396, -75.0067 | \n", "
3 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "2015 | \n", "6 | \n", "40.0207 | \n", "-75.0424 | \n", "0 | \n", "18 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "40.0207, -75.0424 | \n", "
4 | \n", "0 | \n", "0 | \n", "7 | \n", "1 | \n", "2015 | \n", "7 | \n", "39.9750 | \n", "-75.1174 | \n", "0 | \n", "2 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "39.975, -75.1174 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
10131 | \n", "0 | \n", "0 | \n", "4 | \n", "12 | \n", "2020 | \n", "5 | \n", "40.0107 | \n", "-75.1627 | \n", "0 | \n", "20 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "40.0107, -75.1627 | \n", "
10132 | \n", "0 | \n", "0 | \n", "5 | \n", "5 | \n", "2020 | \n", "4 | \n", "40.0308 | \n", "-75.0546 | \n", "0 | \n", "18 | \n", "4 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "40.0308, -75.0546 | \n", "
10133 | \n", "0 | \n", "0 | \n", "5 | \n", "5 | \n", "2020 | \n", "4 | \n", "40.0277 | \n", "-75.0610 | \n", "0 | \n", "18 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "40.0277, -75.061 | \n", "
10134 | \n", "0 | \n", "0 | \n", "8 | \n", "11 | \n", "2020 | \n", "2 | \n", "39.9695 | \n", "-75.2304 | \n", "0 | \n", "11 | \n", "1 | \n", "2 | \n", "0 | \n", "0 | \n", "1 | \n", "39.9695, -75.2304 | \n", "
10135 | \n", "0 | \n", "0 | \n", "1 | \n", "8 | \n", "2020 | \n", "2 | \n", "40.0010 | \n", "-75.1328 | \n", "0 | \n", "8 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "40.001, -75.1328 | \n", "
67333 rows × 16 columns
\n", "