{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparing the data for analysis\n", "> A Summary of lecture \"Analyzing Police Activity with pandas\", via datacamp\n", "\n", "- toc: true \n", "- badges: true\n", "- comments: true\n", "- author: Chanseok Kang\n", "- categories: [Python, Datacamp, Data_Science]\n", "- image: images/logo.png" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stanford Open Policing Project dataset\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Examining the dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state stop_date stop_time county_name driver_gender driver_race \\\n", "0 RI 2005-01-04 12:55 NaN M White \n", "1 RI 2005-01-23 23:15 NaN M White \n", "2 RI 2005-02-17 04:15 NaN M White \n", "3 RI 2005-02-20 17:15 NaN M White \n", "4 RI 2005-02-24 01:20 NaN F White \n", "\n", " violation_raw violation search_conducted search_type \\\n", "0 Equipment/Inspection Violation Equipment False NaN \n", "1 Speeding Speeding False NaN \n", "2 Speeding Speeding False NaN \n", "3 Call for Service Other False NaN \n", "4 Speeding Speeding False NaN \n", "\n", " stop_outcome is_arrested stop_duration drugs_related_stop district \n", "0 Citation False 0-15 Min False Zone X4 \n", "1 Citation False 0-15 Min False Zone K3 \n", "2 Citation False 0-15 Min False Zone X4 \n", "3 Arrest Driver True 16-30 Min False Zone X1 \n", "4 Citation False 0-15 Min False Zone X3 \n", "state 0\n", "stop_date 0\n", "stop_time 0\n", "county_name 91741\n", "driver_gender 5205\n", "driver_race 5202\n", "violation_raw 5202\n", "violation 5202\n", "search_conducted 0\n", "search_type 88434\n", "stop_outcome 5202\n", "is_arrested 5202\n", "stop_duration 5202\n", "drugs_related_stop 0\n", "district 0\n", "dtype: int64\n" ] } ], "source": [ "# Import the pandas library as pd\n", "import pandas as pd\n", "\n", "# Read 'police.csv' into a DataFrame named ri\n", "ri = pd.read_csv('./dataset/police.csv')\n", "\n", "# Examine the head of the DataFrame\n", "print(ri.head(5))\n", "\n", "# Count the number of missing values in each column\n", "print(ri.isnull().sum())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dropping columns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(91741, 15)\n", "(91741, 13)\n" ] } ], "source": [ "# Examine the shape of the DataFrame\n", "print(ri.shape)\n", "\n", "# Drop the 'country_name' and 'state' columns\n", "ri.drop(['county_name', 'state'], axis='columns', inplace=True)\n", "\n", "# Examine the shape of the DataFrame (again)\n", "print(ri.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dropping rows" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "stop_date 0\n", "stop_time 0\n", "driver_gender 5205\n", "driver_race 5202\n", "violation_raw 5202\n", "violation 5202\n", "search_conducted 0\n", "search_type 88434\n", "stop_outcome 5202\n", "is_arrested 5202\n", "stop_duration 5202\n", "drugs_related_stop 0\n", "district 0\n", "dtype: int64\n", "stop_date 0\n", "stop_time 0\n", "driver_gender 0\n", "driver_race 0\n", "violation_raw 0\n", "violation 0\n", "search_conducted 0\n", "search_type 83229\n", "stop_outcome 0\n", "is_arrested 0\n", "stop_duration 0\n", "drugs_related_stop 0\n", "district 0\n", "dtype: int64\n", "(86536, 13)\n" ] } ], "source": [ "# Count the number of missing values in each column\n", "print(ri.isnull().sum())\n", "\n", "# Drop all rows that are missing 'driver_gender'\n", "ri.dropna(subset=['driver_gender'], inplace=True)\n", "\n", "# Count the number of missing values in each column (again)\n", "print(ri.isnull().sum())\n", "\n", "# Examine the shape of the DataFrame\n", "print(ri.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using proper data types\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Finding an incorrect data type\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "stop_date object\n", "stop_time object\n", "driver_gender object\n", "driver_race object\n", "violation_raw object\n", "violation object\n", "search_conducted bool\n", "search_type object\n", "stop_outcome object\n", "is_arrested object\n", "stop_duration object\n", "drugs_related_stop bool\n", "district object\n", "dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ri.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fixing a data type\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 False\n", "1 False\n", "2 False\n", "3 True\n", "4 False\n", "Name: is_arrested, dtype: object\n", "bool\n" ] } ], "source": [ "# Examine the head of the 'is_arrested' column\n", "print(ri.is_arrested.head())\n", "\n", "# Change the data type of 'is_arrested' to 'bool'\n", "ri['is_arrested'] = ri.is_arrested.astype('bool')\n", "\n", "# Check the data type of 'is_arrested' \n", "print(ri['is_arrested'].dtype)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating a DatetimeIndex\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combining object columns\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "stop_date object\n", "stop_time object\n", "driver_gender object\n", "driver_race object\n", "violation_raw object\n", "violation object\n", "search_conducted bool\n", "search_type object\n", "stop_outcome object\n", "is_arrested bool\n", "stop_duration object\n", "drugs_related_stop bool\n", "district object\n", "stop_datetime datetime64[ns]\n", "dtype: object\n" ] } ], "source": [ "# Concatenate 'stop_date' and 'stop_time' (separated by a space)\n", "combined = ri.stop_date.str.cat(ri.stop_time, sep=' ')\n", "\n", "# Convert 'combined' to datetime format\n", "ri['stop_datetime'] = pd.to_datetime(combined)\n", "\n", "# Examine the data types of the DataFrame\n", "print(ri.dtypes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setting the index" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatetimeIndex(['2005-01-04 12:55:00', '2005-01-23 23:15:00',\n", " '2005-02-17 04:15:00', '2005-02-20 17:15:00',\n", " '2005-02-24 01:20:00', '2005-03-14 10:00:00',\n", " '2005-03-29 21:55:00', '2005-04-04 21:25:00',\n", " '2005-07-14 11:20:00', '2005-07-14 19:55:00',\n", " ...\n", " '2015-12-31 13:23:00', '2015-12-31 18:59:00',\n", " '2015-12-31 19:13:00', '2015-12-31 20:20:00',\n", " '2015-12-31 20:50:00', '2015-12-31 21:21:00',\n", " '2015-12-31 21:59:00', '2015-12-31 22:04:00',\n", " '2015-12-31 22:09:00', '2015-12-31 22:47:00'],\n", " dtype='datetime64[ns]', name='stop_datetime', length=86536, freq=None)\n", "Index(['stop_date', 'stop_time', 'driver_gender', 'driver_race',\n", " 'violation_raw', 'violation', 'search_conducted', 'search_type',\n", " 'stop_outcome', 'is_arrested', 'stop_duration', 'drugs_related_stop',\n", " 'district'],\n", " dtype='object')\n" ] } ], "source": [ "# Set 'stop_datetime' as the index\n", "ri.set_index('stop_datetime', inplace=True)\n", "\n", "# Examine the index\n", "print(ri.index)\n", "\n", "# Examine the columns\n", "print(ri.columns)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }