{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preparing the data for analysis\n",
    "> A Summary of lecture \"Analyzing Police Activity with pandas\", via datacamp\n",
    "\n",
    "- toc: true \n",
    "- badges: true\n",
    "- comments: true\n",
    "- author: Chanseok Kang\n",
    "- categories: [Python, Datacamp, Data_Science]\n",
    "- image: images/logo.png"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stanford Open Policing Project dataset\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Examining the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  state   stop_date stop_time  county_name driver_gender driver_race  \\\n",
      "0    RI  2005-01-04     12:55          NaN             M       White   \n",
      "1    RI  2005-01-23     23:15          NaN             M       White   \n",
      "2    RI  2005-02-17     04:15          NaN             M       White   \n",
      "3    RI  2005-02-20     17:15          NaN             M       White   \n",
      "4    RI  2005-02-24     01:20          NaN             F       White   \n",
      "\n",
      "                    violation_raw  violation  search_conducted search_type  \\\n",
      "0  Equipment/Inspection Violation  Equipment             False         NaN   \n",
      "1                        Speeding   Speeding             False         NaN   \n",
      "2                        Speeding   Speeding             False         NaN   \n",
      "3                Call for Service      Other             False         NaN   \n",
      "4                        Speeding   Speeding             False         NaN   \n",
      "\n",
      "    stop_outcome is_arrested stop_duration  drugs_related_stop district  \n",
      "0       Citation       False      0-15 Min               False  Zone X4  \n",
      "1       Citation       False      0-15 Min               False  Zone K3  \n",
      "2       Citation       False      0-15 Min               False  Zone X4  \n",
      "3  Arrest Driver        True     16-30 Min               False  Zone X1  \n",
      "4       Citation       False      0-15 Min               False  Zone X3  \n",
      "state                     0\n",
      "stop_date                 0\n",
      "stop_time                 0\n",
      "county_name           91741\n",
      "driver_gender          5205\n",
      "driver_race            5202\n",
      "violation_raw          5202\n",
      "violation              5202\n",
      "search_conducted          0\n",
      "search_type           88434\n",
      "stop_outcome           5202\n",
      "is_arrested            5202\n",
      "stop_duration          5202\n",
      "drugs_related_stop        0\n",
      "district                  0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Import the pandas library as pd\n",
    "import pandas as pd\n",
    "\n",
    "# Read 'police.csv' into a DataFrame named ri\n",
    "ri = pd.read_csv('./dataset/police.csv')\n",
    "\n",
    "# Examine the head of the DataFrame\n",
    "print(ri.head(5))\n",
    "\n",
    "# Count the number of missing values in each column\n",
    "print(ri.isnull().sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dropping columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(91741, 15)\n",
      "(91741, 13)\n"
     ]
    }
   ],
   "source": [
    "# Examine the shape of the DataFrame\n",
    "print(ri.shape)\n",
    "\n",
    "# Drop the 'country_name' and 'state' columns\n",
    "ri.drop(['county_name', 'state'], axis='columns', inplace=True)\n",
    "\n",
    "# Examine the shape of the DataFrame (again)\n",
    "print(ri.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dropping rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "stop_date                 0\n",
      "stop_time                 0\n",
      "driver_gender          5205\n",
      "driver_race            5202\n",
      "violation_raw          5202\n",
      "violation              5202\n",
      "search_conducted          0\n",
      "search_type           88434\n",
      "stop_outcome           5202\n",
      "is_arrested            5202\n",
      "stop_duration          5202\n",
      "drugs_related_stop        0\n",
      "district                  0\n",
      "dtype: int64\n",
      "stop_date                 0\n",
      "stop_time                 0\n",
      "driver_gender             0\n",
      "driver_race               0\n",
      "violation_raw             0\n",
      "violation                 0\n",
      "search_conducted          0\n",
      "search_type           83229\n",
      "stop_outcome              0\n",
      "is_arrested               0\n",
      "stop_duration             0\n",
      "drugs_related_stop        0\n",
      "district                  0\n",
      "dtype: int64\n",
      "(86536, 13)\n"
     ]
    }
   ],
   "source": [
    "# Count the number of missing values in each column\n",
    "print(ri.isnull().sum())\n",
    "\n",
    "# Drop all rows that are missing 'driver_gender'\n",
    "ri.dropna(subset=['driver_gender'], inplace=True)\n",
    "\n",
    "# Count the number of missing values in each column (again)\n",
    "print(ri.isnull().sum())\n",
    "\n",
    "# Examine the shape of the DataFrame\n",
    "print(ri.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using proper data types\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Finding an incorrect data type\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "stop_date             object\n",
       "stop_time             object\n",
       "driver_gender         object\n",
       "driver_race           object\n",
       "violation_raw         object\n",
       "violation             object\n",
       "search_conducted        bool\n",
       "search_type           object\n",
       "stop_outcome          object\n",
       "is_arrested           object\n",
       "stop_duration         object\n",
       "drugs_related_stop      bool\n",
       "district              object\n",
       "dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ri.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fixing a data type\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    False\n",
      "1    False\n",
      "2    False\n",
      "3     True\n",
      "4    False\n",
      "Name: is_arrested, dtype: object\n",
      "bool\n"
     ]
    }
   ],
   "source": [
    "# Examine the head of the 'is_arrested' column\n",
    "print(ri.is_arrested.head())\n",
    "\n",
    "# Change the data type of 'is_arrested' to 'bool'\n",
    "ri['is_arrested'] = ri.is_arrested.astype('bool')\n",
    "\n",
    "# Check the data type of 'is_arrested' \n",
    "print(ri['is_arrested'].dtype)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating a DatetimeIndex\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combining object columns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "stop_date                     object\n",
      "stop_time                     object\n",
      "driver_gender                 object\n",
      "driver_race                   object\n",
      "violation_raw                 object\n",
      "violation                     object\n",
      "search_conducted                bool\n",
      "search_type                   object\n",
      "stop_outcome                  object\n",
      "is_arrested                     bool\n",
      "stop_duration                 object\n",
      "drugs_related_stop              bool\n",
      "district                      object\n",
      "stop_datetime         datetime64[ns]\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "# Concatenate 'stop_date' and 'stop_time' (separated by a space)\n",
    "combined = ri.stop_date.str.cat(ri.stop_time, sep=' ')\n",
    "\n",
    "# Convert 'combined' to datetime format\n",
    "ri['stop_datetime'] = pd.to_datetime(combined)\n",
    "\n",
    "# Examine the data types of the DataFrame\n",
    "print(ri.dtypes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting the index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatetimeIndex(['2005-01-04 12:55:00', '2005-01-23 23:15:00',\n",
      "               '2005-02-17 04:15:00', '2005-02-20 17:15:00',\n",
      "               '2005-02-24 01:20:00', '2005-03-14 10:00:00',\n",
      "               '2005-03-29 21:55:00', '2005-04-04 21:25:00',\n",
      "               '2005-07-14 11:20:00', '2005-07-14 19:55:00',\n",
      "               ...\n",
      "               '2015-12-31 13:23:00', '2015-12-31 18:59:00',\n",
      "               '2015-12-31 19:13:00', '2015-12-31 20:20:00',\n",
      "               '2015-12-31 20:50:00', '2015-12-31 21:21:00',\n",
      "               '2015-12-31 21:59:00', '2015-12-31 22:04:00',\n",
      "               '2015-12-31 22:09:00', '2015-12-31 22:47:00'],\n",
      "              dtype='datetime64[ns]', name='stop_datetime', length=86536, freq=None)\n",
      "Index(['stop_date', 'stop_time', 'driver_gender', 'driver_race',\n",
      "       'violation_raw', 'violation', 'search_conducted', 'search_type',\n",
      "       'stop_outcome', 'is_arrested', 'stop_duration', 'drugs_related_stop',\n",
      "       'district'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# Set 'stop_datetime' as the index\n",
    "ri.set_index('stop_datetime', inplace=True)\n",
    "\n",
    "# Examine the index\n",
    "print(ri.index)\n",
    "\n",
    "# Examine the columns\n",
    "print(ri.columns)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}