{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Manipulation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some aspects of __data manipulation__, altering data to make it easier to read or use, include sorting and grouping attributes and encoding categorical variables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (1.2.2)\n",
      "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.8/site-packages (from pandas) (2.8.1)\n",
      "Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.8/site-packages (from pandas) (2021.1)\n",
      "Requirement already satisfied: numpy>=1.16.5 in /opt/conda/lib/python3.8/site-packages (from pandas) (1.19.5)\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Form</th>\n",
       "      <th>State</th>\n",
       "      <th>Security_Grade</th>\n",
       "      <th>Area_Number</th>\n",
       "      <th>Terrain_Description</th>\n",
       "      <th>Favorable_Influences</th>\n",
       "      <th>Detrimental_Influences</th>\n",
       "      <th>INHABITANTS_Type</th>\n",
       "      <th>INHABITANTS_Annual_Income</th>\n",
       "      <th>...</th>\n",
       "      <th>Ten_Fifteen_Desirability</th>\n",
       "      <th>Remarks</th>\n",
       "      <th>Date</th>\n",
       "      <th>City_clean</th>\n",
       "      <th>Suburb</th>\n",
       "      <th>max_building_age</th>\n",
       "      <th>Year</th>\n",
       "      <th>Day</th>\n",
       "      <th>Month</th>\n",
       "      <th>max_annual_income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>NS FORM-8 6-1-37</td>\n",
       "      <td>Maryland</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "      <td>Rolling</td>\n",
       "      <td>Fairly new suburban area of homogeneous charac...</td>\n",
       "      <td>No</td>\n",
       "      <td>Substantial Middle Class</td>\n",
       "      <td>$3000 - 5,000</td>\n",
       "      <td>...</td>\n",
       "      <td>Upward</td>\n",
       "      <td>A recent development with much room for expans...</td>\n",
       "      <td>May 4,1937</td>\n",
       "      <td>Baltimore</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.0</td>\n",
       "      <td>1937.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>May</td>\n",
       "      <td>5000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>NS FORM-8 6-1-37</td>\n",
       "      <td>Maryland</td>\n",
       "      <td>A</td>\n",
       "      <td>1</td>\n",
       "      <td>Undulating</td>\n",
       "      <td>Very nicely planned residential area of medium...</td>\n",
       "      <td>No</td>\n",
       "      <td>Executives, Professional Men</td>\n",
       "      <td>over $5000</td>\n",
       "      <td>...</td>\n",
       "      <td>Upward</td>\n",
       "      <td>Mostly fee properties. A few homes valued at $...</td>\n",
       "      <td>May 4,1937</td>\n",
       "      <td>Baltimore</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12.0</td>\n",
       "      <td>1937.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>May</td>\n",
       "      <td>5000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 33 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0              Form     State Security_Grade  Area_Number  \\\n",
       "0           0  NS FORM-8 6-1-37  Maryland              A            2   \n",
       "1           1  NS FORM-8 6-1-37  Maryland              A            1   \n",
       "\n",
       "  Terrain_Description                               Favorable_Influences  \\\n",
       "0             Rolling  Fairly new suburban area of homogeneous charac...   \n",
       "1         Undulating   Very nicely planned residential area of medium...   \n",
       "\n",
       "  Detrimental_Influences              INHABITANTS_Type  \\\n",
       "0                     No      Substantial Middle Class   \n",
       "1                     No  Executives, Professional Men   \n",
       "\n",
       "  INHABITANTS_Annual_Income  ... Ten_Fifteen_Desirability  \\\n",
       "0             $3000 - 5,000  ...                   Upward   \n",
       "1                over $5000  ...                   Upward   \n",
       "\n",
       "                                             Remarks        Date  City_clean  \\\n",
       "0  A recent development with much room for expans...  May 4,1937  Baltimore    \n",
       "1  Mostly fee properties. A few homes valued at $...  May 4,1937  Baltimore    \n",
       "\n",
       "  Suburb max_building_age    Year  Day Month max_annual_income  \n",
       "0    NaN             10.0  1937.0  4.0   May            5000.0  \n",
       "1    NaN             12.0  1937.0  4.0   May            5000.0  \n",
       "\n",
       "[2 rows x 33 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# loads the pandas library \n",
    "import pandas as pd\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)  # Ignore Pandas future warnings\n",
    "\n",
    "# creates data frame named df by reading in the Baltimore csv\n",
    "df = pd.read_csv(\"clean_baltimore_data.csv\")\n",
    "df.head(n=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Indicators"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since many of the attributes have multiple values contained within them, we are going to encode them to indicate that a feature exist or not and use Tableau to visualize the frequency of different characteristics by grade. For example, here the `Terrain_Description` column has two categories of values : flat or rolling."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<script type='text/javascript' src='https://10ay.online.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 1440px; height: 655px;'><object class='tableauViz' width='1440' height='655' style='display:none;'><param name='host_url' value='https%3A%2F%2F10ay.online.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='&#47;t&#47;sadata' /><param name='name' value='mapping_inequality&#47;Sheet4' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='showAppBanner' value='false' /><param name='filter' value='iframeSizedToWindow=true' /></object></div>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%HTML\n",
    "<script type='text/javascript' src='https://10ay.online.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 1440px; height: 655px;'><object class='tableauViz' width='1440' height='655' style='display:none;'><param name='host_url' value='https%3A%2F%2F10ay.online.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='&#47;t&#47;sadata' /><param name='name' value='mapping_inequality&#47;Sheet4' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='showAppBanner' value='false' /><param name='filter' value='iframeSizedToWindow=true' /></object></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here, this tells us that `rolling` is generally a characteristic overall of all the grades and it's indicated most frequently in the B-grade. The `flat` characteristic is not indicated as frequently compared to `rolling` overall, however, in the C-grade it's about the same. __Note__ that a value could be flat and rolling."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0              rolling\n",
       "1          undulating \n",
       "2              rolling\n",
       "3                level\n",
       "4          undulating \n",
       "5              rolling\n",
       "6     rolling to hilly\n",
       "7     rolling to hilly\n",
       "8           high level\n",
       "9     rolling to level\n",
       "10         undulating \n",
       "11             rolling\n",
       "12          high level\n",
       "13             rolling\n",
       "14         undulating \n",
       "Name: Terrain_Description, dtype: object"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# case normalization \n",
    "df['Terrain_Description'] = df['Terrain_Description'].str.lower()\n",
    "\n",
    "# remove extraneous punctuation \n",
    "df['Terrain_Description'] = df['Terrain_Description'].str.replace('[^\\w\\s]','')\n",
    "df[\"Terrain_Description\"][0:15]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we identified that there are three categories of all the values:\n",
    "1. rolling/hilly/undualating\n",
    "2. level/flat\n",
    "3. missing\n",
    "\n",
    "Before we encode, we need to remove punctuation and convert the strings to lowercase so that the ocurrence of rolling will be standardized."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "# inserts a 1 in terrain_rolling whenever the value is 'rolling' or 'undulating' or 'hilly'\n",
    "df[\"terrain_rolling\"] = np.where((df[\"Terrain_Description\"].str.contains(\"rolling\") | \n",
    "                                      df[\"Terrain_Description\"].str.contains(\"undulating\")|\n",
    "                                     df[\"Terrain_Description\"].str.contains(\"hilly\")) , 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>terrain_rolling</th>\n",
       "      <th>Terrain_Description</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>rolling</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>undulating</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>rolling</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>level</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>undulating</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>rolling</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>rolling to hilly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>rolling to hilly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>high level</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>rolling to level</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1</td>\n",
       "      <td>undulating</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    terrain_rolling Terrain_Description\n",
       "0                 1             rolling\n",
       "1                 1         undulating \n",
       "2                 1             rolling\n",
       "3                 0               level\n",
       "4                 1         undulating \n",
       "5                 1             rolling\n",
       "6                 1    rolling to hilly\n",
       "7                 1    rolling to hilly\n",
       "8                 0          high level\n",
       "9                 1    rolling to level\n",
       "10                1         undulating "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[0:10,['terrain_rolling','Terrain_Description']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The'INHABITANTS_Type' describes what the occupation is like in a different section. We would like to get an idea of what sections indicated certain occupations the most. Using `value_counts()`, we can see what values frequently occur in this column."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "White collar class                         8\n",
       "Executives, Professional Men               5\n",
       "Mixture                                    5\n",
       "Laborers                                   3\n",
       "Business men                               2\n",
       "Business people                            2\n",
       "Clerks, mechanics                          2\n",
       "Skilled labor clerks                       2\n",
       "Middle class                               2\n",
       "Mechanics, clerks                          2\n",
       "Skilled mechanics, clerks                  2\n",
       "Industrial employees                       1\n",
       "Mixture, laboring class                    1\n",
       "Clerks, skilled mechanics                  1\n",
       "Officers of Plants (industrial)            1\n",
       "Skilled labor                              1\n",
       "Business men, clerks                       1\n",
       "Skilled mechanics and Junior Executives    1\n",
       "Substantial Middle Class                   1\n",
       " White collar class                        1\n",
       "Mixture, Jewish                            1\n",
       "Industrial workers                         1\n",
       "Mechanics                                  1\n",
       "Business Men                               1\n",
       "White collar class, clerks                 1\n",
       "Professional and Executives                1\n",
       "Name: INHABITANTS_Type, dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['INHABITANTS_Type'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here, are the different groups of inhabitants overall:\n",
    "\n",
    "1. white colllar class\n",
    "2. mixture/jewish\n",
    "3. professional\n",
    "4. business/executive\n",
    "5. laborers / labor\n",
    "6. clerks\n",
    "7. mechanics\n",
    "8. industrial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once again, we need to normalize all the text values. Note that the data type of `INHABITANTS_Type` was incorrect and needs to be converted to type `object`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         substantial middle class\n",
       "1      executives professional men\n",
       "2      executives professional men\n",
       "3     professional and executives \n",
       "4      executives professional men\n",
       "5                              NaN\n",
       "6      executives professional men\n",
       "7      executives professional men\n",
       "8                     business men\n",
       "9                  business people\n",
       "10              white collar class\n",
       "11              white collar class\n",
       "12                       mechanics\n",
       "13                    middle class\n",
       "14                    middle class\n",
       "Name: INHABITANTS_Type, dtype: object"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['INHABITANTS_Type'] = df['INHABITANTS_Type'].astype('object')\n",
    "df['INHABITANTS_Type'] = df['INHABITANTS_Type'].str.lower()\n",
    "df['INHABITANTS_Type'] = df['INHABITANTS_Type'].str.replace('[^\\w\\s]','')\n",
    "df['INHABITANTS_Type'][0:15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "df['white_collar'] = np.where(df['INHABITANTS_Type'].str.contains(\"white collar class\"), 1, 0)\n",
    "\n",
    "df['mixture_or_jewish'] = np.where((df['INHABITANTS_Type'].str.contains(\"mixture\") |\n",
    "                                   df['INHABITANTS_Type'].str.contains(\"jewish\")), 1, 0)\n",
    "\n",
    "df['professional'] = np.where(df['INHABITANTS_Type'].str.contains(\"professional\"), 1, 0)\n",
    "\n",
    "df['business_or_executive'] = np.where((df['INHABITANTS_Type'].str.contains(\"business\") |\n",
    "                              df['INHABITANTS_Type'].str.contains(\"executive\")), 1, 0)\n",
    "\n",
    "df['laborer'] = np.where((df['INHABITANTS_Type'].str.contains(\"laborer\") |\n",
    "                              df['INHABITANTS_Type'].str.contains(\"laborers\")), 1, 0)\n",
    "\n",
    "df['clerks'] = np.where(df['INHABITANTS_Type'].str.contains(\"clerks\"), 1, 0)\n",
    "\n",
    "df['mechanics'] = np.where(df['INHABITANTS_Type'].str.contains(\"mechanics\"), 1, 0)\n",
    "\n",
    "df['industrial'] = np.where(df['INHABITANTS_Type'].str.contains(\"industrial\"), 1, 0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<script type='text/javascript' src='https://10ay.online.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 1440px; height: 655px;'><object class='tableauViz' width='1440' height='655' style='display:none;'><param name='host_url' value='https%3A%2F%2F10ay.online.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='&#47;t&#47;sadata' /><param name='name' value='mapping_inequality&#47;Sheet5' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='showAppBanner' value='false' /><param name='filter' value='iframeSizedToWindow=true' /></object></div>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%HTML\n",
    "<script type='text/javascript' src='https://10ay.online.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 1440px; height: 655px;'><object class='tableauViz' width='1440' height='655' style='display:none;'><param name='host_url' value='https%3A%2F%2F10ay.online.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='&#47;t&#47;sadata' /><param name='name' value='mapping_inequality&#47;Sheet5' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='showAppBanner' value='false' /><param name='filter' value='iframeSizedToWindow=true' /></object></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we see that grade C has the most variation compared to the other grades, however, within this grade Clerks and Mechanics are indicated the most. Grade D has the least variation with only Mixture/Jewish and Laborers. Most of the `INHABITANTS_Type` in grade B are indicated as Business/Executive or White collar. Grade A has more variation than grade D, however less than B and C. The frequency for Professional and Business/Executtive in grade A is about equal."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> 1. Can you create the `terrain_flat` variable?\n",
    "2. What are some other things you notice about values in `INHABITANTS_Type` type?  What are possible explainations for these outcomes?\n",
    "3. Check out this [DataCamp](https://www.datacamp.com/community/tutorials/categorical-data) article about encoding categorical data in python. What are some other attributes that can be encoded and how would you encode them?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Sorting and Grouping "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The values of `Area_Number` are out of order and we want these values to be sorted by `Security_Grade`. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Security_Grade</th>\n",
       "      <th>Area_Number</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>B</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>B</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>B</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>B</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>B</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Security_Grade  Area_Number\n",
       "0               A            2\n",
       "1               A            1\n",
       "2               A            3\n",
       "3               A            4\n",
       "4               A            5\n",
       "5               A            6\n",
       "6               B            1\n",
       "7               B            2\n",
       "8               B            3\n",
       "9               B            4\n",
       "10              B            5"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# removes any additional spaces from Security_Grade\n",
    "df['Security_Grade'] = df['Security_Grade'].str.replace('[\\W]','')\n",
    "# converts 'Area_Number' from type object to type 'numeric'\n",
    "df['Area_Number'] = pd.to_numeric(df['Area_Number'])\n",
    "df.loc[0:10,['Security_Grade','Area_Number']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To do this, we created use the `sort_values()` function on the original data frame and reset the index. First, the data is sorted and grouped by `Security_Grade` and then `Area_Number` is sorted in increasing order."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Security_Grade</th>\n",
       "      <th>Area_Number</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>B</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>B</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>B</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>B</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>B</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Security_Grade  Area_Number\n",
       "0               A            1\n",
       "1               A            2\n",
       "2               A            3\n",
       "3               A            4\n",
       "4               A            5\n",
       "5               A            6\n",
       "6               B            1\n",
       "7               B            2\n",
       "8               B            3\n",
       "9               B            4\n",
       "10              B            5"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.sort_values(by=['Security_Grade', 'Area_Number'])\n",
    "# resets the index starting from 0\n",
    "df = df.reset_index(drop=True)\n",
    "# in order to save the new sorted area_number into the dataframe\n",
    "df['Area_Number'] = df['Area_Number']\n",
    "df.loc[0:10,['Security_Grade','Area_Number']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(r'manipulated_baltimore_data.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Next Step\n",
    "\n",
    "Continue to the [data analysis and visualization](Data_Visualization.ipynb) portion of this module."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}