{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notebook for Toronto City clustering project"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### We will work on extracting borough & neighbourhood information + cleaning + clustering & visualising with Folium"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1. Establish environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request, urllib.parse, urllib.error\n",
    "from urllib.request import urlopen\n",
    "from bs4 import BeautifulSoup\n",
    "import ssl\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2. Ignore SSL errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "cntxt = ssl.create_default_context()\n",
    "cntxt.check_hostname = False\n",
    "cntxt.verify_mode = ssl.CERT_NONE"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3. Obtain URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Please enter the website to obtain data from:  \n"
     ]
    }
   ],
   "source": [
    "url = input('Please enter the website to obtain data from: ')\n",
    "if len(url) < 1: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You want data from >>>\n",
      " https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M \n",
      "<<<\n"
     ]
    }
   ],
   "source": [
    "print('You want data from >>>\\n', url, '\\n<<<')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4. Open and parse the url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Use a file-handle like object to open the url\n",
    "html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end\n",
    "\n",
    "#Use BeautifulSoup to parse\n",
    "soup = BeautifulSoup(html, 'html.parser')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 5. Explore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "bs4.BeautifulSoup"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(soup)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total tags extracted:  294 \n",
      "\n",
      "\n",
      "Total post-codes with borough info: 103\n"
     ]
    }
   ],
   "source": [
    "#Try retrieving 'tr' tags\n",
    "#print(soup)\n",
    "tags = soup('tr')\n",
    "print('Total tags extracted: ', len(tags),'\\n')\n",
    "\n",
    "#Look at the tag extracted\n",
    "count = 0 #Initialise counter to count iterations & if rqd, help break out of loop\n",
    "dict = {} #Initialise empty dictionary to store postcode as KEY, boroughs and neighbourhoods as VALUES\n",
    "list = [] #Initialise empty list to hold the borough + neighbourhood info\n",
    "\n",
    "#Loop through the tags\n",
    "for i in tags:\n",
    "    count = count + 1 #Increase counter value beginning through each iteration\n",
    "    if count == 290: break #For limiting output and stopping the loop from running + length of tags for some reason does not correspond to actual number of elements\n",
    "    \n",
    "    #Skip if Borough is Not Assigned\n",
    "    if i.contents[3].text == 'Not assigned':\n",
    "        continue\n",
    "        \n",
    "    #Skip if text is Postcode\n",
    "    if i.contents[1].text == 'Postcode':\n",
    "        continue\n",
    "    \n",
    "    #If pincode already exists, eg M5A, M6A, append new data\n",
    "    if i.contents[1].text in dict: \n",
    "        \n",
    "        # Append the new data to the existing array at this slot with the following NEW SYNTAX\n",
    "        #dict[existing_key].append(value)\n",
    "        dict[i.contents[1].text].append(i.contents[5].text.rstrip())\n",
    "        \n",
    "    #If borough exists but not the neighbourhood, run the following    \n",
    "    elif re.search('[a-z]', i.contents[3].text) and re.search('No.*', i.contents[5].text):\n",
    "        list.append(i.contents[3].text), list.append(i.contents[3].text)\n",
    "        dict[i.contents[1].text] = list\n",
    "    else:\n",
    "        list.append(i.contents[3].text)\n",
    "        list.append(i.contents[5].text.rstrip())#rstrip gets rid of newline char.\n",
    "        dict[i.contents[1].text] = list\n",
    "    \n",
    "    #Would need to reset the list to empty after each round otherwise each successive iteration will bloat up the key-value\n",
    "    list = []\n",
    "    \n",
    "\n",
    "#print(dict)\n",
    "\n",
    "print('\\nTotal post-codes with borough info:', len(dict))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 6. Convert to pandas dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#6.1 Import the library\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#6.2 Convert dictionary to dataframe\n",
    "data = pd.DataFrame.from_dict(dict, orient= 'index')#index when keys are row labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>M3A</th>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M1G</th>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M4N</th>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M1L</th>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea</td>\n",
       "      <td>Golden Mile</td>\n",
       "      <td>Oakridge</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M5G</th>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    0                   1            2         3     4     5  \\\n",
       "M3A        North York           Parkwoods         None      None  None  None   \n",
       "M1G       Scarborough              Woburn         None      None  None  None   \n",
       "M4N   Central Toronto       Lawrence Park         None      None  None  None   \n",
       "M1L       Scarborough            Clairlea  Golden Mile  Oakridge  None  None   \n",
       "M5G  Downtown Toronto  Central Bay Street         None      None  None  None   \n",
       "\n",
       "        6     7     8  \n",
       "M3A  None  None  None  \n",
       "M1G  None  None  None  \n",
       "M4N  None  None  None  \n",
       "M1L  None  None  None  \n",
       "M5G  None  None  None  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PostalCode</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>M3A</th>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M1G</th>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M4N</th>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M1L</th>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea</td>\n",
       "      <td>Golden Mile</td>\n",
       "      <td>Oakridge</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M5G</th>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           0                   1            2         3     4  \\\n",
       "PostalCode                                                                      \n",
       "M3A               North York           Parkwoods         None      None  None   \n",
       "M1G              Scarborough              Woburn         None      None  None   \n",
       "M4N          Central Toronto       Lawrence Park         None      None  None   \n",
       "M1L              Scarborough            Clairlea  Golden Mile  Oakridge  None   \n",
       "M5G         Downtown Toronto  Central Bay Street         None      None  None   \n",
       "\n",
       "               5     6     7     8  \n",
       "PostalCode                          \n",
       "M3A         None  None  None  None  \n",
       "M1G         None  None  None  None  \n",
       "M4N         None  None  None  None  \n",
       "M1L         None  None  None  None  \n",
       "M5G         None  None  None  None  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#6.3 Assign row name to postcodes\n",
    "data.index.name = 'PostalCode'\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M4N</td>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1L</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea</td>\n",
       "      <td>Golden Mile</td>\n",
       "      <td>Oakridge</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M5G</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode                 0                   1            2         3  \\\n",
       "0        M3A        North York           Parkwoods         None      None   \n",
       "1        M1G       Scarborough              Woburn         None      None   \n",
       "2        M4N   Central Toronto       Lawrence Park         None      None   \n",
       "3        M1L       Scarborough            Clairlea  Golden Mile  Oakridge   \n",
       "4        M5G  Downtown Toronto  Central Bay Street         None      None   \n",
       "\n",
       "      4     5     6     7     8  \n",
       "0  None  None  None  None  None  \n",
       "1  None  None  None  None  None  \n",
       "2  None  None  None  None  None  \n",
       "3  None  None  None  None  None  \n",
       "4  None  None  None  None  None  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#6.4 Reset index\n",
    "data.reset_index(inplace= True)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M4N</td>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1L</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea</td>\n",
       "      <td>Golden Mile</td>\n",
       "      <td>Oakridge</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M5G</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode           Borough                   1            2         3  \\\n",
       "0        M3A        North York           Parkwoods         None      None   \n",
       "1        M1G       Scarborough              Woburn         None      None   \n",
       "2        M4N   Central Toronto       Lawrence Park         None      None   \n",
       "3        M1L       Scarborough            Clairlea  Golden Mile  Oakridge   \n",
       "4        M5G  Downtown Toronto  Central Bay Street         None      None   \n",
       "\n",
       "      4     5     6     7     8  \n",
       "0  None  None  None  None  None  \n",
       "1  None  None  None  None  None  \n",
       "2  None  None  None  None  None  \n",
       "3  None  None  None  None  None  \n",
       "4  None  None  None  None  None  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#6.5 Set column 2 i.e. after Postcode to Borough\n",
    "data.columns.values[1] = 'Borough'#We need to drill down to the array, accessed with .values\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Borough</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>North York</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Scarborough</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Central Toronto</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Scarborough</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Downtown Toronto</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Borough\n",
       "0        North York\n",
       "1       Scarborough\n",
       "2   Central Toronto\n",
       "3       Scarborough\n",
       "4  Downtown Toronto"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#6.6 Convert columns to list form so as to remove hidden characters that interfer with column extraction by name \n",
    "data.columns = data.columns.tolist()\n",
    "data[['Borough']].head() #data[[5]] for the column numbered 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M4N</td>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1L</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea</td>\n",
       "      <td>Golden Mile</td>\n",
       "      <td>Oakridge</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M5G</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode           Borough                   1            2         3 4 5  \\\n",
       "0        M3A        North York           Parkwoods                              \n",
       "1        M1G       Scarborough              Woburn                              \n",
       "2        M4N   Central Toronto       Lawrence Park                              \n",
       "3        M1L       Scarborough            Clairlea  Golden Mile  Oakridge       \n",
       "4        M5G  Downtown Toronto  Central Bay Street                              \n",
       "\n",
       "  6 7 8  \n",
       "0        \n",
       "1        \n",
       "2        \n",
       "3        \n",
       "4        "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#6.7 Remove None values\n",
    "## It is vital to try this in the beginning when only specific cells have None\n",
    "### It becomes a bit complex if we try to remove None after merging, as each cell then has several values \n",
    "\n",
    "#data_sorted = data.apply(sorted,key=pd.isnull)\n",
    "#data_sorted.head()\n",
    "data_clean = data[~pd.isnull(data).all(1)].fillna('')\n",
    "data_clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>Neighbourhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>Parkwoods,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>Woburn,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M4N</td>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>Lawrence Park,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1L</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea</td>\n",
       "      <td>Golden Mile</td>\n",
       "      <td>Oakridge</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>Clairlea,Golden Mile,Oakridge,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M5G</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>Central Bay Street,,,,,,</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode           Borough                   1            2         3 4 5  \\\n",
       "0        M3A        North York           Parkwoods                              \n",
       "1        M1G       Scarborough              Woburn                              \n",
       "2        M4N   Central Toronto       Lawrence Park                              \n",
       "3        M1L       Scarborough            Clairlea  Golden Mile  Oakridge       \n",
       "4        M5G  Downtown Toronto  Central Bay Street                              \n",
       "\n",
       "  6 7 8                      Neighbourhood  \n",
       "0                          Parkwoods,,,,,,  \n",
       "1                             Woburn,,,,,,  \n",
       "2                      Lawrence Park,,,,,,  \n",
       "3        Clairlea,Golden Mile,Oakridge,,,,  \n",
       "4                 Central Bay Street,,,,,,  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#6.8 Concatenate the neighbourhood columns\n",
    "## Obtain column position after which concatenation is to take place\n",
    "source_col_loc = data_clean.columns.get_loc('Borough') # column position starts from 0\n",
    "source_col_loc\n",
    "\n",
    "#Create new column that will merge all neighbourhoods\n",
    "data_clean['Neighbourhood'] = data_clean.iloc[:,source_col_loc+1:source_col_loc+8].apply(\n",
    "   lambda x: \",\".join(x.astype(str)), axis=1)\n",
    "data_clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "#6.9 Drop columns 1 through 8\n",
    "import numpy as np\n",
    "cols_to_remove = np.arange(1,9)\n",
    "#print(cols_to_remove, '\\n')\n",
    "\n",
    "#Drop\n",
    "data_clean.drop(cols_to_remove, axis=1, inplace = True)\n",
    "#data_clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighbourhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M4N</td>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1L</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea,Golden Mile,Oakridge,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M5G</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street,,,,,,</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode           Borough                      Neighbourhood\n",
       "0        M3A        North York                    Parkwoods,,,,,,\n",
       "1        M1G       Scarborough                       Woburn,,,,,,\n",
       "2        M4N   Central Toronto                Lawrence Park,,,,,,\n",
       "3        M1L       Scarborough  Clairlea,Golden Mile,Oakridge,,,,\n",
       "4        M5G  Downtown Toronto           Central Bay Street,,,,,,"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighbourhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>M5A</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Harbourfront,Regent Park,,,,,</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PostalCode           Borough                  Neighbourhood\n",
       "34        M5A  Downtown Toronto  Harbourfront,Regent Park,,,,,"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_clean[data_clean['PostalCode'] == 'M5A']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(103, 3)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_clean.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 7. Obtain geocoordinates of the postal codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import geocoder #Not installed\n",
    "#From terminal, install GeoPy\n",
    "#sudo python3 -m pip install geopy\n",
    "#sudo python3 -m pip install geocoder #NOTE: THIS IS ANOTHER PACKAGE!\n",
    "\n",
    "#from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values\n",
    "import folium #Plotting library\n",
    "#import geocoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "#g = geocoder.google('Mountain View, CA')\n",
    "#print(g.latlng)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialize your variable to None\n",
    "#lat_lng_coords = None\n",
    "\n",
    "# loop until you get the coordinates\n",
    "#while(lat_lng_coords is None):\n",
    " #   g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))\n",
    "  #  lat_lng_coords = g.latlng\n",
    "\n",
    "#latitude = lat_lng_coords[0]\n",
    "#longitude = lat_lng_coords[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Read direct from provided CSV as the package was not giving results\n",
    "import pandas as pd\n",
    "latlang = pd.read_csv('/Users/peaceful_warrior/Downloads/Geospatial_Coordinates.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Postal Code</th>\n",
       "      <th>Latitude</th>\n",
       "      <th>Longitude</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M1B</td>\n",
       "      <td>43.806686</td>\n",
       "      <td>-79.194353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1C</td>\n",
       "      <td>43.784535</td>\n",
       "      <td>-79.160497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M1E</td>\n",
       "      <td>43.763573</td>\n",
       "      <td>-79.188711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1G</td>\n",
       "      <td>43.770992</td>\n",
       "      <td>-79.216917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M1H</td>\n",
       "      <td>43.773136</td>\n",
       "      <td>-79.239476</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Postal Code   Latitude  Longitude\n",
       "0         M1B  43.806686 -79.194353\n",
       "1         M1C  43.784535 -79.160497\n",
       "2         M1E  43.763573 -79.188711\n",
       "3         M1G  43.770992 -79.216917\n",
       "4         M1H  43.773136 -79.239476"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "latlang.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "latlang.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "latlang.columns.values[0] = 'PostalCode'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Latitude</th>\n",
       "      <th>Longitude</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M1B</td>\n",
       "      <td>43.806686</td>\n",
       "      <td>-79.194353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1C</td>\n",
       "      <td>43.784535</td>\n",
       "      <td>-79.160497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M1E</td>\n",
       "      <td>43.763573</td>\n",
       "      <td>-79.188711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1G</td>\n",
       "      <td>43.770992</td>\n",
       "      <td>-79.216917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M1H</td>\n",
       "      <td>43.773136</td>\n",
       "      <td>-79.239476</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode   Latitude  Longitude\n",
       "0        M1B  43.806686 -79.194353\n",
       "1        M1C  43.784535 -79.160497\n",
       "2        M1E  43.763573 -79.188711\n",
       "3        M1G  43.770992 -79.216917\n",
       "4        M1H  43.773136 -79.239476"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "latlang.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighbourhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M4N</td>\n",
       "      <td>Central Toronto</td>\n",
       "      <td>Lawrence Park,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1L</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Clairlea,Golden Mile,Oakridge,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M5G</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street,,,,,,</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode           Borough                      Neighbourhood\n",
       "0        M3A        North York                    Parkwoods,,,,,,\n",
       "1        M1G       Scarborough                       Woburn,,,,,,\n",
       "2        M4N   Central Toronto                Lawrence Park,,,,,,\n",
       "3        M1L       Scarborough  Clairlea,Golden Mile,Oakridge,,,,\n",
       "4        M5G  Downtown Toronto           Central Bay Street,,,,,,"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighbourhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>M1B</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Rouge,Malvern,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>M1C</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Highland Creek,Rouge Hill,Port Union,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>M1E</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Guildwood,Morningside,West Hill,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn,,,,,,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>M1H</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Cedarbrae,,,,,,</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PostalCode      Borough                             Neighbourhood\n",
       "6         M1B  Scarborough                        Rouge,Malvern,,,,,\n",
       "86        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union,,,,\n",
       "49        M1E  Scarborough       Guildwood,Morningside,West Hill,,,,\n",
       "1         M1G  Scarborough                              Woburn,,,,,,\n",
       "66        M1H  Scarborough                           Cedarbrae,,,,,,"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_clean_sorted = data_clean.sort_values(by = 'PostalCode', ascending= True)\n",
    "data_clean_sorted.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['PostalCode', 'Borough', 'Neighbourhood'], dtype='object')"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_clean_sorted.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>M1B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>M1C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>M1E</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>M1H</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PostalCode\n",
       "6         M1B\n",
       "86        M1C\n",
       "49        M1E\n",
       "1         M1G\n",
       "66        M1H"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_clean_sorted[['PostalCode']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"None of [Index(['PostalCode'], dtype='object')] are in the [columns]\"",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-54-400fad942f23>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlatlang\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PostalCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2932\u001b[0m                 \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2933\u001b[0m             indexer = self.loc._convert_to_indexer(key, axis=1,\n\u001b[0;32m-> 2934\u001b[0;31m                                                    raise_missing=True)\n\u001b[0m\u001b[1;32m   2935\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2936\u001b[0m         \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_convert_to_indexer\u001b[0;34m(self, obj, axis, is_setter, raise_missing)\u001b[0m\n\u001b[1;32m   1352\u001b[0m                 kwargs = {'raise_missing': True if is_setter else\n\u001b[1;32m   1353\u001b[0m                           raise_missing}\n\u001b[0;32m-> 1354\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1355\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1356\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1159\u001b[0m         self._validate_read_indexer(keyarr, indexer,\n\u001b[1;32m   1160\u001b[0m                                     \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1161\u001b[0;31m                                     raise_missing=raise_missing)\n\u001b[0m\u001b[1;32m   1162\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m   1244\u001b[0m                 raise KeyError(\n\u001b[1;32m   1245\u001b[0m                     u\"None of [{key}] are in the [{axis}]\".format(\n\u001b[0;32m-> 1246\u001b[0;31m                         key=key, axis=self.obj._get_axis_name(axis)))\n\u001b[0m\u001b[1;32m   1247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1248\u001b[0m             \u001b[0;31m# We (temporarily) allow for some missing keys with .loc, except in\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: \"None of [Index(['PostalCode'], dtype='object')] are in the [columns]\""
     ]
    }
   ],
   "source": [
    "#latlang[['PostalCode']].head()#Throws error, possibly due to hidden character, so convert to list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['PostalCode', 'Latitude', 'Longitude'], dtype='object')"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "latlang.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Convert to list\n",
    "latlang.columns = latlang.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M1B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M1E</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M1H</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode\n",
       "0        M1B\n",
       "1        M1C\n",
       "2        M1E\n",
       "3        M1G\n",
       "4        M1H"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "latlang[['PostalCode']].head()#Now it displays properly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Merge the two dataframes on PostalCode\n",
    "data_merged = pd.merge(data_clean_sorted, latlang, on='PostalCode')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighbourhood</th>\n",
       "      <th>Latitude</th>\n",
       "      <th>Longitude</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M1B</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Rouge,Malvern,,,,,</td>\n",
       "      <td>43.806686</td>\n",
       "      <td>-79.194353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1C</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Highland Creek,Rouge Hill,Port Union,,,,</td>\n",
       "      <td>43.784535</td>\n",
       "      <td>-79.160497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M1E</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Guildwood,Morningside,West Hill,,,,</td>\n",
       "      <td>43.763573</td>\n",
       "      <td>-79.188711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn,,,,,,</td>\n",
       "      <td>43.770992</td>\n",
       "      <td>-79.216917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M1H</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Cedarbrae,,,,,,</td>\n",
       "      <td>43.773136</td>\n",
       "      <td>-79.239476</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode      Borough                             Neighbourhood  \\\n",
       "0        M1B  Scarborough                        Rouge,Malvern,,,,,   \n",
       "1        M1C  Scarborough  Highland Creek,Rouge Hill,Port Union,,,,   \n",
       "2        M1E  Scarborough       Guildwood,Morningside,West Hill,,,,   \n",
       "3        M1G  Scarborough                              Woburn,,,,,,   \n",
       "4        M1H  Scarborough                           Cedarbrae,,,,,,   \n",
       "\n",
       "    Latitude  Longitude  \n",
       "0  43.806686 -79.194353  \n",
       "1  43.784535 -79.160497  \n",
       "2  43.763573 -79.188711  \n",
       "3  43.770992 -79.216917  \n",
       "4  43.773136 -79.239476  "
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_merged.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighbourhood</th>\n",
       "      <th>Latitude</th>\n",
       "      <th>Longitude</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>M5A</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Harbourfront,Regent Park,,,,,</td>\n",
       "      <td>43.65426</td>\n",
       "      <td>-79.360636</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PostalCode           Borough                  Neighbourhood  Latitude  \\\n",
       "53        M5A  Downtown Toronto  Harbourfront,Regent Park,,,,,  43.65426   \n",
       "\n",
       "    Longitude  \n",
       "53 -79.360636  "
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_merged[data_merged['PostalCode'] == 'M5A']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}