{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook for Toronto City clustering project"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### We will work on extracting borough & neighbourhood information + cleaning + clustering & visualising with Folium"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. Establish environment"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import urllib.request, urllib.parse, urllib.error\n",
"from urllib.request import urlopen\n",
"from bs4 import BeautifulSoup\n",
"import ssl\n",
"import re"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2. Ignore SSL errors"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"cntxt = ssl.create_default_context()\n",
"cntxt.check_hostname = False\n",
"cntxt.verify_mode = ssl.CERT_NONE"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3. Obtain URL"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Please enter the website to obtain data from: \n"
]
}
],
"source": [
"url = input('Please enter the website to obtain data from: ')\n",
"if len(url) < 1: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"You want data from >>>\n",
" https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M \n",
"<<<\n"
]
}
],
"source": [
"print('You want data from >>>\\n', url, '\\n<<<')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4. Open and parse the url"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"#Use a file-handle like object to open the url\n",
"html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end\n",
"\n",
"#Use BeautifulSoup to parse\n",
"soup = BeautifulSoup(html, 'html.parser')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 5. Explore"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"bs4.BeautifulSoup"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(soup)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total tags extracted: 294 \n",
"\n",
"\n",
"Total post-codes with borough info: 103\n"
]
}
],
"source": [
"#Try retrieving 'tr' tags\n",
"#print(soup)\n",
"tags = soup('tr')\n",
"print('Total tags extracted: ', len(tags),'\\n')\n",
"\n",
"#Look at the tag extracted\n",
"count = 0 #Initialise counter to count iterations & if rqd, help break out of loop\n",
"dict = {} #Initialise empty dictionary to store postcode as KEY, boroughs and neighbourhoods as VALUES\n",
"list = [] #Initialise empty list to hold the borough + neighbourhood info\n",
"\n",
"#Loop through the tags\n",
"for i in tags:\n",
" count = count + 1 #Increase counter value beginning through each iteration\n",
" if count == 290: break #For limiting output and stopping the loop from running + length of tags for some reason does not correspond to actual number of elements\n",
" \n",
" #Skip if Borough is Not Assigned\n",
" if i.contents[3].text == 'Not assigned':\n",
" continue\n",
" \n",
" #Skip if text is Postcode\n",
" if i.contents[1].text == 'Postcode':\n",
" continue\n",
" \n",
" #If pincode already exists, eg M5A, M6A, append new data\n",
" if i.contents[1].text in dict: \n",
" \n",
" # Append the new data to the existing array at this slot with the following NEW SYNTAX\n",
" #dict[existing_key].append(value)\n",
" dict[i.contents[1].text].append(i.contents[5].text.rstrip())\n",
" \n",
" #If borough exists but not the neighbourhood, run the following \n",
" elif re.search('[a-z]', i.contents[3].text) and re.search('No.*', i.contents[5].text):\n",
" list.append(i.contents[3].text), list.append(i.contents[3].text)\n",
" dict[i.contents[1].text] = list\n",
" else:\n",
" list.append(i.contents[3].text)\n",
" list.append(i.contents[5].text.rstrip())#rstrip gets rid of newline char.\n",
" dict[i.contents[1].text] = list\n",
" \n",
" #Would need to reset the list to empty after each round otherwise each successive iteration will bloat up the key-value\n",
" list = []\n",
" \n",
"\n",
"#print(dict)\n",
"\n",
"print('\\nTotal post-codes with borough info:', len(dict))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 6. Convert to pandas dataframe"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#6.1 Import the library\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"#6.2 Convert dictionary to dataframe\n",
"data = pd.DataFrame.from_dict(dict, orient= 'index')#index when keys are row labels"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" \n",
" \n",
" M3A | \n",
" North York | \n",
" Parkwoods | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M1G | \n",
" Scarborough | \n",
" Woburn | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea | \n",
" Golden Mile | \n",
" Oakridge | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 \\\n",
"M3A North York Parkwoods None None None None \n",
"M1G Scarborough Woburn None None None None \n",
"M4N Central Toronto Lawrence Park None None None None \n",
"M1L Scarborough Clairlea Golden Mile Oakridge None None \n",
"M5G Downtown Toronto Central Bay Street None None None None \n",
"\n",
" 6 7 8 \n",
"M3A None None None \n",
"M1G None None None \n",
"M4N None None None \n",
"M1L None None None \n",
"M5G None None None "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" PostalCode | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" M3A | \n",
" North York | \n",
" Parkwoods | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M1G | \n",
" Scarborough | \n",
" Woburn | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea | \n",
" Golden Mile | \n",
" Oakridge | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 \\\n",
"PostalCode \n",
"M3A North York Parkwoods None None None \n",
"M1G Scarborough Woburn None None None \n",
"M4N Central Toronto Lawrence Park None None None \n",
"M1L Scarborough Clairlea Golden Mile Oakridge None \n",
"M5G Downtown Toronto Central Bay Street None None None \n",
"\n",
" 5 6 7 8 \n",
"PostalCode \n",
"M3A None None None None \n",
"M1G None None None None \n",
"M4N None None None None \n",
"M1L None None None None \n",
"M5G None None None None "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#6.3 Assign row name to postcodes\n",
"data.index.name = 'PostalCode'\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M3A | \n",
" North York | \n",
" Parkwoods | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 2 | \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea | \n",
" Golden Mile | \n",
" Oakridge | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 4 | \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode 0 1 2 3 \\\n",
"0 M3A North York Parkwoods None None \n",
"1 M1G Scarborough Woburn None None \n",
"2 M4N Central Toronto Lawrence Park None None \n",
"3 M1L Scarborough Clairlea Golden Mile Oakridge \n",
"4 M5G Downtown Toronto Central Bay Street None None \n",
"\n",
" 4 5 6 7 8 \n",
"0 None None None None None \n",
"1 None None None None None \n",
"2 None None None None None \n",
"3 None None None None None \n",
"4 None None None None None "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#6.4 Reset index\n",
"data.reset_index(inplace= True)\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M3A | \n",
" North York | \n",
" Parkwoods | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 2 | \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea | \n",
" Golden Mile | \n",
" Oakridge | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 4 | \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough 1 2 3 \\\n",
"0 M3A North York Parkwoods None None \n",
"1 M1G Scarborough Woburn None None \n",
"2 M4N Central Toronto Lawrence Park None None \n",
"3 M1L Scarborough Clairlea Golden Mile Oakridge \n",
"4 M5G Downtown Toronto Central Bay Street None None \n",
"\n",
" 4 5 6 7 8 \n",
"0 None None None None None \n",
"1 None None None None None \n",
"2 None None None None None \n",
"3 None None None None None \n",
"4 None None None None None "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#6.5 Set column 2 i.e. after Postcode to Borough\n",
"data.columns.values[1] = 'Borough'#We need to drill down to the array, accessed with .values\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Borough | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" North York | \n",
"
\n",
" \n",
" 1 | \n",
" Scarborough | \n",
"
\n",
" \n",
" 2 | \n",
" Central Toronto | \n",
"
\n",
" \n",
" 3 | \n",
" Scarborough | \n",
"
\n",
" \n",
" 4 | \n",
" Downtown Toronto | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Borough\n",
"0 North York\n",
"1 Scarborough\n",
"2 Central Toronto\n",
"3 Scarborough\n",
"4 Downtown Toronto"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#6.6 Convert columns to list form so as to remove hidden characters that interfer with column extraction by name \n",
"data.columns = data.columns.tolist()\n",
"data[['Borough']].head() #data[[5]] for the column numbered 5"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M3A | \n",
" North York | \n",
" Parkwoods | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 2 | \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 3 | \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea | \n",
" Golden Mile | \n",
" Oakridge | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 4 | \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough 1 2 3 4 5 \\\n",
"0 M3A North York Parkwoods \n",
"1 M1G Scarborough Woburn \n",
"2 M4N Central Toronto Lawrence Park \n",
"3 M1L Scarborough Clairlea Golden Mile Oakridge \n",
"4 M5G Downtown Toronto Central Bay Street \n",
"\n",
" 6 7 8 \n",
"0 \n",
"1 \n",
"2 \n",
"3 \n",
"4 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#6.7 Remove None values\n",
"## It is vital to try this in the beginning when only specific cells have None\n",
"### It becomes a bit complex if we try to remove None after merging, as each cell then has several values \n",
"\n",
"#data_sorted = data.apply(sorted,key=pd.isnull)\n",
"#data_sorted.head()\n",
"data_clean = data[~pd.isnull(data).all(1)].fillna('')\n",
"data_clean.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" Neighbourhood | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M3A | \n",
" North York | \n",
" Parkwoods | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" Parkwoods,,,,,, | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" Woburn,,,,,, | \n",
"
\n",
" \n",
" 2 | \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" Lawrence Park,,,,,, | \n",
"
\n",
" \n",
" 3 | \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea | \n",
" Golden Mile | \n",
" Oakridge | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" Clairlea,Golden Mile,Oakridge,,,, | \n",
"
\n",
" \n",
" 4 | \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" Central Bay Street,,,,,, | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough 1 2 3 4 5 \\\n",
"0 M3A North York Parkwoods \n",
"1 M1G Scarborough Woburn \n",
"2 M4N Central Toronto Lawrence Park \n",
"3 M1L Scarborough Clairlea Golden Mile Oakridge \n",
"4 M5G Downtown Toronto Central Bay Street \n",
"\n",
" 6 7 8 Neighbourhood \n",
"0 Parkwoods,,,,,, \n",
"1 Woburn,,,,,, \n",
"2 Lawrence Park,,,,,, \n",
"3 Clairlea,Golden Mile,Oakridge,,,, \n",
"4 Central Bay Street,,,,,, "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#6.8 Concatenate the neighbourhood columns\n",
"## Obtain column position after which concatenation is to take place\n",
"source_col_loc = data_clean.columns.get_loc('Borough') # column position starts from 0\n",
"source_col_loc\n",
"\n",
"#Create new column that will merge all neighbourhoods\n",
"data_clean['Neighbourhood'] = data_clean.iloc[:,source_col_loc+1:source_col_loc+8].apply(\n",
" lambda x: \",\".join(x.astype(str)), axis=1)\n",
"data_clean.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"#6.9 Drop columns 1 through 8\n",
"import numpy as np\n",
"cols_to_remove = np.arange(1,9)\n",
"#print(cols_to_remove, '\\n')\n",
"\n",
"#Drop\n",
"data_clean.drop(cols_to_remove, axis=1, inplace = True)\n",
"#data_clean.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" Neighbourhood | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M3A | \n",
" North York | \n",
" Parkwoods,,,,,, | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn,,,,,, | \n",
"
\n",
" \n",
" 2 | \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park,,,,,, | \n",
"
\n",
" \n",
" 3 | \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea,Golden Mile,Oakridge,,,, | \n",
"
\n",
" \n",
" 4 | \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street,,,,,, | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough Neighbourhood\n",
"0 M3A North York Parkwoods,,,,,,\n",
"1 M1G Scarborough Woburn,,,,,,\n",
"2 M4N Central Toronto Lawrence Park,,,,,,\n",
"3 M1L Scarborough Clairlea,Golden Mile,Oakridge,,,,\n",
"4 M5G Downtown Toronto Central Bay Street,,,,,,"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean.head()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" Neighbourhood | \n",
"
\n",
" \n",
" \n",
" \n",
" 34 | \n",
" M5A | \n",
" Downtown Toronto | \n",
" Harbourfront,Regent Park,,,,, | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough Neighbourhood\n",
"34 M5A Downtown Toronto Harbourfront,Regent Park,,,,,"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean[data_clean['PostalCode'] == 'M5A']"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(103, 3)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 7. Obtain geocoordinates of the postal codes"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"#import geocoder #Not installed\n",
"#From terminal, install GeoPy\n",
"#sudo python3 -m pip install geopy\n",
"#sudo python3 -m pip install geocoder #NOTE: THIS IS ANOTHER PACKAGE!\n",
"\n",
"#from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values\n",
"import folium #Plotting library\n",
"#import geocoder"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"#g = geocoder.google('Mountain View, CA')\n",
"#print(g.latlng)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# initialize your variable to None\n",
"#lat_lng_coords = None\n",
"\n",
"# loop until you get the coordinates\n",
"#while(lat_lng_coords is None):\n",
" # g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))\n",
" # lat_lng_coords = g.latlng\n",
"\n",
"#latitude = lat_lng_coords[0]\n",
"#longitude = lat_lng_coords[1]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"#Read direct from provided CSV as the package was not giving results\n",
"import pandas as pd\n",
"latlang = pd.read_csv('/Users/peaceful_warrior/Downloads/Geospatial_Coordinates.csv')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Postal Code | \n",
" Latitude | \n",
" Longitude | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M1B | \n",
" 43.806686 | \n",
" -79.194353 | \n",
"
\n",
" \n",
" 1 | \n",
" M1C | \n",
" 43.784535 | \n",
" -79.160497 | \n",
"
\n",
" \n",
" 2 | \n",
" M1E | \n",
" 43.763573 | \n",
" -79.188711 | \n",
"
\n",
" \n",
" 3 | \n",
" M1G | \n",
" 43.770992 | \n",
" -79.216917 | \n",
"
\n",
" \n",
" 4 | \n",
" M1H | \n",
" 43.773136 | \n",
" -79.239476 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Postal Code Latitude Longitude\n",
"0 M1B 43.806686 -79.194353\n",
"1 M1C 43.784535 -79.160497\n",
"2 M1E 43.763573 -79.188711\n",
"3 M1G 43.770992 -79.216917\n",
"4 M1H 43.773136 -79.239476"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latlang.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latlang.columns"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"latlang.columns.values[0] = 'PostalCode'"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Latitude | \n",
" Longitude | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M1B | \n",
" 43.806686 | \n",
" -79.194353 | \n",
"
\n",
" \n",
" 1 | \n",
" M1C | \n",
" 43.784535 | \n",
" -79.160497 | \n",
"
\n",
" \n",
" 2 | \n",
" M1E | \n",
" 43.763573 | \n",
" -79.188711 | \n",
"
\n",
" \n",
" 3 | \n",
" M1G | \n",
" 43.770992 | \n",
" -79.216917 | \n",
"
\n",
" \n",
" 4 | \n",
" M1H | \n",
" 43.773136 | \n",
" -79.239476 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Latitude Longitude\n",
"0 M1B 43.806686 -79.194353\n",
"1 M1C 43.784535 -79.160497\n",
"2 M1E 43.763573 -79.188711\n",
"3 M1G 43.770992 -79.216917\n",
"4 M1H 43.773136 -79.239476"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latlang.head()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" Neighbourhood | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M3A | \n",
" North York | \n",
" Parkwoods,,,,,, | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn,,,,,, | \n",
"
\n",
" \n",
" 2 | \n",
" M4N | \n",
" Central Toronto | \n",
" Lawrence Park,,,,,, | \n",
"
\n",
" \n",
" 3 | \n",
" M1L | \n",
" Scarborough | \n",
" Clairlea,Golden Mile,Oakridge,,,, | \n",
"
\n",
" \n",
" 4 | \n",
" M5G | \n",
" Downtown Toronto | \n",
" Central Bay Street,,,,,, | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough Neighbourhood\n",
"0 M3A North York Parkwoods,,,,,,\n",
"1 M1G Scarborough Woburn,,,,,,\n",
"2 M4N Central Toronto Lawrence Park,,,,,,\n",
"3 M1L Scarborough Clairlea,Golden Mile,Oakridge,,,,\n",
"4 M5G Downtown Toronto Central Bay Street,,,,,,"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean.head()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" Neighbourhood | \n",
"
\n",
" \n",
" \n",
" \n",
" 6 | \n",
" M1B | \n",
" Scarborough | \n",
" Rouge,Malvern,,,,, | \n",
"
\n",
" \n",
" 86 | \n",
" M1C | \n",
" Scarborough | \n",
" Highland Creek,Rouge Hill,Port Union,,,, | \n",
"
\n",
" \n",
" 49 | \n",
" M1E | \n",
" Scarborough | \n",
" Guildwood,Morningside,West Hill,,,, | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn,,,,,, | \n",
"
\n",
" \n",
" 66 | \n",
" M1H | \n",
" Scarborough | \n",
" Cedarbrae,,,,,, | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough Neighbourhood\n",
"6 M1B Scarborough Rouge,Malvern,,,,,\n",
"86 M1C Scarborough Highland Creek,Rouge Hill,Port Union,,,,\n",
"49 M1E Scarborough Guildwood,Morningside,West Hill,,,,\n",
"1 M1G Scarborough Woburn,,,,,,\n",
"66 M1H Scarborough Cedarbrae,,,,,,"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean_sorted = data_clean.sort_values(by = 'PostalCode', ascending= True)\n",
"data_clean_sorted.head()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['PostalCode', 'Borough', 'Neighbourhood'], dtype='object')"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean_sorted.columns"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
"
\n",
" \n",
" \n",
" \n",
" 6 | \n",
" M1B | \n",
"
\n",
" \n",
" 86 | \n",
" M1C | \n",
"
\n",
" \n",
" 49 | \n",
" M1E | \n",
"
\n",
" \n",
" 1 | \n",
" M1G | \n",
"
\n",
" \n",
" 66 | \n",
" M1H | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode\n",
"6 M1B\n",
"86 M1C\n",
"49 M1E\n",
"1 M1G\n",
"66 M1H"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean_sorted[['PostalCode']].head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"None of [Index(['PostalCode'], dtype='object')] are in the [columns]\"",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlatlang\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PostalCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2932\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2933\u001b[0m indexer = self.loc._convert_to_indexer(key, axis=1,\n\u001b[0;32m-> 2934\u001b[0;31m raise_missing=True)\n\u001b[0m\u001b[1;32m 2935\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2936\u001b[0m \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_convert_to_indexer\u001b[0;34m(self, obj, axis, is_setter, raise_missing)\u001b[0m\n\u001b[1;32m 1352\u001b[0m kwargs = {'raise_missing': True if is_setter else\n\u001b[1;32m 1353\u001b[0m raise_missing}\n\u001b[0;32m-> 1354\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1355\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1356\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1159\u001b[0m self._validate_read_indexer(keyarr, indexer,\n\u001b[1;32m 1160\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1161\u001b[0;31m raise_missing=raise_missing)\n\u001b[0m\u001b[1;32m 1162\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1244\u001b[0m raise KeyError(\n\u001b[1;32m 1245\u001b[0m u\"None of [{key}] are in the [{axis}]\".format(\n\u001b[0;32m-> 1246\u001b[0;31m key=key, axis=self.obj._get_axis_name(axis)))\n\u001b[0m\u001b[1;32m 1247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1248\u001b[0m \u001b[0;31m# We (temporarily) allow for some missing keys with .loc, except in\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: \"None of [Index(['PostalCode'], dtype='object')] are in the [columns]\""
]
}
],
"source": [
"#latlang[['PostalCode']].head()#Throws error, possibly due to hidden character, so convert to list"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['PostalCode', 'Latitude', 'Longitude'], dtype='object')"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latlang.columns"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"#Convert to list\n",
"latlang.columns = latlang.columns.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M1B | \n",
"
\n",
" \n",
" 1 | \n",
" M1C | \n",
"
\n",
" \n",
" 2 | \n",
" M1E | \n",
"
\n",
" \n",
" 3 | \n",
" M1G | \n",
"
\n",
" \n",
" 4 | \n",
" M1H | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode\n",
"0 M1B\n",
"1 M1C\n",
"2 M1E\n",
"3 M1G\n",
"4 M1H"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latlang[['PostalCode']].head()#Now it displays properly"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"#Merge the two dataframes on PostalCode\n",
"data_merged = pd.merge(data_clean_sorted, latlang, on='PostalCode')"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" Neighbourhood | \n",
" Latitude | \n",
" Longitude | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M1B | \n",
" Scarborough | \n",
" Rouge,Malvern,,,,, | \n",
" 43.806686 | \n",
" -79.194353 | \n",
"
\n",
" \n",
" 1 | \n",
" M1C | \n",
" Scarborough | \n",
" Highland Creek,Rouge Hill,Port Union,,,, | \n",
" 43.784535 | \n",
" -79.160497 | \n",
"
\n",
" \n",
" 2 | \n",
" M1E | \n",
" Scarborough | \n",
" Guildwood,Morningside,West Hill,,,, | \n",
" 43.763573 | \n",
" -79.188711 | \n",
"
\n",
" \n",
" 3 | \n",
" M1G | \n",
" Scarborough | \n",
" Woburn,,,,,, | \n",
" 43.770992 | \n",
" -79.216917 | \n",
"
\n",
" \n",
" 4 | \n",
" M1H | \n",
" Scarborough | \n",
" Cedarbrae,,,,,, | \n",
" 43.773136 | \n",
" -79.239476 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough Neighbourhood \\\n",
"0 M1B Scarborough Rouge,Malvern,,,,, \n",
"1 M1C Scarborough Highland Creek,Rouge Hill,Port Union,,,, \n",
"2 M1E Scarborough Guildwood,Morningside,West Hill,,,, \n",
"3 M1G Scarborough Woburn,,,,,, \n",
"4 M1H Scarborough Cedarbrae,,,,,, \n",
"\n",
" Latitude Longitude \n",
"0 43.806686 -79.194353 \n",
"1 43.784535 -79.160497 \n",
"2 43.763573 -79.188711 \n",
"3 43.770992 -79.216917 \n",
"4 43.773136 -79.239476 "
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_merged.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PostalCode | \n",
" Borough | \n",
" Neighbourhood | \n",
" Latitude | \n",
" Longitude | \n",
"
\n",
" \n",
" \n",
" \n",
" 53 | \n",
" M5A | \n",
" Downtown Toronto | \n",
" Harbourfront,Regent Park,,,,, | \n",
" 43.65426 | \n",
" -79.360636 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PostalCode Borough Neighbourhood Latitude \\\n",
"53 M5A Downtown Toronto Harbourfront,Regent Park,,,,, 43.65426 \n",
"\n",
" Longitude \n",
"53 -79.360636 "
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_merged[data_merged['PostalCode'] == 'M5A']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}