# Notebook for Toronto City clustering project

##### We will work on extracting borough & neighbourhood information + cleaning + clustering & visualising with Folium

#### 1. Establish environment

In [16]:
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re

#### 2. Ignore SSL errors

In [17]:
cntxt = ssl.create_default_context()
cntxt.check_hostname = False
cntxt.verify_mode = ssl.CERT_NONE

#### 3. Obtain URL

In [19]:
url = input('Please enter the website to obtain data from: ')
if len(url) < 1: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

Please enter the website to obtain data from:  


In [20]:
print('You want data from >>>\n', url, '\n<<<')

You want data from >>>
 https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 
<<<


#### 4. Open and parse the url

In [21]:
#Use a file-handle like object to open the url
html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end

#Use BeautifulSoup to parse
soup = BeautifulSoup(html, 'html.parser')

#### 5. Explore

In [22]:
type(soup)

bs4.BeautifulSoup

In [23]:
#Try retrieving 'tr' tags
#print(soup)
tags = soup('tr')
print('Total tags extracted: ', len(tags),'\n')

#Look at the tag extracted
count = 0 #Initialise counter to count iterations & if rqd, help break out of loop
dict = {} #Initialise empty dictionary to store postcode as KEY, boroughs and neighbourhoods as VALUES
list = [] #Initialise empty list to hold the borough + neighbourhood info

#Loop through the tags
for i in tags:
    count = count + 1 #Increase counter value beginning through each iteration
    if count == 290: break #For limiting output and stopping the loop from running + length of tags for some reason does not correspond to actual number of elements
    
    #Skip if Borough is Not Assigned
    if i.contents[3].text == 'Not assigned':
        continue
        
    #Skip if text is Postcode
    if i.contents[1].text == 'Postcode':
        continue
    
    #If pincode already exists, eg M5A, M6A, append new data
    if i.contents[1].text in dict: 
        
        # Append the new data to the existing array at this slot with the following NEW SYNTAX
        #dict[existing_key].append(value)
        dict[i.contents[1].text].append(i.contents[5].text.rstrip())
        
    #If borough exists but not the neighbourhood, run the following    
    elif re.search('[a-z]', i.contents[3].text) and re.search('No.*', i.contents[5].text):
        list.append(i.contents[3].text), list.append(i.contents[3].text)
        dict[i.contents[1].text] = list
    else:
        list.append(i.contents[3].text)
        list.append(i.contents[5].text.rstrip())#rstrip gets rid of newline char.
        dict[i.contents[1].text] = list
    
    #Would need to reset the list to empty after each round otherwise each successive iteration will bloat up the key-value
    list = []
    

#print(dict)

print('\nTotal post-codes with borough info:', len(dict))

Total tags extracted:  294 


Total post-codes with borough info: 103


#### 6. Convert to pandas dataframe

In [24]:
#6.1 Import the library
import pandas as pd

In [25]:
#6.2 Convert dictionary to dataframe
data = pd.DataFrame.from_dict(dict, orient= 'index')#index when keys are row labels

In [26]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
M3A,North York,Parkwoods,,,,,,,
M1G,Scarborough,Woburn,,,,,,,
M4N,Central Toronto,Lawrence Park,,,,,,,
M1L,Scarborough,Clairlea,Golden Mile,Oakridge,,,,,
M5G,Downtown Toronto,Central Bay Street,,,,,,,


In [27]:
#6.3 Assign row name to postcodes
data.index.name = 'PostalCode'
data.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
M3A,North York,Parkwoods,,,,,,,
M1G,Scarborough,Woburn,,,,,,,
M4N,Central Toronto,Lawrence Park,,,,,,,
M1L,Scarborough,Clairlea,Golden Mile,Oakridge,,,,,
M5G,Downtown Toronto,Central Bay Street,,,,,,,


In [28]:
#6.4 Reset index
data.reset_index(inplace= True)
data.head()

Unnamed: 0,PostalCode,0,1,2,3,4,5,6,7,8
0,M3A,North York,Parkwoods,,,,,,,
1,M1G,Scarborough,Woburn,,,,,,,
2,M4N,Central Toronto,Lawrence Park,,,,,,,
3,M1L,Scarborough,Clairlea,Golden Mile,Oakridge,,,,,
4,M5G,Downtown Toronto,Central Bay Street,,,,,,,


In [29]:
#6.5 Set column 2 i.e. after Postcode to Borough
data.columns.values[1] = 'Borough'#We need to drill down to the array, accessed with .values
data.head()

Unnamed: 0,PostalCode,Borough,1,2,3,4,5,6,7,8
0,M3A,North York,Parkwoods,,,,,,,
1,M1G,Scarborough,Woburn,,,,,,,
2,M4N,Central Toronto,Lawrence Park,,,,,,,
3,M1L,Scarborough,Clairlea,Golden Mile,Oakridge,,,,,
4,M5G,Downtown Toronto,Central Bay Street,,,,,,,


In [30]:
#6.6 Convert columns to list form so as to remove hidden characters that interfer with column extraction by name 
data.columns = data.columns.tolist()
data[['Borough']].head() #data[[5]] for the column numbered 5

Unnamed: 0,Borough
0,North York
1,Scarborough
2,Central Toronto
3,Scarborough
4,Downtown Toronto


In [31]:
#6.7 Remove None values
## It is vital to try this in the beginning when only specific cells have None
### It becomes a bit complex if we try to remove None after merging, as each cell then has several values 

#data_sorted = data.apply(sorted,key=pd.isnull)
#data_sorted.head()
data_clean = data[~pd.isnull(data).all(1)].fillna('')
data_clean.head()

Unnamed: 0,PostalCode,Borough,1,2,3,4,5,6,7,8
0,M3A,North York,Parkwoods,,,,,,,
1,M1G,Scarborough,Woburn,,,,,,,
2,M4N,Central Toronto,Lawrence Park,,,,,,,
3,M1L,Scarborough,Clairlea,Golden Mile,Oakridge,,,,,
4,M5G,Downtown Toronto,Central Bay Street,,,,,,,


In [32]:
#6.8 Concatenate the neighbourhood columns
## Obtain column position after which concatenation is to take place
source_col_loc = data_clean.columns.get_loc('Borough') # column position starts from 0
source_col_loc

#Create new column that will merge all neighbourhoods
data_clean['Neighbourhood'] = data_clean.iloc[:,source_col_loc+1:source_col_loc+8].apply(
   lambda x: ",".join(x.astype(str)), axis=1)
data_clean.head()

Unnamed: 0,PostalCode,Borough,1,2,3,4,5,6,7,8,Neighbourhood
0,M3A,North York,Parkwoods,,,,,,,,"Parkwoods,,,,,,"
1,M1G,Scarborough,Woburn,,,,,,,,"Woburn,,,,,,"
2,M4N,Central Toronto,Lawrence Park,,,,,,,,"Lawrence Park,,,,,,"
3,M1L,Scarborough,Clairlea,Golden Mile,Oakridge,,,,,,"Clairlea,Golden Mile,Oakridge,,,,"
4,M5G,Downtown Toronto,Central Bay Street,,,,,,,,"Central Bay Street,,,,,,"


In [33]:
#6.9 Drop columns 1 through 8
import numpy as np
cols_to_remove = np.arange(1,9)
#print(cols_to_remove, '\n')

#Drop
data_clean.drop(cols_to_remove, axis=1, inplace = True)
#data_clean.head()

In [34]:
data_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,"Parkwoods,,,,,,"
1,M1G,Scarborough,"Woburn,,,,,,"
2,M4N,Central Toronto,"Lawrence Park,,,,,,"
3,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge,,,,"
4,M5G,Downtown Toronto,"Central Bay Street,,,,,,"


In [35]:
data_clean[data_clean['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighbourhood
34,M5A,Downtown Toronto,"Harbourfront,Regent Park,,,,,"


In [36]:
data_clean.shape

(103, 3)

#### 7. Obtain geocoordinates of the postal codes

In [37]:
#import geocoder #Not installed
#From terminal, install GeoPy
#sudo python3 -m pip install geopy
#sudo python3 -m pip install geocoder #NOTE: THIS IS ANOTHER PACKAGE!

#from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import folium #Plotting library
#import geocoder

In [39]:
#g = geocoder.google('Mountain View, CA')
#print(g.latlng)

In [None]:
# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
 #   g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
  #  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [40]:
#Read direct from provided CSV as the package was not giving results
import pandas as pd
latlang = pd.read_csv('/Users/peaceful_warrior/Downloads/Geospatial_Coordinates.csv')

In [41]:
latlang.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [43]:
latlang.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [45]:
latlang.columns.values[0] = 'PostalCode'

In [46]:
latlang.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [42]:
data_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,"Parkwoods,,,,,,"
1,M1G,Scarborough,"Woburn,,,,,,"
2,M4N,Central Toronto,"Lawrence Park,,,,,,"
3,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge,,,,"
4,M5G,Downtown Toronto,"Central Bay Street,,,,,,"


In [48]:
data_clean_sorted = data_clean.sort_values(by = 'PostalCode', ascending= True)
data_clean_sorted.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
6,M1B,Scarborough,"Rouge,Malvern,,,,,"
86,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union,,,,"
49,M1E,Scarborough,"Guildwood,Morningside,West Hill,,,,"
1,M1G,Scarborough,"Woburn,,,,,,"
66,M1H,Scarborough,"Cedarbrae,,,,,,"


In [51]:
data_clean_sorted.columns

Index(['PostalCode', 'Borough', 'Neighbourhood'], dtype='object')

In [53]:
data_clean_sorted[['PostalCode']].head()

Unnamed: 0,PostalCode
6,M1B
86,M1C
49,M1E
1,M1G
66,M1H


In [54]:
#latlang[['PostalCode']].head()#Throws error, possibly due to hidden character, so convert to list

KeyError: "None of [Index(['PostalCode'], dtype='object')] are in the [columns]"

In [55]:
latlang.columns

Index(['PostalCode', 'Latitude', 'Longitude'], dtype='object')

In [56]:
#Convert to list
latlang.columns = latlang.columns.tolist()

In [62]:
latlang[['PostalCode']].head()#Now it displays properly

Unnamed: 0,PostalCode
0,M1B
1,M1C
2,M1E
3,M1G
4,M1H


In [58]:
#Merge the two dataframes on PostalCode
data_merged = pd.merge(data_clean_sorted, latlang, on='PostalCode')

In [61]:
data_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern,,,,,",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union,,,,",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill,,,,",43.763573,-79.188711
3,M1G,Scarborough,"Woburn,,,,,,",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae,,,,,,",43.773136,-79.239476


In [60]:
data_merged[data_merged['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
53,M5A,Downtown Toronto,"Harbourfront,Regent Park,,,,,",43.65426,-79.360636
