{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Notebook for Toronto City clustering project" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### We will work on extracting borough & neighbourhood information + cleaning + clustering & visualising with Folium" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 1. Establish environment" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import urllib.request, urllib.parse, urllib.error\n", "from urllib.request import urlopen\n", "from bs4 import BeautifulSoup\n", "import ssl\n", "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2. Ignore SSL errors" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "cntxt = ssl.create_default_context()\n", "cntxt.check_hostname = False\n", "cntxt.verify_mode = ssl.CERT_NONE" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3. Obtain URL" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdin", "output_type": "stream", "text": [ "Please enter the website to obtain data from: \n" ] } ], "source": [ "url = input('Please enter the website to obtain data from: ')\n", "if len(url) < 1: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "You want data from >>>\n", " https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M \n", "<<<\n" ] } ], "source": [ "print('You want data from >>>\\n', url, '\\n<<<')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4. Open and parse the url" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "#Use a file-handle like object to open the url\n", "html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end\n", "\n", "#Use BeautifulSoup to parse\n", "soup = BeautifulSoup(html, 'html.parser')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 5. Explore" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "bs4.BeautifulSoup" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(soup)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total tags extracted: 294 \n", "\n", "\n", "Total post-codes with borough info: 103\n" ] } ], "source": [ "#Try retrieving 'tr' tags\n", "#print(soup)\n", "tags = soup('tr')\n", "print('Total tags extracted: ', len(tags),'\\n')\n", "\n", "#Look at the tag extracted\n", "count = 0 #Initialise counter to count iterations & if rqd, help break out of loop\n", "dict = {} #Initialise empty dictionary to store postcode as KEY, boroughs and neighbourhoods as VALUES\n", "list = [] #Initialise empty list to hold the borough + neighbourhood info\n", "\n", "#Loop through the tags\n", "for i in tags:\n", " count = count + 1 #Increase counter value beginning through each iteration\n", " if count == 290: break #For limiting output and stopping the loop from running + length of tags for some reason does not correspond to actual number of elements\n", " \n", " #Skip if Borough is Not Assigned\n", " if i.contents[3].text == 'Not assigned':\n", " continue\n", " \n", " #Skip if text is Postcode\n", " if i.contents[1].text == 'Postcode':\n", " continue\n", " \n", " #If pincode already exists, eg M5A, M6A, append new data\n", " if i.contents[1].text in dict: \n", " \n", " # Append the new data to the existing array at this slot with the following NEW SYNTAX\n", " #dict[existing_key].append(value)\n", " dict[i.contents[1].text].append(i.contents[5].text.rstrip())\n", " \n", " #If borough exists but not the neighbourhood, run the following \n", " elif re.search('[a-z]', i.contents[3].text) and re.search('No.*', i.contents[5].text):\n", " list.append(i.contents[3].text), list.append(i.contents[3].text)\n", " dict[i.contents[1].text] = list\n", " else:\n", " list.append(i.contents[3].text)\n", " list.append(i.contents[5].text.rstrip())#rstrip gets rid of newline char.\n", " dict[i.contents[1].text] = list\n", " \n", " #Would need to reset the list to empty after each round otherwise each successive iteration will bloat up the key-value\n", " list = []\n", " \n", "\n", "#print(dict)\n", "\n", "print('\\nTotal post-codes with borough info:', len(dict))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 6. Convert to pandas dataframe" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "#6.1 Import the library\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "#6.2 Convert dictionary to dataframe\n", "data = pd.DataFrame.from_dict(dict, orient= 'index')#index when keys are row labels" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678
M3ANorth YorkParkwoodsNoneNoneNoneNoneNoneNoneNone
M1GScarboroughWoburnNoneNoneNoneNoneNoneNoneNone
M4NCentral TorontoLawrence ParkNoneNoneNoneNoneNoneNoneNone
M1LScarboroughClairleaGolden MileOakridgeNoneNoneNoneNoneNone
M5GDowntown TorontoCentral Bay StreetNoneNoneNoneNoneNoneNoneNone
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 \\\n", "M3A North York Parkwoods None None None None \n", "M1G Scarborough Woburn None None None None \n", "M4N Central Toronto Lawrence Park None None None None \n", "M1L Scarborough Clairlea Golden Mile Oakridge None None \n", "M5G Downtown Toronto Central Bay Street None None None None \n", "\n", " 6 7 8 \n", "M3A None None None \n", "M1G None None None \n", "M4N None None None \n", "M1L None None None \n", "M5G None None None " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678
PostalCode
M3ANorth YorkParkwoodsNoneNoneNoneNoneNoneNoneNone
M1GScarboroughWoburnNoneNoneNoneNoneNoneNoneNone
M4NCentral TorontoLawrence ParkNoneNoneNoneNoneNoneNoneNone
M1LScarboroughClairleaGolden MileOakridgeNoneNoneNoneNoneNone
M5GDowntown TorontoCentral Bay StreetNoneNoneNoneNoneNoneNoneNone
\n", "
" ], "text/plain": [ " 0 1 2 3 4 \\\n", "PostalCode \n", "M3A North York Parkwoods None None None \n", "M1G Scarborough Woburn None None None \n", "M4N Central Toronto Lawrence Park None None None \n", "M1L Scarborough Clairlea Golden Mile Oakridge None \n", "M5G Downtown Toronto Central Bay Street None None None \n", "\n", " 5 6 7 8 \n", "PostalCode \n", "M3A None None None None \n", "M1G None None None None \n", "M4N None None None None \n", "M1L None None None None \n", "M5G None None None None " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#6.3 Assign row name to postcodes\n", "data.index.name = 'PostalCode'\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCode012345678
0M3ANorth YorkParkwoodsNoneNoneNoneNoneNoneNoneNone
1M1GScarboroughWoburnNoneNoneNoneNoneNoneNoneNone
2M4NCentral TorontoLawrence ParkNoneNoneNoneNoneNoneNoneNone
3M1LScarboroughClairleaGolden MileOakridgeNoneNoneNoneNoneNone
4M5GDowntown TorontoCentral Bay StreetNoneNoneNoneNoneNoneNoneNone
\n", "
" ], "text/plain": [ " PostalCode 0 1 2 3 \\\n", "0 M3A North York Parkwoods None None \n", "1 M1G Scarborough Woburn None None \n", "2 M4N Central Toronto Lawrence Park None None \n", "3 M1L Scarborough Clairlea Golden Mile Oakridge \n", "4 M5G Downtown Toronto Central Bay Street None None \n", "\n", " 4 5 6 7 8 \n", "0 None None None None None \n", "1 None None None None None \n", "2 None None None None None \n", "3 None None None None None \n", "4 None None None None None " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#6.4 Reset index\n", "data.reset_index(inplace= True)\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBorough12345678
0M3ANorth YorkParkwoodsNoneNoneNoneNoneNoneNoneNone
1M1GScarboroughWoburnNoneNoneNoneNoneNoneNoneNone
2M4NCentral TorontoLawrence ParkNoneNoneNoneNoneNoneNoneNone
3M1LScarboroughClairleaGolden MileOakridgeNoneNoneNoneNoneNone
4M5GDowntown TorontoCentral Bay StreetNoneNoneNoneNoneNoneNoneNone
\n", "
" ], "text/plain": [ " PostalCode Borough 1 2 3 \\\n", "0 M3A North York Parkwoods None None \n", "1 M1G Scarborough Woburn None None \n", "2 M4N Central Toronto Lawrence Park None None \n", "3 M1L Scarborough Clairlea Golden Mile Oakridge \n", "4 M5G Downtown Toronto Central Bay Street None None \n", "\n", " 4 5 6 7 8 \n", "0 None None None None None \n", "1 None None None None None \n", "2 None None None None None \n", "3 None None None None None \n", "4 None None None None None " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#6.5 Set column 2 i.e. after Postcode to Borough\n", "data.columns.values[1] = 'Borough'#We need to drill down to the array, accessed with .values\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Borough
0North York
1Scarborough
2Central Toronto
3Scarborough
4Downtown Toronto
\n", "
" ], "text/plain": [ " Borough\n", "0 North York\n", "1 Scarborough\n", "2 Central Toronto\n", "3 Scarborough\n", "4 Downtown Toronto" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#6.6 Convert columns to list form so as to remove hidden characters that interfer with column extraction by name \n", "data.columns = data.columns.tolist()\n", "data[['Borough']].head() #data[[5]] for the column numbered 5" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBorough12345678
0M3ANorth YorkParkwoods
1M1GScarboroughWoburn
2M4NCentral TorontoLawrence Park
3M1LScarboroughClairleaGolden MileOakridge
4M5GDowntown TorontoCentral Bay Street
\n", "
" ], "text/plain": [ " PostalCode Borough 1 2 3 4 5 \\\n", "0 M3A North York Parkwoods \n", "1 M1G Scarborough Woburn \n", "2 M4N Central Toronto Lawrence Park \n", "3 M1L Scarborough Clairlea Golden Mile Oakridge \n", "4 M5G Downtown Toronto Central Bay Street \n", "\n", " 6 7 8 \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#6.7 Remove None values\n", "## It is vital to try this in the beginning when only specific cells have None\n", "### It becomes a bit complex if we try to remove None after merging, as each cell then has several values \n", "\n", "#data_sorted = data.apply(sorted,key=pd.isnull)\n", "#data_sorted.head()\n", "data_clean = data[~pd.isnull(data).all(1)].fillna('')\n", "data_clean.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBorough12345678Neighbourhood
0M3ANorth YorkParkwoodsParkwoods,,,,,,
1M1GScarboroughWoburnWoburn,,,,,,
2M4NCentral TorontoLawrence ParkLawrence Park,,,,,,
3M1LScarboroughClairleaGolden MileOakridgeClairlea,Golden Mile,Oakridge,,,,
4M5GDowntown TorontoCentral Bay StreetCentral Bay Street,,,,,,
\n", "
" ], "text/plain": [ " PostalCode Borough 1 2 3 4 5 \\\n", "0 M3A North York Parkwoods \n", "1 M1G Scarborough Woburn \n", "2 M4N Central Toronto Lawrence Park \n", "3 M1L Scarborough Clairlea Golden Mile Oakridge \n", "4 M5G Downtown Toronto Central Bay Street \n", "\n", " 6 7 8 Neighbourhood \n", "0 Parkwoods,,,,,, \n", "1 Woburn,,,,,, \n", "2 Lawrence Park,,,,,, \n", "3 Clairlea,Golden Mile,Oakridge,,,, \n", "4 Central Bay Street,,,,,, " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#6.8 Concatenate the neighbourhood columns\n", "## Obtain column position after which concatenation is to take place\n", "source_col_loc = data_clean.columns.get_loc('Borough') # column position starts from 0\n", "source_col_loc\n", "\n", "#Create new column that will merge all neighbourhoods\n", "data_clean['Neighbourhood'] = data_clean.iloc[:,source_col_loc+1:source_col_loc+8].apply(\n", " lambda x: \",\".join(x.astype(str)), axis=1)\n", "data_clean.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "#6.9 Drop columns 1 through 8\n", "import numpy as np\n", "cols_to_remove = np.arange(1,9)\n", "#print(cols_to_remove, '\\n')\n", "\n", "#Drop\n", "data_clean.drop(cols_to_remove, axis=1, inplace = True)\n", "#data_clean.head()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBoroughNeighbourhood
0M3ANorth YorkParkwoods,,,,,,
1M1GScarboroughWoburn,,,,,,
2M4NCentral TorontoLawrence Park,,,,,,
3M1LScarboroughClairlea,Golden Mile,Oakridge,,,,
4M5GDowntown TorontoCentral Bay Street,,,,,,
\n", "
" ], "text/plain": [ " PostalCode Borough Neighbourhood\n", "0 M3A North York Parkwoods,,,,,,\n", "1 M1G Scarborough Woburn,,,,,,\n", "2 M4N Central Toronto Lawrence Park,,,,,,\n", "3 M1L Scarborough Clairlea,Golden Mile,Oakridge,,,,\n", "4 M5G Downtown Toronto Central Bay Street,,,,,," ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean.head()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBoroughNeighbourhood
34M5ADowntown TorontoHarbourfront,Regent Park,,,,,
\n", "
" ], "text/plain": [ " PostalCode Borough Neighbourhood\n", "34 M5A Downtown Toronto Harbourfront,Regent Park,,,,," ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean[data_clean['PostalCode'] == 'M5A']" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(103, 3)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 7. Obtain geocoordinates of the postal codes" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "#import geocoder #Not installed\n", "#From terminal, install GeoPy\n", "#sudo python3 -m pip install geopy\n", "#sudo python3 -m pip install geocoder #NOTE: THIS IS ANOTHER PACKAGE!\n", "\n", "#from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values\n", "import folium #Plotting library\n", "#import geocoder" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "#g = geocoder.google('Mountain View, CA')\n", "#print(g.latlng)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# initialize your variable to None\n", "#lat_lng_coords = None\n", "\n", "# loop until you get the coordinates\n", "#while(lat_lng_coords is None):\n", " # g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))\n", " # lat_lng_coords = g.latlng\n", "\n", "#latitude = lat_lng_coords[0]\n", "#longitude = lat_lng_coords[1]" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "#Read direct from provided CSV as the package was not giving results\n", "import pandas as pd\n", "latlang = pd.read_csv('/Users/peaceful_warrior/Downloads/Geospatial_Coordinates.csv')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Postal CodeLatitudeLongitude
0M1B43.806686-79.194353
1M1C43.784535-79.160497
2M1E43.763573-79.188711
3M1G43.770992-79.216917
4M1H43.773136-79.239476
\n", "
" ], "text/plain": [ " Postal Code Latitude Longitude\n", "0 M1B 43.806686 -79.194353\n", "1 M1C 43.784535 -79.160497\n", "2 M1E 43.763573 -79.188711\n", "3 M1G 43.770992 -79.216917\n", "4 M1H 43.773136 -79.239476" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latlang.head()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latlang.columns" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "latlang.columns.values[0] = 'PostalCode'" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeLatitudeLongitude
0M1B43.806686-79.194353
1M1C43.784535-79.160497
2M1E43.763573-79.188711
3M1G43.770992-79.216917
4M1H43.773136-79.239476
\n", "
" ], "text/plain": [ " PostalCode Latitude Longitude\n", "0 M1B 43.806686 -79.194353\n", "1 M1C 43.784535 -79.160497\n", "2 M1E 43.763573 -79.188711\n", "3 M1G 43.770992 -79.216917\n", "4 M1H 43.773136 -79.239476" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latlang.head()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBoroughNeighbourhood
0M3ANorth YorkParkwoods,,,,,,
1M1GScarboroughWoburn,,,,,,
2M4NCentral TorontoLawrence Park,,,,,,
3M1LScarboroughClairlea,Golden Mile,Oakridge,,,,
4M5GDowntown TorontoCentral Bay Street,,,,,,
\n", "
" ], "text/plain": [ " PostalCode Borough Neighbourhood\n", "0 M3A North York Parkwoods,,,,,,\n", "1 M1G Scarborough Woburn,,,,,,\n", "2 M4N Central Toronto Lawrence Park,,,,,,\n", "3 M1L Scarborough Clairlea,Golden Mile,Oakridge,,,,\n", "4 M5G Downtown Toronto Central Bay Street,,,,,," ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean.head()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBoroughNeighbourhood
6M1BScarboroughRouge,Malvern,,,,,
86M1CScarboroughHighland Creek,Rouge Hill,Port Union,,,,
49M1EScarboroughGuildwood,Morningside,West Hill,,,,
1M1GScarboroughWoburn,,,,,,
66M1HScarboroughCedarbrae,,,,,,
\n", "
" ], "text/plain": [ " PostalCode Borough Neighbourhood\n", "6 M1B Scarborough Rouge,Malvern,,,,,\n", "86 M1C Scarborough Highland Creek,Rouge Hill,Port Union,,,,\n", "49 M1E Scarborough Guildwood,Morningside,West Hill,,,,\n", "1 M1G Scarborough Woburn,,,,,,\n", "66 M1H Scarborough Cedarbrae,,,,,," ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean_sorted = data_clean.sort_values(by = 'PostalCode', ascending= True)\n", "data_clean_sorted.head()" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['PostalCode', 'Borough', 'Neighbourhood'], dtype='object')" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean_sorted.columns" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCode
6M1B
86M1C
49M1E
1M1G
66M1H
\n", "
" ], "text/plain": [ " PostalCode\n", "6 M1B\n", "86 M1C\n", "49 M1E\n", "1 M1G\n", "66 M1H" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean_sorted[['PostalCode']].head()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "\"None of [Index(['PostalCode'], dtype='object')] are in the [columns]\"", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlatlang\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PostalCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2932\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2933\u001b[0m indexer = self.loc._convert_to_indexer(key, axis=1,\n\u001b[0;32m-> 2934\u001b[0;31m raise_missing=True)\n\u001b[0m\u001b[1;32m 2935\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2936\u001b[0m \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_convert_to_indexer\u001b[0;34m(self, obj, axis, is_setter, raise_missing)\u001b[0m\n\u001b[1;32m 1352\u001b[0m kwargs = {'raise_missing': True if is_setter else\n\u001b[1;32m 1353\u001b[0m raise_missing}\n\u001b[0;32m-> 1354\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1355\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1356\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1159\u001b[0m self._validate_read_indexer(keyarr, indexer,\n\u001b[1;32m 1160\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1161\u001b[0;31m raise_missing=raise_missing)\n\u001b[0m\u001b[1;32m 1162\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1244\u001b[0m raise KeyError(\n\u001b[1;32m 1245\u001b[0m u\"None of [{key}] are in the [{axis}]\".format(\n\u001b[0;32m-> 1246\u001b[0;31m key=key, axis=self.obj._get_axis_name(axis)))\n\u001b[0m\u001b[1;32m 1247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1248\u001b[0m \u001b[0;31m# We (temporarily) allow for some missing keys with .loc, except in\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: \"None of [Index(['PostalCode'], dtype='object')] are in the [columns]\"" ] } ], "source": [ "#latlang[['PostalCode']].head()#Throws error, possibly due to hidden character, so convert to list" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['PostalCode', 'Latitude', 'Longitude'], dtype='object')" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latlang.columns" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "#Convert to list\n", "latlang.columns = latlang.columns.tolist()" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCode
0M1B
1M1C
2M1E
3M1G
4M1H
\n", "
" ], "text/plain": [ " PostalCode\n", "0 M1B\n", "1 M1C\n", "2 M1E\n", "3 M1G\n", "4 M1H" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latlang[['PostalCode']].head()#Now it displays properly" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "#Merge the two dataframes on PostalCode\n", "data_merged = pd.merge(data_clean_sorted, latlang, on='PostalCode')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBoroughNeighbourhoodLatitudeLongitude
0M1BScarboroughRouge,Malvern,,,,,43.806686-79.194353
1M1CScarboroughHighland Creek,Rouge Hill,Port Union,,,,43.784535-79.160497
2M1EScarboroughGuildwood,Morningside,West Hill,,,,43.763573-79.188711
3M1GScarboroughWoburn,,,,,,43.770992-79.216917
4M1HScarboroughCedarbrae,,,,,,43.773136-79.239476
\n", "
" ], "text/plain": [ " PostalCode Borough Neighbourhood \\\n", "0 M1B Scarborough Rouge,Malvern,,,,, \n", "1 M1C Scarborough Highland Creek,Rouge Hill,Port Union,,,, \n", "2 M1E Scarborough Guildwood,Morningside,West Hill,,,, \n", "3 M1G Scarborough Woburn,,,,,, \n", "4 M1H Scarborough Cedarbrae,,,,,, \n", "\n", " Latitude Longitude \n", "0 43.806686 -79.194353 \n", "1 43.784535 -79.160497 \n", "2 43.763573 -79.188711 \n", "3 43.770992 -79.216917 \n", "4 43.773136 -79.239476 " ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_merged.head()" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostalCodeBoroughNeighbourhoodLatitudeLongitude
53M5ADowntown TorontoHarbourfront,Regent Park,,,,,43.65426-79.360636
\n", "
" ], "text/plain": [ " PostalCode Borough Neighbourhood Latitude \\\n", "53 M5A Downtown Toronto Harbourfront,Regent Park,,,,, 43.65426 \n", "\n", " Longitude \n", "53 -79.360636 " ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_merged[data_merged['PostalCode'] == 'M5A']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.7" } }, "nbformat": 4, "nbformat_minor": 2 }