{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Building the baseline classifier\n", "\n", "We'll now do a basic round of supervised classification using scikit-learn. We start by loading the data. We actually have the final classifications in this dataset, so that we can figure out what our accuracy rate was, but we'll ignore it initially and pretend we're starting from scratch." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = pd.read_csv('singapore-roadnames-final-classified.csv')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | Unnamed: 0 | \n", "road_name | \n", "has_malay_road_tag | \n", "classification | \n", "comment | \n", "
---|---|---|---|---|---|
0 | \n", "0 | \n", "Abingdon | \n", "0 | \n", "British | \n", "NaN | \n", "
1 | \n", "1 | \n", "Abu Talib | \n", "1 | \n", "Malay | \n", "NaN | \n", "
2 | \n", "2 | \n", "Adam | \n", "0 | \n", "British | \n", "NaN | \n", "
3 | \n", "3 | \n", "Adat | \n", "1 | \n", "Malay | \n", "NaN | \n", "
4 | \n", "4 | \n", "Adis | \n", "0 | \n", "Other | \n", "Indian Jewish | \n", "
5 | \n", "5 | \n", "Admiralty | \n", "0 | \n", "British | \n", "NaN | \n", "
6 | \n", "6 | \n", "Ah Hood | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
7 | \n", "7 | \n", "Ah Soo | \n", "1 | \n", "Chinese | \n", "NaN | \n", "
8 | \n", "8 | \n", "Ahmad Ibrahim | \n", "1 | \n", "Malay | \n", "NaN | \n", "
9 | \n", "9 | \n", "Aida | \n", "0 | \n", "Other | \n", "NaN | \n", "
10 | \n", "10 | \n", "Airport | \n", "0 | \n", "Generic | \n", "NaN | \n", "
11 | \n", "11 | \n", "Alexandra | \n", "0 | \n", "British | \n", "NaN | \n", "
12 | \n", "12 | \n", "Aliwal | \n", "0 | \n", "Indian | \n", "Battle of Aliwal in the Indo-Sikh war | \n", "
13 | \n", "13 | \n", "Aljunied | \n", "0 | \n", "Other | \n", "Arab | \n", "
14 | \n", "14 | \n", "Allanbrooke | \n", "0 | \n", "British | \n", "NaN | \n", "
15 | \n", "15 | \n", "Allenby | \n", "0 | \n", "British | \n", "NaN | \n", "
16 | \n", "16 | \n", "Almond | \n", "0 | \n", "Generic | \n", "NaN | \n", "
17 | \n", "17 | \n", "Alnwick | \n", "0 | \n", "British | \n", "NaN | \n", "
18 | \n", "18 | \n", "Alps | \n", "0 | \n", "Other | \n", "NaN | \n", "
19 | \n", "19 | \n", "Ama Keng | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
20 | \n", "20 | \n", "Amber | \n", "0 | \n", "Other | \n", "after the Amber Trust fund established for poo... | \n", "
21 | \n", "21 | \n", "Amoy | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
22 | \n", "22 | \n", "Ampang | \n", "1 | \n", "Malay | \n", "NaN | \n", "
23 | \n", "23 | \n", "Ampas | \n", "1 | \n", "Malay | \n", "NaN | \n", "
24 | \n", "24 | \n", "Ampat | \n", "1 | \n", "Malay | \n", "NaN | \n", "
25 | \n", "25 | \n", "Anak Bukit | \n", "1 | \n", "Malay | \n", "NaN | \n", "
26 | \n", "26 | \n", "Anak Patong | \n", "1 | \n", "Malay | \n", "NaN | \n", "
27 | \n", "27 | \n", "Anamalai | \n", "0 | \n", "Indian | \n", "NaN | \n", "
28 | \n", "28 | \n", "Anchorvale | \n", "0 | \n", "Generic | \n", "marine theme | \n", "
29 | \n", "29 | \n", "Anderson | \n", "0 | \n", "British | \n", "NaN | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1721 | \n", "1721 | \n", "Woodgrove | \n", "0 | \n", "Generic | \n", "NaN | \n", "
1722 | \n", "1722 | \n", "Woodland | \n", "0 | \n", "Generic | \n", "NaN | \n", "
1723 | \n", "1723 | \n", "Woodlands | \n", "0 | \n", "Generic | \n", "NaN | \n", "
1724 | \n", "1724 | \n", "Woodleigh | \n", "0 | \n", "British | \n", "NaN | \n", "
1725 | \n", "1725 | \n", "Woodsville | \n", "0 | \n", "Generic | \n", "NaN | \n", "
1726 | \n", "1726 | \n", "Woollerton | \n", "0 | \n", "British | \n", "NaN | \n", "
1727 | \n", "1727 | \n", "Worthing | \n", "0 | \n", "British | \n", "NaN | \n", "
1728 | \n", "1728 | \n", "Xilin | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1729 | \n", "1729 | \n", "Yan Kit | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1730 | \n", "1730 | \n", "Yarrow | \n", "0 | \n", "British | \n", "NaN | \n", "
1731 | \n", "1731 | \n", "Yarwood | \n", "0 | \n", "British | \n", "NaN | \n", "
1732 | \n", "1732 | \n", "Yasin | \n", "1 | \n", "Malay | \n", "NaN | \n", "
1733 | \n", "1733 | \n", "Yio Chu Kang | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1734 | \n", "1734 | \n", "Yishun | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1735 | \n", "1735 | \n", "Yong Siak | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1736 | \n", "1736 | \n", "York | \n", "0 | \n", "British | \n", "NaN | \n", "
1737 | \n", "1737 | \n", "Youngberg | \n", "0 | \n", "British | \n", "NaN | \n", "
1738 | \n", "1738 | \n", "Yuan Ching | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1739 | \n", "1739 | \n", "Yuk Tong | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1740 | \n", "1740 | \n", "Yung An | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1741 | \n", "1741 | \n", "Yung Ho | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1742 | \n", "1742 | \n", "Yung Kuang | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1743 | \n", "1743 | \n", "Yung Sheng | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1744 | \n", "1744 | \n", "Yunnan | \n", "0 | \n", "Chinese | \n", "NaN | \n", "
1745 | \n", "1745 | \n", "Zamrud | \n", "1 | \n", "Malay | \n", "NaN | \n", "
1746 | \n", "1746 | \n", "Zehnder | \n", "0 | \n", "Other | \n", "Eurasian | \n", "
1747 | \n", "1747 | \n", "Zion | \n", "0 | \n", "Other | \n", "NaN | \n", "
1748 | \n", "1748 | \n", "Zubir Said | \n", "0 | \n", "Malay | \n", "NaN | \n", "
1749 | \n", "1749 | \n", "kukoh | \n", "1 | \n", "Malay | \n", "NaN | \n", "
1750 | \n", "1750 | \n", "one-north Gateway | \n", "0 | \n", "Generic | \n", "NaN | \n", "
1751 rows × 5 columns
\n", "