{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"A quick python modelling pipeline","version":"0.3.2","provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"metadata":{"id":"Ppk8pcaVwWQP","colab_type":"text"},"cell_type":"markdown","source":["# Classification of Italian Wines\n","![alt text](https://viaverdimiami.com/wp-content/uploads/2017/07/Italian-Wine.jpg)\n","\n","In this notebook we will be using supervised learning to classify Italian wines. \n","The question is: Can we teach a machine to figure out which type of wine an obseration belongs to?\n","\n","We will work with a famous but small dataset that can be found [here](https://archive.ics.uci.edu/ml/datasets/wine) (along more informaion).\n","The data is clean, contains only numerical and no missing values. We will not do any EDA but only focus on prediction. The only preprocessing step will be standardization of the physiochemical variables.\n","\n","We will be using Pandas and Scikit-Learn which are both parts of the Anaconda distribution."]},{"metadata":{"id":"6gm0-8FhTjez","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":204},"outputId":"f939f24f-31a8-4db7-fc3d-13ea731e843b","executionInfo":{"status":"ok","timestamp":1536914885576,"user_tz":-120,"elapsed":1980,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Download the dateset using WGET.\n","# If this is not possible, then just paste the URL in your browser and download \n","# the file, or if you use GithubDesktop then it should be in the folder\n","# after a pull.\n","\n","\n","!wget https://cdn.rawgit.com/SDS-AAU/M1-2018/182abaa2/data/wine.csv"],"execution_count":42,"outputs":[{"output_type":"stream","text":["\n","Redirecting output to ‘wget-log.1’.\n"],"name":"stdout"}]},{"metadata":{"id":"fgMrPUKMT--A","colab_type":"code","colab":{}},"cell_type":"code","source":["# Importing the libraries\n","\n","import numpy as np # for working with arrays\n","np.set_printoptions(suppress=True) # not a must but nice to avoid scientific notation\n","\n","\n","import pandas as pd # as usual for handling dataframes\n","pd.options.display.float_format = '{:.4f}'.format #same for pandas to turn off scientific notation"],"execution_count":0,"outputs":[]},{"metadata":{"id":"fScEVcTJUlR4","colab_type":"code","colab":{}},"cell_type":"code","source":["# Importing the dataset\n","dataset = pd.read_csv('wine.csv')"],"execution_count":0,"outputs":[]},{"metadata":{"id":"hM5ZBaXAVQRG","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":176},"outputId":"5621102b-dd91-45ad-b888-5a85fdbefbd5","executionInfo":{"status":"ok","timestamp":1536915056884,"user_tz":-120,"elapsed":639,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Quick check of the dataframe proportions\n","dataset.shape"],"execution_count":45,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(178, 15)"]},"metadata":{"tags":[]},"execution_count":45}]},{"metadata":{"id":"NVU5xCe1UpI7","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":428},"outputId":"580ccf06-4209-47d4-ba60-dbc6af365bd7","executionInfo":{"status":"ok","timestamp":1536915081431,"user_tz":-120,"elapsed":635,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Checking the first 5 rows to get familiar with the data\n","dataset.head()"],"execution_count":46,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>class_label</th>\n","      <th>class_name</th>\n","      <th>alcohol</th>\n","      <th>malic_acid</th>\n","      <th>ash</th>\n","      <th>alcalinity_of_ash</th>\n","      <th>magnesium</th>\n","      <th>total_phenols</th>\n","      <th>flavanoids</th>\n","      <th>nonflavanoid_phenols</th>\n","      <th>proanthocyanins</th>\n","      <th>color_intensity</th>\n","      <th>hue</th>\n","      <th>od280</th>\n","      <th>proline</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>Barolo</td>\n","      <td>14.2300</td>\n","      <td>1.7100</td>\n","      <td>2.4300</td>\n","      <td>15.6000</td>\n","      <td>127</td>\n","      <td>2.8000</td>\n","      <td>3.0600</td>\n","      <td>0.2800</td>\n","      <td>2.2900</td>\n","      <td>5.6400</td>\n","      <td>1.0400</td>\n","      <td>3.9200</td>\n","      <td>1065</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>1</td>\n","      <td>Barolo</td>\n","      <td>13.2000</td>\n","      <td>1.7800</td>\n","      <td>2.1400</td>\n","      <td>11.2000</td>\n","      <td>100</td>\n","      <td>2.6500</td>\n","      <td>2.7600</td>\n","      <td>0.2600</td>\n","      <td>1.2800</td>\n","      <td>4.3800</td>\n","      <td>1.0500</td>\n","      <td>3.4000</td>\n","      <td>1050</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1</td>\n","      <td>Barolo</td>\n","      <td>13.1600</td>\n","      <td>2.3600</td>\n","      <td>2.6700</td>\n","      <td>18.6000</td>\n","      <td>101</td>\n","      <td>2.8000</td>\n","      <td>3.2400</td>\n","      <td>0.3000</td>\n","      <td>2.8100</td>\n","      <td>5.6800</td>\n","      <td>1.0300</td>\n","      <td>3.1700</td>\n","      <td>1185</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>1</td>\n","      <td>Barolo</td>\n","      <td>14.3700</td>\n","      <td>1.9500</td>\n","      <td>2.5000</td>\n","      <td>16.8000</td>\n","      <td>113</td>\n","      <td>3.8500</td>\n","      <td>3.4900</td>\n","      <td>0.2400</td>\n","      <td>2.1800</td>\n","      <td>7.8000</td>\n","      <td>0.8600</td>\n","      <td>3.4500</td>\n","      <td>1480</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>1</td>\n","      <td>Barolo</td>\n","      <td>13.2400</td>\n","      <td>2.5900</td>\n","      <td>2.8700</td>\n","      <td>21.0000</td>\n","      <td>118</td>\n","      <td>2.8000</td>\n","      <td>2.6900</td>\n","      <td>0.3900</td>\n","      <td>1.8200</td>\n","      <td>4.3200</td>\n","      <td>1.0400</td>\n","      <td>2.9300</td>\n","      <td>735</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["   class_label class_name  alcohol  malic_acid    ash  alcalinity_of_ash  \\\n","0            1     Barolo  14.2300      1.7100 2.4300            15.6000   \n","1            1     Barolo  13.2000      1.7800 2.1400            11.2000   \n","2            1     Barolo  13.1600      2.3600 2.6700            18.6000   \n","3            1     Barolo  14.3700      1.9500 2.5000            16.8000   \n","4            1     Barolo  13.2400      2.5900 2.8700            21.0000   \n","\n","   magnesium  total_phenols  flavanoids  nonflavanoid_phenols  \\\n","0        127         2.8000      3.0600                0.2800   \n","1        100         2.6500      2.7600                0.2600   \n","2        101         2.8000      3.2400                0.3000   \n","3        113         3.8500      3.4900                0.2400   \n","4        118         2.8000      2.6900                0.3900   \n","\n","   proanthocyanins  color_intensity    hue  od280  proline  \n","0           2.2900           5.6400 1.0400 3.9200     1065  \n","1           1.2800           4.3800 1.0500 3.4000     1050  \n","2           2.8100           5.6800 1.0300 3.1700     1185  \n","3           2.1800           7.8000 0.8600 3.4500     1480  \n","4           1.8200           4.3200 1.0400 2.9300      735  "]},"metadata":{"tags":[]},"execution_count":46}]},{"metadata":{"id":"TiX9lDXBoR0H","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":524},"outputId":"97f5eacf-197f-4f0e-ed56-8467ca8fcb20","executionInfo":{"status":"ok","timestamp":1536915222928,"user_tz":-120,"elapsed":627,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Getting basic descriptives for all nummerical variables\n","dataset.describe()"],"execution_count":47,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>class_label</th>\n","      <th>alcohol</th>\n","      <th>malic_acid</th>\n","      <th>ash</th>\n","      <th>alcalinity_of_ash</th>\n","      <th>magnesium</th>\n","      <th>total_phenols</th>\n","      <th>flavanoids</th>\n","      <th>nonflavanoid_phenols</th>\n","      <th>proanthocyanins</th>\n","      <th>color_intensity</th>\n","      <th>hue</th>\n","      <th>od280</th>\n","      <th>proline</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>count</th>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","    </tr>\n","    <tr>\n","      <th>mean</th>\n","      <td>1.9382</td>\n","      <td>13.0006</td>\n","      <td>2.3363</td>\n","      <td>2.3665</td>\n","      <td>19.4949</td>\n","      <td>99.7416</td>\n","      <td>2.2951</td>\n","      <td>2.0293</td>\n","      <td>0.3619</td>\n","      <td>1.5909</td>\n","      <td>5.0581</td>\n","      <td>0.9574</td>\n","      <td>2.6117</td>\n","      <td>746.8933</td>\n","    </tr>\n","    <tr>\n","      <th>std</th>\n","      <td>0.7750</td>\n","      <td>0.8118</td>\n","      <td>1.1171</td>\n","      <td>0.2743</td>\n","      <td>3.3396</td>\n","      <td>14.2825</td>\n","      <td>0.6259</td>\n","      <td>0.9989</td>\n","      <td>0.1245</td>\n","      <td>0.5724</td>\n","      <td>2.3183</td>\n","      <td>0.2286</td>\n","      <td>0.7100</td>\n","      <td>314.9075</td>\n","    </tr>\n","    <tr>\n","      <th>min</th>\n","      <td>1.0000</td>\n","      <td>11.0300</td>\n","      <td>0.7400</td>\n","      <td>1.3600</td>\n","      <td>10.6000</td>\n","      <td>70.0000</td>\n","      <td>0.9800</td>\n","      <td>0.3400</td>\n","      <td>0.1300</td>\n","      <td>0.4100</td>\n","      <td>1.2800</td>\n","      <td>0.4800</td>\n","      <td>1.2700</td>\n","      <td>278.0000</td>\n","    </tr>\n","    <tr>\n","      <th>25%</th>\n","      <td>1.0000</td>\n","      <td>12.3625</td>\n","      <td>1.6025</td>\n","      <td>2.2100</td>\n","      <td>17.2000</td>\n","      <td>88.0000</td>\n","      <td>1.7425</td>\n","      <td>1.2050</td>\n","      <td>0.2700</td>\n","      <td>1.2500</td>\n","      <td>3.2200</td>\n","      <td>0.7825</td>\n","      <td>1.9375</td>\n","      <td>500.5000</td>\n","    </tr>\n","    <tr>\n","      <th>50%</th>\n","      <td>2.0000</td>\n","      <td>13.0500</td>\n","      <td>1.8650</td>\n","      <td>2.3600</td>\n","      <td>19.5000</td>\n","      <td>98.0000</td>\n","      <td>2.3550</td>\n","      <td>2.1350</td>\n","      <td>0.3400</td>\n","      <td>1.5550</td>\n","      <td>4.6900</td>\n","      <td>0.9650</td>\n","      <td>2.7800</td>\n","      <td>673.5000</td>\n","    </tr>\n","    <tr>\n","      <th>75%</th>\n","      <td>3.0000</td>\n","      <td>13.6775</td>\n","      <td>3.0825</td>\n","      <td>2.5575</td>\n","      <td>21.5000</td>\n","      <td>107.0000</td>\n","      <td>2.8000</td>\n","      <td>2.8750</td>\n","      <td>0.4375</td>\n","      <td>1.9500</td>\n","      <td>6.2000</td>\n","      <td>1.1200</td>\n","      <td>3.1700</td>\n","      <td>985.0000</td>\n","    </tr>\n","    <tr>\n","      <th>max</th>\n","      <td>3.0000</td>\n","      <td>14.8300</td>\n","      <td>5.8000</td>\n","      <td>3.2300</td>\n","      <td>30.0000</td>\n","      <td>162.0000</td>\n","      <td>3.8800</td>\n","      <td>5.0800</td>\n","      <td>0.6600</td>\n","      <td>3.5800</td>\n","      <td>13.0000</td>\n","      <td>1.7100</td>\n","      <td>4.0000</td>\n","      <td>1680.0000</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["       class_label  alcohol  malic_acid      ash  alcalinity_of_ash  \\\n","count     178.0000 178.0000    178.0000 178.0000           178.0000   \n","mean        1.9382  13.0006      2.3363   2.3665            19.4949   \n","std         0.7750   0.8118      1.1171   0.2743             3.3396   \n","min         1.0000  11.0300      0.7400   1.3600            10.6000   \n","25%         1.0000  12.3625      1.6025   2.2100            17.2000   \n","50%         2.0000  13.0500      1.8650   2.3600            19.5000   \n","75%         3.0000  13.6775      3.0825   2.5575            21.5000   \n","max         3.0000  14.8300      5.8000   3.2300            30.0000   \n","\n","       magnesium  total_phenols  flavanoids  nonflavanoid_phenols  \\\n","count   178.0000       178.0000    178.0000              178.0000   \n","mean     99.7416         2.2951      2.0293                0.3619   \n","std      14.2825         0.6259      0.9989                0.1245   \n","min      70.0000         0.9800      0.3400                0.1300   \n","25%      88.0000         1.7425      1.2050                0.2700   \n","50%      98.0000         2.3550      2.1350                0.3400   \n","75%     107.0000         2.8000      2.8750                0.4375   \n","max     162.0000         3.8800      5.0800                0.6600   \n","\n","       proanthocyanins  color_intensity      hue    od280   proline  \n","count         178.0000         178.0000 178.0000 178.0000  178.0000  \n","mean            1.5909           5.0581   0.9574   2.6117  746.8933  \n","std             0.5724           2.3183   0.2286   0.7100  314.9075  \n","min             0.4100           1.2800   0.4800   1.2700  278.0000  \n","25%             1.2500           3.2200   0.7825   1.9375  500.5000  \n","50%             1.5550           4.6900   0.9650   2.7800  673.5000  \n","75%             1.9500           6.2000   1.1200   3.1700  985.0000  \n","max             3.5800          13.0000   1.7100   4.0000 1680.0000  "]},"metadata":{"tags":[]},"execution_count":47}]},{"metadata":{"id":"6Wkz17J70IrB","colab_type":"text"},"cell_type":"markdown","source":["We can see here that means and spread (standard deviation) of the features is very different and thus we will need to standardize the dataset. \n","\n","\n","> \"As a rule of thumb I’d say: When in doubt, just standardize the data, it shouldn’t hurt.\"\" [Sebastian Raschka](https://sebastianraschka.com/Articles/2014_about_feature_scaling.html)"]},{"metadata":{"id":"yJtZcJv_Unue","colab_type":"code","colab":{}},"cell_type":"code","source":["# Selecting the relevant data\n","# using the iloc selector allows to grab a range 2-15 of columns\n","# withouth having to call their names. That's practical\n","# Also, we ask for values only, as we are going to pass the data into\n","# the ML algorithms in the form of arrays rather than pandas DFs\n","\n","X = dataset.iloc[:, 2:15].values\n","y = dataset.iloc[:, 1].values"],"execution_count":0,"outputs":[]},{"metadata":{"id":"gjrUE6km2UVX","colab_type":"text"},"cell_type":"markdown","source":["Yes, there is a ```class_lable``` in the dataset but for the sake of learning and because it is very simple, we are going to construct our class_lables on our own. For this we will use the ```LabelEncoder``` from Scikit-Learn. Note that in contrast to Pandas, the Scikit-Learn is more of a (HUGE!!!) Library where you have to import different functionalities separately. You can find an index of all classes [here](http://scikit-learn.org/stable/modules/classes.html)."]},{"metadata":{"id":"I2W13RUmV9cr","colab_type":"code","colab":{}},"cell_type":"code","source":["# Encoding categorical data\n","from sklearn.preprocessing import LabelEncoder"],"execution_count":0,"outputs":[]},{"metadata":{"id":"YHBl_Jdb3OTb","colab_type":"text"},"cell_type":"markdown","source":["Classes such as the ```LabelEncoder``` or any modely type that you import have several parameters that can (but don't have to be) specified. Also, you are usually fitting them to some data first before performind transformations. Thus, they are *cutom-made* for each use case and therefore you will need to define an encoder object from the imported class. This is a general philosophy behind all Scikit-Learn classes. The good news: The syntax is the same across all classes.\n","\n","Below we first define a ```labelencoder_y``` and then use the ```fit_transform``` method (we could also first use ```fit``` and then ```transform```) to turn our wine-type names into numbers."]},{"metadata":{"id":"8XVJe409WbSK","colab_type":"code","colab":{}},"cell_type":"code","source":["# From labels to numbers\n","labelencoder_y = LabelEncoder()\n","y = labelencoder_y.fit_transform(y)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"tejI4qFe4X1X","colab_type":"text"},"cell_type":"markdown","source":["As you have seen from the descriptives above our variables lie on very different scales. Therefore, we will standardize them before going further. The procedure using the ```StandardScaler```is exactly the same as before with the label encoder.\n","\n","This scaling will for each value substract the mean (of the column) and devide it by the standard deviation, thus bringing them all on the same scale with a mean of 0 and a standard deviation of 1."]},{"metadata":{"id":"2Wva3SqQfvFp","colab_type":"code","colab":{}},"cell_type":"code","source":["# Feature scaling\n","from sklearn.preprocessing import StandardScaler\n","\n","scaler = StandardScaler()\n","\n","X = scaler.fit_transform(X)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"0cRYCKVP45aH","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":464},"outputId":"16901b4f-c679-406e-df1e-49b8350d5674","executionInfo":{"status":"ok","timestamp":1536915713954,"user_tz":-120,"elapsed":660,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# We can check our transform data using pandas describe\n","pd.DataFrame(X).describe()"],"execution_count":56,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>0</th>\n","      <th>1</th>\n","      <th>2</th>\n","      <th>3</th>\n","      <th>4</th>\n","      <th>5</th>\n","      <th>6</th>\n","      <th>7</th>\n","      <th>8</th>\n","      <th>9</th>\n","      <th>10</th>\n","      <th>11</th>\n","      <th>12</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>count</th>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","      <td>178.0000</td>\n","    </tr>\n","    <tr>\n","      <th>mean</th>\n","      <td>-0.0000</td>\n","      <td>-0.0000</td>\n","      <td>-0.0000</td>\n","      <td>-0.0000</td>\n","      <td>-0.0000</td>\n","      <td>0.0000</td>\n","      <td>-0.0000</td>\n","      <td>0.0000</td>\n","      <td>-0.0000</td>\n","      <td>0.0000</td>\n","      <td>0.0000</td>\n","      <td>0.0000</td>\n","      <td>-0.0000</td>\n","    </tr>\n","    <tr>\n","      <th>std</th>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","      <td>1.0028</td>\n","    </tr>\n","    <tr>\n","      <th>min</th>\n","      <td>-2.4342</td>\n","      <td>-1.4330</td>\n","      <td>-3.6792</td>\n","      <td>-2.6710</td>\n","      <td>-2.0883</td>\n","      <td>-2.1072</td>\n","      <td>-1.6960</td>\n","      <td>-1.8682</td>\n","      <td>-2.0690</td>\n","      <td>-1.6343</td>\n","      <td>-2.0947</td>\n","      <td>-1.8951</td>\n","      <td>-1.4932</td>\n","    </tr>\n","    <tr>\n","      <th>25%</th>\n","      <td>-0.7882</td>\n","      <td>-0.6587</td>\n","      <td>-0.5721</td>\n","      <td>-0.6891</td>\n","      <td>-0.8244</td>\n","      <td>-0.8855</td>\n","      <td>-0.8275</td>\n","      <td>-0.7401</td>\n","      <td>-0.5973</td>\n","      <td>-0.7951</td>\n","      <td>-0.7676</td>\n","      <td>-0.9522</td>\n","      <td>-0.7846</td>\n","    </tr>\n","    <tr>\n","      <th>50%</th>\n","      <td>0.0610</td>\n","      <td>-0.4231</td>\n","      <td>-0.0238</td>\n","      <td>0.0015</td>\n","      <td>-0.1223</td>\n","      <td>0.0960</td>\n","      <td>0.1061</td>\n","      <td>-0.1761</td>\n","      <td>-0.0629</td>\n","      <td>-0.1592</td>\n","      <td>0.0331</td>\n","      <td>0.2377</td>\n","      <td>-0.2337</td>\n","    </tr>\n","    <tr>\n","      <th>75%</th>\n","      <td>0.8361</td>\n","      <td>0.6698</td>\n","      <td>0.6981</td>\n","      <td>0.6021</td>\n","      <td>0.5096</td>\n","      <td>0.8090</td>\n","      <td>0.8491</td>\n","      <td>0.6095</td>\n","      <td>0.6292</td>\n","      <td>0.4940</td>\n","      <td>0.7132</td>\n","      <td>0.7886</td>\n","      <td>0.7582</td>\n","    </tr>\n","    <tr>\n","      <th>max</th>\n","      <td>2.2598</td>\n","      <td>3.1092</td>\n","      <td>3.1563</td>\n","      <td>3.1545</td>\n","      <td>4.3714</td>\n","      <td>2.5395</td>\n","      <td>3.0628</td>\n","      <td>2.4024</td>\n","      <td>3.4851</td>\n","      <td>3.4354</td>\n","      <td>3.3017</td>\n","      <td>1.9609</td>\n","      <td>2.9715</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["            0        1        2        3        4        5        6        7   \\\n","count 178.0000 178.0000 178.0000 178.0000 178.0000 178.0000 178.0000 178.0000   \n","mean   -0.0000  -0.0000  -0.0000  -0.0000  -0.0000   0.0000  -0.0000   0.0000   \n","std     1.0028   1.0028   1.0028   1.0028   1.0028   1.0028   1.0028   1.0028   \n","min    -2.4342  -1.4330  -3.6792  -2.6710  -2.0883  -2.1072  -1.6960  -1.8682   \n","25%    -0.7882  -0.6587  -0.5721  -0.6891  -0.8244  -0.8855  -0.8275  -0.7401   \n","50%     0.0610  -0.4231  -0.0238   0.0015  -0.1223   0.0960   0.1061  -0.1761   \n","75%     0.8361   0.6698   0.6981   0.6021   0.5096   0.8090   0.8491   0.6095   \n","max     2.2598   3.1092   3.1563   3.1545   4.3714   2.5395   3.0628   2.4024   \n","\n","            8        9        10       11       12  \n","count 178.0000 178.0000 178.0000 178.0000 178.0000  \n","mean   -0.0000   0.0000   0.0000   0.0000  -0.0000  \n","std     1.0028   1.0028   1.0028   1.0028   1.0028  \n","min    -2.0690  -1.6343  -2.0947  -1.8951  -1.4932  \n","25%    -0.5973  -0.7951  -0.7676  -0.9522  -0.7846  \n","50%    -0.0629  -0.1592   0.0331   0.2377  -0.2337  \n","75%     0.6292   0.4940   0.7132   0.7886   0.7582  \n","max     3.4851   3.4354   3.3017   1.9609   2.9715  "]},"metadata":{"tags":[]},"execution_count":56}]},{"metadata":{"id":"ddiAjZSa57B8","colab_type":"text"},"cell_type":"markdown","source":["In the next step we split the data into a training and a test-set. Very often you will see a split of 80/20 %\n","\n","\n","![alt text](https://cdn-images-1.medium.com/max/1000/1*4G__SV580CxFj78o9yUXuQ.png)\n","\n","80% of the data will be used to fit a model, while we will keep 20% of the data for testing the models performance.\n","\n","The train_test_split class takes 4 parameters: (X, y, test_size = 0.2, random_state = 21)\n","\n","\n","1.   Input matrix: X\n","2.   Output matrix: y\n","3. The test size: We take 20%\n","4. A random state (optional): Some number for the random generator that will shuffle the values*\n","\n","*The whole random state thing is mostly for easier reproducibility and can also be let our. \n","\n","\n","\n"]},{"metadata":{"id":"dqJF3TsMfUoX","colab_type":"code","colab":{}},"cell_type":"code","source":["# Splitting the dataset into the Training set and Test set\n","\n","from sklearn.model_selection import train_test_split\n","\n","X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"LIFI4Qgs6zET","colab_type":"text"},"cell_type":"markdown","source":["![alt text](https://uproxx.files.wordpress.com/2015/12/bender-pointless-day.jpg?quality=95)\n","\n","Now it's time for the model to meet the wine data.\n","\n","We will be using 3 different models. The reason why we use 3 models is because, it is nice to see how easy it is to switch them aroun to experiment what works best. Since we can calculate an (kind of) objective quality measure, it is easy to compare and evaluate them agains each other. \n","\n","*   Logistic Regression\n","*   Suport Vector Classifier\n","* Random Forest Classifier\n","\n","Remember that this is a classification problem rather than a regression. The models will be estimating probabilities for some class vs. other classes."]},{"metadata":{"id":"XCu3pjo6ufDJ","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":228},"outputId":"1178008b-7e34-497f-ff59-b9f32075c056","executionInfo":{"status":"ok","timestamp":1536917333620,"user_tz":-120,"elapsed":555,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# We first import and train a Logistic Regression\n","\n","from sklearn.linear_model import LogisticRegression\n","\n","classifier = LogisticRegression(random_state = 22)\n","\n","classifier.fit(X_train, y_train)\n","\n","\n","# After training the model we should jump further down (over the next 2 models)\n","# To evaluate the results"],"execution_count":57,"outputs":[{"output_type":"execute_result","data":{"text/plain":["LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n","          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n","          penalty='l2', random_state=22, solver='liblinear', tol=0.0001,\n","          verbose=0, warm_start=False)"]},"metadata":{"tags":[]},"execution_count":57}]},{"metadata":{"id":"zPqsfMx-ioz2","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":264},"outputId":"7eb40d74-3c29-49d7-c6d4-275b8ce88793","executionInfo":{"status":"ok","timestamp":1536917526954,"user_tz":-120,"elapsed":710,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Fitting Random Forest Classification to the Training set\n","from sklearn.ensemble import RandomForestClassifier\n","\n","classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 22)\n","\n","classifier.fit(X_train, y_train)"],"execution_count":59,"outputs":[{"output_type":"execute_result","data":{"text/plain":["RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n","            max_depth=None, max_features='auto', max_leaf_nodes=None,\n","            min_impurity_decrease=0.0, min_impurity_split=None,\n","            min_samples_leaf=1, min_samples_split=2,\n","            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,\n","            oob_score=False, random_state=22, verbose=0, warm_start=False)"]},"metadata":{"tags":[]},"execution_count":59}]},{"metadata":{"id":"iUIvBW8LqZtg","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":228},"outputId":"d40710d5-d475-4708-d622-94a3c74747ac","executionInfo":{"status":"ok","timestamp":1536917795693,"user_tz":-120,"elapsed":667,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Finally we train a Support Vector Classifier\n","from sklearn.svm import SVC\n","\n","classifier = SVC(kernel = 'linear', random_state = 21)\n","\n","classifier.fit(X_train, y_train)"],"execution_count":61,"outputs":[{"output_type":"execute_result","data":{"text/plain":["SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n","  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n","  max_iter=-1, probability=False, random_state=21, shrinking=True,\n","  tol=0.001, verbose=False)"]},"metadata":{"tags":[]},"execution_count":61}]},{"metadata":{"id":"BMFNTWpGA2lb","colab_type":"text"},"cell_type":"markdown","source":["Perhaps this time the algorithm was just lucky because of a random allocation of the data in the train-test split. To make sure which model is the most accurate, we can run a k-Fold Cross Validation deviding x_train into (here) 10 parts, training on 9 and testing on 1. This will be done 10 times, every time measuring the accuracy and finally returning the average accuracy.\n","\n","![alt text](https://www.researchgate.net/profile/Kiret_Dhindsa/publication/323969239/figure/fig10/AS:607404244873216@1521827865007/The-K-fold-cross-validation-scheme-133-Each-of-the-K-partitions-is-used-as-a-test.png)"]},{"metadata":{"id":"18KOe_XJfOwc","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":196},"outputId":"af336168-e14f-4768-9b2b-a95b719b4d49","executionInfo":{"status":"ok","timestamp":1536917808660,"user_tz":-120,"elapsed":715,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Applying k-Fold Cross Validation\n","from sklearn.model_selection import cross_val_score\n","\n","accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 5)\n","\n","print(accuracies.mean())\n","print(accuracies.std())"],"execution_count":62,"outputs":[{"output_type":"stream","text":["0.9856960408684546\n","0.017537311768860152\n"],"name":"stdout"}]},{"metadata":{"id":"KnyckARR-jC4","colab_type":"text"},"cell_type":"markdown","source":["Now that we fitted or trained a model we need to figure out how well it performes. This approach to evaluation is very different from what many of you are used to from econometrics. \n","\n","Here we are not interested in a model summary table, rather we will be exploring predictive performance.\n","In the next cell we ask the classifier object (our trained model) to gives us predictions for data it never has seen before.\n","\n","Then we will compare the predictions made against the real-world values that we actually know."]},{"metadata":{"id":"OrUb_pOuitwH","colab_type":"code","colab":{}},"cell_type":"code","source":["# Predicting the Test set results\n","y_pred = classifier.predict(X_test)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"9X5O8MgE_l2X","colab_type":"code","colab":{}},"cell_type":"code","source":["# Making a classification report\n","from sklearn.metrics import classification_report\n","\n","cm = classification_report(y_test, y_pred)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"0jxIpUVxjP6R","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":300},"outputId":"218cc916-06be-4e9c-d302-572b73b27d0e","executionInfo":{"status":"ok","timestamp":1536917968605,"user_tz":-120,"elapsed":568,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["print(cm)"],"execution_count":67,"outputs":[{"output_type":"stream","text":["             precision    recall  f1-score   support\n","\n","          0       0.92      1.00      0.96        11\n","          1       1.00      1.00      1.00        15\n","          2       1.00      0.90      0.95        10\n","\n","avg / total       0.97      0.97      0.97        36\n","\n"],"name":"stdout"}]},{"metadata":{"id":"-mJQZkZdAEwc","colab_type":"text"},"cell_type":"markdown","source":["There is also a slightly more intuitive way to evaluate our predictions in the case of a multiclass-classification where we cannot just create a confusion-matrix. What we can do is using pandas to crosstabulate our real against our predicted wines.\n","\n","To get the wine names, we will use the ```inverse_transform``` function of our ```labelencoder```"]},{"metadata":{"id":"VwokRUrSthsw","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":308},"outputId":"6dda574a-ea25-42ee-b0d1-95a232fc559b","executionInfo":{"status":"ok","timestamp":1536918021429,"user_tz":-120,"elapsed":619,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Transforming nummerical labels to wine types\n","\n","true_wines = labelencoder_y.inverse_transform(y_test)\n","\n","predicted_wines = labelencoder_y.inverse_transform(y_pred)"],"execution_count":68,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n","  if diff:\n","/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n","  if diff:\n"],"name":"stderr"}]},{"metadata":{"id":"nB__XG7cj4uz","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":316},"outputId":"a30b3757-49e3-4d48-e390-16ce0f9390ae","executionInfo":{"status":"ok","timestamp":1536918052143,"user_tz":-120,"elapsed":653,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Creating a pandas DataFrame and cross-tabulation\n","\n","df = pd.DataFrame({'true_wines': true_wines, 'predicted_wines': predicted_wines}) \n","\n","pd.crosstab(df.true_wines, df.predicted_wines)"],"execution_count":71,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th>predicted_wines</th>\n","      <th>Barbera</th>\n","      <th>Barolo</th>\n","      <th>Grignolino</th>\n","    </tr>\n","    <tr>\n","      <th>true_wines</th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>Barbera</th>\n","      <td>11</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>Barolo</th>\n","      <td>0</td>\n","      <td>15</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>Grignolino</th>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>9</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["predicted_wines  Barbera  Barolo  Grignolino\n","true_wines                                  \n","Barbera               11       0           0\n","Barolo                 0      15           0\n","Grignolino             1       0           9"]},"metadata":{"tags":[]},"execution_count":71}]},{"metadata":{"id":"5VUw_tzuMdqo","colab_type":"text"},"cell_type":"markdown","source":["**But is that not the same as PCA or soe other kind of clustering?**\n","\n","Well, let's try to use unsupervised learning on the same data-set. We will be using KMeans (because it is simple and nice for illustration)\n","\n","Just as before, we import a model class, define a model object and fit it. Same 3 steps as before."]},{"metadata":{"id":"V_nmcsb1MtJh","colab_type":"code","colab":{}},"cell_type":"code","source":["# We import KMeans and creade a model object (we know that there are 3 wines...kind of cheating)\n","from sklearn.cluster import KMeans\n","\n","model = KMeans(n_clusters = 3)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"cgCR_MmeNCab","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":90},"outputId":"58f3e682-3145-407b-e05f-6b3dd8019c30","executionInfo":{"status":"ok","timestamp":1536920916624,"user_tz":-120,"elapsed":510,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Fitting the model is super easy, jsut one line\n","model.fit(X_train)"],"execution_count":83,"outputs":[{"output_type":"execute_result","data":{"text/plain":["KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n","    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',\n","    random_state=None, tol=0.0001, verbose=0)"]},"metadata":{"tags":[]},"execution_count":83}]},{"metadata":{"id":"RWU1hnQnOmcQ","colab_type":"code","colab":{}},"cell_type":"code","source":["# Prediction is easy, too\n","\n","predicted_wine_clusters = model.predict(X_train)\n","\n","predicted_new_wine_clusters = model.predict(X_test)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"sD_o5atxOxRu","colab_type":"text"},"cell_type":"markdown","source":["Note that the clustering model never met any y-values - only X values"]},{"metadata":{"id":"81fPkaiWNCwK","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":160},"outputId":"58c791f1-2ed5-4a6a-e325-fb9ffd9e068b","executionInfo":{"status":"ok","timestamp":1536920918246,"user_tz":-120,"elapsed":732,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Quick print out of the labels\n","\n","predicted_wine_clusters"],"execution_count":85,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 2, 0, 1, 2, 2, 1, 2, 1, 1, 0, 0, 2, 2, 1, 0, 0, 0, 2, 2, 2, 2,\n","       1, 0, 0, 1, 0, 2, 2, 0, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1, 1, 1,\n","       1, 2, 0, 1, 2, 1, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 1, 2, 1, 2, 1,\n","       1, 2, 1, 1, 0, 2, 1, 0, 2, 2, 0, 1, 0, 1, 1, 1, 2, 0, 2, 2, 1, 1,\n","       2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2, 2, 0,\n","       1, 2, 0, 1, 2, 1, 0, 1, 2, 1, 1, 0, 1, 1, 0, 2, 0, 2, 2, 2, 0, 2,\n","       0, 2, 2, 2, 0, 2, 2, 1, 1, 1], dtype=int32)"]},"metadata":{"tags":[]},"execution_count":85}]},{"metadata":{"id":"LTCxwcrXNPqy","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":290},"outputId":"34e3c90f-dad0-47a4-9950-2f19c0386e13","executionInfo":{"status":"ok","timestamp":1536920918954,"user_tz":-120,"elapsed":505,"user":{"displayName":"Roman Jurowetzki","photoUrl":"//lh6.googleusercontent.com/-PYkMKYnWWKc/AAAAAAAAAAI/AAAAAAAAGi4/VeHA9Eiq9XY/s50-c-k-no/photo.jpg","userId":"108675243397717376404"}}},"cell_type":"code","source":["# Transforming nummerical labels to wine types\n","\n","true_wines = labelencoder_y.inverse_transform(y_train)\n","\n","df = pd.DataFrame({'true_wines': true_wines, 'predicted_wines': predicted_wine_clusters}) \n","pd.crosstab(df.true_wines, df.predicted_wines)"],"execution_count":86,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n","  if diff:\n"],"name":"stderr"},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th>predicted_wines</th>\n","      <th>0</th>\n","      <th>1</th>\n","      <th>2</th>\n","    </tr>\n","    <tr>\n","      <th>true_wines</th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>Barbera</th>\n","      <td>37</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>Barolo</th>\n","      <td>0</td>\n","      <td>44</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>Grignolino</th>\n","      <td>3</td>\n","      <td>4</td>\n","      <td>54</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["predicted_wines   0   1   2\n","true_wines                 \n","Barbera          37   0   0\n","Barolo            0  44   0\n","Grignolino        3   4  54"]},"metadata":{"tags":[]},"execution_count":86}]},{"metadata":{"id":"tv1p9InDnuPR","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]}]}