{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Most of the times, the data is damaged, or missing, we need to take care of it since Machine Learning models don't work when the data is missing or not a number. " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.preprocessing import Imputer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing missing values using Imputer" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryAgeSalaryPurchased
0France44.072000.0No
1Spain27.048000.0Yes
2Germany30.054000.0No
3Spain38.061000.0No
4Germany40.0NaNYes
\n", "
" ], "text/plain": [ " Country Age Salary Purchased\n", "0 France 44.0 72000.0 No\n", "1 Spain 27.0 48000.0 Yes\n", "2 Germany 30.0 54000.0 No\n", "3 Spain 38.0 61000.0 No\n", "4 Germany 40.0 NaN Yes" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('Data.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryAgeSalaryPurchased
0France44.072000.000000No
1Spain27.048000.000000Yes
2Germany30.054000.000000No
3Spain38.061000.000000No
4Germany40.063777.777778Yes
\n", "
" ], "text/plain": [ " Country Age Salary Purchased\n", "0 France 44.0 72000.000000 No\n", "1 Spain 27.0 48000.000000 Yes\n", "2 Germany 30.0 54000.000000 No\n", "3 Spain 38.0 61000.000000 No\n", "4 Germany 40.0 63777.777778 Yes" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# replace every occurrence of missing_values to one defined by strategy\n", "# which can be mean, median, mode. Axis = 0 means rows, 1 means column\n", "\n", "imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0)\n", "df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Encoding categorical data " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.\n", "# One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present\n", "from sklearn.preprocessing import LabelEncoder, OneHotEncoder" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryAgeSalaryPurchased
0044.072000.000000No
1227.048000.000000Yes
2130.054000.000000No
3238.061000.000000No
4140.063777.777778Yes
\n", "
" ], "text/plain": [ " Country Age Salary Purchased\n", "0 0 44.0 72000.000000 No\n", "1 2 27.0 48000.000000 Yes\n", "2 1 30.0 54000.000000 No\n", "3 2 38.0 61000.000000 No\n", "4 1 40.0 63777.777778 Yes" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lable_encoder = LabelEncoder()\n", "temp = df.copy()\n", "temp.iloc[:, 0] = lable_encoder.fit_transform(df.iloc[:, 0])\n", "temp.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeSalaryCountry_FranceCountry_GermanyCountry_Spain
044.00000072000.000000100
127.00000048000.000000001
230.00000054000.000000010
338.00000061000.000000001
440.00000063777.777778010
535.00000058000.000000100
638.77777852000.000000001
748.00000079000.000000100
850.00000083000.000000010
937.00000067000.000000100
\n", "
" ], "text/plain": [ " Age Salary Country_France Country_Germany Country_Spain\n", "0 44.000000 72000.000000 1 0 0\n", "1 27.000000 48000.000000 0 0 1\n", "2 30.000000 54000.000000 0 1 0\n", "3 38.000000 61000.000000 0 0 1\n", "4 40.000000 63777.777778 0 1 0\n", "5 35.000000 58000.000000 1 0 0\n", "6 38.777778 52000.000000 0 0 1\n", "7 48.000000 79000.000000 1 0 0\n", "8 50.000000 83000.000000 0 1 0\n", "9 37.000000 67000.000000 1 0 0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# you can pass an array of indices of categorical features\n", "# one_hot_encoder = OneHotEncoder(categorical_features=[0])\n", "# temp = df.copy()\n", "# temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, 0])\n", "\n", "# you can achieve the same thing using get_dummies\n", "pd.get_dummies(df.iloc[:, :-1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binarizing\n", "\n", "Often we need to do the reverse of what we've done above. That is, convert continuous features to discrete values. For instance, we want to convert the output to 0 or 1 depending on the threshold. " ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_iris\n", "\n", "iris_dataset = load_iris()\n", "X = iris_dataset.data\n", "y = iris_dataset.target\n", "feature_names = iris_dataset.feature_names\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we'll binarize the sepal width with 0 or 1 indicating whether the current value is below or above mean. " ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3.5, 3. , 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,\n", " 3.4, 3. , 3. , 4. , 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7,\n", " 3.6, 3.3, 3.4, 3. , 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1,\n", " 4.2, 3.1, 3.2, 3.5, 3.1, 3. , 3.4, 3.5, 2.3, 3.2, 3.5,\n", " 3.8, 3. , 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8,\n", " 2.8, 3.3, 2.4, 2.9, 2.7, 2. , 3. , 2.2, 2.9, 2.9, 3.1,\n", " 3. , 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3. , 2.8,\n", " 3. , 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3. , 3.4, 3.1, 2.3,\n", " 3. , 2.5, 2.6, 3. , 2.6, 2.3, 2.7, 3. , 2.9, 2.9, 2.5,\n", " 2.8, 3.3, 2.7, 3. , 2.9, 3. , 3. , 2.5, 2.9, 2.5, 3.6,\n", " 3.2, 2.7, 3. , 2.5, 2.8, 3.2, 3. , 3.8, 2.6, 2.2, 3.2,\n", " 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3. , 2.8, 3. , 2.8, 3.8,\n", " 2.8, 2.8, 2.6, 3. , 3.4, 3.1, 3. , 3.1, 3.1, 3.1, 2.7,\n", " 3.2, 3.3, 3. , 2.5, 3. , 3.4, 3. ])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X[:, 1]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,\n", " 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,\n", " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,\n", " 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,\n", " 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,\n", " 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,\n", " 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.,\n", " 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0.,\n", " 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,\n", " 1., 1., 0., 0., 0., 1., 0.])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import Binarizer\n", "X[:, 1:2] = Binarizer(threshold=X[:, 1].mean()).fit_transform(X[:, 1].reshape(-1, 1))\n", "X[:, 1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 1 }