{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Most of the times, the data is damaged, or missing, we need to take care of it since Machine Learning models don't work when the data is missing or not a number. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import Imputer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputing missing values using Imputer"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Age | \n",
" Salary | \n",
" Purchased | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" France | \n",
" 44.0 | \n",
" 72000.0 | \n",
" No | \n",
"
\n",
" \n",
" 1 | \n",
" Spain | \n",
" 27.0 | \n",
" 48000.0 | \n",
" Yes | \n",
"
\n",
" \n",
" 2 | \n",
" Germany | \n",
" 30.0 | \n",
" 54000.0 | \n",
" No | \n",
"
\n",
" \n",
" 3 | \n",
" Spain | \n",
" 38.0 | \n",
" 61000.0 | \n",
" No | \n",
"
\n",
" \n",
" 4 | \n",
" Germany | \n",
" 40.0 | \n",
" NaN | \n",
" Yes | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Age Salary Purchased\n",
"0 France 44.0 72000.0 No\n",
"1 Spain 27.0 48000.0 Yes\n",
"2 Germany 30.0 54000.0 No\n",
"3 Spain 38.0 61000.0 No\n",
"4 Germany 40.0 NaN Yes"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('Data.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Age | \n",
" Salary | \n",
" Purchased | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" France | \n",
" 44.0 | \n",
" 72000.000000 | \n",
" No | \n",
"
\n",
" \n",
" 1 | \n",
" Spain | \n",
" 27.0 | \n",
" 48000.000000 | \n",
" Yes | \n",
"
\n",
" \n",
" 2 | \n",
" Germany | \n",
" 30.0 | \n",
" 54000.000000 | \n",
" No | \n",
"
\n",
" \n",
" 3 | \n",
" Spain | \n",
" 38.0 | \n",
" 61000.000000 | \n",
" No | \n",
"
\n",
" \n",
" 4 | \n",
" Germany | \n",
" 40.0 | \n",
" 63777.777778 | \n",
" Yes | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Age Salary Purchased\n",
"0 France 44.0 72000.000000 No\n",
"1 Spain 27.0 48000.000000 Yes\n",
"2 Germany 30.0 54000.000000 No\n",
"3 Spain 38.0 61000.000000 No\n",
"4 Germany 40.0 63777.777778 Yes"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# replace every occurrence of missing_values to one defined by strategy\n",
"# which can be mean, median, mode. Axis = 0 means rows, 1 means column\n",
"\n",
"imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0)\n",
"df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Encoding categorical data "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.\n",
"# One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present\n",
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Age | \n",
" Salary | \n",
" Purchased | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 44.0 | \n",
" 72000.000000 | \n",
" No | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 27.0 | \n",
" 48000.000000 | \n",
" Yes | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 30.0 | \n",
" 54000.000000 | \n",
" No | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
" 38.0 | \n",
" 61000.000000 | \n",
" No | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 40.0 | \n",
" 63777.777778 | \n",
" Yes | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Age Salary Purchased\n",
"0 0 44.0 72000.000000 No\n",
"1 2 27.0 48000.000000 Yes\n",
"2 1 30.0 54000.000000 No\n",
"3 2 38.0 61000.000000 No\n",
"4 1 40.0 63777.777778 Yes"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lable_encoder = LabelEncoder()\n",
"temp = df.copy()\n",
"temp.iloc[:, 0] = lable_encoder.fit_transform(df.iloc[:, 0])\n",
"temp.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Age | \n",
" Salary | \n",
" Country_France | \n",
" Country_Germany | \n",
" Country_Spain | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 44.000000 | \n",
" 72000.000000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 27.000000 | \n",
" 48000.000000 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 30.000000 | \n",
" 54000.000000 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 38.000000 | \n",
" 61000.000000 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 40.000000 | \n",
" 63777.777778 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 35.000000 | \n",
" 58000.000000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 38.777778 | \n",
" 52000.000000 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" 48.000000 | \n",
" 79000.000000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 50.000000 | \n",
" 83000.000000 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" 37.000000 | \n",
" 67000.000000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Salary Country_France Country_Germany Country_Spain\n",
"0 44.000000 72000.000000 1 0 0\n",
"1 27.000000 48000.000000 0 0 1\n",
"2 30.000000 54000.000000 0 1 0\n",
"3 38.000000 61000.000000 0 0 1\n",
"4 40.000000 63777.777778 0 1 0\n",
"5 35.000000 58000.000000 1 0 0\n",
"6 38.777778 52000.000000 0 0 1\n",
"7 48.000000 79000.000000 1 0 0\n",
"8 50.000000 83000.000000 0 1 0\n",
"9 37.000000 67000.000000 1 0 0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# you can pass an array of indices of categorical features\n",
"# one_hot_encoder = OneHotEncoder(categorical_features=[0])\n",
"# temp = df.copy()\n",
"# temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, 0])\n",
"\n",
"# you can achieve the same thing using get_dummies\n",
"pd.get_dummies(df.iloc[:, :-1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Binarizing\n",
"\n",
"Often we need to do the reverse of what we've done above. That is, convert continuous features to discrete values. For instance, we want to convert the output to 0 or 1 depending on the threshold. "
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"\n",
"iris_dataset = load_iris()\n",
"X = iris_dataset.data\n",
"y = iris_dataset.target\n",
"feature_names = iris_dataset.feature_names\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we'll binarize the sepal width with 0 or 1 indicating whether the current value is below or above mean. "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 3.5, 3. , 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,\n",
" 3.4, 3. , 3. , 4. , 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7,\n",
" 3.6, 3.3, 3.4, 3. , 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1,\n",
" 4.2, 3.1, 3.2, 3.5, 3.1, 3. , 3.4, 3.5, 2.3, 3.2, 3.5,\n",
" 3.8, 3. , 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8,\n",
" 2.8, 3.3, 2.4, 2.9, 2.7, 2. , 3. , 2.2, 2.9, 2.9, 3.1,\n",
" 3. , 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3. , 2.8,\n",
" 3. , 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3. , 3.4, 3.1, 2.3,\n",
" 3. , 2.5, 2.6, 3. , 2.6, 2.3, 2.7, 3. , 2.9, 2.9, 2.5,\n",
" 2.8, 3.3, 2.7, 3. , 2.9, 3. , 3. , 2.5, 2.9, 2.5, 3.6,\n",
" 3.2, 2.7, 3. , 2.5, 2.8, 3.2, 3. , 3.8, 2.6, 2.2, 3.2,\n",
" 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3. , 2.8, 3. , 2.8, 3.8,\n",
" 2.8, 2.8, 2.6, 3. , 3.4, 3.1, 3. , 3.1, 3.1, 3.1, 2.7,\n",
" 3.2, 3.3, 3. , 2.5, 3. , 3.4, 3. ])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,\n",
" 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,\n",
" 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,\n",
" 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.,\n",
" 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0.,\n",
" 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,\n",
" 1., 1., 0., 0., 0., 1., 0.])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import Binarizer\n",
"X[:, 1:2] = Binarizer(threshold=X[:, 1].mean()).fit_transform(X[:, 1].reshape(-1, 1))\n",
"X[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.14"
}
},
"nbformat": 4,
"nbformat_minor": 1
}