{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.14.2\n",
"0.20.3\n"
]
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"import numpy as np\n",
"\n",
"print np.__version__\n",
"print pd.__version__"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" b | \n",
" 30.83 | \n",
" 0.000 | \n",
" u | \n",
" g | \n",
" w | \n",
" v | \n",
" 1.25 | \n",
" t | \n",
" t | \n",
" 1 | \n",
" f | \n",
" g | \n",
" 202.0 | \n",
" 0 | \n",
" + | \n",
"
\n",
" \n",
" | 1 | \n",
" a | \n",
" 58.67 | \n",
" 4.460 | \n",
" u | \n",
" g | \n",
" q | \n",
" h | \n",
" 3.04 | \n",
" t | \n",
" t | \n",
" 6 | \n",
" f | \n",
" g | \n",
" 43.0 | \n",
" 560 | \n",
" + | \n",
"
\n",
" \n",
" | 2 | \n",
" a | \n",
" 24.50 | \n",
" 0.500 | \n",
" u | \n",
" g | \n",
" q | \n",
" h | \n",
" 1.50 | \n",
" t | \n",
" f | \n",
" 0 | \n",
" f | \n",
" g | \n",
" 280.0 | \n",
" 824 | \n",
" + | \n",
"
\n",
" \n",
" | 3 | \n",
" b | \n",
" 27.83 | \n",
" 1.540 | \n",
" u | \n",
" g | \n",
" w | \n",
" v | \n",
" 3.75 | \n",
" t | \n",
" t | \n",
" 5 | \n",
" t | \n",
" g | \n",
" 100.0 | \n",
" 3 | \n",
" + | \n",
"
\n",
" \n",
" | 4 | \n",
" b | \n",
" 20.17 | \n",
" 5.625 | \n",
" u | \n",
" g | \n",
" w | \n",
" v | \n",
" 1.71 | \n",
" t | \n",
" f | \n",
" 0 | \n",
" f | \n",
" s | \n",
" 120.0 | \n",
" 0 | \n",
" + | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\n",
"0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 +\n",
"1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 +\n",
"2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 +\n",
"3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 +\n",
"4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 +"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# reading data into pandas dataframe\n",
"# also we know that the missing values in this dataset is denoted by '?', so we are telling pandas beforehand \n",
"# to treat '?' as NaN values\n",
"DATA_DIR = '../data'\n",
"\n",
"df = pd.read_table(\n",
" os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),\n",
" sep = ',',\n",
" header=None,\n",
" na_values = '?'\n",
" )\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 12\n",
"1 12\n",
"2 0\n",
"3 6\n",
"4 6\n",
"5 9\n",
"6 9\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 13\n",
"14 0\n",
"15 0\n",
"dtype: int64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# list out number of missing value in each of the series of the df\n",
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 object\n",
"1 float64\n",
"2 float64\n",
"3 object\n",
"4 object\n",
"5 object\n",
"6 object\n",
"7 float64\n",
"8 object\n",
"9 object\n",
"10 int64\n",
"11 object\n",
"12 object\n",
"13 float64\n",
"14 int64\n",
"15 object\n",
"dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# just have look at the data types to get an idea of what imputation to make for any particular series\n",
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deleting Rows that have missing values"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
"5 0\n",
"6 0\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 0\n",
"14 0\n",
"15 0\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.dropna(inplace=False)\n",
"df_.isnull().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputing with Mean value - (For Continuous Data)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 12\n",
"1 0\n",
"2 0\n",
"3 6\n",
"4 6\n",
"5 9\n",
"6 9\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 13\n",
"14 0\n",
"15 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we will just show example of doing it with column number 1; this can be scaled to 'n' number of columns\n",
"df[1].fillna(df[1].mean(), inplace=True)\n",
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputing with Mode value - (For Categorical Data)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 0\n",
"1 0\n",
"2 0\n",
"3 6\n",
"4 6\n",
"5 9\n",
"6 9\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 13\n",
"14 0\n",
"15 0\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[0].fillna(df[0].mode()[0], inplace=True)\n",
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputing by adding one more category - (For Categorical Data)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['u', 'y', nan, 'l'], dtype=object)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 3rd column can have these number of possibilities\n",
"df[3].unique()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 6\n",
"5 9\n",
"6 9\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 13\n",
"14 0\n",
"15 0\n",
"dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# treating it as new category all together tells model to learn it's dependency with other features for \n",
"# making prediction; adds to one more column; if one hot representation; else another integer value if label encoding\n",
"df[3].fillna('UNK', inplace=True)\n",
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 206 | \n",
" a | \n",
" 71.58 | \n",
" 0.0 | \n",
" UNK | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.0 | \n",
" f | \n",
" f | \n",
" 0 | \n",
" f | \n",
" p | \n",
" NaN | \n",
" 0 | \n",
" + | \n",
"
\n",
" \n",
" | 270 | \n",
" b | \n",
" 37.58 | \n",
" 0.0 | \n",
" UNK | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.0 | \n",
" f | \n",
" f | \n",
" 0 | \n",
" f | \n",
" p | \n",
" NaN | \n",
" 0 | \n",
" + | \n",
"
\n",
" \n",
" | 330 | \n",
" b | \n",
" 20.42 | \n",
" 0.0 | \n",
" UNK | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.0 | \n",
" f | \n",
" f | \n",
" 0 | \n",
" f | \n",
" p | \n",
" NaN | \n",
" 0 | \n",
" - | \n",
"
\n",
" \n",
" | 456 | \n",
" b | \n",
" 34.58 | \n",
" 0.0 | \n",
" UNK | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.0 | \n",
" f | \n",
" f | \n",
" 0 | \n",
" f | \n",
" p | \n",
" NaN | \n",
" 0 | \n",
" - | \n",
"
\n",
" \n",
" | 592 | \n",
" b | \n",
" 23.17 | \n",
" 0.0 | \n",
" UNK | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.0 | \n",
" f | \n",
" f | \n",
" 0 | \n",
" f | \n",
" p | \n",
" NaN | \n",
" 0 | \n",
" + | \n",
"
\n",
" \n",
" | 622 | \n",
" a | \n",
" 25.58 | \n",
" 0.0 | \n",
" UNK | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.0 | \n",
" f | \n",
" f | \n",
" 0 | \n",
" f | \n",
" p | \n",
" NaN | \n",
" 0 | \n",
" + | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\n",
"206 a 71.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n",
"270 b 37.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n",
"330 b 20.42 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 -\n",
"456 b 34.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 -\n",
"592 b 23.17 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n",
"622 a 25.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df[3] == 'UNK']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputing by back filling "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 6\n",
"5 9\n",
"6 9\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 0\n",
"14 0\n",
"15 0\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we can specify a back-fill to propagate the next values backward\n",
"df[13].fillna(method='bfill', inplace=True)\n",
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputing by forward filling "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
"5 9\n",
"6 9\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 0\n",
"14 0\n",
"15 0\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We can specify a forward-fill to propagate the previous value forward\n",
"df[4].fillna(method='ffill', inplace=True)\n",
"df.isnull().sum()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}