{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.14.2\n", "0.20.3\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "import numpy as np\n", "\n", "print np.__version__\n", "print pd.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789101112131415
0b30.830.000ugwv1.25tt1fg202.00+
1a58.674.460ugqh3.04tt6fg43.0560+
2a24.500.500ugqh1.50tf0fg280.0824+
3b27.831.540ugwv3.75tt5tg100.03+
4b20.175.625ugwv1.71tf0fs120.00+
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\n", "0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 +\n", "1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 +\n", "2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 +\n", "3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 +\n", "4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 +" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading data into pandas dataframe\n", "# also we know that the missing values in this dataset is denoted by '?', so we are telling pandas beforehand \n", "# to treat '?' as NaN values\n", "DATA_DIR = '../data'\n", "\n", "df = pd.read_table(\n", " os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),\n", " sep = ',',\n", " header=None,\n", " na_values = '?'\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 12\n", "1 12\n", "2 0\n", "3 6\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# list out number of missing value in each of the series of the df\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 object\n", "1 float64\n", "2 float64\n", "3 object\n", "4 object\n", "5 object\n", "6 object\n", "7 float64\n", "8 object\n", "9 object\n", "10 int64\n", "11 object\n", "12 object\n", "13 float64\n", "14 int64\n", "15 object\n", "dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# just have look at the data types to get an idea of what imputation to make for any particular series\n", "df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deleting Rows that have missing values" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", "5 0\n", "6 0\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 0\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.dropna(inplace=False)\n", "df_.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing with Mean value - (For Continuous Data)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 12\n", "1 0\n", "2 0\n", "3 6\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we will just show example of doing it with column number 1; this can be scaled to 'n' number of columns\n", "df[1].fillna(df[1].mean(), inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing with Mode value - (For Categorical Data)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 6\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[0].fillna(df[0].mode()[0], inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing by adding one more category - (For Categorical Data)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['u', 'y', nan, 'l'], dtype=object)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 3rd column can have these number of possibilities\n", "df[3].unique()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# treating it as new category all together tells model to learn it's dependency with other features for \n", "# making prediction; adds to one more column; if one hot representation; else another integer value if label encoding\n", "df[3].fillna('UNK', inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789101112131415
206a71.580.0UNKNaNNaNNaN0.0ff0fpNaN0+
270b37.580.0UNKNaNNaNNaN0.0ff0fpNaN0+
330b20.420.0UNKNaNNaNNaN0.0ff0fpNaN0-
456b34.580.0UNKNaNNaNNaN0.0ff0fpNaN0-
592b23.170.0UNKNaNNaNNaN0.0ff0fpNaN0+
622a25.580.0UNKNaNNaNNaN0.0ff0fpNaN0+
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\n", "206 a 71.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n", "270 b 37.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n", "330 b 20.42 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 -\n", "456 b 34.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 -\n", "592 b 23.17 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n", "622 a 25.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[3] == 'UNK']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing by back filling " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 0\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can specify a back-fill to propagate the next values backward\n", "df[13].fillna(method='bfill', inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing by forward filling " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 0\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We can specify a forward-fill to propagate the previous value forward\n", "df[4].fillna(method='ffill', inplace=True)\n", "df.isnull().sum()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }