{ "metadata": { "name": "", "signature": "sha256:6af55902e41f36273af92bed94f8b56edb3f4439038f686f00e0f040fbb8553f" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Missing Data In Pandas Dataframe\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### import modules" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create dataframe with missing values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'], \n", " 'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'], \n", " 'age': [42, np.nan, 36, 24, 73], \n", " 'sex': ['m', np.nan, 'f', 'm', 'f'], \n", " 'preTestScore': [4, np.nan, np.nan, 2, 3],\n", " 'postTestScore': [25, np.nan, np.nan, 62, 70]}\n", "df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScore
0 Jason Miller 42 m 4 25
1 NaN NaNNaN NaNNaNNaN
2 Tina Ali 36 fNaNNaN
3 Jake Milner 24 m 2 62
4 Amy Cooze 73 f 3 70
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ " first_name last_name age sex preTestScore postTestScore\n", "0 Jason Miller 42 m 4 25\n", "1 NaN NaN NaN NaN NaN NaN\n", "2 Tina Ali 36 f NaN NaN\n", "3 Jake Milner 24 m 2 62\n", "4 Amy Cooze 73 f 3 70" ] } ], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Drop missing observations" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_no_missing = df.dropna()\n", "df_no_missing" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScore
0 Jason Miller 42 m 4 25
3 Jake Milner 24 m 2 62
4 Amy Cooze 73 f 3 70
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ " first_name last_name age sex preTestScore postTestScore\n", "0 Jason Miller 42 m 4 25\n", "3 Jake Milner 24 m 2 62\n", "4 Amy Cooze 73 f 3 70" ] } ], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Drop rows where all cells in that row is NA" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_cleaned = df.dropna(how='all')\n", "df_cleaned" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScore
0 Jason Miller 42 m 4 25
2 Tina Ali 36 fNaNNaN
3 Jake Milner 24 m 2 62
4 Amy Cooze 73 f 3 70
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ " first_name last_name age sex preTestScore postTestScore\n", "0 Jason Miller 42 m 4 25\n", "2 Tina Ali 36 f NaN NaN\n", "3 Jake Milner 24 m 2 62\n", "4 Amy Cooze 73 f 3 70" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a new column full of missing values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['location'] = [np.nan, np.nan, np.nan, np.nan, np.nan]\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScorelocation
0 Jason Miller 42 m 4 25NaN
1 NaN NaNNaN NaNNaNNaNNaN
2 Tina Ali 36 fNaNNaNNaN
3 Jake Milner 24 m 2 62NaN
4 Amy Cooze 73 f 3 70NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ " first_name last_name age sex preTestScore postTestScore location\n", "0 Jason Miller 42 m 4 25 NaN\n", "1 NaN NaN NaN NaN NaN NaN NaN\n", "2 Tina Ali 36 f NaN NaN NaN\n", "3 Jake Milner 24 m 2 62 NaN\n", "4 Amy Cooze 73 f 3 70 NaN" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Drop column if they only contain missing values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.dropna(axis=1, how='all')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScore
0 Jason Miller 42 m 4 25
1 NaN NaNNaN NaNNaNNaN
2 Tina Ali 36 fNaNNaN
3 Jake Milner 24 m 2 62
4 Amy Cooze 73 f 3 70
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ " first_name last_name age sex preTestScore postTestScore\n", "0 Jason Miller 42 m 4 25\n", "1 NaN NaN NaN NaN NaN NaN\n", "2 Tina Ali 36 f NaN NaN\n", "3 Jake Milner 24 m 2 62\n", "4 Amy Cooze 73 f 3 70" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Drop rows that contain less than five observations\n", "\n", "This is really mostly useful for time series" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.dropna(thresh=5)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScorelocation
0 Jason Miller 42 m 4 25NaN
3 Jake Milner 24 m 2 62NaN
4 Amy Cooze 73 f 3 70NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ " first_name last_name age sex preTestScore postTestScore location\n", "0 Jason Miller 42 m 4 25 NaN\n", "3 Jake Milner 24 m 2 62 NaN\n", "4 Amy Cooze 73 f 3 70 NaN" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fill in missing data with zeros" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.fillna(0)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScorelocation
0 Jason Miller 42 m 4 25 0
1 0 0 0 0 0 0 0
2 Tina Ali 36 f 0 0 0
3 Jake Milner 24 m 2 62 0
4 Amy Cooze 73 f 3 70 0
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " first_name last_name age sex preTestScore postTestScore location\n", "0 Jason Miller 42 m 4 25 0\n", "1 0 0 0 0 0 0 0\n", "2 Tina Ali 36 f 0 0 0\n", "3 Jake Milner 24 m 2 62 0\n", "4 Amy Cooze 73 f 3 70 0" ] } ], "prompt_number": 8 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fill in missing in preTestScore with the mean value of preTestScore\n", "\n", "inplace=True means that the changes are saved to the df right away" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df[\"preTestScore\"].fillna(df[\"preTestScore\"].mean(), inplace=True)\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScorelocation
0 Jason Miller 42 m 4 25NaN
1 NaN NaNNaN NaN 3NaNNaN
2 Tina Ali 36 f 3NaNNaN
3 Jake Milner 24 m 2 62NaN
4 Amy Cooze 73 f 3 70NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ " first_name last_name age sex preTestScore postTestScore location\n", "0 Jason Miller 42 m 4 25 NaN\n", "1 NaN NaN NaN NaN 3 NaN NaN\n", "2 Tina Ali 36 f 3 NaN NaN\n", "3 Jake Milner 24 m 2 62 NaN\n", "4 Amy Cooze 73 f 3 70 NaN" ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fill in missing in postTestScore with each sex's mean value of postTestScore" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df[\"postTestScore\"].fillna(df.groupby(\"sex\")[\"postTestScore\"].transform(\"mean\"), inplace=True)\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScorelocation
0 Jason Miller 42 m 4 25NaN
1 NaN NaNNaN NaN 3NaNNaN
2 Tina Ali 36 f 3 70NaN
3 Jake Milner 24 m 2 62NaN
4 Amy Cooze 73 f 3 70NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ " first_name last_name age sex preTestScore postTestScore location\n", "0 Jason Miller 42 m 4 25 NaN\n", "1 NaN NaN NaN NaN 3 NaN NaN\n", "2 Tina Ali 36 f 3 70 NaN\n", "3 Jake Milner 24 m 2 62 NaN\n", "4 Amy Cooze 73 f 3 70 NaN" ] } ], "prompt_number": 10 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select some raws but ignore the missing data points" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Select the rows of df where age is not NaN and sex is not NaN\n", "df[df['age'].notnull() & df['sex'].notnull()]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagesexpreTestScorepostTestScorelocation
0 Jason Miller 42 m 4 25NaN
2 Tina Ali 36 f 3 70NaN
3 Jake Milner 24 m 2 62NaN
4 Amy Cooze 73 f 3 70NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ " first_name last_name age sex preTestScore postTestScore location\n", "0 Jason Miller 42 m 4 25 NaN\n", "2 Tina Ali 36 f 3 70 NaN\n", "3 Jake Milner 24 m 2 62 NaN\n", "4 Amy Cooze 73 f 3 70 NaN" ] } ], "prompt_number": 12 } ], "metadata": {} } ] }