{ "cells": [ { "cell_type": "markdown", "id": "852c361d", "metadata": {}, "source": [ "<img src=\"./images/shouke_logo.png\"\n", " style=\"float: right\"\n", " width=100\n", " style=\"padding-bottom:100px;\"/>\n", "<br>\n", "<br>\n", "\n", "<table style=\"float:center;\">\n", " <tr>\n", " <td>\n", " <img src='./images/python-logo.png'width=130>\n", " </td>\n", " <td>\n", " <img src='./images/pandas-logo.png'width=150>\n", " </td>\n", " </tr>\n", "</table>\n", "\n", "<h1 style='text-align: center;'>Imputing Missing Values of DataFrame</h1>\n", "<h3 style='text-align: center;'>Shouke Wei, Ph.D. Professor</h3>\n", "<h4 style='text-align: center;'>Email: shouke.wei@gmail.com</h4>" ] }, { "cell_type": "markdown", "id": "4127aa60", "metadata": {}, "source": [ "## Objective\n", "- learn how to drop, impute or fill the missing values of the dataframe" ] }, { "cell_type": "code", "execution_count": 18, "id": "1d8f5194", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['prov', 'gdpr', 'year', 'gdp', 'pop', 'finv', 'trade', 'fexpen',\n", " 'uinc'],\n", " dtype='object')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import required packages\n", "import pandas as pd\n", "\n", "# read data \n", "df = pd.read_csv('./data/gdp_china_renamed.csv')\n", "\n", "# display names of the columns\n", "df.columns" ] }, { "cell_type": "markdown", "id": "b45eba63", "metadata": {}, "source": [ "## 1. Drop missing values" ] }, { "cell_type": "code", "execution_count": 19, "id": "b85bd1b6", "metadata": { "scrolled": true }, "outputs": [], "source": [ "df_new = df.dropna()" ] }, { "cell_type": "code", "execution_count": 21, "id": "fa6a6895", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.isna().sum().sum()" ] }, { "cell_type": "markdown", "id": "c4f157d8", "metadata": {}, "source": [ "## 2. Fill missing values" ] }, { "cell_type": "code", "execution_count": 23, "id": "5b3e93ac", "metadata": {}, "outputs": [], "source": [ "idx_l = [3,4,5,22,23,24]" ] }, { "cell_type": "code", "execution_count": 24, "id": "39754d36", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>8.963</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>9.194</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>7.458</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>7.588</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pop\n", "3 8.963\n", "4 NaN\n", "5 9.194\n", "22 7.458\n", "23 NaN\n", "24 7.588" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[idx_l,['pop']]" ] }, { "cell_type": "markdown", "id": "f47c8120", "metadata": {}, "source": [ "### (1) fill missing values with the mean value(s)" ] }, { "cell_type": "code", "execution_count": 26, "id": "bd2b530c", "metadata": { "scrolled": false }, "outputs": [], "source": [ "df_num = df.drop(['prov','gdpr'],axis=1)\n", "df_new = df_num.fillna(df_num.mean())" ] }, { "cell_type": "code", "execution_count": 27, "id": "52d65d3f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>8.963000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.321032</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>9.194000</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>7.458000</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>8.321032</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>7.588000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pop\n", "3 8.963000\n", "4 8.321032\n", "5 9.194000\n", "22 7.458000\n", "23 8.321032\n", "24 7.588000" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.loc[idx_l,['pop']]" ] }, { "cell_type": "markdown", "id": "fd908c8d", "metadata": {}, "source": [ "### (2) fill with build-in methods" ] }, { "cell_type": "code", "execution_count": 29, "id": "96b02478", "metadata": {}, "outputs": [], "source": [ "# foward fill\n", "df_new = df.fillna(method='ffill')" ] }, { "cell_type": "code", "execution_count": 30, "id": "ecfde378", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>8.963</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.963</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>9.194</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>7.458</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>7.458</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>7.588</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pop\n", "3 8.963\n", "4 8.963\n", "5 9.194\n", "22 7.458\n", "23 7.458\n", "24 7.588" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.loc[idx_l,['pop']]" ] }, { "cell_type": "code", "execution_count": 31, "id": "10ed2875", "metadata": {}, "outputs": [], "source": [ "# backward fill\n", "df_new = df.fillna(method='bfill')" ] }, { "cell_type": "code", "execution_count": 32, "id": "04ba5525", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>8.963</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>9.194</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>9.194</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>7.458</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>7.588</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>7.588</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pop\n", "3 8.963\n", "4 9.194\n", "5 9.194\n", "22 7.458\n", "23 7.588\n", "24 7.588" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.loc[idx_l,['pop']]" ] }, { "cell_type": "markdown", "id": "23e9708f", "metadata": {}, "source": [ "Refer to: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html" ] }, { "cell_type": "markdown", "id": "780eebb6", "metadata": {}, "source": [ "### 3. fill with interpolate" ] }, { "cell_type": "code", "execution_count": 33, "id": "c510d918", "metadata": {}, "outputs": [], "source": [ "# default is linear\n", "df_new =df.interpolate() # method='linear'" ] }, { "cell_type": "code", "execution_count": 34, "id": "6df1a71c", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>8.9630</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>9.0785</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>9.1940</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>7.4580</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>7.5230</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>7.5880</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pop\n", "3 8.9630\n", "4 9.0785\n", "5 9.1940\n", "22 7.4580\n", "23 7.5230\n", "24 7.5880" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.loc[idx_l,['pop']]" ] }, { "cell_type": "code", "execution_count": 35, "id": "835bb2a5", "metadata": {}, "outputs": [], "source": [ "# polynomial method\n", "df_new =df.interpolate(method='polynomial',order=2)" ] }, { "cell_type": "code", "execution_count": 36, "id": "2586dc53", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>8.963000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>9.054257</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>9.194000</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>7.458000</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>7.522919</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>7.588000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pop\n", "3 8.963000\n", "4 9.054257\n", "5 9.194000\n", "22 7.458000\n", "23 7.522919\n", "24 7.588000" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.loc[idx_l,['pop']]" ] }, { "cell_type": "code", "execution_count": 37, "id": "b1ff778f", "metadata": {}, "outputs": [], "source": [ "# cubicspline\n", "df_new =df.interpolate(method='cubicspline',order=2)" ] }, { "cell_type": "code", "execution_count": 38, "id": "d00ed97d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>8.963000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>9.052298</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>9.194000</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>7.458000</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>7.532276</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>7.588000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pop\n", "3 8.963000\n", "4 9.052298\n", "5 9.194000\n", "22 7.458000\n", "23 7.532276\n", "24 7.588000" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.loc[idx_l,['pop']]" ] }, { "cell_type": "markdown", "id": "cb6b1e70", "metadata": {}, "source": [ "Refer to:\n", "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html\n", "\n", "https://www.w3schools.com/python/pandas/ref_df_interpolate.asp" ] }, { "cell_type": "markdown", "id": "84a79d76", "metadata": {}, "source": [ "## 4. save data as csv file" ] }, { "cell_type": "code", "execution_count": 40, "id": "d516923f", "metadata": {}, "outputs": [], "source": [ "df_new.to_csv('./data/gdp_china_mis_cl.csv',index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" } }, "nbformat": 4, "nbformat_minor": 5 }