{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "852c361d",
   "metadata": {},
   "source": [
    "<img src=\"./images/shouke_logo.png\"\n",
    "     style=\"float: right\"\n",
    "     width=100\n",
    "     style=\"padding-bottom:100px;\"/>\n",
    "<br>\n",
    "<br>\n",
    "\n",
    "<table style=\"float:center;\">\n",
    "    <tr>\n",
    "        <td>\n",
    "            <img src='./images/python-logo.png'width=130>\n",
    "        </td>\n",
    "        <td>\n",
    "            <img src='./images/pandas-logo.png'width=150>\n",
    "        </td>\n",
    "    </tr>\n",
    "</table>\n",
    "\n",
    "<h1 style='text-align: center;'>Imputing Missing Values of DataFrame</h1>\n",
    "<h3 style='text-align: center;'>Shouke Wei, Ph.D. Professor</h3>\n",
    "<h4 style='text-align: center;'>Email: shouke.wei@gmail.com</h4>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4127aa60",
   "metadata": {},
   "source": [
    "## Objective\n",
    "- learn how to drop, impute or fill the missing values of the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "1d8f5194",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['prov', 'gdpr', 'year', 'gdp', 'pop', 'finv', 'trade', 'fexpen',\n",
       "       'uinc'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import required packages\n",
    "import pandas as pd\n",
    "\n",
    "# read data \n",
    "df = pd.read_csv('./data/gdp_china_renamed.csv')\n",
    "\n",
    "# display names of the columns\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b45eba63",
   "metadata": {},
   "source": [
    "## 1. Drop missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "b85bd1b6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_new = df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "fa6a6895",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.isna().sum().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4f157d8",
   "metadata": {},
   "source": [
    "## 2. Fill missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "5b3e93ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx_l = [3,4,5,22,23,24]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "39754d36",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.963</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9.194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7.588</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      pop\n",
       "3   8.963\n",
       "4     NaN\n",
       "5   9.194\n",
       "22  7.458\n",
       "23    NaN\n",
       "24  7.588"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[idx_l,['pop']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f47c8120",
   "metadata": {},
   "source": [
    "### (1) fill missing values with the mean value(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "bd2b530c",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df_num = df.drop(['prov','gdpr'],axis=1)\n",
    "df_new = df_num.fillna(df_num.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "52d65d3f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.963000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.321032</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9.194000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.458000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>8.321032</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7.588000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         pop\n",
       "3   8.963000\n",
       "4   8.321032\n",
       "5   9.194000\n",
       "22  7.458000\n",
       "23  8.321032\n",
       "24  7.588000"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.loc[idx_l,['pop']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd908c8d",
   "metadata": {},
   "source": [
    "### (2) fill with build-in methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "96b02478",
   "metadata": {},
   "outputs": [],
   "source": [
    "# foward fill\n",
    "df_new = df.fillna(method='ffill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "ecfde378",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.963</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.963</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9.194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>7.458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7.588</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      pop\n",
       "3   8.963\n",
       "4   8.963\n",
       "5   9.194\n",
       "22  7.458\n",
       "23  7.458\n",
       "24  7.588"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.loc[idx_l,['pop']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "10ed2875",
   "metadata": {},
   "outputs": [],
   "source": [
    "# backward fill\n",
    "df_new = df.fillna(method='bfill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "04ba5525",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.963</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9.194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9.194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>7.588</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7.588</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      pop\n",
       "3   8.963\n",
       "4   9.194\n",
       "5   9.194\n",
       "22  7.458\n",
       "23  7.588\n",
       "24  7.588"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.loc[idx_l,['pop']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23e9708f",
   "metadata": {},
   "source": [
    "Refer to: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "780eebb6",
   "metadata": {},
   "source": [
    "### 3. fill with interpolate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "c510d918",
   "metadata": {},
   "outputs": [],
   "source": [
    "# default is linear\n",
    "df_new =df.interpolate() # method='linear'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "6df1a71c",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.9630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9.0785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9.1940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.4580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>7.5230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7.5880</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       pop\n",
       "3   8.9630\n",
       "4   9.0785\n",
       "5   9.1940\n",
       "22  7.4580\n",
       "23  7.5230\n",
       "24  7.5880"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.loc[idx_l,['pop']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "835bb2a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# polynomial method\n",
    "df_new =df.interpolate(method='polynomial',order=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "2586dc53",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.963000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9.054257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9.194000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.458000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>7.522919</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7.588000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         pop\n",
       "3   8.963000\n",
       "4   9.054257\n",
       "5   9.194000\n",
       "22  7.458000\n",
       "23  7.522919\n",
       "24  7.588000"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.loc[idx_l,['pop']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "b1ff778f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# cubicspline\n",
    "df_new =df.interpolate(method='cubicspline',order=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "d00ed97d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.963000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9.052298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9.194000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7.458000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>7.532276</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7.588000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         pop\n",
       "3   8.963000\n",
       "4   9.052298\n",
       "5   9.194000\n",
       "22  7.458000\n",
       "23  7.532276\n",
       "24  7.588000"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.loc[idx_l,['pop']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb6b1e70",
   "metadata": {},
   "source": [
    "Refer to:\n",
    "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html\n",
    "\n",
    "https://www.w3schools.com/python/pandas/ref_df_interpolate.asp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "84a79d76",
   "metadata": {},
   "source": [
    "## 4. save data as csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "d516923f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_new.to_csv('./data/gdp_china_mis_cl.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}