{
"cells": [
{
"cell_type": "markdown",
"id": "852c361d",
"metadata": {},
"source": [
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"\n",
"Imputing Missing Values of DataFrame
\n",
"Shouke Wei, Ph.D. Professor
\n",
"Email: shouke.wei@gmail.com
"
]
},
{
"cell_type": "markdown",
"id": "4127aa60",
"metadata": {},
"source": [
"## Objective\n",
"- learn how to drop, impute or fill the missing values of the dataframe"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "1d8f5194",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['prov', 'gdpr', 'year', 'gdp', 'pop', 'finv', 'trade', 'fexpen',\n",
" 'uinc'],\n",
" dtype='object')"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import required packages\n",
"import pandas as pd\n",
"\n",
"# read data \n",
"df = pd.read_csv('./data/gdp_china_renamed.csv')\n",
"\n",
"# display names of the columns\n",
"df.columns"
]
},
{
"cell_type": "markdown",
"id": "b45eba63",
"metadata": {},
"source": [
"## 1. Drop missing values"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b85bd1b6",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_new = df.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "fa6a6895",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.isna().sum().sum()"
]
},
{
"cell_type": "markdown",
"id": "c4f157d8",
"metadata": {},
"source": [
"## 2. Fill missing values"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "5b3e93ac",
"metadata": {},
"outputs": [],
"source": [
"idx_l = [3,4,5,22,23,24]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "39754d36",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 8.963 | \n",
"
\n",
" \n",
" | 4 | \n",
" NaN | \n",
"
\n",
" \n",
" | 5 | \n",
" 9.194 | \n",
"
\n",
" \n",
" | 22 | \n",
" 7.458 | \n",
"
\n",
" \n",
" | 23 | \n",
" NaN | \n",
"
\n",
" \n",
" | 24 | \n",
" 7.588 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pop\n",
"3 8.963\n",
"4 NaN\n",
"5 9.194\n",
"22 7.458\n",
"23 NaN\n",
"24 7.588"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[idx_l,['pop']]"
]
},
{
"cell_type": "markdown",
"id": "f47c8120",
"metadata": {},
"source": [
"### (1) fill missing values with the mean value(s)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "bd2b530c",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"df_num = df.drop(['prov','gdpr'],axis=1)\n",
"df_new = df_num.fillna(df_num.mean())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "52d65d3f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 8.963000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 8.321032 | \n",
"
\n",
" \n",
" | 5 | \n",
" 9.194000 | \n",
"
\n",
" \n",
" | 22 | \n",
" 7.458000 | \n",
"
\n",
" \n",
" | 23 | \n",
" 8.321032 | \n",
"
\n",
" \n",
" | 24 | \n",
" 7.588000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pop\n",
"3 8.963000\n",
"4 8.321032\n",
"5 9.194000\n",
"22 7.458000\n",
"23 8.321032\n",
"24 7.588000"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.loc[idx_l,['pop']]"
]
},
{
"cell_type": "markdown",
"id": "fd908c8d",
"metadata": {},
"source": [
"### (2) fill with build-in methods"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "96b02478",
"metadata": {},
"outputs": [],
"source": [
"# foward fill\n",
"df_new = df.fillna(method='ffill')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "ecfde378",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 8.963 | \n",
"
\n",
" \n",
" | 4 | \n",
" 8.963 | \n",
"
\n",
" \n",
" | 5 | \n",
" 9.194 | \n",
"
\n",
" \n",
" | 22 | \n",
" 7.458 | \n",
"
\n",
" \n",
" | 23 | \n",
" 7.458 | \n",
"
\n",
" \n",
" | 24 | \n",
" 7.588 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pop\n",
"3 8.963\n",
"4 8.963\n",
"5 9.194\n",
"22 7.458\n",
"23 7.458\n",
"24 7.588"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.loc[idx_l,['pop']]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "10ed2875",
"metadata": {},
"outputs": [],
"source": [
"# backward fill\n",
"df_new = df.fillna(method='bfill')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "04ba5525",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 8.963 | \n",
"
\n",
" \n",
" | 4 | \n",
" 9.194 | \n",
"
\n",
" \n",
" | 5 | \n",
" 9.194 | \n",
"
\n",
" \n",
" | 22 | \n",
" 7.458 | \n",
"
\n",
" \n",
" | 23 | \n",
" 7.588 | \n",
"
\n",
" \n",
" | 24 | \n",
" 7.588 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pop\n",
"3 8.963\n",
"4 9.194\n",
"5 9.194\n",
"22 7.458\n",
"23 7.588\n",
"24 7.588"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.loc[idx_l,['pop']]"
]
},
{
"cell_type": "markdown",
"id": "23e9708f",
"metadata": {},
"source": [
"Refer to: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html"
]
},
{
"cell_type": "markdown",
"id": "780eebb6",
"metadata": {},
"source": [
"### 3. fill with interpolate"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "c510d918",
"metadata": {},
"outputs": [],
"source": [
"# default is linear\n",
"df_new =df.interpolate() # method='linear'"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "6df1a71c",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 8.9630 | \n",
"
\n",
" \n",
" | 4 | \n",
" 9.0785 | \n",
"
\n",
" \n",
" | 5 | \n",
" 9.1940 | \n",
"
\n",
" \n",
" | 22 | \n",
" 7.4580 | \n",
"
\n",
" \n",
" | 23 | \n",
" 7.5230 | \n",
"
\n",
" \n",
" | 24 | \n",
" 7.5880 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pop\n",
"3 8.9630\n",
"4 9.0785\n",
"5 9.1940\n",
"22 7.4580\n",
"23 7.5230\n",
"24 7.5880"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.loc[idx_l,['pop']]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "835bb2a5",
"metadata": {},
"outputs": [],
"source": [
"# polynomial method\n",
"df_new =df.interpolate(method='polynomial',order=2)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "2586dc53",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 8.963000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 9.054257 | \n",
"
\n",
" \n",
" | 5 | \n",
" 9.194000 | \n",
"
\n",
" \n",
" | 22 | \n",
" 7.458000 | \n",
"
\n",
" \n",
" | 23 | \n",
" 7.522919 | \n",
"
\n",
" \n",
" | 24 | \n",
" 7.588000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pop\n",
"3 8.963000\n",
"4 9.054257\n",
"5 9.194000\n",
"22 7.458000\n",
"23 7.522919\n",
"24 7.588000"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.loc[idx_l,['pop']]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "b1ff778f",
"metadata": {},
"outputs": [],
"source": [
"# cubicspline\n",
"df_new =df.interpolate(method='cubicspline',order=2)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "d00ed97d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 8.963000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 9.052298 | \n",
"
\n",
" \n",
" | 5 | \n",
" 9.194000 | \n",
"
\n",
" \n",
" | 22 | \n",
" 7.458000 | \n",
"
\n",
" \n",
" | 23 | \n",
" 7.532276 | \n",
"
\n",
" \n",
" | 24 | \n",
" 7.588000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pop\n",
"3 8.963000\n",
"4 9.052298\n",
"5 9.194000\n",
"22 7.458000\n",
"23 7.532276\n",
"24 7.588000"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.loc[idx_l,['pop']]"
]
},
{
"cell_type": "markdown",
"id": "cb6b1e70",
"metadata": {},
"source": [
"Refer to:\n",
"https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html\n",
"\n",
"https://www.w3schools.com/python/pandas/ref_df_interpolate.asp"
]
},
{
"cell_type": "markdown",
"id": "84a79d76",
"metadata": {},
"source": [
"## 4. save data as csv file"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "d516923f",
"metadata": {},
"outputs": [],
"source": [
"df_new.to_csv('./data/gdp_china_mis_cl.csv',index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}