{
"cells": [
{
"cell_type": "markdown",
"id": "08a1d994",
"metadata": {},
"source": [
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" \n",
" \n",
" | \n",
" \n",
" \n",
" | \n",
"
\n",
"
\n",
"\n",
"Normalizing the Data
\n",
"Shouke Wei, Ph.D. Professor
\n",
"Email: shouke.wei@gmail.com
"
]
},
{
"cell_type": "markdown",
"id": "f8ae5c5d",
"metadata": {},
"source": [
"## Objective \n",
"- learn how to normalize the features, save and load the normalization scaler for new data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "94abd9b5",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" gdp | \n",
" pop | \n",
" finv | \n",
" trade | \n",
" fexpen | \n",
" uinc | \n",
" prov_hn | \n",
" prov_js | \n",
" prov_sd | \n",
" prov_zj | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 2000 | \n",
" 1.074125 | \n",
" 8.650000 | \n",
" 0.314513 | \n",
" 1.408147 | \n",
" 0.108032 | \n",
" 0.976157 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2001 | \n",
" 1.203925 | \n",
" 8.733000 | \n",
" 0.348443 | \n",
" 1.501391 | \n",
" 0.132133 | \n",
" 1.041519 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2002 | \n",
" 1.350242 | \n",
" 8.842000 | \n",
" 0.385078 | \n",
" 1.830169 | \n",
" 0.152108 | \n",
" 1.113720 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2003 | \n",
" 1.584464 | \n",
" 8.963000 | \n",
" 0.481320 | \n",
" 2.346735 | \n",
" 0.169563 | \n",
" 1.238043 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2004 | \n",
" 1.886462 | \n",
" 9.052298 | \n",
" 0.587002 | \n",
" 2.955899 | \n",
" 0.185295 | \n",
" 1.362765 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year gdp pop finv trade fexpen uinc prov_hn \\\n",
"0 2000 1.074125 8.650000 0.314513 1.408147 0.108032 0.976157 0.0 \n",
"1 2001 1.203925 8.733000 0.348443 1.501391 0.132133 1.041519 0.0 \n",
"2 2002 1.350242 8.842000 0.385078 1.830169 0.152108 1.113720 0.0 \n",
"3 2003 1.584464 8.963000 0.481320 2.346735 0.169563 1.238043 0.0 \n",
"4 2004 1.886462 9.052298 0.587002 2.955899 0.185295 1.362765 0.0 \n",
"\n",
" prov_js prov_sd prov_zj \n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import required packages\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"# read data\n",
"df = pd.read_csv('./data/gdp_china_encoded.csv')\n",
"\n",
"# show the first 5 rows\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "c46350e0",
"metadata": {},
"source": [
"### Slice data into features X and target y"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4fadde18",
"metadata": {},
"outputs": [],
"source": [
"X = df.drop(['gdp'],axis=1)\n",
"y = df['gdp']"
]
},
{
"cell_type": "markdown",
"id": "913af9d8",
"metadata": {},
"source": [
"### Split train and test data"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "06608307",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30, random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7fbd1ef3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" pop | \n",
" finv | \n",
" trade | \n",
" fexpen | \n",
" uinc | \n",
" prov_hn | \n",
" prov_js | \n",
" prov_sd | \n",
" prov_zj | \n",
"
\n",
" \n",
" \n",
" \n",
" | 66 | \n",
" 2009 | \n",
" 5.276 | \n",
" 1.074232 | \n",
" 1.282390 | \n",
" 0.265335 | \n",
" 2.461081 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 54 | \n",
" 2016 | \n",
" 9.947 | \n",
" 5.332294 | \n",
" 1.547657 | \n",
" 0.875521 | \n",
" 3.401208 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 36 | \n",
" 2017 | \n",
" 7.656 | \n",
" 5.327700 | \n",
" 3.999750 | \n",
" 1.062103 | \n",
" 4.362180 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 45 | \n",
" 2007 | \n",
" 9.367 | \n",
" 1.253770 | \n",
" 0.931296 | \n",
" 0.226185 | \n",
" 1.426470 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 52 | \n",
" 2014 | \n",
" 9.789 | \n",
" 4.249555 | \n",
" 1.701122 | \n",
" 0.717731 | \n",
" 2.922194 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 75 | \n",
" 2018 | \n",
" 5.155 | \n",
" 3.169770 | \n",
" 2.851160 | \n",
" 0.862953 | \n",
" 5.557430 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 9 | \n",
" 2009 | \n",
" 10.130 | \n",
" 1.293312 | \n",
" 4.174383 | \n",
" 0.433437 | \n",
" 2.157472 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 72 | \n",
" 2015 | \n",
" 5.539 | \n",
" 2.732332 | \n",
" 2.159908 | \n",
" 0.664598 | \n",
" 4.371448 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 12 | \n",
" 2012 | \n",
" 10.594 | \n",
" 1.875150 | \n",
" 6.211629 | \n",
" 0.738786 | \n",
" 3.022671 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 37 | \n",
" 2018 | \n",
" 7.723 | \n",
" 5.327680 | \n",
" 4.379350 | \n",
" 1.165735 | \n",
" 4.720000 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
66 rows × 10 columns
\n",
"
"
],
"text/plain": [
" year pop finv trade fexpen uinc prov_hn prov_js \\\n",
"66 2009 5.276 1.074232 1.282390 0.265335 2.461081 0.0 0.0 \n",
"54 2016 9.947 5.332294 1.547657 0.875521 3.401208 0.0 0.0 \n",
"36 2017 7.656 5.327700 3.999750 1.062103 4.362180 0.0 1.0 \n",
"45 2007 9.367 1.253770 0.931296 0.226185 1.426470 0.0 0.0 \n",
"52 2014 9.789 4.249555 1.701122 0.717731 2.922194 0.0 0.0 \n",
".. ... ... ... ... ... ... ... ... \n",
"75 2018 5.155 3.169770 2.851160 0.862953 5.557430 0.0 0.0 \n",
"9 2009 10.130 1.293312 4.174383 0.433437 2.157472 0.0 0.0 \n",
"72 2015 5.539 2.732332 2.159908 0.664598 4.371448 0.0 0.0 \n",
"12 2012 10.594 1.875150 6.211629 0.738786 3.022671 0.0 0.0 \n",
"37 2018 7.723 5.327680 4.379350 1.165735 4.720000 0.0 1.0 \n",
"\n",
" prov_sd prov_zj \n",
"66 0.0 1.0 \n",
"54 1.0 0.0 \n",
"36 0.0 0.0 \n",
"45 1.0 0.0 \n",
"52 1.0 0.0 \n",
".. ... ... \n",
"75 0.0 1.0 \n",
"9 0.0 0.0 \n",
"72 0.0 1.0 \n",
"12 0.0 0.0 \n",
"37 0.0 0.0 \n",
"\n",
"[66 rows x 10 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train"
]
},
{
"cell_type": "markdown",
"id": "4dbaa7bf",
"metadata": {},
"source": [
"## 1. Normalization and Standardization"
]
},
{
"cell_type": "markdown",
"id": "337e30a2",
"metadata": {},
"source": [
"The terms standardize and normalize are used interchangeably in data preprocessing, although in statistics, the latter term also has other connotations.\n",
"\n",
"The process of normalization involves transforming the data to a smaller or common range such as [−1,1] or [0, 1]."
]
},
{
"cell_type": "markdown",
"id": "9950ac4c",
"metadata": {},
"source": [
"## 2. Why data normalization?\n",
"\n",
"Normalization:\n",
" \n",
"- gives all attributes an equal weight\n",
"- avoids dependence on the measurement units\n",
"- particularly useful for machine learning training or\n",
"- helps speed up the learning phase\n",
"\n",
"In a linear regression model, it can help too though it is not necessary."
]
},
{
"cell_type": "markdown",
"id": "d4099149",
"metadata": {},
"source": [
"## 3. Methods for data normalization\n",
"#### Min-max normalization: \n",
"$$x'=\\frac{x - min(x)}{max(x) - min(x)}$$\n",
"\n",
"$$x'=\\frac{x - min(x)}{max(x) - min(x)}(new\\_max(x)-new\\_min(x)) + new\\_min(x)$$"
]
},
{
"cell_type": "markdown",
"id": "4942006e",
"metadata": {},
"source": [
"#### Mean normalization\n",
"\n",
"$$x'=\\frac{x - mean(x)}{max(x) - min(x)}$$ \n",
"\n",
"#### Z-score normalization / Standardization\n",
"\n",
"$$x'=\\frac{x - \\mu}{\\sigma}$$\n",
"\n",
"$$μ: \\text{the mean of the variable,}$$\n",
"$$σ: \\text{is the standard deviation of the variable.}$$\n",
"\n",
"#### Scaling to unit length\n",
"\n",
"$$x'=\\frac{x}{||x||}$$\n",
"$$||x||: \\text{the Euclidean length of the variable}.$$\n",
"\n",
"#### Decimal scaling \n",
"$$x'=\\frac{x}{10^j}$$\n",
"\n",
"$$ j: \\text{the smallest integer such that max(|x'|)<1}$$"
]
},
{
"cell_type": "markdown",
"id": "d1d00a4d",
"metadata": {},
"source": [
"## 4. Sklearn built-in methods for data normalization"
]
},
{
"cell_type": "markdown",
"id": "66f5d61e",
"metadata": {},
"source": [
"### (1) MinMaxScaler\n",
"- Transform features by scaling each feature to a given range\n",
"\n",
"### (2) MaxAbsScaler \n",
"- Scale each feature by its maximum absolute value [-1, 1] by dividing through the largest maximum value\n",
"\n",
"### (3) RobustScaler\n",
"- Scale features using statistics that are robust to outliers.It subtracts the column median and divides by the interquartile range.\n",
"\n",
"### (4) StandardScaler\n",
"- StandardScaler scales each column to have 0 mean and unit variance.\n",
"\n",
"### (5) Normalizer\n",
"Normalize samples individually to unit norm. The normalizer operates on the rows rather than the columns. It applies l2 normalization by default.\n",
"\n",
"Reference: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing"
]
},
{
"cell_type": "markdown",
"id": "bc734084",
"metadata": {},
"source": [
"### `MinMaxScaler` Example:"
]
},
{
"cell_type": "markdown",
"id": "830de60c",
"metadata": {},
"source": [
"#### (1) Normaliz the trainning dataset"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "8d211be1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" pop | \n",
" finv | \n",
" trade | \n",
" fexpen | \n",
" uinc | \n",
"
\n",
" \n",
" \n",
" \n",
" | 66 | \n",
" 2009 | \n",
" 5.276 | \n",
" 1.074232 | \n",
" 1.282390 | \n",
" 0.265335 | \n",
" 2.461081 | \n",
"
\n",
" \n",
" | 54 | \n",
" 2016 | \n",
" 9.947 | \n",
" 5.332294 | \n",
" 1.547657 | \n",
" 0.875521 | \n",
" 3.401208 | \n",
"
\n",
" \n",
" | 36 | \n",
" 2017 | \n",
" 7.656 | \n",
" 5.327700 | \n",
" 3.999750 | \n",
" 1.062103 | \n",
" 4.362180 | \n",
"
\n",
" \n",
" | 45 | \n",
" 2007 | \n",
" 9.367 | \n",
" 1.253770 | \n",
" 0.931296 | \n",
" 0.226185 | \n",
" 1.426470 | \n",
"
\n",
" \n",
" | 52 | \n",
" 2014 | \n",
" 9.789 | \n",
" 4.249555 | \n",
" 1.701122 | \n",
" 0.717731 | \n",
" 2.922194 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 75 | \n",
" 2018 | \n",
" 5.155 | \n",
" 3.169770 | \n",
" 2.851160 | \n",
" 0.862953 | \n",
" 5.557430 | \n",
"
\n",
" \n",
" | 9 | \n",
" 2009 | \n",
" 10.130 | \n",
" 1.293312 | \n",
" 4.174383 | \n",
" 0.433437 | \n",
" 2.157472 | \n",
"
\n",
" \n",
" | 72 | \n",
" 2015 | \n",
" 5.539 | \n",
" 2.732332 | \n",
" 2.159908 | \n",
" 0.664598 | \n",
" 4.371448 | \n",
"
\n",
" \n",
" | 12 | \n",
" 2012 | \n",
" 10.594 | \n",
" 1.875150 | \n",
" 6.211629 | \n",
" 0.738786 | \n",
" 3.022671 | \n",
"
\n",
" \n",
" | 37 | \n",
" 2018 | \n",
" 7.723 | \n",
" 5.327680 | \n",
" 4.379350 | \n",
" 1.165735 | \n",
" 4.720000 | \n",
"
\n",
" \n",
"
\n",
"
66 rows × 6 columns
\n",
"
"
],
"text/plain": [
" year pop finv trade fexpen uinc\n",
"66 2009 5.276 1.074232 1.282390 0.265335 2.461081\n",
"54 2016 9.947 5.332294 1.547657 0.875521 3.401208\n",
"36 2017 7.656 5.327700 3.999750 1.062103 4.362180\n",
"45 2007 9.367 1.253770 0.931296 0.226185 1.426470\n",
"52 2014 9.789 4.249555 1.701122 0.717731 2.922194\n",
".. ... ... ... ... ... ...\n",
"75 2018 5.155 3.169770 2.851160 0.862953 5.557430\n",
"9 2009 10.130 1.293312 4.174383 0.433437 2.157472\n",
"72 2015 5.539 2.732332 2.159908 0.664598 4.371448\n",
"12 2012 10.594 1.875150 6.211629 0.738786 3.022671\n",
"37 2018 7.723 5.327680 4.379350 1.165735 4.720000\n",
"\n",
"[66 rows x 6 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# slice the continous features from the training data\n",
"X_train_continuous = X_train.loc[:,'year':'uinc']\n",
"X_train_continuous"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "329a6000",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" pop | \n",
" finv | \n",
" trade | \n",
" fexpen | \n",
" uinc | \n",
"
\n",
" \n",
" \n",
" \n",
" | 66 | \n",
" 0.500000 | \n",
" 0.094319 | \n",
" 0.173947 | \n",
" 0.176927 | \n",
" 0.145251 | \n",
" 0.390579 | \n",
"
\n",
" \n",
" | 54 | \n",
" 0.888889 | \n",
" 0.833518 | \n",
" 0.964883 | \n",
" 0.214073 | \n",
" 0.544119 | \n",
" 0.575614 | \n",
"
\n",
" \n",
" | 36 | \n",
" 0.944444 | \n",
" 0.470961 | \n",
" 0.964029 | \n",
" 0.557440 | \n",
" 0.666084 | \n",
" 0.764752 | \n",
"
\n",
" \n",
" | 45 | \n",
" 0.388889 | \n",
" 0.741731 | \n",
" 0.207296 | \n",
" 0.127763 | \n",
" 0.119660 | \n",
" 0.186948 | \n",
"
\n",
" \n",
" | 52 | \n",
" 0.777778 | \n",
" 0.808514 | \n",
" 0.763764 | \n",
" 0.235562 | \n",
" 0.440974 | \n",
" 0.481335 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 75 | \n",
" 1.000000 | \n",
" 0.075170 | \n",
" 0.563194 | \n",
" 0.396602 | \n",
" 0.535903 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 9 | \n",
" 0.500000 | \n",
" 0.862478 | \n",
" 0.214641 | \n",
" 0.581894 | \n",
" 0.255137 | \n",
" 0.330823 | \n",
"
\n",
" \n",
" | 72 | \n",
" 0.833333 | \n",
" 0.135939 | \n",
" 0.481940 | \n",
" 0.299806 | \n",
" 0.406242 | \n",
" 0.766576 | \n",
"
\n",
" \n",
" | 12 | \n",
" 0.666667 | \n",
" 0.935908 | \n",
" 0.322718 | \n",
" 0.867170 | \n",
" 0.454738 | \n",
" 0.501111 | \n",
"
\n",
" \n",
" | 37 | \n",
" 1.000000 | \n",
" 0.481564 | \n",
" 0.964026 | \n",
" 0.610595 | \n",
" 0.733827 | \n",
" 0.835178 | \n",
"
\n",
" \n",
"
\n",
"
66 rows × 6 columns
\n",
"
"
],
"text/plain": [
" year pop finv trade fexpen uinc\n",
"66 0.500000 0.094319 0.173947 0.176927 0.145251 0.390579\n",
"54 0.888889 0.833518 0.964883 0.214073 0.544119 0.575614\n",
"36 0.944444 0.470961 0.964029 0.557440 0.666084 0.764752\n",
"45 0.388889 0.741731 0.207296 0.127763 0.119660 0.186948\n",
"52 0.777778 0.808514 0.763764 0.235562 0.440974 0.481335\n",
".. ... ... ... ... ... ...\n",
"75 1.000000 0.075170 0.563194 0.396602 0.535903 1.000000\n",
"9 0.500000 0.862478 0.214641 0.581894 0.255137 0.330823\n",
"72 0.833333 0.135939 0.481940 0.299806 0.406242 0.766576\n",
"12 0.666667 0.935908 0.322718 0.867170 0.454738 0.501111\n",
"37 1.000000 0.481564 0.964026 0.610595 0.733827 0.835178\n",
"\n",
"[66 rows x 6 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# to learn the underlying parameters of the scaler from the training data-set\n",
"min_max_scaler = MinMaxScaler().fit(X_train_continuous)\n",
"\n",
"# transform the training data-set to range [0,1]\n",
"X_train_continuous_scaled = min_max_scaler.transform(X_train_continuous)\n",
"\n",
"# convert it into dataframe\n",
"X_train_continuous_scaled = pd.DataFrame(X_train_continuous_scaled,index=X_train_continuous.index,\n",
" columns=X_train_continuous.columns)\n",
"X_train_continuous_scaled"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "96d85fb7",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" pop | \n",
" finv | \n",
" trade | \n",
" fexpen | \n",
" uinc | \n",
" prov_hn | \n",
" prov_js | \n",
" prov_sd | \n",
" prov_zj | \n",
"
\n",
" \n",
" \n",
" \n",
" | 66 | \n",
" 0.500000 | \n",
" 0.094319 | \n",
" 0.173947 | \n",
" 0.176927 | \n",
" 0.145251 | \n",
" 0.390579 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 54 | \n",
" 0.888889 | \n",
" 0.833518 | \n",
" 0.964883 | \n",
" 0.214073 | \n",
" 0.544119 | \n",
" 0.575614 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 36 | \n",
" 0.944444 | \n",
" 0.470961 | \n",
" 0.964029 | \n",
" 0.557440 | \n",
" 0.666084 | \n",
" 0.764752 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 45 | \n",
" 0.388889 | \n",
" 0.741731 | \n",
" 0.207296 | \n",
" 0.127763 | \n",
" 0.119660 | \n",
" 0.186948 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 52 | \n",
" 0.777778 | \n",
" 0.808514 | \n",
" 0.763764 | \n",
" 0.235562 | \n",
" 0.440974 | \n",
" 0.481335 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 75 | \n",
" 1.000000 | \n",
" 0.075170 | \n",
" 0.563194 | \n",
" 0.396602 | \n",
" 0.535903 | \n",
" 1.000000 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 9 | \n",
" 0.500000 | \n",
" 0.862478 | \n",
" 0.214641 | \n",
" 0.581894 | \n",
" 0.255137 | \n",
" 0.330823 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 72 | \n",
" 0.833333 | \n",
" 0.135939 | \n",
" 0.481940 | \n",
" 0.299806 | \n",
" 0.406242 | \n",
" 0.766576 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 12 | \n",
" 0.666667 | \n",
" 0.935908 | \n",
" 0.322718 | \n",
" 0.867170 | \n",
" 0.454738 | \n",
" 0.501111 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 37 | \n",
" 1.000000 | \n",
" 0.481564 | \n",
" 0.964026 | \n",
" 0.610595 | \n",
" 0.733827 | \n",
" 0.835178 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
66 rows × 10 columns
\n",
"
"
],
"text/plain": [
" year pop finv trade fexpen uinc prov_hn \\\n",
"66 0.500000 0.094319 0.173947 0.176927 0.145251 0.390579 0.0 \n",
"54 0.888889 0.833518 0.964883 0.214073 0.544119 0.575614 0.0 \n",
"36 0.944444 0.470961 0.964029 0.557440 0.666084 0.764752 0.0 \n",
"45 0.388889 0.741731 0.207296 0.127763 0.119660 0.186948 0.0 \n",
"52 0.777778 0.808514 0.763764 0.235562 0.440974 0.481335 0.0 \n",
".. ... ... ... ... ... ... ... \n",
"75 1.000000 0.075170 0.563194 0.396602 0.535903 1.000000 0.0 \n",
"9 0.500000 0.862478 0.214641 0.581894 0.255137 0.330823 0.0 \n",
"72 0.833333 0.135939 0.481940 0.299806 0.406242 0.766576 0.0 \n",
"12 0.666667 0.935908 0.322718 0.867170 0.454738 0.501111 0.0 \n",
"37 1.000000 0.481564 0.964026 0.610595 0.733827 0.835178 0.0 \n",
"\n",
" prov_js prov_sd prov_zj \n",
"66 0.0 0.0 1.0 \n",
"54 0.0 1.0 0.0 \n",
"36 1.0 0.0 0.0 \n",
"45 0.0 1.0 0.0 \n",
"52 0.0 1.0 0.0 \n",
".. ... ... ... \n",
"75 0.0 0.0 1.0 \n",
"9 0.0 0.0 0.0 \n",
"72 0.0 0.0 1.0 \n",
"12 0.0 0.0 0.0 \n",
"37 1.0 0.0 0.0 \n",
"\n",
"[66 rows x 10 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# diplay the full scaled train dataset \n",
"X_train_scaled = X_train.copy()\n",
"X_train_scaled.loc[:,'year':'uinc'] = X_train_continuous_scaled\n",
"X_train_scaled"
]
},
{
"cell_type": "markdown",
"id": "c61f9138",
"metadata": {},
"source": [
"#### (2) Normaliz the testing dataset"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "965240a1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" pop | \n",
" finv | \n",
" trade | \n",
" fexpen | \n",
" uinc | \n",
"
\n",
" \n",
" \n",
" \n",
" | 40 | \n",
" 0.111111 | \n",
" 0.696629 | \n",
" 0.039111 | \n",
" 0.036688 | \n",
" 0.028066 | \n",
" 0.056056 | \n",
"
\n",
" \n",
" | 31 | \n",
" 0.666667 | \n",
" 0.512739 | \n",
" 0.547526 | \n",
" 0.481719 | \n",
" 0.431193 | \n",
" 0.490291 | \n",
"
\n",
" \n",
" | 46 | \n",
" 0.444444 | \n",
" 0.749644 | \n",
" 0.261131 | \n",
" 0.151409 | \n",
" 0.148605 | \n",
" 0.227113 | \n",
"
\n",
" \n",
" | 58 | \n",
" 0.055556 | \n",
" 0.007754 | \n",
" 0.027068 | \n",
" 0.035371 | \n",
" 0.010851 | \n",
" 0.112156 | \n",
"
\n",
" \n",
" | 77 | \n",
" 0.055556 | \n",
" 0.771483 | \n",
" 0.003089 | \n",
" 0.000578 | \n",
" 0.005052 | \n",
" 0.009864 | \n",
"
\n",
" \n",
" | 49 | \n",
" 0.611111 | \n",
" 0.784460 | \n",
" 0.471284 | \n",
" 0.210696 | \n",
" 0.298783 | \n",
" 0.354778 | \n",
"
\n",
" \n",
" | 87 | \n",
" 0.611111 | \n",
" 0.745055 | \n",
" 0.304467 | \n",
" 0.026858 | \n",
" 0.249544 | \n",
" 0.264300 | \n",
"
\n",
" \n",
" | 44 | \n",
" 0.333333 | \n",
" 0.732553 | \n",
" 0.180803 | \n",
" 0.103640 | \n",
" 0.091655 | \n",
" 0.146158 | \n",
"
\n",
" \n",
" | 88 | \n",
" 0.666667 | \n",
" 0.747903 | \n",
" 0.372843 | \n",
" 0.043088 | \n",
" 0.299066 | \n",
" 0.308541 | \n",
"
\n",
" \n",
" | 90 | \n",
" 0.777778 | \n",
" 0.752651 | \n",
" 0.546188 | \n",
" 0.053241 | \n",
" 0.365891 | \n",
" 0.372103 | \n",
"
\n",
" \n",
" | 67 | \n",
" 0.555556 | \n",
" 0.121380 | \n",
" 0.204294 | \n",
" 0.237688 | \n",
" 0.181500 | \n",
" 0.444669 | \n",
"
\n",
" \n",
" | 27 | \n",
" 0.444444 | \n",
" 0.487735 | \n",
" 0.258616 | \n",
" 0.378848 | \n",
" 0.184089 | \n",
" 0.273840 | \n",
"
\n",
" \n",
" | 74 | \n",
" 0.944444 | \n",
" 0.062035 | \n",
" 0.563162 | \n",
" 0.355903 | \n",
" 0.464050 | \n",
" 0.915100 | \n",
"
\n",
" \n",
" | 84 | \n",
" 0.444444 | \n",
" 0.751543 | \n",
" 0.169272 | \n",
" 0.014353 | \n",
" 0.120951 | \n",
" 0.166605 | \n",
"
\n",
" \n",
" | 32 | \n",
" 0.722222 | \n",
" 0.515746 | \n",
" 0.650043 | \n",
" 0.475029 | \n",
" 0.481579 | \n",
" 0.527854 | \n",
"
\n",
" \n",
" | 55 | \n",
" 0.944444 | \n",
" 0.732553 | \n",
" 0.999799 | \n",
" 0.248336 | \n",
" 0.577012 | \n",
" 0.630277 | \n",
"
\n",
" \n",
" | 39 | \n",
" 0.055556 | \n",
" 0.690141 | \n",
" 0.026208 | \n",
" 0.030915 | \n",
" 0.021080 | \n",
" 0.045954 | \n",
"
\n",
" \n",
" | 10 | \n",
" 0.555556 | \n",
" 0.911695 | \n",
" 0.264619 | \n",
" 0.741384 | \n",
" 0.326203 | \n",
" 0.376546 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.111111 | \n",
" 0.658649 | \n",
" 0.045937 | \n",
" 0.253633 | \n",
" 0.071237 | \n",
" 0.125392 | \n",
"
\n",
" \n",
" | 38 | \n",
" 0.000000 | \n",
" 0.683336 | \n",
" 0.021424 | \n",
" 0.026322 | \n",
" 0.011883 | \n",
" 0.033926 | \n",
"
\n",
" \n",
" | 53 | \n",
" 0.833333 | \n",
" 0.817693 | \n",
" 0.871813 | \n",
" 0.207203 | \n",
" 0.511095 | \n",
" 0.527062 | \n",
"
\n",
" \n",
" | 73 | \n",
" 0.888889 | \n",
" 0.144010 | \n",
" 0.536787 | \n",
" 0.308322 | \n",
" 0.427701 | \n",
" 0.835909 | \n",
"
\n",
" \n",
" | 19 | \n",
" 0.000000 | \n",
" 0.418895 | \n",
" 0.022146 | \n",
" 0.050256 | \n",
" 0.010458 | \n",
" 0.040032 | \n",
"
\n",
" \n",
" | 89 | \n",
" 0.722222 | \n",
" 0.749011 | \n",
" 0.458983 | \n",
" 0.049350 | \n",
" 0.336712 | \n",
" 0.334089 | \n",
"
\n",
" \n",
" | 94 | \n",
" 1.000000 | \n",
" 0.740624 | \n",
" 0.801175 | \n",
" 0.074534 | \n",
" 0.574353 | \n",
" 0.533536 | \n",
"
\n",
" \n",
" | 35 | \n",
" 0.888889 | \n",
" 0.525241 | \n",
" 0.896903 | \n",
" 0.468056 | \n",
" 0.624309 | \n",
" 0.696451 | \n",
"
\n",
" \n",
" | 33 | \n",
" 0.777778 | \n",
" 0.519069 | \n",
" 0.753419 | \n",
" 0.482110 | \n",
" 0.525635 | \n",
" 0.582191 | \n",
"
\n",
" \n",
" | 48 | \n",
" 0.555556 | \n",
" 0.776705 | \n",
" 0.406844 | \n",
" 0.176662 | \n",
" 0.242760 | \n",
" 0.298763 | \n",
"
\n",
" \n",
" | 70 | \n",
" 0.722222 | \n",
" 0.129451 | \n",
" 0.360436 | \n",
" 0.288562 | \n",
" 0.281029 | \n",
" 0.635990 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year pop finv trade fexpen uinc\n",
"40 0.111111 0.696629 0.039111 0.036688 0.028066 0.056056\n",
"31 0.666667 0.512739 0.547526 0.481719 0.431193 0.490291\n",
"46 0.444444 0.749644 0.261131 0.151409 0.148605 0.227113\n",
"58 0.055556 0.007754 0.027068 0.035371 0.010851 0.112156\n",
"77 0.055556 0.771483 0.003089 0.000578 0.005052 0.009864\n",
"49 0.611111 0.784460 0.471284 0.210696 0.298783 0.354778\n",
"87 0.611111 0.745055 0.304467 0.026858 0.249544 0.264300\n",
"44 0.333333 0.732553 0.180803 0.103640 0.091655 0.146158\n",
"88 0.666667 0.747903 0.372843 0.043088 0.299066 0.308541\n",
"90 0.777778 0.752651 0.546188 0.053241 0.365891 0.372103\n",
"67 0.555556 0.121380 0.204294 0.237688 0.181500 0.444669\n",
"27 0.444444 0.487735 0.258616 0.378848 0.184089 0.273840\n",
"74 0.944444 0.062035 0.563162 0.355903 0.464050 0.915100\n",
"84 0.444444 0.751543 0.169272 0.014353 0.120951 0.166605\n",
"32 0.722222 0.515746 0.650043 0.475029 0.481579 0.527854\n",
"55 0.944444 0.732553 0.999799 0.248336 0.577012 0.630277\n",
"39 0.055556 0.690141 0.026208 0.030915 0.021080 0.045954\n",
"10 0.555556 0.911695 0.264619 0.741384 0.326203 0.376546\n",
"2 0.111111 0.658649 0.045937 0.253633 0.071237 0.125392\n",
"38 0.000000 0.683336 0.021424 0.026322 0.011883 0.033926\n",
"53 0.833333 0.817693 0.871813 0.207203 0.511095 0.527062\n",
"73 0.888889 0.144010 0.536787 0.308322 0.427701 0.835909\n",
"19 0.000000 0.418895 0.022146 0.050256 0.010458 0.040032\n",
"89 0.722222 0.749011 0.458983 0.049350 0.336712 0.334089\n",
"94 1.000000 0.740624 0.801175 0.074534 0.574353 0.533536\n",
"35 0.888889 0.525241 0.896903 0.468056 0.624309 0.696451\n",
"33 0.777778 0.519069 0.753419 0.482110 0.525635 0.582191\n",
"48 0.555556 0.776705 0.406844 0.176662 0.242760 0.298763\n",
"70 0.722222 0.129451 0.360436 0.288562 0.281029 0.635990"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# slice the continous features from the testing data\n",
"X_test_continuous = X_test.loc[:,'year':'uinc']\n",
"# transforme the testing data-set to range [0,1] using the training scaler\n",
"X_test_continuous_scaled = min_max_scaler.transform(X_test_continuous)\n",
"\n",
"# convert it into dataframe\n",
"X_test_continuous_scaled = pd.DataFrame(X_test_continuous_scaled,index=X_test_continuous.index,\n",
" columns=X_test_continuous.columns)\n",
"X_test_continuous_scaled"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "51fecee3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" pop | \n",
" finv | \n",
" trade | \n",
" fexpen | \n",
" uinc | \n",
" prov_hn | \n",
" prov_js | \n",
" prov_sd | \n",
" prov_zj | \n",
"
\n",
" \n",
" \n",
" \n",
" | 40 | \n",
" 0.111111 | \n",
" 0.696629 | \n",
" 0.039111 | \n",
" 0.036688 | \n",
" 0.028066 | \n",
" 0.056056 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 31 | \n",
" 0.666667 | \n",
" 0.512739 | \n",
" 0.547526 | \n",
" 0.481719 | \n",
" 0.431193 | \n",
" 0.490291 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 46 | \n",
" 0.444444 | \n",
" 0.749644 | \n",
" 0.261131 | \n",
" 0.151409 | \n",
" 0.148605 | \n",
" 0.227113 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 58 | \n",
" 0.055556 | \n",
" 0.007754 | \n",
" 0.027068 | \n",
" 0.035371 | \n",
" 0.010851 | \n",
" 0.112156 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 77 | \n",
" 0.055556 | \n",
" 0.771483 | \n",
" 0.003089 | \n",
" 0.000578 | \n",
" 0.005052 | \n",
" 0.009864 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 49 | \n",
" 0.611111 | \n",
" 0.784460 | \n",
" 0.471284 | \n",
" 0.210696 | \n",
" 0.298783 | \n",
" 0.354778 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 87 | \n",
" 0.611111 | \n",
" 0.745055 | \n",
" 0.304467 | \n",
" 0.026858 | \n",
" 0.249544 | \n",
" 0.264300 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 44 | \n",
" 0.333333 | \n",
" 0.732553 | \n",
" 0.180803 | \n",
" 0.103640 | \n",
" 0.091655 | \n",
" 0.146158 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 88 | \n",
" 0.666667 | \n",
" 0.747903 | \n",
" 0.372843 | \n",
" 0.043088 | \n",
" 0.299066 | \n",
" 0.308541 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 90 | \n",
" 0.777778 | \n",
" 0.752651 | \n",
" 0.546188 | \n",
" 0.053241 | \n",
" 0.365891 | \n",
" 0.372103 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 67 | \n",
" 0.555556 | \n",
" 0.121380 | \n",
" 0.204294 | \n",
" 0.237688 | \n",
" 0.181500 | \n",
" 0.444669 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 27 | \n",
" 0.444444 | \n",
" 0.487735 | \n",
" 0.258616 | \n",
" 0.378848 | \n",
" 0.184089 | \n",
" 0.273840 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 74 | \n",
" 0.944444 | \n",
" 0.062035 | \n",
" 0.563162 | \n",
" 0.355903 | \n",
" 0.464050 | \n",
" 0.915100 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 84 | \n",
" 0.444444 | \n",
" 0.751543 | \n",
" 0.169272 | \n",
" 0.014353 | \n",
" 0.120951 | \n",
" 0.166605 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 32 | \n",
" 0.722222 | \n",
" 0.515746 | \n",
" 0.650043 | \n",
" 0.475029 | \n",
" 0.481579 | \n",
" 0.527854 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 55 | \n",
" 0.944444 | \n",
" 0.732553 | \n",
" 0.999799 | \n",
" 0.248336 | \n",
" 0.577012 | \n",
" 0.630277 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 39 | \n",
" 0.055556 | \n",
" 0.690141 | \n",
" 0.026208 | \n",
" 0.030915 | \n",
" 0.021080 | \n",
" 0.045954 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 10 | \n",
" 0.555556 | \n",
" 0.911695 | \n",
" 0.264619 | \n",
" 0.741384 | \n",
" 0.326203 | \n",
" 0.376546 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.111111 | \n",
" 0.658649 | \n",
" 0.045937 | \n",
" 0.253633 | \n",
" 0.071237 | \n",
" 0.125392 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 38 | \n",
" 0.000000 | \n",
" 0.683336 | \n",
" 0.021424 | \n",
" 0.026322 | \n",
" 0.011883 | \n",
" 0.033926 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 53 | \n",
" 0.833333 | \n",
" 0.817693 | \n",
" 0.871813 | \n",
" 0.207203 | \n",
" 0.511095 | \n",
" 0.527062 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 73 | \n",
" 0.888889 | \n",
" 0.144010 | \n",
" 0.536787 | \n",
" 0.308322 | \n",
" 0.427701 | \n",
" 0.835909 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 19 | \n",
" 0.000000 | \n",
" 0.418895 | \n",
" 0.022146 | \n",
" 0.050256 | \n",
" 0.010458 | \n",
" 0.040032 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 89 | \n",
" 0.722222 | \n",
" 0.749011 | \n",
" 0.458983 | \n",
" 0.049350 | \n",
" 0.336712 | \n",
" 0.334089 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 94 | \n",
" 1.000000 | \n",
" 0.740624 | \n",
" 0.801175 | \n",
" 0.074534 | \n",
" 0.574353 | \n",
" 0.533536 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 35 | \n",
" 0.888889 | \n",
" 0.525241 | \n",
" 0.896903 | \n",
" 0.468056 | \n",
" 0.624309 | \n",
" 0.696451 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 33 | \n",
" 0.777778 | \n",
" 0.519069 | \n",
" 0.753419 | \n",
" 0.482110 | \n",
" 0.525635 | \n",
" 0.582191 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 48 | \n",
" 0.555556 | \n",
" 0.776705 | \n",
" 0.406844 | \n",
" 0.176662 | \n",
" 0.242760 | \n",
" 0.298763 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 70 | \n",
" 0.722222 | \n",
" 0.129451 | \n",
" 0.360436 | \n",
" 0.288562 | \n",
" 0.281029 | \n",
" 0.635990 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year pop finv trade fexpen uinc prov_hn \\\n",
"40 0.111111 0.696629 0.039111 0.036688 0.028066 0.056056 0.0 \n",
"31 0.666667 0.512739 0.547526 0.481719 0.431193 0.490291 0.0 \n",
"46 0.444444 0.749644 0.261131 0.151409 0.148605 0.227113 0.0 \n",
"58 0.055556 0.007754 0.027068 0.035371 0.010851 0.112156 0.0 \n",
"77 0.055556 0.771483 0.003089 0.000578 0.005052 0.009864 1.0 \n",
"49 0.611111 0.784460 0.471284 0.210696 0.298783 0.354778 0.0 \n",
"87 0.611111 0.745055 0.304467 0.026858 0.249544 0.264300 1.0 \n",
"44 0.333333 0.732553 0.180803 0.103640 0.091655 0.146158 0.0 \n",
"88 0.666667 0.747903 0.372843 0.043088 0.299066 0.308541 1.0 \n",
"90 0.777778 0.752651 0.546188 0.053241 0.365891 0.372103 1.0 \n",
"67 0.555556 0.121380 0.204294 0.237688 0.181500 0.444669 0.0 \n",
"27 0.444444 0.487735 0.258616 0.378848 0.184089 0.273840 0.0 \n",
"74 0.944444 0.062035 0.563162 0.355903 0.464050 0.915100 0.0 \n",
"84 0.444444 0.751543 0.169272 0.014353 0.120951 0.166605 1.0 \n",
"32 0.722222 0.515746 0.650043 0.475029 0.481579 0.527854 0.0 \n",
"55 0.944444 0.732553 0.999799 0.248336 0.577012 0.630277 0.0 \n",
"39 0.055556 0.690141 0.026208 0.030915 0.021080 0.045954 0.0 \n",
"10 0.555556 0.911695 0.264619 0.741384 0.326203 0.376546 0.0 \n",
"2 0.111111 0.658649 0.045937 0.253633 0.071237 0.125392 0.0 \n",
"38 0.000000 0.683336 0.021424 0.026322 0.011883 0.033926 0.0 \n",
"53 0.833333 0.817693 0.871813 0.207203 0.511095 0.527062 0.0 \n",
"73 0.888889 0.144010 0.536787 0.308322 0.427701 0.835909 0.0 \n",
"19 0.000000 0.418895 0.022146 0.050256 0.010458 0.040032 0.0 \n",
"89 0.722222 0.749011 0.458983 0.049350 0.336712 0.334089 1.0 \n",
"94 1.000000 0.740624 0.801175 0.074534 0.574353 0.533536 1.0 \n",
"35 0.888889 0.525241 0.896903 0.468056 0.624309 0.696451 0.0 \n",
"33 0.777778 0.519069 0.753419 0.482110 0.525635 0.582191 0.0 \n",
"48 0.555556 0.776705 0.406844 0.176662 0.242760 0.298763 0.0 \n",
"70 0.722222 0.129451 0.360436 0.288562 0.281029 0.635990 0.0 \n",
"\n",
" prov_js prov_sd prov_zj \n",
"40 0.0 1.0 0.0 \n",
"31 1.0 0.0 0.0 \n",
"46 0.0 1.0 0.0 \n",
"58 0.0 0.0 1.0 \n",
"77 0.0 0.0 0.0 \n",
"49 0.0 1.0 0.0 \n",
"87 0.0 0.0 0.0 \n",
"44 0.0 1.0 0.0 \n",
"88 0.0 0.0 0.0 \n",
"90 0.0 0.0 0.0 \n",
"67 0.0 0.0 1.0 \n",
"27 1.0 0.0 0.0 \n",
"74 0.0 0.0 1.0 \n",
"84 0.0 0.0 0.0 \n",
"32 1.0 0.0 0.0 \n",
"55 0.0 1.0 0.0 \n",
"39 0.0 1.0 0.0 \n",
"10 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"38 0.0 1.0 0.0 \n",
"53 0.0 1.0 0.0 \n",
"73 0.0 0.0 1.0 \n",
"19 1.0 0.0 0.0 \n",
"89 0.0 0.0 0.0 \n",
"94 0.0 0.0 0.0 \n",
"35 1.0 0.0 0.0 \n",
"33 1.0 0.0 0.0 \n",
"48 0.0 1.0 0.0 \n",
"70 0.0 0.0 1.0 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# diplay the full scaled train dataset \n",
"X_test_scaled = X_test.copy()\n",
"X_test_scaled.loc[:,'year':'uinc'] = X_test_continuous_scaled\n",
"X_test_scaled"
]
},
{
"cell_type": "markdown",
"id": "7aac1d46",
"metadata": {},
"source": [
"## 7. Save and load the training scaler"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "e4937531",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['mm_scaler']"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import joblib\n",
"joblib.dump(min_max_scaler,'mm_scaler')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "d0a0c572",
"metadata": {},
"outputs": [],
"source": [
"import joblib\n",
"mm_scaler = joblib.load('mm_scaler')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "1332cbc1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1.11111111e-01, 6.96629213e-01, 3.91109924e-02, 3.66880369e-02,\n",
" 2.80658336e-02, 5.60560888e-02],\n",
" [6.66666667e-01, 5.12739357e-01, 5.47525660e-01, 4.81719398e-01,\n",
" 4.31192786e-01, 4.90290710e-01],\n",
" [4.44444444e-01, 7.49643931e-01, 2.61131077e-01, 1.51408782e-01,\n",
" 1.48605435e-01, 2.27112677e-01],\n",
" [5.55555556e-02, 7.75439152e-03, 2.70675105e-02, 3.53714159e-02,\n",
" 1.08511200e-02, 1.12155675e-01],\n",
" [5.55555556e-02, 7.71482830e-01, 3.08939634e-03, 5.78018498e-04,\n",
" 5.05165395e-03, 9.86379321e-03],\n",
" [6.11111111e-01, 7.84459566e-01, 4.71284143e-01, 2.10695512e-01,\n",
" 2.98782975e-01, 3.54778102e-01],\n",
" [6.11111111e-01, 7.45054597e-01, 3.04466957e-01, 2.68583659e-02,\n",
" 2.49544384e-01, 2.64299509e-01],\n",
" [3.33333333e-01, 7.32552619e-01, 1.80803243e-01, 1.03640173e-01,\n",
" 9.16553580e-02, 1.46157577e-01],\n",
" [6.66666667e-01, 7.47903149e-01, 3.72842512e-01, 4.30876725e-02,\n",
" 2.99066019e-01, 3.08540932e-01],\n",
" [7.77777778e-01, 7.52650736e-01, 5.46187701e-01, 5.32412770e-02,\n",
" 3.65891269e-01, 3.72102526e-01],\n",
" [5.55555556e-01, 1.21379965e-01, 2.04293577e-01, 2.37688015e-01,\n",
" 1.81500017e-01, 4.44668993e-01],\n",
" [4.44444444e-01, 4.87735401e-01, 2.58616392e-01, 3.78847655e-01,\n",
" 1.84089251e-01, 2.73839731e-01],\n",
" [9.44444444e-01, 6.20351321e-02, 5.63162106e-01, 3.55902600e-01,\n",
" 4.64050109e-01, 9.15100051e-01],\n",
" [4.44444444e-01, 7.51542966e-01, 1.69272246e-01, 1.43526835e-02,\n",
" 1.20951421e-01, 1.66604537e-01],\n",
" [7.22222222e-01, 5.15746162e-01, 6.50043391e-01, 4.75028989e-01,\n",
" 4.81578590e-01, 5.27853859e-01],\n",
" [9.44444444e-01, 7.32552619e-01, 9.99799390e-01, 2.48335520e-01,\n",
" 5.77011575e-01, 6.30277019e-01],\n",
" [5.55555556e-02, 6.90140845e-01, 2.62082304e-02, 3.09146826e-02,\n",
" 2.10799348e-02, 4.59537506e-02],\n",
" [5.55555556e-01, 9.11694888e-01, 2.64618908e-01, 7.41384232e-01,\n",
" 3.26202971e-01, 3.76545523e-01],\n",
" [1.11111111e-01, 6.58648520e-01, 4.59367528e-02, 2.53632717e-01,\n",
" 7.12369492e-02, 1.25392359e-01],\n",
" [0.00000000e+00, 6.83335971e-01, 2.14236782e-02, 2.63224027e-02,\n",
" 1.18826301e-02, 3.39259298e-02],\n",
" [8.33333333e-01, 8.17692673e-01, 8.71812713e-01, 2.07203245e-01,\n",
" 5.11094943e-01, 5.27062449e-01],\n",
" [8.88888889e-01, 1.44010128e-01, 5.36786887e-01, 3.08322356e-01,\n",
" 4.27701471e-01, 8.35909435e-01],\n",
" [0.00000000e+00, 4.18895395e-01, 2.21456890e-02, 5.02564960e-02,\n",
" 1.04576035e-02, 4.00324437e-02],\n",
" [7.22222222e-01, 7.49010919e-01, 4.58983397e-01, 4.93503389e-02,\n",
" 3.36712215e-01, 3.34089054e-01],\n",
" [1.00000000e+00, 7.40623516e-01, 8.01174907e-01, 7.45341047e-02,\n",
" 5.74353051e-01, 5.33536425e-01],\n",
" [8.88888889e-01, 5.25241336e-01, 8.96903285e-01, 4.68055899e-01,\n",
" 6.24309385e-01, 6.96451388e-01],\n",
" [7.77777778e-01, 5.19069473e-01, 7.53418917e-01, 4.82109655e-01,\n",
" 5.25635444e-01, 5.82191322e-01],\n",
" [5.55555556e-01, 7.76705175e-01, 4.06844447e-01, 1.76661501e-01,\n",
" 2.42759819e-01, 2.98763149e-01],\n",
" [7.22222222e-01, 1.29450862e-01, 3.60436446e-01, 2.88561556e-01,\n",
" 2.81028974e-01, 6.35990288e-01]])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_continuous_scaled2 = mm_scaler.transform(X_test_continuous)\n",
"X_test_continuous_scaled2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de1f1e92",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}