{ "cells": [ { "cell_type": "markdown", "id": "08a1d994", "metadata": {}, "source": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", "\n", "

Normalizing the Data

\n", "

Shouke Wei, Ph.D. Professor

\n", "

Email: shouke.wei@gmail.com

" ] }, { "cell_type": "markdown", "id": "f8ae5c5d", "metadata": {}, "source": [ "## Objective \n", "- learn how to normalize the features, save and load the normalization scaler for new data" ] }, { "cell_type": "code", "execution_count": 5, "id": "94abd9b5", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yeargdppopfinvtradefexpenuincprov_hnprov_jsprov_sdprov_zj
020001.0741258.6500000.3145131.4081470.1080320.9761570.00.00.00.0
120011.2039258.7330000.3484431.5013910.1321331.0415190.00.00.00.0
220021.3502428.8420000.3850781.8301690.1521081.1137200.00.00.00.0
320031.5844648.9630000.4813202.3467350.1695631.2380430.00.00.00.0
420041.8864629.0522980.5870022.9558990.1852951.3627650.00.00.00.0
\n", "
" ], "text/plain": [ " year gdp pop finv trade fexpen uinc prov_hn \\\n", "0 2000 1.074125 8.650000 0.314513 1.408147 0.108032 0.976157 0.0 \n", "1 2001 1.203925 8.733000 0.348443 1.501391 0.132133 1.041519 0.0 \n", "2 2002 1.350242 8.842000 0.385078 1.830169 0.152108 1.113720 0.0 \n", "3 2003 1.584464 8.963000 0.481320 2.346735 0.169563 1.238043 0.0 \n", "4 2004 1.886462 9.052298 0.587002 2.955899 0.185295 1.362765 0.0 \n", "\n", " prov_js prov_sd prov_zj \n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import required packages\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "# read data\n", "df = pd.read_csv('./data/gdp_china_encoded.csv')\n", "\n", "# show the first 5 rows\n", "df.head()" ] }, { "cell_type": "markdown", "id": "c46350e0", "metadata": {}, "source": [ "### Slice data into features X and target y" ] }, { "cell_type": "code", "execution_count": 6, "id": "4fadde18", "metadata": {}, "outputs": [], "source": [ "X = df.drop(['gdp'],axis=1)\n", "y = df['gdp']" ] }, { "cell_type": "markdown", "id": "913af9d8", "metadata": {}, "source": [ "### Split train and test data" ] }, { "cell_type": "code", "execution_count": 7, "id": "06608307", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30, random_state=1)" ] }, { "cell_type": "code", "execution_count": 8, "id": "7fbd1ef3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopfinvtradefexpenuincprov_hnprov_jsprov_sdprov_zj
6620095.2761.0742321.2823900.2653352.4610810.00.00.01.0
5420169.9475.3322941.5476570.8755213.4012080.00.01.00.0
3620177.6565.3277003.9997501.0621034.3621800.01.00.00.0
4520079.3671.2537700.9312960.2261851.4264700.00.01.00.0
5220149.7894.2495551.7011220.7177312.9221940.00.01.00.0
.................................
7520185.1553.1697702.8511600.8629535.5574300.00.00.01.0
9200910.1301.2933124.1743830.4334372.1574720.00.00.00.0
7220155.5392.7323322.1599080.6645984.3714480.00.00.01.0
12201210.5941.8751506.2116290.7387863.0226710.00.00.00.0
3720187.7235.3276804.3793501.1657354.7200000.01.00.00.0
\n", "

66 rows × 10 columns

\n", "
" ], "text/plain": [ " year pop finv trade fexpen uinc prov_hn prov_js \\\n", "66 2009 5.276 1.074232 1.282390 0.265335 2.461081 0.0 0.0 \n", "54 2016 9.947 5.332294 1.547657 0.875521 3.401208 0.0 0.0 \n", "36 2017 7.656 5.327700 3.999750 1.062103 4.362180 0.0 1.0 \n", "45 2007 9.367 1.253770 0.931296 0.226185 1.426470 0.0 0.0 \n", "52 2014 9.789 4.249555 1.701122 0.717731 2.922194 0.0 0.0 \n", ".. ... ... ... ... ... ... ... ... \n", "75 2018 5.155 3.169770 2.851160 0.862953 5.557430 0.0 0.0 \n", "9 2009 10.130 1.293312 4.174383 0.433437 2.157472 0.0 0.0 \n", "72 2015 5.539 2.732332 2.159908 0.664598 4.371448 0.0 0.0 \n", "12 2012 10.594 1.875150 6.211629 0.738786 3.022671 0.0 0.0 \n", "37 2018 7.723 5.327680 4.379350 1.165735 4.720000 0.0 1.0 \n", "\n", " prov_sd prov_zj \n", "66 0.0 1.0 \n", "54 1.0 0.0 \n", "36 0.0 0.0 \n", "45 1.0 0.0 \n", "52 1.0 0.0 \n", ".. ... ... \n", "75 0.0 1.0 \n", "9 0.0 0.0 \n", "72 0.0 1.0 \n", "12 0.0 0.0 \n", "37 0.0 0.0 \n", "\n", "[66 rows x 10 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] }, { "cell_type": "markdown", "id": "4dbaa7bf", "metadata": {}, "source": [ "## 1. Normalization and Standardization" ] }, { "cell_type": "markdown", "id": "337e30a2", "metadata": {}, "source": [ "The terms standardize and normalize are used interchangeably in data preprocessing, although in statistics, the latter term also has other connotations.\n", "\n", "The process of normalization involves transforming the data to a smaller or common range such as [−1,1] or [0, 1]." ] }, { "cell_type": "markdown", "id": "9950ac4c", "metadata": {}, "source": [ "## 2. Why data normalization?\n", "\n", "Normalization:\n", " \n", "- gives all attributes an equal weight\n", "- avoids dependence on the measurement units\n", "- particularly useful for machine learning training or\n", "- helps speed up the learning phase\n", "\n", "In a linear regression model, it can help too though it is not necessary." ] }, { "cell_type": "markdown", "id": "d4099149", "metadata": {}, "source": [ "## 3. Methods for data normalization\n", "#### Min-max normalization: \n", "$$x'=\\frac{x - min(x)}{max(x) - min(x)}$$\n", "\n", "$$x'=\\frac{x - min(x)}{max(x) - min(x)}(new\\_max(x)-new\\_min(x)) + new\\_min(x)$$" ] }, { "cell_type": "markdown", "id": "4942006e", "metadata": {}, "source": [ "#### Mean normalization\n", "\n", "$$x'=\\frac{x - mean(x)}{max(x) - min(x)}$$ \n", "\n", "#### Z-score normalization / Standardization\n", "\n", "$$x'=\\frac{x - \\mu}{\\sigma}$$\n", "\n", "$$μ: \\text{the mean of the variable,}$$\n", "$$σ: \\text{is the standard deviation of the variable.}$$\n", "\n", "#### Scaling to unit length\n", "\n", "$$x'=\\frac{x}{||x||}$$\n", "$$||x||: \\text{the Euclidean length of the variable}.$$\n", "\n", "#### Decimal scaling \n", "$$x'=\\frac{x}{10^j}$$\n", "\n", "$$ j: \\text{the smallest integer such that max(|x'|)<1}$$" ] }, { "cell_type": "markdown", "id": "d1d00a4d", "metadata": {}, "source": [ "## 4. Sklearn built-in methods for data normalization" ] }, { "cell_type": "markdown", "id": "66f5d61e", "metadata": {}, "source": [ "### (1) MinMaxScaler\n", "- Transform features by scaling each feature to a given range\n", "\n", "### (2) MaxAbsScaler \n", "- Scale each feature by its maximum absolute value [-1, 1] by dividing through the largest maximum value\n", "\n", "### (3) RobustScaler\n", "- Scale features using statistics that are robust to outliers.It subtracts the column median and divides by the interquartile range.\n", "\n", "### (4) StandardScaler\n", "- StandardScaler scales each column to have 0 mean and unit variance.\n", "\n", "### (5) Normalizer\n", "Normalize samples individually to unit norm. The normalizer operates on the rows rather than the columns. It applies l2 normalization by default.\n", "\n", "Reference: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing" ] }, { "cell_type": "markdown", "id": "bc734084", "metadata": {}, "source": [ "### `MinMaxScaler` Example:" ] }, { "cell_type": "markdown", "id": "830de60c", "metadata": {}, "source": [ "#### (1) Normaliz the trainning dataset" ] }, { "cell_type": "code", "execution_count": 16, "id": "8d211be1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopfinvtradefexpenuinc
6620095.2761.0742321.2823900.2653352.461081
5420169.9475.3322941.5476570.8755213.401208
3620177.6565.3277003.9997501.0621034.362180
4520079.3671.2537700.9312960.2261851.426470
5220149.7894.2495551.7011220.7177312.922194
.....................
7520185.1553.1697702.8511600.8629535.557430
9200910.1301.2933124.1743830.4334372.157472
7220155.5392.7323322.1599080.6645984.371448
12201210.5941.8751506.2116290.7387863.022671
3720187.7235.3276804.3793501.1657354.720000
\n", "

66 rows × 6 columns

\n", "
" ], "text/plain": [ " year pop finv trade fexpen uinc\n", "66 2009 5.276 1.074232 1.282390 0.265335 2.461081\n", "54 2016 9.947 5.332294 1.547657 0.875521 3.401208\n", "36 2017 7.656 5.327700 3.999750 1.062103 4.362180\n", "45 2007 9.367 1.253770 0.931296 0.226185 1.426470\n", "52 2014 9.789 4.249555 1.701122 0.717731 2.922194\n", ".. ... ... ... ... ... ...\n", "75 2018 5.155 3.169770 2.851160 0.862953 5.557430\n", "9 2009 10.130 1.293312 4.174383 0.433437 2.157472\n", "72 2015 5.539 2.732332 2.159908 0.664598 4.371448\n", "12 2012 10.594 1.875150 6.211629 0.738786 3.022671\n", "37 2018 7.723 5.327680 4.379350 1.165735 4.720000\n", "\n", "[66 rows x 6 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# slice the continous features from the training data\n", "X_train_continuous = X_train.loc[:,'year':'uinc']\n", "X_train_continuous" ] }, { "cell_type": "code", "execution_count": 18, "id": "329a6000", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopfinvtradefexpenuinc
660.5000000.0943190.1739470.1769270.1452510.390579
540.8888890.8335180.9648830.2140730.5441190.575614
360.9444440.4709610.9640290.5574400.6660840.764752
450.3888890.7417310.2072960.1277630.1196600.186948
520.7777780.8085140.7637640.2355620.4409740.481335
.....................
751.0000000.0751700.5631940.3966020.5359031.000000
90.5000000.8624780.2146410.5818940.2551370.330823
720.8333330.1359390.4819400.2998060.4062420.766576
120.6666670.9359080.3227180.8671700.4547380.501111
371.0000000.4815640.9640260.6105950.7338270.835178
\n", "

66 rows × 6 columns

\n", "
" ], "text/plain": [ " year pop finv trade fexpen uinc\n", "66 0.500000 0.094319 0.173947 0.176927 0.145251 0.390579\n", "54 0.888889 0.833518 0.964883 0.214073 0.544119 0.575614\n", "36 0.944444 0.470961 0.964029 0.557440 0.666084 0.764752\n", "45 0.388889 0.741731 0.207296 0.127763 0.119660 0.186948\n", "52 0.777778 0.808514 0.763764 0.235562 0.440974 0.481335\n", ".. ... ... ... ... ... ...\n", "75 1.000000 0.075170 0.563194 0.396602 0.535903 1.000000\n", "9 0.500000 0.862478 0.214641 0.581894 0.255137 0.330823\n", "72 0.833333 0.135939 0.481940 0.299806 0.406242 0.766576\n", "12 0.666667 0.935908 0.322718 0.867170 0.454738 0.501111\n", "37 1.000000 0.481564 0.964026 0.610595 0.733827 0.835178\n", "\n", "[66 rows x 6 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to learn the underlying parameters of the scaler from the training data-set\n", "min_max_scaler = MinMaxScaler().fit(X_train_continuous)\n", "\n", "# transform the training data-set to range [0,1]\n", "X_train_continuous_scaled = min_max_scaler.transform(X_train_continuous)\n", "\n", "# convert it into dataframe\n", "X_train_continuous_scaled = pd.DataFrame(X_train_continuous_scaled,index=X_train_continuous.index,\n", " columns=X_train_continuous.columns)\n", "X_train_continuous_scaled" ] }, { "cell_type": "code", "execution_count": 19, "id": "96d85fb7", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopfinvtradefexpenuincprov_hnprov_jsprov_sdprov_zj
660.5000000.0943190.1739470.1769270.1452510.3905790.00.00.01.0
540.8888890.8335180.9648830.2140730.5441190.5756140.00.01.00.0
360.9444440.4709610.9640290.5574400.6660840.7647520.01.00.00.0
450.3888890.7417310.2072960.1277630.1196600.1869480.00.01.00.0
520.7777780.8085140.7637640.2355620.4409740.4813350.00.01.00.0
.................................
751.0000000.0751700.5631940.3966020.5359031.0000000.00.00.01.0
90.5000000.8624780.2146410.5818940.2551370.3308230.00.00.00.0
720.8333330.1359390.4819400.2998060.4062420.7665760.00.00.01.0
120.6666670.9359080.3227180.8671700.4547380.5011110.00.00.00.0
371.0000000.4815640.9640260.6105950.7338270.8351780.01.00.00.0
\n", "

66 rows × 10 columns

\n", "
" ], "text/plain": [ " year pop finv trade fexpen uinc prov_hn \\\n", "66 0.500000 0.094319 0.173947 0.176927 0.145251 0.390579 0.0 \n", "54 0.888889 0.833518 0.964883 0.214073 0.544119 0.575614 0.0 \n", "36 0.944444 0.470961 0.964029 0.557440 0.666084 0.764752 0.0 \n", "45 0.388889 0.741731 0.207296 0.127763 0.119660 0.186948 0.0 \n", "52 0.777778 0.808514 0.763764 0.235562 0.440974 0.481335 0.0 \n", ".. ... ... ... ... ... ... ... \n", "75 1.000000 0.075170 0.563194 0.396602 0.535903 1.000000 0.0 \n", "9 0.500000 0.862478 0.214641 0.581894 0.255137 0.330823 0.0 \n", "72 0.833333 0.135939 0.481940 0.299806 0.406242 0.766576 0.0 \n", "12 0.666667 0.935908 0.322718 0.867170 0.454738 0.501111 0.0 \n", "37 1.000000 0.481564 0.964026 0.610595 0.733827 0.835178 0.0 \n", "\n", " prov_js prov_sd prov_zj \n", "66 0.0 0.0 1.0 \n", "54 0.0 1.0 0.0 \n", "36 1.0 0.0 0.0 \n", "45 0.0 1.0 0.0 \n", "52 0.0 1.0 0.0 \n", ".. ... ... ... \n", "75 0.0 0.0 1.0 \n", "9 0.0 0.0 0.0 \n", "72 0.0 0.0 1.0 \n", "12 0.0 0.0 0.0 \n", "37 1.0 0.0 0.0 \n", "\n", "[66 rows x 10 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# diplay the full scaled train dataset \n", "X_train_scaled = X_train.copy()\n", "X_train_scaled.loc[:,'year':'uinc'] = X_train_continuous_scaled\n", "X_train_scaled" ] }, { "cell_type": "markdown", "id": "c61f9138", "metadata": {}, "source": [ "#### (2) Normaliz the testing dataset" ] }, { "cell_type": "code", "execution_count": 20, "id": "965240a1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopfinvtradefexpenuinc
400.1111110.6966290.0391110.0366880.0280660.056056
310.6666670.5127390.5475260.4817190.4311930.490291
460.4444440.7496440.2611310.1514090.1486050.227113
580.0555560.0077540.0270680.0353710.0108510.112156
770.0555560.7714830.0030890.0005780.0050520.009864
490.6111110.7844600.4712840.2106960.2987830.354778
870.6111110.7450550.3044670.0268580.2495440.264300
440.3333330.7325530.1808030.1036400.0916550.146158
880.6666670.7479030.3728430.0430880.2990660.308541
900.7777780.7526510.5461880.0532410.3658910.372103
670.5555560.1213800.2042940.2376880.1815000.444669
270.4444440.4877350.2586160.3788480.1840890.273840
740.9444440.0620350.5631620.3559030.4640500.915100
840.4444440.7515430.1692720.0143530.1209510.166605
320.7222220.5157460.6500430.4750290.4815790.527854
550.9444440.7325530.9997990.2483360.5770120.630277
390.0555560.6901410.0262080.0309150.0210800.045954
100.5555560.9116950.2646190.7413840.3262030.376546
20.1111110.6586490.0459370.2536330.0712370.125392
380.0000000.6833360.0214240.0263220.0118830.033926
530.8333330.8176930.8718130.2072030.5110950.527062
730.8888890.1440100.5367870.3083220.4277010.835909
190.0000000.4188950.0221460.0502560.0104580.040032
890.7222220.7490110.4589830.0493500.3367120.334089
941.0000000.7406240.8011750.0745340.5743530.533536
350.8888890.5252410.8969030.4680560.6243090.696451
330.7777780.5190690.7534190.4821100.5256350.582191
480.5555560.7767050.4068440.1766620.2427600.298763
700.7222220.1294510.3604360.2885620.2810290.635990
\n", "
" ], "text/plain": [ " year pop finv trade fexpen uinc\n", "40 0.111111 0.696629 0.039111 0.036688 0.028066 0.056056\n", "31 0.666667 0.512739 0.547526 0.481719 0.431193 0.490291\n", "46 0.444444 0.749644 0.261131 0.151409 0.148605 0.227113\n", "58 0.055556 0.007754 0.027068 0.035371 0.010851 0.112156\n", "77 0.055556 0.771483 0.003089 0.000578 0.005052 0.009864\n", "49 0.611111 0.784460 0.471284 0.210696 0.298783 0.354778\n", "87 0.611111 0.745055 0.304467 0.026858 0.249544 0.264300\n", "44 0.333333 0.732553 0.180803 0.103640 0.091655 0.146158\n", "88 0.666667 0.747903 0.372843 0.043088 0.299066 0.308541\n", "90 0.777778 0.752651 0.546188 0.053241 0.365891 0.372103\n", "67 0.555556 0.121380 0.204294 0.237688 0.181500 0.444669\n", "27 0.444444 0.487735 0.258616 0.378848 0.184089 0.273840\n", "74 0.944444 0.062035 0.563162 0.355903 0.464050 0.915100\n", "84 0.444444 0.751543 0.169272 0.014353 0.120951 0.166605\n", "32 0.722222 0.515746 0.650043 0.475029 0.481579 0.527854\n", "55 0.944444 0.732553 0.999799 0.248336 0.577012 0.630277\n", "39 0.055556 0.690141 0.026208 0.030915 0.021080 0.045954\n", "10 0.555556 0.911695 0.264619 0.741384 0.326203 0.376546\n", "2 0.111111 0.658649 0.045937 0.253633 0.071237 0.125392\n", "38 0.000000 0.683336 0.021424 0.026322 0.011883 0.033926\n", "53 0.833333 0.817693 0.871813 0.207203 0.511095 0.527062\n", "73 0.888889 0.144010 0.536787 0.308322 0.427701 0.835909\n", "19 0.000000 0.418895 0.022146 0.050256 0.010458 0.040032\n", "89 0.722222 0.749011 0.458983 0.049350 0.336712 0.334089\n", "94 1.000000 0.740624 0.801175 0.074534 0.574353 0.533536\n", "35 0.888889 0.525241 0.896903 0.468056 0.624309 0.696451\n", "33 0.777778 0.519069 0.753419 0.482110 0.525635 0.582191\n", "48 0.555556 0.776705 0.406844 0.176662 0.242760 0.298763\n", "70 0.722222 0.129451 0.360436 0.288562 0.281029 0.635990" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# slice the continous features from the testing data\n", "X_test_continuous = X_test.loc[:,'year':'uinc']\n", "# transforme the testing data-set to range [0,1] using the training scaler\n", "X_test_continuous_scaled = min_max_scaler.transform(X_test_continuous)\n", "\n", "# convert it into dataframe\n", "X_test_continuous_scaled = pd.DataFrame(X_test_continuous_scaled,index=X_test_continuous.index,\n", " columns=X_test_continuous.columns)\n", "X_test_continuous_scaled" ] }, { "cell_type": "code", "execution_count": 21, "id": "51fecee3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopfinvtradefexpenuincprov_hnprov_jsprov_sdprov_zj
400.1111110.6966290.0391110.0366880.0280660.0560560.00.01.00.0
310.6666670.5127390.5475260.4817190.4311930.4902910.01.00.00.0
460.4444440.7496440.2611310.1514090.1486050.2271130.00.01.00.0
580.0555560.0077540.0270680.0353710.0108510.1121560.00.00.01.0
770.0555560.7714830.0030890.0005780.0050520.0098641.00.00.00.0
490.6111110.7844600.4712840.2106960.2987830.3547780.00.01.00.0
870.6111110.7450550.3044670.0268580.2495440.2643001.00.00.00.0
440.3333330.7325530.1808030.1036400.0916550.1461580.00.01.00.0
880.6666670.7479030.3728430.0430880.2990660.3085411.00.00.00.0
900.7777780.7526510.5461880.0532410.3658910.3721031.00.00.00.0
670.5555560.1213800.2042940.2376880.1815000.4446690.00.00.01.0
270.4444440.4877350.2586160.3788480.1840890.2738400.01.00.00.0
740.9444440.0620350.5631620.3559030.4640500.9151000.00.00.01.0
840.4444440.7515430.1692720.0143530.1209510.1666051.00.00.00.0
320.7222220.5157460.6500430.4750290.4815790.5278540.01.00.00.0
550.9444440.7325530.9997990.2483360.5770120.6302770.00.01.00.0
390.0555560.6901410.0262080.0309150.0210800.0459540.00.01.00.0
100.5555560.9116950.2646190.7413840.3262030.3765460.00.00.00.0
20.1111110.6586490.0459370.2536330.0712370.1253920.00.00.00.0
380.0000000.6833360.0214240.0263220.0118830.0339260.00.01.00.0
530.8333330.8176930.8718130.2072030.5110950.5270620.00.01.00.0
730.8888890.1440100.5367870.3083220.4277010.8359090.00.00.01.0
190.0000000.4188950.0221460.0502560.0104580.0400320.01.00.00.0
890.7222220.7490110.4589830.0493500.3367120.3340891.00.00.00.0
941.0000000.7406240.8011750.0745340.5743530.5335361.00.00.00.0
350.8888890.5252410.8969030.4680560.6243090.6964510.01.00.00.0
330.7777780.5190690.7534190.4821100.5256350.5821910.01.00.00.0
480.5555560.7767050.4068440.1766620.2427600.2987630.00.01.00.0
700.7222220.1294510.3604360.2885620.2810290.6359900.00.00.01.0
\n", "
" ], "text/plain": [ " year pop finv trade fexpen uinc prov_hn \\\n", "40 0.111111 0.696629 0.039111 0.036688 0.028066 0.056056 0.0 \n", "31 0.666667 0.512739 0.547526 0.481719 0.431193 0.490291 0.0 \n", "46 0.444444 0.749644 0.261131 0.151409 0.148605 0.227113 0.0 \n", "58 0.055556 0.007754 0.027068 0.035371 0.010851 0.112156 0.0 \n", "77 0.055556 0.771483 0.003089 0.000578 0.005052 0.009864 1.0 \n", "49 0.611111 0.784460 0.471284 0.210696 0.298783 0.354778 0.0 \n", "87 0.611111 0.745055 0.304467 0.026858 0.249544 0.264300 1.0 \n", "44 0.333333 0.732553 0.180803 0.103640 0.091655 0.146158 0.0 \n", "88 0.666667 0.747903 0.372843 0.043088 0.299066 0.308541 1.0 \n", "90 0.777778 0.752651 0.546188 0.053241 0.365891 0.372103 1.0 \n", "67 0.555556 0.121380 0.204294 0.237688 0.181500 0.444669 0.0 \n", "27 0.444444 0.487735 0.258616 0.378848 0.184089 0.273840 0.0 \n", "74 0.944444 0.062035 0.563162 0.355903 0.464050 0.915100 0.0 \n", "84 0.444444 0.751543 0.169272 0.014353 0.120951 0.166605 1.0 \n", "32 0.722222 0.515746 0.650043 0.475029 0.481579 0.527854 0.0 \n", "55 0.944444 0.732553 0.999799 0.248336 0.577012 0.630277 0.0 \n", "39 0.055556 0.690141 0.026208 0.030915 0.021080 0.045954 0.0 \n", "10 0.555556 0.911695 0.264619 0.741384 0.326203 0.376546 0.0 \n", "2 0.111111 0.658649 0.045937 0.253633 0.071237 0.125392 0.0 \n", "38 0.000000 0.683336 0.021424 0.026322 0.011883 0.033926 0.0 \n", "53 0.833333 0.817693 0.871813 0.207203 0.511095 0.527062 0.0 \n", "73 0.888889 0.144010 0.536787 0.308322 0.427701 0.835909 0.0 \n", "19 0.000000 0.418895 0.022146 0.050256 0.010458 0.040032 0.0 \n", "89 0.722222 0.749011 0.458983 0.049350 0.336712 0.334089 1.0 \n", "94 1.000000 0.740624 0.801175 0.074534 0.574353 0.533536 1.0 \n", "35 0.888889 0.525241 0.896903 0.468056 0.624309 0.696451 0.0 \n", "33 0.777778 0.519069 0.753419 0.482110 0.525635 0.582191 0.0 \n", "48 0.555556 0.776705 0.406844 0.176662 0.242760 0.298763 0.0 \n", "70 0.722222 0.129451 0.360436 0.288562 0.281029 0.635990 0.0 \n", "\n", " prov_js prov_sd prov_zj \n", "40 0.0 1.0 0.0 \n", "31 1.0 0.0 0.0 \n", "46 0.0 1.0 0.0 \n", "58 0.0 0.0 1.0 \n", "77 0.0 0.0 0.0 \n", "49 0.0 1.0 0.0 \n", "87 0.0 0.0 0.0 \n", "44 0.0 1.0 0.0 \n", "88 0.0 0.0 0.0 \n", "90 0.0 0.0 0.0 \n", "67 0.0 0.0 1.0 \n", "27 1.0 0.0 0.0 \n", "74 0.0 0.0 1.0 \n", "84 0.0 0.0 0.0 \n", "32 1.0 0.0 0.0 \n", "55 0.0 1.0 0.0 \n", "39 0.0 1.0 0.0 \n", "10 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "38 0.0 1.0 0.0 \n", "53 0.0 1.0 0.0 \n", "73 0.0 0.0 1.0 \n", "19 1.0 0.0 0.0 \n", "89 0.0 0.0 0.0 \n", "94 0.0 0.0 0.0 \n", "35 1.0 0.0 0.0 \n", "33 1.0 0.0 0.0 \n", "48 0.0 1.0 0.0 \n", "70 0.0 0.0 1.0 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# diplay the full scaled train dataset \n", "X_test_scaled = X_test.copy()\n", "X_test_scaled.loc[:,'year':'uinc'] = X_test_continuous_scaled\n", "X_test_scaled" ] }, { "cell_type": "markdown", "id": "7aac1d46", "metadata": {}, "source": [ "## 7. Save and load the training scaler" ] }, { "cell_type": "code", "execution_count": 22, "id": "e4937531", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['mm_scaler']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "joblib.dump(min_max_scaler,'mm_scaler')" ] }, { "cell_type": "code", "execution_count": 25, "id": "d0a0c572", "metadata": {}, "outputs": [], "source": [ "import joblib\n", "mm_scaler = joblib.load('mm_scaler')" ] }, { "cell_type": "code", "execution_count": 26, "id": "1332cbc1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.11111111e-01, 6.96629213e-01, 3.91109924e-02, 3.66880369e-02,\n", " 2.80658336e-02, 5.60560888e-02],\n", " [6.66666667e-01, 5.12739357e-01, 5.47525660e-01, 4.81719398e-01,\n", " 4.31192786e-01, 4.90290710e-01],\n", " [4.44444444e-01, 7.49643931e-01, 2.61131077e-01, 1.51408782e-01,\n", " 1.48605435e-01, 2.27112677e-01],\n", " [5.55555556e-02, 7.75439152e-03, 2.70675105e-02, 3.53714159e-02,\n", " 1.08511200e-02, 1.12155675e-01],\n", " [5.55555556e-02, 7.71482830e-01, 3.08939634e-03, 5.78018498e-04,\n", " 5.05165395e-03, 9.86379321e-03],\n", " [6.11111111e-01, 7.84459566e-01, 4.71284143e-01, 2.10695512e-01,\n", " 2.98782975e-01, 3.54778102e-01],\n", " [6.11111111e-01, 7.45054597e-01, 3.04466957e-01, 2.68583659e-02,\n", " 2.49544384e-01, 2.64299509e-01],\n", " [3.33333333e-01, 7.32552619e-01, 1.80803243e-01, 1.03640173e-01,\n", " 9.16553580e-02, 1.46157577e-01],\n", " [6.66666667e-01, 7.47903149e-01, 3.72842512e-01, 4.30876725e-02,\n", " 2.99066019e-01, 3.08540932e-01],\n", " [7.77777778e-01, 7.52650736e-01, 5.46187701e-01, 5.32412770e-02,\n", " 3.65891269e-01, 3.72102526e-01],\n", " [5.55555556e-01, 1.21379965e-01, 2.04293577e-01, 2.37688015e-01,\n", " 1.81500017e-01, 4.44668993e-01],\n", " [4.44444444e-01, 4.87735401e-01, 2.58616392e-01, 3.78847655e-01,\n", " 1.84089251e-01, 2.73839731e-01],\n", " [9.44444444e-01, 6.20351321e-02, 5.63162106e-01, 3.55902600e-01,\n", " 4.64050109e-01, 9.15100051e-01],\n", " [4.44444444e-01, 7.51542966e-01, 1.69272246e-01, 1.43526835e-02,\n", " 1.20951421e-01, 1.66604537e-01],\n", " [7.22222222e-01, 5.15746162e-01, 6.50043391e-01, 4.75028989e-01,\n", " 4.81578590e-01, 5.27853859e-01],\n", " [9.44444444e-01, 7.32552619e-01, 9.99799390e-01, 2.48335520e-01,\n", " 5.77011575e-01, 6.30277019e-01],\n", " [5.55555556e-02, 6.90140845e-01, 2.62082304e-02, 3.09146826e-02,\n", " 2.10799348e-02, 4.59537506e-02],\n", " [5.55555556e-01, 9.11694888e-01, 2.64618908e-01, 7.41384232e-01,\n", " 3.26202971e-01, 3.76545523e-01],\n", " [1.11111111e-01, 6.58648520e-01, 4.59367528e-02, 2.53632717e-01,\n", " 7.12369492e-02, 1.25392359e-01],\n", " [0.00000000e+00, 6.83335971e-01, 2.14236782e-02, 2.63224027e-02,\n", " 1.18826301e-02, 3.39259298e-02],\n", " [8.33333333e-01, 8.17692673e-01, 8.71812713e-01, 2.07203245e-01,\n", " 5.11094943e-01, 5.27062449e-01],\n", " [8.88888889e-01, 1.44010128e-01, 5.36786887e-01, 3.08322356e-01,\n", " 4.27701471e-01, 8.35909435e-01],\n", " [0.00000000e+00, 4.18895395e-01, 2.21456890e-02, 5.02564960e-02,\n", " 1.04576035e-02, 4.00324437e-02],\n", " [7.22222222e-01, 7.49010919e-01, 4.58983397e-01, 4.93503389e-02,\n", " 3.36712215e-01, 3.34089054e-01],\n", " [1.00000000e+00, 7.40623516e-01, 8.01174907e-01, 7.45341047e-02,\n", " 5.74353051e-01, 5.33536425e-01],\n", " [8.88888889e-01, 5.25241336e-01, 8.96903285e-01, 4.68055899e-01,\n", " 6.24309385e-01, 6.96451388e-01],\n", " [7.77777778e-01, 5.19069473e-01, 7.53418917e-01, 4.82109655e-01,\n", " 5.25635444e-01, 5.82191322e-01],\n", " [5.55555556e-01, 7.76705175e-01, 4.06844447e-01, 1.76661501e-01,\n", " 2.42759819e-01, 2.98763149e-01],\n", " [7.22222222e-01, 1.29450862e-01, 3.60436446e-01, 2.88561556e-01,\n", " 2.81028974e-01, 6.35990288e-01]])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_continuous_scaled2 = mm_scaler.transform(X_test_continuous)\n", "X_test_continuous_scaled2" ] }, { "cell_type": "code", "execution_count": null, "id": "de1f1e92", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" } }, "nbformat": 4, "nbformat_minor": 5 }