{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1 数据的标准化"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1 scale:sklearn.preprocessing.scale(X, axis=0, with_mean=True, with_std=True, copy=True)\n",
"- with_mean : boolean, True by default, If True, center the data before scaling. 即使得对应axis上的均值为0\n",
"- with_std : boolean, True by default,If True, scale the data to unit variance. 即使得对应axis上的方差为1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean: [0 0 0], \n",
"Std: [1. 1. 1.]\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import * \n",
"\n",
"rg = np.random.RandomState(2017)\n",
"X_train = rg.uniform(0, 5, (4,3))\n",
"X_scaled = scale(X_train)\n",
"print('Mean: {}, \\nStd: {}'.format(X_scaled.mean(axis=0, dtype=np.int), X_scaled.std(axis=0)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### scale的参数axis=0,表示对每列进行标准化,即每列减去此列均值再除以其方差"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def f(array):\n",
" result = (array - np.mean(array)) / np.std(array, ddof=0) # ddof默认为0\n",
" return result\n",
"\n",
"scale_result = np.apply_along_axis(f, axis=0, arr=X_train)\n",
"assert np.allclose(X_scaled, scale_result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2 StandardScaler:sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)\n",
"\n",
"可通过fit方法获取某特征的均值与方差,再运用transform方法标准化其他特征\n",
"\n",
"#### 优点:\n",
"1)提升模型的收敛速度\n",
"\n",
"2)使得各指标值都处于同一个量纲上,提升模型的精度"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal length (cm) | \n",
" sepal width (cm) | \n",
" petal length (cm) | \n",
" petal width (cm) | \n",
"
\n",
" \n",
" \n",
" \n",
" | 143 | \n",
" 6.8 | \n",
" 3.2 | \n",
" 5.9 | \n",
" 2.3 | \n",
"
\n",
" \n",
" | 115 | \n",
" 6.4 | \n",
" 3.2 | \n",
" 5.3 | \n",
" 2.3 | \n",
"
\n",
" \n",
" | 102 | \n",
" 7.1 | \n",
" 3.0 | \n",
" 5.9 | \n",
" 2.1 | \n",
"
\n",
" \n",
" | 51 | \n",
" 6.4 | \n",
" 3.2 | \n",
" 4.5 | \n",
" 1.5 | \n",
"
\n",
" \n",
" | 76 | \n",
" 6.8 | \n",
" 2.8 | \n",
" 4.8 | \n",
" 1.4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)\n",
"143 6.8 3.2 5.9 2.3\n",
"115 6.4 3.2 5.3 2.3\n",
"102 7.1 3.0 5.9 2.1\n",
"51 6.4 3.2 4.5 1.5\n",
"76 6.8 2.8 4.8 1.4"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.datasets import load_iris \n",
"\n",
"dataset = load_iris()\n",
"np.random.seed(2017)\n",
"iris = pd.DataFrame(dataset.data, columns=dataset.feature_names).sample(5)\n",
"iris"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 对某数据框直接调用fit_transform时,等价于单独对每列分别进行scale操作"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.372678 , 0.75 , 1.0932857 , 0.96958969],\n",
" [-1.11803399, 0.75 , 0.03526728, 0.96958969],\n",
" [ 1.49071198, -0.5 , 1.0932857 , 0.45927933],\n",
" [-1.11803399, 0.75 , -1.37542395, -1.07165176],\n",
" [ 0.372678 , -1.75 , -0.84641474, -1.32680694]])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaler = StandardScaler()\n",
"iris_scaled = scaler.fit_transform(iris) \n",
"iris_scaled"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([0, 0, 0, 0]), array([1., 1., 1., 1.]))"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris_scaled.mean(axis=0, dtype=np.int), iris_scaled.std(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 等价于对每列单独调用scale\n",
"np.allclose(scaler.fit_transform(iris), scale(iris))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2 数据的归一化:将数据映射到指定的范围,用于去除不同维度数据的量纲以及量纲单位"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 转换过程\n",
"X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n",
"\n",
"X_scaled = X_std * (max - min) + min"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MinMaxScaler(copy=True, feature_range=(0, 1))"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n",
"scaler = MinMaxScaler()\n",
"scaler.fit(data)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0. ],\n",
" [0.25, 0.25],\n",
" [0.5 , 0.5 ],\n",
" [1. , 1. ]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaler.transform(data)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0. ],\n",
" [0.25, 0.25],\n",
" [0.5 , 0.5 ],\n",
" [1. , 1. ]])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 或者直接调用fit_transform \n",
"scaler.fit_transform(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.3 MaxAbsScaler"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.5, -1. , 1. ],\n",
" [ 1. , 0. , 0. ],\n",
" [ 0. , 1. , -0.5]])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train = np.array([[ 1., -1., 2.],\n",
" [ 2., 0., 0.],\n",
" [ 0., 1., -1.]])\n",
"\n",
"max_abs_scaler = MaxAbsScaler()\n",
"X_train_maxabs = max_abs_scaler.fit_transform(X_train)\n",
"X_train_maxabs "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([2., 1., 2.])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max_abs_scaler.scale_ "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-1.5, -1. , 2. ]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test = np.array([[ -3., -1., 4.]])\n",
"X_test_maxabs = max_abs_scaler.transform(X_test)\n",
"X_test_maxabs "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.4 RobustScaler"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 转化过程:(x-median) / IQR, IQR等于75分位点减去25分位点处的值"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.12669367, 0.61018033, 1.22235048],\n",
" [-2.02310438, -0.03205827, 0.34615926],\n",
" [ 0.12669367, -3.19747007, -0.70069397],\n",
" [ 1.2167336 , 0.03205827, -0.34615926]])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(2018)\n",
"X_train = np.random.randn(4,3)\n",
"\n",
"max_abs_scaler = RobustScaler()\n",
"X_train_maxabs = max_abs_scaler.fit_transform(X_train)\n",
"X_train_maxabs "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.20977884, 0.50624895, 0.34544916])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 求各列的中位数\n",
"max_abs_scaler.center_ "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0.52874591, 0.12390117, 1.47498622])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 求各列IQR值\n",
"max_abs_scaler.scale_"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 验证max_abs_scaler.scale_返回的是否为IQR值\n",
"IQR = np.percentile(X_train, 75, axis=0) - np.percentile(X_train, 25, axis=0)\n",
"np.allclose(max_abs_scaler.scale_ ,IQR)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2 正则化"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1 L1正则化:每行各元素除以每行的L1范数"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L1正则化:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.25 | \n",
" -0.25 | \n",
" 0.5 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.00 | \n",
" 0.50 | \n",
" -0.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2\n",
"0 0.25 -0.25 0.5\n",
"1 1.00 0.00 0.0\n",
"2 0.00 0.50 -0.5"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = [[1,-1,2],[2, 0,0],[0, 1, -1]]\n",
"df = pd.DataFrame(x, columns=list('ABC'))\n",
"\n",
"x_norm1 = normalize(x, norm='l1')\n",
"df_norm1 = pd.DataFrame(x_norm1)\n",
"print('L1正则化:')\n",
"df_norm1"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.25 | \n",
" -0.25 | \n",
" 0.5 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.00 | \n",
" 0.50 | \n",
" -0.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C\n",
"0 0.25 -0.25 0.5\n",
"1 1.00 0.00 0.0\n",
"2 0.00 0.50 -0.5"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_norm1 = df.copy()\n",
"for idx in df.index:\n",
" l1_row = sum(abs(df.iloc[idx]))\n",
" df_norm1.iloc[idx] = df.iloc[idx] / l1_row\n",
" \n",
"df_norm1 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.2 L2正则化:每行各元素除以每行的L2范数"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" -1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 1 | \n",
" -1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C\n",
"0 1 -1 2\n",
"1 2 0 0\n",
"2 0 1 -1"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = [[1,-1,2],[2, 0,0],[0, 1, -1]]\n",
"df = pd.DataFrame(x, columns=list('ABC'))\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.408248 | \n",
" -0.408248 | \n",
" 0.816497 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.000000 | \n",
" 0.707107 | \n",
" -0.707107 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2\n",
"0 0.408248 -0.408248 0.816497\n",
"1 1.000000 0.000000 0.000000\n",
"2 0.000000 0.707107 -0.707107"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_norm2 = normalize(x, norm='l2')\n",
"df_norm2 = pd.DataFrame(x_norm2)\n",
"df_norm2"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.408248 | \n",
" -0.408248 | \n",
" 0.816497 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.000000 | \n",
" 0.707107 | \n",
" -0.707107 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C\n",
"0 0.408248 -0.408248 0.816497\n",
"1 1.000000 0.000000 0.000000\n",
"2 0.000000 0.707107 -0.707107"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_norm2 = df.copy()\n",
"for idx in df.index:\n",
" l2_row = np.sqrt(sum(np.square(df.iloc[idx])))\n",
" df_norm2.iloc[idx] = df.iloc[idx] / l2_row\n",
"\n",
"df_norm2 "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}