{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Author:马肖\n", "#### E-Mail:maxiaoscut@aliyun.com\n", "#### GitHub:https://github.com/Albertsr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1 偏态系数" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 若偏态系数绝对值大于1,称为高度偏态分布\n", "- 若绝对值在【0.5,1】区间内,称为中等偏态分布,概率密度曲线右侧偏长\n", "- 偏态系数小于0,称为左偏;否则,右偏" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2 np.lopg1p或np.sqrt能降低数值型特征的偏度\n", "StandardScaler,MinMaxScaler等标准化操作对偏度无影响" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from matplotlib import pyplot as plt\n", "sns.set(style=\"white\", color_codes=True)\n", "\n", "import warnings \n", "warnings.filterwarnings(\"ignore\")\n", "\n", "from sklearn.datasets import load_boston\n", "from sklearn.preprocessing import StandardScaler,MinMaxScaler\n", "dataset = load_boston()\n", "feature_names = dataset.feature_names\n", "X, y = load_boston(return_X_y=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(X, columns=feature_names)\n", "\n", "scaler_1 = StandardScaler()\n", "df_StandardScaled = scaler_1.fit_transform(df)\n", "df_StandardScaled = pd.DataFrame(df_StandardScaled, columns=feature_names)\n", "\n", "scaler_2 = MinMaxScaler()\n", "df_MinMaxScaled = scaler_2.fit_transform(df)\n", "df_MinMaxScaled = pd.DataFrame(df_MinMaxScaled, columns=feature_names)\n", "\n", "df_lpg1p = df_MinMaxScaled.applymap(lambda x: np.log1p(x))\n", "df_sqrt = df_MinMaxScaled.applymap(lambda x: np.sqrt(x))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | skew_original | \n", "skew_StandardScaled | \n", "skew_MinMaxScaled | \n", "skew_logeded | \n", "skew_sqrt | \n", "
|---|---|---|---|---|---|
| AGE | \n", "-0.598963 | \n", "-0.598963 | \n", "-0.598963 | \n", "-0.806758 | \n", "-1.055553 | \n", "
| B | \n", "-2.890374 | \n", "-2.890374 | \n", "-2.890374 | \n", "-3.084347 | \n", "-3.366608 | \n", "
| CHAS | \n", "3.405904 | \n", "3.405904 | \n", "3.405904 | \n", "3.405904 | \n", "3.405904 | \n", "
| CRIM | \n", "5.223149 | \n", "5.223149 | \n", "5.223149 | \n", "4.128753 | \n", "2.014322 | \n", "
| DIS | \n", "1.011781 | \n", "1.011781 | \n", "1.011781 | \n", "0.734378 | \n", "0.293500 | \n", "
| INDUS | \n", "0.295022 | \n", "0.295022 | \n", "0.295022 | \n", "0.123367 | \n", "-0.121184 | \n", "
| LSTAT | \n", "0.906460 | \n", "0.906460 | \n", "0.906460 | \n", "0.578045 | \n", "0.140307 | \n", "
| NOX | \n", "0.729308 | \n", "0.729308 | \n", "0.729308 | \n", "0.409697 | \n", "0.037881 | \n", "
| PTRATIO | \n", "-0.802325 | \n", "-0.802325 | \n", "-0.802325 | \n", "-1.070027 | \n", "-1.532034 | \n", "
| RAD | \n", "1.004815 | \n", "1.004815 | \n", "1.004815 | \n", "0.938857 | \n", "0.637851 | \n", "
| RM | \n", "0.403612 | \n", "0.403612 | \n", "0.403612 | \n", "-0.127789 | \n", "-0.990823 | \n", "
| TAX | \n", "0.669956 | \n", "0.669956 | \n", "0.669956 | \n", "0.511587 | \n", "0.182230 | \n", "
| ZN | \n", "2.225666 | \n", "2.225666 | \n", "2.225666 | \n", "1.969749 | \n", "1.476293 | \n", "
| SKEW_SUM | \n", "20.167333 | \n", "20.167333 | \n", "20.167333 | \n", "17.889259 | \n", "15.254490 | \n", "