{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Author：马肖\n",
    "#### E-Mail：maxiaoscut@aliyun.com\n",
    "#### GitHub：https://github.com/Albertsr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd \n",
    "\n",
    "\n",
    "rdg = np.random.RandomState(2017)\n",
    "age = rdg.randint(1, 78, 20)\n",
    "fare = rdg.uniform(10, 100, 20)\n",
    "df = pd.DataFrame({'Age':age, 'Fare':fare}).round(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 离散化后的特征对异常数据有很强的鲁棒性：比如一个特征是年龄>30是1，否则0。\n",
    "- 如果特征没有离散化，一个异常数据“年龄300岁”会给模型造成很大的干扰；"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 等距分箱"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方法一：运用pd.cut()\n",
    "- [pandas.cut官方文档](http://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.cut.html)\n",
    "- pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.928, 19.0]    8\n",
       "(55.0, 73.0]     5\n",
       "(37.0, 55.0]     5\n",
       "(19.0, 37.0]     2\n",
       "Name: Age, dtype: int64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(df['Age'], 4).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "中年    8\n",
       "少年    8\n",
       "老年    3\n",
       "青年    1\n",
       "Name: Age, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_catogary = pd.cut(df['Age'], bins=[0, 17, 35, 59, 100], labels=['少年', '青年', '中年', '老年'])\n",
    "age_catogary.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Age_少年</th>\n",
       "      <th>Age_青年</th>\n",
       "      <th>Age_中年</th>\n",
       "      <th>Age_老年</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60</td>\n",
       "      <td>76.21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>71.77</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71</td>\n",
       "      <td>24.76</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14</td>\n",
       "      <td>71.50</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>43</td>\n",
       "      <td>43.19</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Age   Fare  Age_少年  Age_青年  Age_中年  Age_老年\n",
       "0   60  76.21       0       0       0       1\n",
       "1   10  71.77       1       0       0       0\n",
       "2   71  24.76       0       0       0       1\n",
       "3   14  71.50       1       0       0       0\n",
       "4   43  43.19       0       0       1       0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_dummies = pd.get_dummies(age_catogary, prefix='Age')\n",
    "df = df.join(age_dummies)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方法二：运用np.digitize进行等距分段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.        , 25.33333333, 49.66666667, 74.        ])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将年龄分为3个区间\n",
    "bins = np.linspace(df['Age'].min(), df['Age'].max()+1, 4)\n",
    "bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 1, 3, 1, 2, 1, 3, 3, 1, 2, 1, 1, 3, 1, 2, 2, 2, 1, 3, 2],\n",
       "      dtype=int64)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_bins = np.digitize(df['Age'], bins)\n",
    "age_bins"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  2. 等频分箱qcut\n",
    "- [pandas.qcut官方文档](http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html)\n",
    "- Discretize variable into equal-sized buckets based on rank or based on sample quantiles.\n",
    "- cut将根据值本身来选择箱子均匀间隔，qcut是根据这些值的频率来选择箱子的均匀间隔"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_bins = pd.qcut(df['Age'], 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(53.0, 73.0]     5\n",
       "(37.5, 53.0]     5\n",
       "(9.75, 37.5]     5\n",
       "(0.999, 9.75]    5\n",
       "Name: Age, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.qcut(df['Age'], 4).value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 自定义区间对费用分段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Age_少年</th>\n",
       "      <th>Age_青年</th>\n",
       "      <th>Age_中年</th>\n",
       "      <th>Age_老年</th>\n",
       "      <th>Fare_high</th>\n",
       "      <th>Fare_low</th>\n",
       "      <th>Fare_middle</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60</td>\n",
       "      <td>76.21</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>71.77</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71</td>\n",
       "      <td>24.76</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14</td>\n",
       "      <td>71.50</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>43</td>\n",
       "      <td>43.19</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Age   Fare  Age_少年  Age_青年  Age_中年  Age_老年  Fare_high  Fare_low  \\\n",
       "0   60  76.21       0       0       0       1          0         1   \n",
       "1   10  71.77       1       0       0       0          0         0   \n",
       "2   71  24.76       0       0       0       1          1         0   \n",
       "3   14  71.50       1       0       0       0          0         0   \n",
       "4   43  43.19       0       0       1       0          1         0   \n",
       "\n",
       "   Fare_middle  \n",
       "0            0  \n",
       "1            1  \n",
       "2            0  \n",
       "3            1  \n",
       "4            0  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def fare_rate_func(x):\n",
    "    if x <= np.percentile(df['Fare'], 25):\n",
    "        return 'high'\n",
    "    elif np.percentile(df['Fare'],25) < x <= np.percentile(df['Fare'], 75):\n",
    "        return 'middle'\n",
    "    else:\n",
    "        return 'low'\n",
    "    \n",
    "df['fare_rate'] = df['Fare'].apply(fare_rate_func)\n",
    "# df['fare_rate'] = df['Fare'].map(fare_rate_func)\n",
    "fare_dummies = pd.get_dummies(df['fare_rate'], prefix='Fare')\n",
    "df.drop(['fare_rate'], axis=1, inplace=True)\n",
    "df = df.join(fare_dummies)\n",
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}