{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing範例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Preparation (拆分訓練集跟測試集)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = np.array(range(25)).reshape(5, 5)\n",
    "Y = np.array(range(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 缺失值處理 (Dealing with Missing Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(X)\n",
    "df.iloc[1, 2] = np.nan\n",
    "df.iloc[2, 3] = np.nan\n",
    "df.iloc[4, 1] = np.nan\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import Imputer\n",
    "\n",
    "imr = Imputer(missing_values='NaN', strategy='mean', axis=0)\n",
    "imr = imr.fit(df.values)\n",
    "imputed_data = imr.transform(df.values)\n",
    "imputed_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特徵縮放 (Feature Scaling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Standardization\n",
    "# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "data = [[0, 0], [0, 0], [1, 1], [1, 1]]\n",
    "scaler = StandardScaler()\n",
    "scaler.fit(data)\n",
    "scaler.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler.mean_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler.var_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler.transform([[2, 2]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalization\n",
    "# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n",
    "scaler = MinMaxScaler()\n",
    "scaler.fit(data)\n",
    "scaler.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler.data_max_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler.data_min_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler.transform([[2, 2]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Label Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame([['green', 'M', 10.1, 'class1'],\n",
    "                   ['red', 'L', 13.5, 'class2'],\n",
    "                   ['blue', 'XL', 15.3, 'class1']])\n",
    "\n",
    "df.columns = ['color', 'size', 'price', 'classlabel']\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# Label encoding with sklearn's LabelEncoder\n",
    "class_le = LabelEncoder()\n",
    "y = class_le.fit_transform(df['classlabel'].values)\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reverse mapping\n",
    "class_le.inverse_transform(y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## One-Hot Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df[['color', 'size', 'price']].values\n",
    "\n",
    "color_le = LabelEncoder()\n",
    "X[:, 0] = color_le.fit_transform(X[:, 0])\n",
    "\n",
    "size_le = LabelEncoder()\n",
    "X[:, 1] = size_le.fit_transform(X[:, 1])\n",
    "\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "ohe = OneHotEncoder(categorical_features=[0])\n",
    "ohe.fit_transform(X).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# return dense array so that we can skip\n",
    "# the toarray step\n",
    "\n",
    "ohe = OneHotEncoder(categorical_features=[0], sparse=False)\n",
    "ohe.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# one-hot encoding via pandas\n",
    "\n",
    "pd.get_dummies(df[['price', 'color', 'size']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# multicollinearity guard for the OneHotEncoder\n",
    "\n",
    "ohe = OneHotEncoder(categorical_features=[0])\n",
    "ohe.fit_transform(X).toarray()[:, 1:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)\n",
    "\n",
    "df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',\n",
    "                   'Alcalinity of ash', 'Magnesium', 'Total phenols',\n",
    "                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',\n",
    "                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',\n",
    "                   'Proline']\n",
    "\n",
    "print('Class labels', np.unique(df_wine['Class label']))\n",
    "df_wine.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "feat_labels = df_wine.columns[1:]\n",
    "\n",
    "forest = RandomForestClassifier(n_estimators=500,\n",
    "                                random_state=1)\n",
    "\n",
    "forest.fit(X_train, y_train)\n",
    "importances = forest.feature_importances_\n",
    "\n",
    "indices = np.argsort(importances)[::-1]\n",
    "\n",
    "for f in range(X_train.shape[1]):\n",
    "    print(\"%2d) %-*s %f\" % (f + 1, 30, \n",
    "                            feat_labels[indices[f]], \n",
    "                            importances[indices[f]]))\n",
    "\n",
    "plt.title('Feature Importance')\n",
    "plt.bar(range(X_train.shape[1]), \n",
    "        importances[indices],\n",
    "        align='center')\n",
    "\n",
    "plt.xticks(range(X_train.shape[1]), \n",
    "           feat_labels[indices], rotation=90)\n",
    "plt.xlim([-1, X_train.shape[1]])\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import SelectFromModel\n",
    "\n",
    "sfm = SelectFromModel(forest, threshold=0.1, prefit=True)\n",
    "X_selected = sfm.transform(X_train)\n",
    "print('Number of samples that meet this criterion:', \n",
    "      X_selected.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for f in range(X_selected.shape[1]):\n",
    "    print(\"%2d) %-*s %f\" % (f + 1, 30, \n",
    "                            feat_labels[indices[f]], \n",
    "                            importances[indices[f]]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}