{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing範例" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preparation (拆分訓練集跟測試集)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = np.array(range(25)).reshape(5, 5)\n", "Y = np.array(range(5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Y" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Y_train" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 缺失值處理 (Dealing with Missing Data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(X)\n", "df.iloc[1, 2] = np.nan\n", "df.iloc[2, 3] = np.nan\n", "df.iloc[4, 1] = np.nan\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import Imputer\n", "\n", "imr = Imputer(missing_values='NaN', strategy='mean', axis=0)\n", "imr = imr.fit(df.values)\n", "imputed_data = imr.transform(df.values)\n", "imputed_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 特徵縮放 (Feature Scaling)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Standardization\n", "# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html\n", "from sklearn.preprocessing import StandardScaler\n", "data = [[0, 0], [0, 0], [1, 1], [1, 1]]\n", "scaler = StandardScaler()\n", "scaler.fit(data)\n", "scaler.transform(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scaler.mean_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scaler.var_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scaler.transform([[2, 2]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Normalization\n", "# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html\n", "from sklearn.preprocessing import MinMaxScaler\n", "data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n", "scaler = MinMaxScaler()\n", "scaler.fit(data)\n", "scaler.transform(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scaler.data_max_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scaler.data_min_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scaler.transform([[2, 2]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Label Encoding" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame([['green', 'M', 10.1, 'class1'],\n", " ['red', 'L', 13.5, 'class2'],\n", " ['blue', 'XL', 15.3, 'class1']])\n", "\n", "df.columns = ['color', 'size', 'price', 'classlabel']\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "# Label encoding with sklearn's LabelEncoder\n", "class_le = LabelEncoder()\n", "y = class_le.fit_transform(df['classlabel'].values)\n", "y" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# reverse mapping\n", "class_le.inverse_transform(y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## One-Hot Encoding" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = df[['color', 'size', 'price']].values\n", "\n", "color_le = LabelEncoder()\n", "X[:, 0] = color_le.fit_transform(X[:, 0])\n", "\n", "size_le = LabelEncoder()\n", "X[:, 1] = size_le.fit_transform(X[:, 1])\n", "\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "ohe = OneHotEncoder(categorical_features=[0])\n", "ohe.fit_transform(X).toarray()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# return dense array so that we can skip\n", "# the toarray step\n", "\n", "ohe = OneHotEncoder(categorical_features=[0], sparse=False)\n", "ohe.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# one-hot encoding via pandas\n", "\n", "pd.get_dummies(df[['price', 'color', 'size']])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# multicollinearity guard for the OneHotEncoder\n", "\n", "ohe = OneHotEncoder(categorical_features=[0])\n", "ohe.fit_transform(X).toarray()[:, 1:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Selection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)\n", "\n", "df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',\n", " 'Alcalinity of ash', 'Magnesium', 'Total phenols',\n", " 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',\n", " 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',\n", " 'Proline']\n", "\n", "print('Class labels', np.unique(df_wine['Class label']))\n", "df_wine.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "feat_labels = df_wine.columns[1:]\n", "\n", "forest = RandomForestClassifier(n_estimators=500,\n", " random_state=1)\n", "\n", "forest.fit(X_train, y_train)\n", "importances = forest.feature_importances_\n", "\n", "indices = np.argsort(importances)[::-1]\n", "\n", "for f in range(X_train.shape[1]):\n", " print(\"%2d) %-*s %f\" % (f + 1, 30, \n", " feat_labels[indices[f]], \n", " importances[indices[f]]))\n", "\n", "plt.title('Feature Importance')\n", "plt.bar(range(X_train.shape[1]), \n", " importances[indices],\n", " align='center')\n", "\n", "plt.xticks(range(X_train.shape[1]), \n", " feat_labels[indices], rotation=90)\n", "plt.xlim([-1, X_train.shape[1]])\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_selection import SelectFromModel\n", "\n", "sfm = SelectFromModel(forest, threshold=0.1, prefit=True)\n", "X_selected = sfm.transform(X_train)\n", "print('Number of samples that meet this criterion:', \n", " X_selected.shape[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for f in range(X_selected.shape[1]):\n", " print(\"%2d) %-*s %f\" % (f + 1, 30, \n", " feat_labels[indices[f]], \n", " importances[indices[f]]))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }