# Preprocessing範例

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Preparation (拆分訓練集跟測試集)

In [None]:
X = np.array(range(25)).reshape(5, 5)
Y = np.array(range(5))

In [None]:
X

In [None]:
Y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
X_train

In [None]:
Y_train

## 缺失值處理 (Dealing with Missing Data)

In [None]:
df = pd.DataFrame(X)
df.iloc[1, 2] = np.nan
df.iloc[2, 3] = np.nan
df.iloc[4, 1] = np.nan
df

In [None]:
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

## 特徵縮放 (Feature Scaling)

In [None]:
# Standardization
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)

In [None]:
scaler.mean_

In [None]:
scaler.var_

In [None]:
scaler.transform([[2, 2]])

In [None]:
# Normalization
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
scaler.fit(data)
scaler.transform(data)

In [None]:
scaler.data_max_

In [None]:
scaler.data_min_

In [None]:
scaler.transform([[2, 2]])

## Label Encoding

In [None]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
 ['red', 'L', 13.5, 'class2'],
 ['blue', 'XL', 15.3, 'class1']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

In [None]:
from sklearn.preprocessing import LabelEncoder

# Label encoding with sklearn's LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

In [None]:
# reverse mapping
class_le.inverse_transform(y)

## One-Hot Encoding

In [None]:
X = df[['color', 'size', 'price']].values

color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])

size_le = LabelEncoder()
X[:, 1] = size_le.fit_transform(X[:, 1])

X

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

In [None]:
# return dense array so that we can skip
# the toarray step

ohe = OneHotEncoder(categorical_features=[0], sparse=False)
ohe.fit_transform(X)

In [None]:
# one-hot encoding via pandas

pd.get_dummies(df[['price', 'color', 'size']])

In [None]:
# multicollinearity guard for the OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()[:, 1:]

## Feature Selection

In [None]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
 'Alcalinity of ash', 'Magnesium', 'Total phenols',
 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
 'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

In [None]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
from sklearn.ensemble import RandomForestClassifier

feat_labels = df_wine.columns[1:]

forest = RandomForestClassifier(n_estimators=500,
 random_state=1)

forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
 print("%2d) %-*s %f" % (f + 1, 30, 
 feat_labels[indices[f]], 
 importances[indices[f]]))

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), 
 importances[indices],
 align='center')

plt.xticks(range(X_train.shape[1]), 
 feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of samples that meet this criterion:', 
 X_selected.shape[0])

In [None]:
for f in range(X_selected.shape[1]):
 print("%2d) %-*s %f" % (f + 1, 30, 
 feat_labels[indices[f]], 
 importances[indices[f]]))