{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scikit-learn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Scikit-learn contains simple and efficient tools for data mining and data analysis. It implements a wide variety of machine learning algorithms and processes to conduct advanced analytics.\n", "\n", "Library documentation: http://scikit-learn.org/stable/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### General" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "from sklearn import datasets\n", "from sklearn import svm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0. 0. 5. ..., 0. 0. 0.]\n", " [ 0. 0. 0. ..., 10. 0. 0.]\n", " [ 0. 0. 0. ..., 16. 9. 0.]\n", " ..., \n", " [ 0. 0. 1. ..., 6. 0. 0.]\n", " [ 0. 0. 2. ..., 12. 0. 0.]\n", " [ 0. 0. 10. ..., 12. 1. 0.]]\n" ] } ], "source": [ "# import a sample dataset and view the data\n", "digits = datasets.load_digits()\n", "print(digits.data)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2, ..., 8, 9, 8])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# view the target variable\n", "digits.target" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,\n", " gamma=0.001, kernel='rbf', max_iter=-1, probability=False,\n", " random_state=None, shrinking=True, tol=0.001, verbose=False)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# train a support vector machine using everything but the last example \n", "classifier = svm.SVC(gamma=0.001, C=100.)\n", "classifier.fit(digits.data[:-1], digits.target[:-1])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([8])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# predict the target of the last example\n", "classifier.predict(digits.data[-1])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([8])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# persist the model and reload\n", "import pickle\n", "from sklearn.externals import joblib\n", "joblib.dump(classifier, 'model.pkl')\n", "classifier2 = joblib.load('model.pkl')\n", "classifier2.predict(digits.data[-1])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "os.remove('model.pkl')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.97999999999999998" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# another example with the digits data set\n", "svc = svm.SVC(C=1, kernel='linear')\n", "svc.fit(digits.data[:-100], digits.target[:-100]).score(digits.data[-100:], digits.target[-100:])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: [2 3 4 5] | test: [0 1]\n", "Train: [0 1 4 5] | test: [2 3]\n", "Train: [0 1 2 3] | test: [4 5]\n" ] } ], "source": [ "# perform cross-validation on the estimator's predictions\n", "from sklearn import cross_validation\n", "k_fold = cross_validation.KFold(n=6, n_folds=3)\n", "for train_indices, test_indices in k_fold:\n", " print('Train: %s | test: %s' % (train_indices, test_indices))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0.93489149, 0.95659432, 0.93989983])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# apply to the model\n", "kfold = cross_validation.KFold(len(digits.data), n_folds=3)\n", "cross_validation.cross_val_score(svc, digits.data, digits.target, cv=kfold, n_jobs=-1)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=None,\n", " estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n", " kernel='linear', max_iter=-1, probability=False, random_state=None,\n", " shrinking=True, tol=0.001, verbose=False),\n", " fit_params={}, iid=True, loss_func=None, n_jobs=-1,\n", " param_grid={'gamma': array([ 1.00000e-06, 3.59381e-06, 1.29155e-05, 4.64159e-05,\n", " 1.66810e-04, 5.99484e-04, 2.15443e-03, 7.74264e-03,\n", " 2.78256e-02, 1.00000e-01])},\n", " pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n", " verbose=0)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the grid search module to optimize model parameters\n", "from sklearn.grid_search import GridSearchCV\n", "gammas = np.logspace(-6, -1, 10)\n", "classifier = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas), n_jobs=-1)\n", "classifier.fit(digits.data[:1000], digits.target[:1000])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.92400000000000004" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier.best_score_" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "9.9999999999999995e-07" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier.best_estimator_.gamma" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.94228356336260977" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# run against the test set\n", "classifier.score(digits.data[1000:], digits.target[1000:])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0.93521595, 0.95826377, 0.93791946])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# nested cross-validation example\n", "cross_validation.cross_val_score(classifier, digits.data, digits.target)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Other Classifiers" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# import the iris dataset\n", "iris = datasets.load_iris()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_neighbors=5, p=2, weights='uniform')" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# k nearest neighbors\n", "from sklearn.neighbors import KNeighborsClassifier\n", "knn = KNeighborsClassifier()\n", "knn.fit(iris.data, iris.target)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier(compute_importances=None, criterion='gini',\n", " max_depth=None, max_features=None, max_leaf_nodes=None,\n", " min_density=None, min_samples_leaf=1, min_samples_split=2,\n", " random_state=None, splitter='best')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# decision tree\n", "from sklearn.tree import DecisionTreeClassifier\n", "dtree = DecisionTreeClassifier()\n", "dtree.fit(iris.data, iris.target)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,\n", " fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',\n", " loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,\n", " random_state=None, shuffle=False, verbose=0, warm_start=False)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# stochastic gradient descent\n", "from sklearn.linear_model import SGDClassifier\n", "sgd = SGDClassifier(loss=\"hinge\", penalty=\"l2\")\n", "sgd.fit(iris.data, iris.target)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of mislabeled points : 6\n" ] } ], "source": [ "# naive bayes\n", "from sklearn.naive_bayes import GaussianNB\n", "gnb = GaussianNB()\n", "y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)\n", "print(\"Number of mislabeled points : %d\" % (iris.target != y_pred).sum())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Regression" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# load another sample dataset\n", "diabetes = datasets.load_diabetes()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# linear regression\n", "from sklearn import linear_model\n", "regr = linear_model.LinearRegression()\n", "regr.fit(diabetes.data, diabetes.target)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ -10.01219782 -239.81908937 519.83978679 324.39042769 -792.18416163\n", " 476.74583782 101.04457032 177.06417623 751.27932109 67.62538639]\n" ] } ], "source": [ "# regression coefficients\n", "print(regr.coef_)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2859.6903987680657" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# mean squared error\n", "np.mean((regr.predict(diabetes.data)-diabetes.target)**2)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.51774942541329338" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# explained variance\n", "regr.score(diabetes.data, diabetes.target)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,\n", " normalize=False, solver='auto', tol=0.001)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# ridge regression\n", "regr = linear_model.Ridge(alpha=.1)\n", "regr.fit(diabetes.data, diabetes.target)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n", " normalize=False, positive=False, precompute='auto', tol=0.0001,\n", " warm_start=False)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# lasso regression\n", "regr = linear_model.Lasso()\n", "regr.fit(diabetes.data, diabetes.target)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1, penalty='l2',\n", " random_state=None, tol=0.0001)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# logistic regression (this is actually a classifier)\n", "iris = datasets.load_iris()\n", "logistic = linear_model.LogisticRegression(C=1e5)\n", "logistic.fit(iris.data, iris.target)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# feature scaling\n", "from sklearn import preprocessing\n", "X = np.array([[ 1., -1., 2.],\n", " [ 2., 0., 0.],\n", " [ 0., 1., -1.]])\n", "X_scaled = preprocessing.scale(X)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "StandardScaler(copy=True, with_mean=True, with_std=True)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save the scaling transform to apply to new data later\n", "scaler = preprocessing.StandardScaler().fit(X)\n", "scaler" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0. , -1.22474487, 1.33630621],\n", " [ 1.22474487, 0. , -0.26726124],\n", " [-1.22474487, 1.22474487, -1.06904497]])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scaler.transform(X)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0.5 , 0. , 1. ],\n", " [ 1. , 0.5 , 0.33333333],\n", " [ 0. , 1. , 0. ]])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# range scaling\n", "min_max_scaler = preprocessing.MinMaxScaler()\n", "X_minmax = min_max_scaler.fit_transform(X)\n", "X_minmax" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0.40824829, -0.40824829, 0.81649658],\n", " [ 1. , 0. , 0. ],\n", " [ 0. , 0.70710678, -0.70710678]])" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# instance normalization using L2 norm\n", "X_normalized = preprocessing.normalize(X, norm='l2')\n", "X_normalized" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# category encoding\n", "enc = preprocessing.OneHotEncoder()\n", "enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])\n", "enc.transform([[0, 1, 3]]).toarray()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1., 0., 1.],\n", " [ 1., 0., 0.],\n", " [ 0., 1., 0.]])" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# binning\n", "binarizer = preprocessing.Binarizer().fit(X)\n", "binarizer.transform(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Clustering" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,\n", " n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,\n", " verbose=0)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# k means clustering\n", "from sklearn import cluster\n", "k_means = cluster.KMeans(n_clusters=3)\n", "k_means.fit(iris.data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Decomposition" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# create a signal with 2 useful dimensions\n", "x1 = np.random.normal(size=100)\n", "x2 = np.random.normal(size=100)\n", "x3 = x1 + x2\n", "X = np.c_[x1, x2, x3]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "PCA(copy=True, n_components=None, whiten=False)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# compute principal component analysis\n", "from sklearn import decomposition\n", "pca = decomposition.PCA()\n", "pca.fit(X)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 2.77625101e+00, 9.03048616e-01, 3.02456658e-31])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.explained_variance_" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(100L, 2L)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# only the 2 first components are useful\n", "pca.n_components = 2\n", "X_reduced = pca.fit_transform(X)\n", "X_reduced.shape" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# generate more sample data\n", "time = np.linspace(0, 10, 2000)\n", "s1 = np.sin(2 * time) # signal 1 : sinusoidal signal\n", "s2 = np.sign(np.sin(3 * time)) # signal 2 : square signal\n", "S = np.c_[s1, s2]\n", "S += 0.2 * np.random.normal(size=S.shape) # Add noise\n", "S /= S.std(axis=0) # standardize data" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# mix data\n", "A = np.array([[1, 1], [0.5, 2]]) # mixing matrix\n", "X = np.dot(S, A.T) # generate observations" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 43, 