{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Pipelines In SkLearn" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_iris\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.decomposition import PCA\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "iris_df=load_iris()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[5.1, 3.5, 1.4, 0.2],\n", " [4.9, 3. , 1.4, 0.2],\n", " [4.7, 3.2, 1.3, 0.2],\n", " [4.6, 3.1, 1.5, 0.2],\n", " [5. , 3.6, 1.4, 0.2],\n", " [5.4, 3.9, 1.7, 0.4],\n", " [4.6, 3.4, 1.4, 0.3],\n", " [5. , 3.4, 1.5, 0.2],\n", " [4.4, 2.9, 1.4, 0.2],\n", " [4.9, 3.1, 1.5, 0.1],\n", " [5.4, 3.7, 1.5, 0.2],\n", " [4.8, 3.4, 1.6, 0.2],\n", " [4.8, 3. , 1.4, 0.1],\n", " [4.3, 3. , 1.1, 0.1],\n", " [5.8, 4. , 1.2, 0.2],\n", " [5.7, 4.4, 1.5, 0.4],\n", " [5.4, 3.9, 1.3, 0.4],\n", " [5.1, 3.5, 1.4, 0.3],\n", " [5.7, 3.8, 1.7, 0.3],\n", " [5.1, 3.8, 1.5, 0.3],\n", " [5.4, 3.4, 1.7, 0.2],\n", " [5.1, 3.7, 1.5, 0.4],\n", " [4.6, 3.6, 1. , 0.2],\n", " [5.1, 3.3, 1.7, 0.5],\n", " [4.8, 3.4, 1.9, 0.2],\n", " [5. , 3. , 1.6, 0.2],\n", " [5. , 3.4, 1.6, 0.4],\n", " [5.2, 3.5, 1.5, 0.2],\n", " [5.2, 3.4, 1.4, 0.2],\n", " [4.7, 3.2, 1.6, 0.2],\n", " [4.8, 3.1, 1.6, 0.2],\n", " [5.4, 3.4, 1.5, 0.4],\n", " [5.2, 4.1, 1.5, 0.1],\n", " [5.5, 4.2, 1.4, 0.2],\n", " [4.9, 3.1, 1.5, 0.2],\n", " [5. , 3.2, 1.2, 0.2],\n", " [5.5, 3.5, 1.3, 0.2],\n", " [4.9, 3.6, 1.4, 0.1],\n", " [4.4, 3. , 1.3, 0.2],\n", " [5.1, 3.4, 1.5, 0.2],\n", " [5. , 3.5, 1.3, 0.3],\n", " [4.5, 2.3, 1.3, 0.3],\n", " [4.4, 3.2, 1.3, 0.2],\n", " [5. , 3.5, 1.6, 0.6],\n", " [5.1, 3.8, 1.9, 0.4],\n", " [4.8, 3. , 1.4, 0.3],\n", " [5.1, 3.8, 1.6, 0.2],\n", " [4.6, 3.2, 1.4, 0.2],\n", " [5.3, 3.7, 1.5, 0.2],\n", " [5. , 3.3, 1.4, 0.2],\n", " [7. , 3.2, 4.7, 1.4],\n", " [6.4, 3.2, 4.5, 1.5],\n", " [6.9, 3.1, 4.9, 1.5],\n", " [5.5, 2.3, 4. , 1.3],\n", " [6.5, 2.8, 4.6, 1.5],\n", " [5.7, 2.8, 4.5, 1.3],\n", " [6.3, 3.3, 4.7, 1.6],\n", " [4.9, 2.4, 3.3, 1. ],\n", " [6.6, 2.9, 4.6, 1.3],\n", " [5.2, 2.7, 3.9, 1.4],\n", " [5. , 2. , 3.5, 1. ],\n", " [5.9, 3. , 4.2, 1.5],\n", " [6. , 2.2, 4. , 1. ],\n", " [6.1, 2.9, 4.7, 1.4],\n", " [5.6, 2.9, 3.6, 1.3],\n", " [6.7, 3.1, 4.4, 1.4],\n", " [5.6, 3. , 4.5, 1.5],\n", " [5.8, 2.7, 4.1, 1. ],\n", " [6.2, 2.2, 4.5, 1.5],\n", " [5.6, 2.5, 3.9, 1.1],\n", " [5.9, 3.2, 4.8, 1.8],\n", " [6.1, 2.8, 4. , 1.3],\n", " [6.3, 2.5, 4.9, 1.5],\n", " [6.1, 2.8, 4.7, 1.2],\n", " [6.4, 2.9, 4.3, 1.3],\n", " [6.6, 3. , 4.4, 1.4],\n", " [6.8, 2.8, 4.8, 1.4],\n", " [6.7, 3. , 5. , 1.7],\n", " [6. , 2.9, 4.5, 1.5],\n", " [5.7, 2.6, 3.5, 1. ],\n", " [5.5, 2.4, 3.8, 1.1],\n", " [5.5, 2.4, 3.7, 1. ],\n", " [5.8, 2.7, 3.9, 1.2],\n", " [6. , 2.7, 5.1, 1.6],\n", " [5.4, 3. , 4.5, 1.5],\n", " [6. , 3.4, 4.5, 1.6],\n", " [6.7, 3.1, 4.7, 1.5],\n", " [6.3, 2.3, 4.4, 1.3],\n", " [5.6, 3. , 4.1, 1.3],\n", " [5.5, 2.5, 4. , 1.3],\n", " [5.5, 2.6, 4.4, 1.2],\n", " [6.1, 3. , 4.6, 1.4],\n", " [5.8, 2.6, 4. , 1.2],\n", " [5. , 2.3, 3.3, 1. ],\n", " [5.6, 2.7, 4.2, 1.3],\n", " [5.7, 3. , 4.2, 1.2],\n", " [5.7, 2.9, 4.2, 1.3],\n", " [6.2, 2.9, 4.3, 1.3],\n", " [5.1, 2.5, 3. , 1.1],\n", " [5.7, 2.8, 4.1, 1.3],\n", " [6.3, 3.3, 6. , 2.5],\n", " [5.8, 2.7, 5.1, 1.9],\n", " [7.1, 3. , 5.9, 2.1],\n", " [6.3, 2.9, 5.6, 1.8],\n", " [6.5, 3. , 5.8, 2.2],\n", " [7.6, 3. , 6.6, 2.1],\n", " [4.9, 2.5, 4.5, 1.7],\n", " [7.3, 2.9, 6.3, 1.8],\n", " [6.7, 2.5, 5.8, 1.8],\n", " [7.2, 3.6, 6.1, 2.5],\n", " [6.5, 3.2, 5.1, 2. ],\n", " [6.4, 2.7, 5.3, 1.9],\n", " [6.8, 3. , 5.5, 2.1],\n", " [5.7, 2.5, 5. , 2. ],\n", " [5.8, 2.8, 5.1, 2.4],\n", " [6.4, 3.2, 5.3, 2.3],\n", " [6.5, 3. , 5.5, 1.8],\n", " [7.7, 3.8, 6.7, 2.2],\n", " [7.7, 2.6, 6.9, 2.3],\n", " [6. , 2.2, 5. , 1.5],\n", " [6.9, 3.2, 5.7, 2.3],\n", " [5.6, 2.8, 4.9, 2. ],\n", " [7.7, 2.8, 6.7, 2. ],\n", " [6.3, 2.7, 4.9, 1.8],\n", " [6.7, 3.3, 5.7, 2.1],\n", " [7.2, 3.2, 6. , 1.8],\n", " [6.2, 2.8, 4.8, 1.8],\n", " [6.1, 3. , 4.9, 1.8],\n", " [6.4, 2.8, 5.6, 2.1],\n", " [7.2, 3. , 5.8, 1.6],\n", " [7.4, 2.8, 6.1, 1.9],\n", " [7.9, 3.8, 6.4, 2. ],\n", " [6.4, 2.8, 5.6, 2.2],\n", " [6.3, 2.8, 5.1, 1.5],\n", " [6.1, 2.6, 5.6, 1.4],\n", " [7.7, 3. , 6.1, 2.3],\n", " [6.3, 3.4, 5.6, 2.4],\n", " [6.4, 3.1, 5.5, 1.8],\n", " [6. , 3. , 4.8, 1.8],\n", " [6.9, 3.1, 5.4, 2.1],\n", " [6.7, 3.1, 5.6, 2.4],\n", " [6.9, 3.1, 5.1, 2.3],\n", " [5.8, 2.7, 5.1, 1.9],\n", " [6.8, 3.2, 5.9, 2.3],\n", " [6.7, 3.3, 5.7, 2.5],\n", " [6.7, 3. , 5.2, 2.3],\n", " [6.3, 2.5, 5. , 1.9],\n", " [6.5, 3. , 5.2, 2. ],\n", " [6.2, 3.4, 5.4, 2.3],\n", " [5.9, 3. , 5.1, 1.8]])" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_df.data" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "X_train,X_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "## Pipelines Creation\n", "## 1. Data Preprocessing by using Standard Scaler\n", "## 2. Reduce Dimension using PCA\n", "## 3. Apply Classifier" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "pipeline_lr=Pipeline([('scalar1',StandardScaler()),\n", " ('pca1',PCA(n_components=2)),\n", " ('lr_classifier',LogisticRegression(random_state=0))])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "pipeline_dt=Pipeline([('scalar2',StandardScaler()),\n", " ('pca2',PCA(n_components=2)),\n", " ('dt_classifier',DecisionTreeClassifier())])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),\n", " ('pca3',PCA(n_components=2)),\n", " ('rf_classifier',RandomForestClassifier())])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "## LEts make the list of pipelines\n", "pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "best_accuracy=0.0\n", "best_classifier=0\n", "best_pipeline=\"\"" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\krish.naik\\AppData\\Local\\Continuum\\anaconda3\\envs\\myenv\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", " FutureWarning)\n", "C:\\Users\\krish.naik\\AppData\\Local\\Continuum\\anaconda3\\envs\\myenv\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", " \"this warning.\", FutureWarning)\n", "C:\\Users\\krish.naik\\AppData\\Local\\Continuum\\anaconda3\\envs\\myenv\\lib\\site-packages\\sklearn\\ensemble\\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" ] } ], "source": [ "# Dictionary of pipelines and classifier types for ease of reference\n", "pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}\n", "\n", "# Fit the pipelines\n", "for pipe in pipelines:\n", "\tpipe.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logistic Regression Test Accuracy: 0.8666666666666667\n", "Decision Tree Test Accuracy: 0.9111111111111111\n", "RandomForest Test Accuracy: 0.9111111111111111\n" ] } ], "source": [ "for i,model in enumerate(pipelines):\n", " print(\"{} Test Accuracy: {}\".format(pipe_dict[i],model.score(X_test,y_test)))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Classifier with best accuracy:Decision Tree\n" ] } ], "source": [ "for i,model in enumerate(pipelines):\n", " if model.score(X_test,y_test)>best_accuracy:\n", " best_accuracy=model.score(X_test,y_test)\n", " best_pipeline=model\n", " best_classifier=i\n", "print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pipelines Perform Hyperparameter Tuning Using Grid SearchCV" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\n", "from sklearn.model_selection import GridSearchCV" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\krish.naik\\AppData\\Local\\Continuum\\anaconda3\\envs\\myenv\\lib\\site-packages\\sklearn\\model_selection\\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", " DeprecationWarning)\n", "C:\\Users\\krish.naik\\AppData\\Local\\Continuum\\anaconda3\\envs\\myenv\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", " FutureWarning)\n", "C:\\Users\\krish.naik\\AppData\\Local\\Continuum\\anaconda3\\envs\\myenv\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", " \"this warning.\", FutureWarning)\n" ] } ], "source": [ "# Create a pipeline\n", "pipe = Pipeline([(\"classifier\", RandomForestClassifier())])\n", "# Create dictionary with candidate learning algorithms and their hyperparameters\n", "grid_param = [\n", " {\"classifier\": [LogisticRegression()],\n", " \"classifier__penalty\": ['l2','l1'],\n", " \"classifier__C\": np.logspace(0, 4, 10)\n", " },\n", " {\"classifier\": [LogisticRegression()],\n", " \"classifier__penalty\": ['l2'],\n", " \"classifier__C\": np.logspace(0, 4, 10),\n", " \"classifier__solver\":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty\n", " },\n", " {\"classifier\": [RandomForestClassifier()],\n", " \"classifier__n_estimators\": [10, 100, 1000],\n", " \"classifier__max_depth\":[5,8,15,25,30,None],\n", " \"classifier__min_samples_leaf\":[1,2,5,10,15,100],\n", " \"classifier__max_leaf_nodes\": [2, 5,10]}]\n", "# create a gridsearch of the pipeline, the fit the best model\n", "gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search\n", "best_model = gridsearch.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline(memory=None,\n", " steps=[('classifier', LogisticRegression(C=59.94842503189409, class_weight=None, dual=False,\n", " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", " multi_class='warn', n_jobs=None, penalty='l2', random_state=None,\n", " solver='warn', tol=0.0001, verbose=0, warm_start=False))])\n", "The mean accuracy of the model is: 0.9555555555555556\n" ] } ], "source": [ "print(best_model.best_estimator_)\n", "print(\"The mean accuracy of the model is:\",best_model.score(X_test,y_test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## MakePipelines In SKLearn" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import make_pipeline" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\krish.naik\\AppData\\Local\\Continuum\\anaconda3\\envs\\myenv\\lib\\site-packages\\sklearn\\model_selection\\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", " DeprecationWarning)\n" ] } ], "source": [ "# Create a pipeline\n", "pipe = make_pipeline((RandomForestClassifier()))\n", "# Create dictionary with candidate learning algorithms and their hyperparameters\n", "grid_param = [\n", " {\"randomforestclassifier\": [RandomForestClassifier()],\n", " \"randomforestclassifier__n_estimators\": [10, 100, 1000],\n", " \"randomforestclassifier__max_depth\":[5,8,15,25,30,None],\n", " \"randomforestclassifier__min_samples_leaf\":[1,2,5,10,15,100],\n", " \"randomforestclassifier__max_leaf_nodes\": [2, 5,10]}]\n", "# create a gridsearch of the pipeline, the fit the best model\n", "gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search\n", "best_model = gridsearch.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9555555555555556" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "best_model.score(X_test,y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }