{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[![Open in Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/justmarkham/scikit-learn-tips/master?filepath=notebooks%2F37_pipeline_diagram.ipynb)\n", "\n", "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/justmarkham/scikit-learn-tips/blob/master/notebooks/37_pipeline_diagram.ipynb)\n", "\n", "# 🤖⚡ scikit-learn tip #37 ([video](https://www.youtube.com/watch?v=_UKYxucD1Io&list=PL5-da3qGB5ID7YYAqireYEew2mWVvgmj6&index=37))\n", "\n", "New in version 0.23: Create interactive diagrams of Pipelines (and other estimators) in Jupyter!\n", "\n", "Click on any element to see more details. You can even export the diagram to an HTML file!\n", "\n", "See example 👇" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_selection import SelectPercentile, chi2\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.compose import make_column_transformer\n", "from sklearn.pipeline import make_pipeline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('http://bit.ly/kaggletrain')\n", "X = df[['Parch', 'Fare', 'Embarked', 'Sex', 'Name', 'Age']]\n", "y = df['Survived']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "imp_constant = SimpleImputer(strategy='constant')\n", "ohe = OneHotEncoder()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "imp_ohe = make_pipeline(imp_constant, ohe)\n", "vect = CountVectorizer()\n", "imp = SimpleImputer()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# pipeline step 1\n", "ct = make_column_transformer(\n", " (imp_ohe, ['Embarked', 'Sex']),\n", " (vect, 'Name'),\n", " (imp, ['Age', 'Fare']),\n", " ('passthrough', ['Parch']))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# pipeline step 2\n", "selection = SelectPercentile(chi2, percentile=50)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# pipeline step 3\n", "logreg = LogisticRegression(solver='liblinear')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# display estimators as diagrams\n", "from sklearn import set_config\n", "set_config(display='diagram')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(transformers=[('pipeline',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='constant')),\n",
       "                                                                  ('onehotencoder',\n",
       "                                                                   OneHotEncoder())]),\n",
       "                                                  ['Embarked', 'Sex']),\n",
       "                                                 ('countvectorizer',\n",
       "                                                  CountVectorizer(), 'Name'),\n",
       "                                                 ('simpleimputer',\n",
       "                                                  SimpleImputer(),\n",
       "                                                  ['Age', 'Fare']),\n",
       "                                                 ('passthrough', 'passthrough',\n",
       "                                                  ['Parch'])])),\n",
       "                ('selectpercentile',\n",
       "                 SelectPercentile(percentile=50,\n",
       "                                  score_func=)),\n",
       "                ('logisticregression', LogisticRegression(solver='liblinear'))])
ColumnTransformer(transformers=[('pipeline',\n",
       "                                 Pipeline(steps=[('simpleimputer',\n",
       "                                                  SimpleImputer(strategy='constant')),\n",
       "                                                 ('onehotencoder',\n",
       "                                                  OneHotEncoder())]),\n",
       "                                 ['Embarked', 'Sex']),\n",
       "                                ('countvectorizer', CountVectorizer(), 'Name'),\n",
       "                                ('simpleimputer', SimpleImputer(),\n",
       "                                 ['Age', 'Fare']),\n",
       "                                ('passthrough', 'passthrough', ['Parch'])])
['Embarked', 'Sex']
SimpleImputer(strategy='constant')
OneHotEncoder()
Name
CountVectorizer()
['Age', 'Fare']
SimpleImputer()
['Parch']
passthrough
SelectPercentile(percentile=50, score_func=)
LogisticRegression(solver='liblinear')
" ], "text/plain": [ "Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('pipeline',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder())]),\n", " ['Embarked', 'Sex']),\n", " ('countvectorizer',\n", " CountVectorizer(), 'Name'),\n", " ('simpleimputer',\n", " SimpleImputer(),\n", " ['Age', 'Fare']),\n", " ('passthrough', 'passthrough',\n", " ['Parch'])])),\n", " ('selectpercentile',\n", " SelectPercentile(percentile=50,\n", " score_func=)),\n", " ('logisticregression', LogisticRegression(solver='liblinear'))])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe = make_pipeline(ct, selection, logreg)\n", "pipe" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# export the diagram to a file\n", "from sklearn.utils import estimator_html_repr\n", "with open('pipeline.html', 'w') as f: \n", " f.write(estimator_html_repr(pipe))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Want more tips? [View all tips on GitHub](https://github.com/justmarkham/scikit-learn-tips) or [Sign up to receive 2 tips by email every week](https://scikit-learn.tips) 💌\n", "\n", "© 2020 [Data School](https://www.dataschool.io). All rights reserved." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" } }, "nbformat": 4, "nbformat_minor": 4 }