{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Cheat sheet\n", "\n", "## Table of contents:\n", "\n", "- Pipeline\n", "- FeatureUnion\n", "- Custom transformer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pipeline\n", "- Create" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.dummy import DummyClassifier\n", "\n", "pipe = Pipeline(steps=[\n", " # ('estimator_name', estimator_object)\n", " ('pred', DummyClassifier())\n", "])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- fit" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_iris\n", "data, target = load_iris(return_X_y=True)\n", "\n", "pipe.fit(data, target)\n", "pipe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- predict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipe.predict(data[-1:])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- get parameters of pipeline steps" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipe.get_params()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- set parameters of pipeline steps" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipe.set_params(\n", " # stepname__parametername=newvalue\n", " pred__random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# dict with stepname__parametername - newvalue pairs\n", "newvalues = {'pred__random_state': 42}\n", "# using the keyword argument unpacking operator **\n", "pipe.set_params(**newvalues)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## FeatureUnion\n", "- Create" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import FeatureUnion\n", "from sklearn.decomposition import PCA\n", "from sklearn.feature_selection import SelectKBest\n", "\n", "union = FeatureUnion(transformer_list=[\n", " ('pca', PCA(n_components=2)), \n", " (\"univ_select\", SelectKBest(k=1))\n", "])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- fit and transform" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "array([[-2.68420713, 0.32660731, 1.4 ],\n", " [-2.71539062, -0.16955685, 1.4 ],\n", " [-2.88981954, -0.13734561, 1.3 ],\n", " [-2.7464372 , -0.31112432, 1.5 ],\n", " [-2.72859298, 0.33392456, 1.4 ]])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.datasets import load_iris\n", "data, target = load_iris(return_X_y=True)\n", "\n", "transformed = union.fit_transform(data, target)\n", "transformed[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Custom transformer\n", "- Define" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", "class ItemSelector(BaseEstimator, TransformerMixin):\n", " \n", " def __init__(self, keys):\n", " self.keys = keys\n", "\n", " def fit(self, X, y=None):\n", " return self\n", "\n", " def transform(self, X, y=None):\n", " return X[self.keys]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Create" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "sel = ItemSelector('A')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- fit and transform" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 2\n", "2 3\n", "3 4\n", "4 5\n", "Name: A, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "df = pd.DataFrame([\n", " {'A': 1, 'B': 2},\n", " {'A': 2, 'B': 4},\n", " {'A': 3, 'B': 8},\n", " {'A': 4, 'B': 16},\n", " {'A': 5, 'B': 32},\n", "])\n", "\n", "sel.fit_transform(df)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }