{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cheat sheet\n",
"\n",
"## Table of contents:\n",
"\n",
"- Pipeline\n",
"- FeatureUnion\n",
"- Custom transformer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pipeline\n",
"- Create"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn.dummy import DummyClassifier\n",
"\n",
"pipe = Pipeline(steps=[\n",
" # ('estimator_name', estimator_object)\n",
" ('pred', DummyClassifier())\n",
"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- fit"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"data, target = load_iris(return_X_y=True)\n",
"\n",
"pipe.fit(data, target)\n",
"pipe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- predict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipe.predict(data[-1:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- get parameters of pipeline steps"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipe.get_params()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- set parameters of pipeline steps"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipe.set_params(\n",
" # stepname__parametername=newvalue\n",
" pred__random_state=42\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# dict with stepname__parametername - newvalue pairs\n",
"newvalues = {'pred__random_state': 42}\n",
"# using the keyword argument unpacking operator **\n",
"pipe.set_params(**newvalues)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## FeatureUnion\n",
"- Create"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import FeatureUnion\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.feature_selection import SelectKBest\n",
"\n",
"union = FeatureUnion(transformer_list=[\n",
" ('pca', PCA(n_components=2)), \n",
" (\"univ_select\", SelectKBest(k=1))\n",
"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- fit and transform"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-2.68420713, 0.32660731, 1.4 ],\n",
" [-2.71539062, -0.16955685, 1.4 ],\n",
" [-2.88981954, -0.13734561, 1.3 ],\n",
" [-2.7464372 , -0.31112432, 1.5 ],\n",
" [-2.72859298, 0.33392456, 1.4 ]])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.datasets import load_iris\n",
"data, target = load_iris(return_X_y=True)\n",
"\n",
"transformed = union.fit_transform(data, target)\n",
"transformed[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Custom transformer\n",
"- Define"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"\n",
"class ItemSelector(BaseEstimator, TransformerMixin):\n",
" \n",
" def __init__(self, keys):\n",
" self.keys = keys\n",
"\n",
" def fit(self, X, y=None):\n",
" return self\n",
"\n",
" def transform(self, X, y=None):\n",
" return X[self.keys]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Create"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"sel = ItemSelector('A')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- fit and transform"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1\n",
"1 2\n",
"2 3\n",
"3 4\n",
"4 5\n",
"Name: A, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.DataFrame([\n",
" {'A': 1, 'B': 2},\n",
" {'A': 2, 'B': 4},\n",
" {'A': 3, 'B': 8},\n",
" {'A': 4, 'B': 16},\n",
" {'A': 5, 'B': 32},\n",
"])\n",
"\n",
"sel.fit_transform(df)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}