{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sklearn\n", "import pandas as pd\n", "import numpy as np\n", "import scipy" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('0.23.1', '1.0.5', '1.19.0', '1.5.1')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sklearn.__version__, pd.__version__, np.__version__, scipy.__version__" ] }, { "cell_type": "markdown", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Custom dataframe transformer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "\n", "class DataframeFunctionTransformer():\n", " def __init__(self, func):\n", " self.func = func\n", "\n", " def transform(self, input_df, **transform_params):\n", " return self.func(input_df)\n", "\n", " def fit(self, X, y=None, **fit_params):\n", " return self" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def process_dataframe(input_df):\n", " \n", " input_df[\"text\"] = input_df[\"text\"].map(lambda t: t.upper())\n", " \n", " return input_df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\n", " \"id\":[1,2,3,4],\n", " \"text\":[\"foo\",\"Bar\",\"BAz\",\"quux\"]\n", "})" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtext
01foo
12Bar
23BAz
34quux
\n", "
" ], "text/plain": [ " id text\n", "0 1 foo\n", "1 2 Bar\n", "2 3 BAz\n", "3 4 quux" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline([\n", " (\"lowercase\", DataframeFunctionTransformer(process_dataframe))\n", "])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtext
01FOO
12BAR
23BAZ
34QUUX
\n", "
" ], "text/plain": [ " id text\n", "0 1 FOO\n", "1 2 BAR\n", "2 3 BAZ\n", "3 4 QUUX" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.fit_transform(df)" ] }, { "cell_type": "markdown", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## sparse to dense matrix" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.base import TransformerMixin,BaseEstimator\n", "\n", "from sklearn.pipeline import Pipeline" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data = scipy.sparse.csr_matrix([\n", " [1.,0.,0.,0.,0.,0.],\n", " [0.,1.,0.,0.,0.,0.],\n", " [1.,0.,0.,0.,0.,0.],\n", " [0.,0.,0.,0.,1.,0.],\n", " [0.,0.,0.,1.,0.,0.],\n", " [1.,0.,0.,0.,0.,0.],\n", " [1.,1.,0.,0.,0.,0.],\n", " [1.,1.,0.,0.,0.,0.],\n", "])\n", "\n", "target = np.array([1,1,1,0,0,0,1,1])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 0, 0, 1, 1, 1])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "class ToDenseTransformer():\n", " \n", " # here you define the operation it should perform\n", " def transform(self, X, y=None, **fit_params):\n", " return X.todense()\n", "\n", " # just return self\n", " def fit(self, X, y=None, **fit_params):\n", " return self\n", "\n", "# need to make matrices dense because PCA does not work with sparse vectors.\n", "pipeline = Pipeline([\n", " ('to_dense',ToDenseTransformer()),\n", " ('pca',PCA()),\n", " ('clf',DecisionTreeClassifier())\n", "])\n", "\n", "pipeline.fit(data,target)\n", "pipeline.predict(data)" ] }, { "cell_type": "markdown", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## missing imputation" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0alice24.0
1bob32.0
2charlieNaN
3david38.0
4edward20.0
\n", "
" ], "text/plain": [ " name age\n", "0 alice 24.0\n", "1 bob 32.0\n", "2 charlie NaN\n", "3 david 38.0\n", "4 edward 20.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['alice','bob','charlie','david','edward'],\n", " 'age':[24,32,np.nan,38,20]\n", "})\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "transformer_step = ColumnTransformer([\n", " ('impute_mean', SimpleImputer(strategy='mean'), ['age'])\n", " ], remainder='passthrough')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "pipe = Pipeline([\n", " ('transformer', transformer_step)\n", "])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0alice24
1bob32
2charlie28.5
3david38
4edward20
\n", "
" ], "text/plain": [ " name age\n", "0 alice 24\n", "1 bob 32\n", "2 charlie 28.5\n", "3 david 38\n", "4 edward 20" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.fit(df)\n", "\n", "pd.DataFrame(\n", " data=pipe.transform(df),\n", " columns=['age', 'name']\n", ")[[\"name\",\"age\"]]" ] }, { "cell_type": "markdown", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## ColumnTransformer with OneHotEncoder" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.tree import DecisionTreeClassifier" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\n", " 'favorite_color':['blue','green','red','green','blue'],\n", " 'age': [10,15,10,np.nan,10],\n", " 'target':[1,0,1,0,1]\n", "})" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
favorite_coloragetarget
0blue10.01
1green15.00
2red10.01
3greenNaN0
4blue10.01
\n", "
" ], "text/plain": [ " favorite_color age target\n", "0 blue 10.0 1\n", "1 green 15.0 0\n", "2 red 10.0 1\n", "3 green NaN 0\n", "4 blue 10.0 1" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('preprocess',\n", " ColumnTransformer(transformers=[('categorical_preprocessing',\n", " Pipeline(steps=[('ohe',\n", " OneHotEncoder())]),\n", " ['favorite_color']),\n", " ('numerical_preprocessing',\n", " Pipeline(steps=[('imputation',\n", " SimpleImputer())]),\n", " ['age'])])),\n", " ('clf', DecisionTreeClassifier())])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# define individual transformers in a pipeline\n", "categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])\n", "numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])\n", "\n", "# define which transformer applies to which columns\n", "preprocess = ColumnTransformer([\n", " ('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),\n", " ('numerical_preprocessing', numerical_preprocessing, ['age'])\n", "])\n", "\n", "# create the final pipeline with preprocessing steps and \n", "# the final classifier step\n", "pipeline = Pipeline([\n", " ('preprocess', preprocess),\n", " ('clf', DecisionTreeClassifier())\n", "])\n", "\n", "# now fit the pipeline using the whole dataframe\n", "df_features = df[['favorite_color','age']]\n", "df_target = df['target']\n", "\n", "pipeline.fit(df_features, df_target)" ] }, { "cell_type": "markdown", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Select columns with Custom Transformer" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn.base import TransformerMixin,BaseEstimator\n", "from sklearn.pipeline import Pipeline" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "class SelectColumnsTransfomer():\n", " def __init__(self, columns=None):\n", " self.columns = columns\n", "\n", " def transform(self, X, **transform_params):\n", " cpy_df = X[self.columns].copy()\n", " return cpy_df\n", "\n", " def fit(self, X, y=None, **fit_params):\n", " return self" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0alice24.0
1bob32.0
2charlieNaN
3david38.0
4edward20.0
\n", "
" ], "text/plain": [ " name age\n", "0 alice 24.0\n", "1 bob 32.0\n", "2 charlie NaN\n", "3 david 38.0\n", "4 edward 20.0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['alice','bob','charlie','david','edward'],\n", " 'age':[24,32,np.nan,38,20]\n", "})\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "pipe = Pipeline([\n", " ('selector', SelectColumnsTransfomer([\"name\"]))\n", "])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
0alice
1bob
2charlie
3david
4edward
\n", "
" ], "text/plain": [ " name\n", "0 alice\n", "1 bob\n", "2 charlie\n", "3 david\n", "4 edward" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.fit_transform(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Function Transformer with Parameters" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from nltk.stem import RSLPStemmer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.preprocessing import FunctionTransformer\n", "\n", "def stem_str(input_series, stemmer):\n", " \n", " def stem(input_str):\n", " return \" \".join([stemmer.stem(t) for t in input_str.split(\" \")]).strip()\n", " \n", " return input_series.apply(stem)\n", "\n", "pipeline = Pipeline([\n", " ('stemmer', FunctionTransformer(\n", " func=stem_str, \n", " kw_args={'stemmer': RSLPStemmer()})),\n", " ('vect', TfidfVectorizer()),\n", " ('clf', LogisticRegression())\n", "])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\n", " 'text':[\n", " 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',\n", " 'Sed accumsan congue enim non pretium.',\n", " 'In hac habitasse platea dictumst.',\n", " 'Sed tincidunt ipsum nec urna vulputate luctus.'\n", " ],\n", " 'target':[0, 1, 0, 1]\n", "})" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
texttarget
0Lorem ipsum dolor sit amet, consectetur adipis...0
1Sed accumsan congue enim non pretium.1
2In hac habitasse platea dictumst.0
3Sed tincidunt ipsum nec urna vulputate luctus.1
\n", "
" ], "text/plain": [ " text target\n", "0 Lorem ipsum dolor sit amet, consectetur adipis... 0\n", "1 Sed accumsan congue enim non pretium. 1\n", "2 In hac habitasse platea dictumst. 0\n", "3 Sed tincidunt ipsum nec urna vulputate luctus. 1" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('stemmer',\n", " FunctionTransformer(func=,\n", " kw_args={'stemmer': })),\n", " ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.fit(df['text'],df['target'])" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 0, 1])" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.predict(df['text'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "toc-autonumbering": false, "toc-showcode": false, "toc-showmarkdowntxt": false }, "nbformat": 4, "nbformat_minor": 4 }