{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sklearn\n",
"import pandas as pd\n",
"import numpy as np\n",
"import scipy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('0.23.1', '1.0.5', '1.19.0', '1.5.1')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sklearn.__version__, pd.__version__, np.__version__, scipy.__version__"
]
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## Custom dataframe transformer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"\n",
"class DataframeFunctionTransformer():\n",
" def __init__(self, func):\n",
" self.func = func\n",
"\n",
" def transform(self, input_df, **transform_params):\n",
" return self.func(input_df)\n",
"\n",
" def fit(self, X, y=None, **fit_params):\n",
" return self"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def process_dataframe(input_df):\n",
" \n",
" input_df[\"text\"] = input_df[\"text\"].map(lambda t: t.upper())\n",
" \n",
" return input_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\n",
" \"id\":[1,2,3,4],\n",
" \"text\":[\"foo\",\"Bar\",\"BAz\",\"quux\"]\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" foo | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Bar | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" BAz | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" quux | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id text\n",
"0 1 foo\n",
"1 2 Bar\n",
"2 3 BAz\n",
"3 4 quux"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"pipeline = Pipeline([\n",
" (\"lowercase\", DataframeFunctionTransformer(process_dataframe))\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" FOO | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" BAR | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" BAZ | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" QUUX | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id text\n",
"0 1 FOO\n",
"1 2 BAR\n",
"2 3 BAZ\n",
"3 4 QUUX"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline.fit_transform(df)"
]
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## sparse to dense matrix"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import PCA\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.base import TransformerMixin,BaseEstimator\n",
"\n",
"from sklearn.pipeline import Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data = scipy.sparse.csr_matrix([\n",
" [1.,0.,0.,0.,0.,0.],\n",
" [0.,1.,0.,0.,0.,0.],\n",
" [1.,0.,0.,0.,0.,0.],\n",
" [0.,0.,0.,0.,1.,0.],\n",
" [0.,0.,0.,1.,0.,0.],\n",
" [1.,0.,0.,0.,0.,0.],\n",
" [1.,1.,0.,0.,0.,0.],\n",
" [1.,1.,0.,0.,0.,0.],\n",
"])\n",
"\n",
"target = np.array([1,1,1,0,0,0,1,1])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 1, 1, 0, 0, 1, 1, 1])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class ToDenseTransformer():\n",
" \n",
" # here you define the operation it should perform\n",
" def transform(self, X, y=None, **fit_params):\n",
" return X.todense()\n",
"\n",
" # just return self\n",
" def fit(self, X, y=None, **fit_params):\n",
" return self\n",
"\n",
"# need to make matrices dense because PCA does not work with sparse vectors.\n",
"pipeline = Pipeline([\n",
" ('to_dense',ToDenseTransformer()),\n",
" ('pca',PCA()),\n",
" ('clf',DecisionTreeClassifier())\n",
"])\n",
"\n",
"pipeline.fit(data,target)\n",
"pipeline.predict(data)"
]
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## missing imputation"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alice | \n",
" 24.0 | \n",
"
\n",
" \n",
" 1 | \n",
" bob | \n",
" 32.0 | \n",
"
\n",
" \n",
" 2 | \n",
" charlie | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" david | \n",
" 38.0 | \n",
"
\n",
" \n",
" 4 | \n",
" edward | \n",
" 20.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age\n",
"0 alice 24.0\n",
"1 bob 32.0\n",
"2 charlie NaN\n",
"3 david 38.0\n",
"4 edward 20.0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['alice','bob','charlie','david','edward'],\n",
" 'age':[24,32,np.nan,38,20]\n",
"})\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"transformer_step = ColumnTransformer([\n",
" ('impute_mean', SimpleImputer(strategy='mean'), ['age'])\n",
" ], remainder='passthrough')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"pipe = Pipeline([\n",
" ('transformer', transformer_step)\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alice | \n",
" 24 | \n",
"
\n",
" \n",
" 1 | \n",
" bob | \n",
" 32 | \n",
"
\n",
" \n",
" 2 | \n",
" charlie | \n",
" 28.5 | \n",
"
\n",
" \n",
" 3 | \n",
" david | \n",
" 38 | \n",
"
\n",
" \n",
" 4 | \n",
" edward | \n",
" 20 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age\n",
"0 alice 24\n",
"1 bob 32\n",
"2 charlie 28.5\n",
"3 david 38\n",
"4 edward 20"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.fit(df)\n",
"\n",
"pd.DataFrame(\n",
" data=pipe.transform(df),\n",
" columns=['age', 'name']\n",
")[[\"name\",\"age\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## ColumnTransformer with OneHotEncoder"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.tree import DecisionTreeClassifier"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\n",
" 'favorite_color':['blue','green','red','green','blue'],\n",
" 'age': [10,15,10,np.nan,10],\n",
" 'target':[1,0,1,0,1]\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" favorite_color | \n",
" age | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" blue | \n",
" 10.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" green | \n",
" 15.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" red | \n",
" 10.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" green | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" blue | \n",
" 10.0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" favorite_color age target\n",
"0 blue 10.0 1\n",
"1 green 15.0 0\n",
"2 red 10.0 1\n",
"3 green NaN 0\n",
"4 blue 10.0 1"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('preprocess',\n",
" ColumnTransformer(transformers=[('categorical_preprocessing',\n",
" Pipeline(steps=[('ohe',\n",
" OneHotEncoder())]),\n",
" ['favorite_color']),\n",
" ('numerical_preprocessing',\n",
" Pipeline(steps=[('imputation',\n",
" SimpleImputer())]),\n",
" ['age'])])),\n",
" ('clf', DecisionTreeClassifier())])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# define individual transformers in a pipeline\n",
"categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])\n",
"numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])\n",
"\n",
"# define which transformer applies to which columns\n",
"preprocess = ColumnTransformer([\n",
" ('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),\n",
" ('numerical_preprocessing', numerical_preprocessing, ['age'])\n",
"])\n",
"\n",
"# create the final pipeline with preprocessing steps and \n",
"# the final classifier step\n",
"pipeline = Pipeline([\n",
" ('preprocess', preprocess),\n",
" ('clf', DecisionTreeClassifier())\n",
"])\n",
"\n",
"# now fit the pipeline using the whole dataframe\n",
"df_features = df[['favorite_color','age']]\n",
"df_target = df['target']\n",
"\n",
"pipeline.fit(df_features, df_target)"
]
},
{
"cell_type": "markdown",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## Select columns with Custom Transformer"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from sklearn.base import TransformerMixin,BaseEstimator\n",
"from sklearn.pipeline import Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"class SelectColumnsTransfomer():\n",
" def __init__(self, columns=None):\n",
" self.columns = columns\n",
"\n",
" def transform(self, X, **transform_params):\n",
" cpy_df = X[self.columns].copy()\n",
" return cpy_df\n",
"\n",
" def fit(self, X, y=None, **fit_params):\n",
" return self"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alice | \n",
" 24.0 | \n",
"
\n",
" \n",
" 1 | \n",
" bob | \n",
" 32.0 | \n",
"
\n",
" \n",
" 2 | \n",
" charlie | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" david | \n",
" 38.0 | \n",
"
\n",
" \n",
" 4 | \n",
" edward | \n",
" 20.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age\n",
"0 alice 24.0\n",
"1 bob 32.0\n",
"2 charlie NaN\n",
"3 david 38.0\n",
"4 edward 20.0"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['alice','bob','charlie','david','edward'],\n",
" 'age':[24,32,np.nan,38,20]\n",
"})\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"pipe = Pipeline([\n",
" ('selector', SelectColumnsTransfomer([\"name\"]))\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alice | \n",
"
\n",
" \n",
" 1 | \n",
" bob | \n",
"
\n",
" \n",
" 2 | \n",
" charlie | \n",
"
\n",
" \n",
" 3 | \n",
" david | \n",
"
\n",
" \n",
" 4 | \n",
" edward | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name\n",
"0 alice\n",
"1 bob\n",
"2 charlie\n",
"3 david\n",
"4 edward"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.fit_transform(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Function Transformer with Parameters"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from nltk.stem import RSLPStemmer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.preprocessing import FunctionTransformer\n",
"\n",
"def stem_str(input_series, stemmer):\n",
" \n",
" def stem(input_str):\n",
" return \" \".join([stemmer.stem(t) for t in input_str.split(\" \")]).strip()\n",
" \n",
" return input_series.apply(stem)\n",
"\n",
"pipeline = Pipeline([\n",
" ('stemmer', FunctionTransformer(\n",
" func=stem_str, \n",
" kw_args={'stemmer': RSLPStemmer()})),\n",
" ('vect', TfidfVectorizer()),\n",
" ('clf', LogisticRegression())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\n",
" 'text':[\n",
" 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',\n",
" 'Sed accumsan congue enim non pretium.',\n",
" 'In hac habitasse platea dictumst.',\n",
" 'Sed tincidunt ipsum nec urna vulputate luctus.'\n",
" ],\n",
" 'target':[0, 1, 0, 1]\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Lorem ipsum dolor sit amet, consectetur adipis... | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Sed accumsan congue enim non pretium. | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" In hac habitasse platea dictumst. | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" Sed tincidunt ipsum nec urna vulputate luctus. | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text target\n",
"0 Lorem ipsum dolor sit amet, consectetur adipis... 0\n",
"1 Sed accumsan congue enim non pretium. 1\n",
"2 In hac habitasse platea dictumst. 0\n",
"3 Sed tincidunt ipsum nec urna vulputate luctus. 1"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('stemmer',\n",
" FunctionTransformer(func=,\n",
" kw_args={'stemmer': })),\n",
" ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline.fit(df['text'],df['target'])"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 0, 1])"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline.predict(df['text'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false
},
"nbformat": 4,
"nbformat_minor": 4
}