{ "cells": [ { "cell_type": "markdown", "id": "f5afa212", "metadata": {}, "source": [ "## ML Pipeline with Sklearn" ] }, { "cell_type": "code", "execution_count": 1, "id": "29d5c7fe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(768, 9)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skin fold thickness (mm)2-Hour serum insulin (mu U/ml)Body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (years)Class variable
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
..............................
76310101764818032.90.171630
76421227027036.80.340270
7655121722311226.20.245300
7661126600030.10.349471
7671937031030.40.315230
\n", "

768 rows × 9 columns

\n", "
" ], "text/plain": [ " Number of times pregnant \\\n", "0 6 \n", "1 1 \n", "2 8 \n", "3 1 \n", "4 0 \n", ".. ... \n", "763 10 \n", "764 2 \n", "765 5 \n", "766 1 \n", "767 1 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "0 148 \n", "1 85 \n", "2 183 \n", "3 89 \n", "4 137 \n", ".. ... \n", "763 101 \n", "764 122 \n", "765 121 \n", "766 126 \n", "767 93 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skin fold thickness (mm) \\\n", "0 72 35 \n", "1 66 29 \n", "2 64 0 \n", "3 66 23 \n", "4 40 35 \n", ".. ... ... \n", "763 76 48 \n", "764 70 27 \n", "765 72 23 \n", "766 60 0 \n", "767 70 31 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "0 0 \n", "1 0 \n", "2 0 \n", "3 94 \n", "4 168 \n", ".. ... \n", "763 180 \n", "764 0 \n", "765 112 \n", "766 0 \n", "767 0 \n", "\n", " Body mass index (weight in kg/(height in m)^2) \\\n", "0 33.6 \n", "1 26.6 \n", "2 23.3 \n", "3 28.1 \n", "4 43.1 \n", ".. ... \n", "763 32.9 \n", "764 36.8 \n", "765 26.2 \n", "766 30.1 \n", "767 30.4 \n", "\n", " Diabetes pedigree function Age (years) Class variable \n", "0 0.627 50 1 \n", "1 0.351 31 0 \n", "2 0.672 32 1 \n", "3 0.167 21 0 \n", "4 2.288 33 1 \n", ".. ... ... ... \n", "763 0.171 63 0 \n", "764 0.340 27 0 \n", "765 0.245 30 0 \n", "766 0.349 47 1 \n", "767 0.315 23 0 \n", "\n", "[768 rows x 9 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load sample dataset\n", "import pandas as pd\n", "import seaborn as sns\n", "\n", "from ydata_profiling import ProfileReport\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import make_regression\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "from sklearn.pipeline import make_pipeline, Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.metrics import r2_score\n", "from sklearn.metrics import mean_absolute_error\n", "from sklearn.metrics import mean_absolute_percentage_error\n", "\n", "from yellowbrick.regressor import PredictionError\n", "\n", "df = pd.read_csv('./diabetes.csv')\n", "print(df.shape)\n", "df" ] }, { "cell_type": "code", "execution_count": 2, "id": "a2b4bf64", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skin fold thickness (mm)2-Hour serum insulin (mu U/ml)Body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (years)Class_variable
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
..............................
76310101764818032.90.171630
76421227027036.80.340270
7655121722311226.20.245300
7661126600030.10.349471
7671937031030.40.315230
\n", "

768 rows × 9 columns

\n", "
" ], "text/plain": [ " Number of times pregnant \\\n", "0 6 \n", "1 1 \n", "2 8 \n", "3 1 \n", "4 0 \n", ".. ... \n", "763 10 \n", "764 2 \n", "765 5 \n", "766 1 \n", "767 1 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "0 148 \n", "1 85 \n", "2 183 \n", "3 89 \n", "4 137 \n", ".. ... \n", "763 101 \n", "764 122 \n", "765 121 \n", "766 126 \n", "767 93 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skin fold thickness (mm) \\\n", "0 72 35 \n", "1 66 29 \n", "2 64 0 \n", "3 66 23 \n", "4 40 35 \n", ".. ... ... \n", "763 76 48 \n", "764 70 27 \n", "765 72 23 \n", "766 60 0 \n", "767 70 31 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "0 0 \n", "1 0 \n", "2 0 \n", "3 94 \n", "4 168 \n", ".. ... \n", "763 180 \n", "764 0 \n", "765 112 \n", "766 0 \n", "767 0 \n", "\n", " Body mass index (weight in kg/(height in m)^2) \\\n", "0 33.6 \n", "1 26.6 \n", "2 23.3 \n", "3 28.1 \n", "4 43.1 \n", ".. ... \n", "763 32.9 \n", "764 36.8 \n", "765 26.2 \n", "766 30.1 \n", "767 30.4 \n", "\n", " Diabetes pedigree function Age (years) Class_variable \n", "0 0.627 50 1 \n", "1 0.351 31 0 \n", "2 0.672 32 1 \n", "3 0.167 21 0 \n", "4 2.288 33 1 \n", ".. ... ... ... \n", "763 0.171 63 0 \n", "764 0.340 27 0 \n", "765 0.245 30 0 \n", "766 0.349 47 1 \n", "767 0.315 23 0 \n", "\n", "[768 rows x 9 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Renaming the column Class variable\n", "\n", "df = df.rename(columns={'Class variable': 'Class_variable'})\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "id": "8fedd044", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Series([], dtype: int64)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# simple check for nulls\n", "df.isna().sum()[df.isna().sum() > 0]" ] }, { "cell_type": "code", "execution_count": 9, "id": "6b6c08be", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1b693f84fe8e429a84381a2e72748c6c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Summarize dataset: 0%| | 0/5 [00:00" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "53170e4f8beb42efa6a76d25ca294fbf", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Export report to file: 0%| | 0/1 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skin fold thickness (mm)2-Hour serum insulin (mu U/ml)Body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (years)
061487235033.60.62750
11856629026.60.35131
28183640023.30.67232
318966239428.10.16721
40137403516843.12.28833
...........................
762989620022.50.14233
76310101764818032.90.17163
7655121722311226.20.24530
7661126600030.10.34947
7671937031030.40.31523
\n", "

668 rows × 8 columns

\n", "" ], "text/plain": [ " Number of times pregnant \\\n", "0 6 \n", "1 1 \n", "2 8 \n", "3 1 \n", "4 0 \n", ".. ... \n", "762 9 \n", "763 10 \n", "765 5 \n", "766 1 \n", "767 1 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "0 148 \n", "1 85 \n", "2 183 \n", "3 89 \n", "4 137 \n", ".. ... \n", "762 89 \n", "763 101 \n", "765 121 \n", "766 126 \n", "767 93 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skin fold thickness (mm) \\\n", "0 72 35 \n", "1 66 29 \n", "2 64 0 \n", "3 66 23 \n", "4 40 35 \n", ".. ... ... \n", "762 62 0 \n", "763 76 48 \n", "765 72 23 \n", "766 60 0 \n", "767 70 31 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "0 0 \n", "1 0 \n", "2 0 \n", "3 94 \n", "4 168 \n", ".. ... \n", "762 0 \n", "763 180 \n", "765 112 \n", "766 0 \n", "767 0 \n", "\n", " Body mass index (weight in kg/(height in m)^2) \\\n", "0 33.6 \n", "1 26.6 \n", "2 23.3 \n", "3 28.1 \n", "4 43.1 \n", ".. ... \n", "762 22.5 \n", "763 32.9 \n", "765 26.2 \n", "766 30.1 \n", "767 30.4 \n", "\n", " Diabetes pedigree function Age (years) \n", "0 0.627 50 \n", "1 0.351 31 \n", "2 0.672 32 \n", "3 0.167 21 \n", "4 2.288 33 \n", ".. ... ... \n", "762 0.142 33 \n", "763 0.171 63 \n", "765 0.245 30 \n", "766 0.349 47 \n", "767 0.315 23 \n", "\n", "[668 rows x 8 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 15, "id": "1a792824", "metadata": {}, "outputs": [], "source": [ "# split the data into training and test set\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 16, "id": "8a5d6bf1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)'] \n", " []\n" ] } ], "source": [ "# encoding \n", "# get the categorical and numeric column names\n", "num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()\n", "cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()\n", "print(num_cols, '\\n', cat_cols)" ] }, { "cell_type": "code", "execution_count": 17, "id": "40eac5c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n",
       "                ('standardscaler', StandardScaler())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n", " ('standardscaler', StandardScaler())])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pipeline for numerical columns\n", "num_pipe = make_pipeline(\n", " SimpleImputer(strategy='median'),\n", " StandardScaler()\n", ")\n", "num_pipe" ] }, { "cell_type": "code", "execution_count": 18, "id": "01e569a6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('simpleimputer',\n",
       "                 SimpleImputer(fill_value='N/A', strategy='constant')),\n",
       "                ('onehotencoder',\n",
       "                 OneHotEncoder(handle_unknown='ignore', sparse=False))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A', strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore', sparse=False))])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pipeline for categorical columns\n", "cat_pipe = make_pipeline(\n", " SimpleImputer(strategy='constant', fill_value='N/A'),\n", " OneHotEncoder(handle_unknown='ignore', sparse=False)\n", ")\n", "cat_pipe" ] }, { "cell_type": "code", "execution_count": 19, "id": "fd8ab25d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ColumnTransformer(transformers=[('num',\n",
       "                                 Pipeline(steps=[('simpleimputer',\n",
       "                                                  SimpleImputer(strategy='median')),\n",
       "                                                 ('standardscaler',\n",
       "                                                  StandardScaler())]),\n",
       "                                 ['Number of times pregnant',\n",
       "                                  'Plasma glucose concentration a 2 hours in '\n",
       "                                  'an oral glucose tolerance test',\n",
       "                                  'Diastolic blood pressure (mm Hg)',\n",
       "                                  'Triceps skin fold thickness (mm)',\n",
       "                                  '2-Hour serum insulin (mu U/ml)',\n",
       "                                  'Body mass index (weight in kg/(height in '\n",
       "                                  'm)^2)',\n",
       "                                  'Diabetes pedigree function',\n",
       "                                  'Age (years)']),\n",
       "                                ('cat',\n",
       "                                 Pipeline(steps=[('simpleimputer',\n",
       "                                                  SimpleImputer(fill_value='N/A',\n",
       "                                                                strategy='constant')),\n",
       "                                                 ('onehotencoder',\n",
       "                                                  OneHotEncoder(handle_unknown='ignore',\n",
       "                                                                sparse=False))]),\n",
       "                                 [])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose concentration a 2 hours in '\n", " 'an oral glucose tolerance test',\n", " 'Diastolic blood pressure (mm Hg)',\n", " 'Triceps skin fold thickness (mm)',\n", " '2-Hour serum insulin (mu U/ml)',\n", " 'Body mass index (weight in kg/(height in '\n", " 'm)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# combine both the pipelines\n", "full_pipe = ColumnTransformer([\n", " ('num', num_pipe, num_cols),\n", " ('cat', cat_pipe, cat_cols)\n", "])\n", "full_pipe" ] }, { "cell_type": "code", "execution_count": 20, "id": "04f066f5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(transformers=[('num',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='median')),\n",
       "                                                                  ('standardscaler',\n",
       "                                                                   StandardScaler())]),\n",
       "                                                  ['Number of times pregnant',\n",
       "                                                   'Plasma glucose '\n",
       "                                                   'concentration a 2 hours in '\n",
       "                                                   'an oral glucose tolerance '\n",
       "                                                   'test',\n",
       "                                                   'Diastolic blood pressure '\n",
       "                                                   '(mm Hg)',\n",
       "                                                   'Triceps skin fold '\n",
       "                                                   'thickness (mm)',\n",
       "                                                   '2-Hour s...nsulin (mu '\n",
       "                                                   'U/ml)',\n",
       "                                                   'Body mass index (weight in '\n",
       "                                                   'kg/(height in m)^2)',\n",
       "                                                   'Diabetes pedigree function',\n",
       "                                                   'Age (years)']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(fill_value='N/A',\n",
       "                                                                                 strategy='constant')),\n",
       "                                                                  ('onehotencoder',\n",
       "                                                                   OneHotEncoder(handle_unknown='ignore',\n",
       "                                                                                 sparse=False))]),\n",
       "                                                  [])])),\n",
       "                ('gradientboostingregressor',\n",
       "                 GradientBoostingRegressor(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose '\n", " 'concentration a 2 hours in '\n", " 'an oral glucose tolerance '\n", " 'test',\n", " 'Diastolic blood pressure '\n", " '(mm Hg)',\n", " 'Triceps skin fold '\n", " 'thickness (mm)',\n", " '2-Hour s...nsulin (mu '\n", " 'U/ml)',\n", " 'Body mass index (weight in '\n", " 'kg/(height in m)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])),\n", " ('gradientboostingregressor',\n", " GradientBoostingRegressor(random_state=42))])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# build the model\n", "gbr_diabetes = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))\n", "gbr_diabetes" ] }, { "cell_type": "code", "execution_count": 22, "id": "0788a4cd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\owner\\miniconda3\\envs\\pc3\\lib\\site-packages\\sklearn\\ensemble\\_gb.py:437: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n" ] }, { "data": { "text/html": [ "
Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(transformers=[('num',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='median')),\n",
       "                                                                  ('standardscaler',\n",
       "                                                                   StandardScaler())]),\n",
       "                                                  ['Number of times pregnant',\n",
       "                                                   'Plasma glucose '\n",
       "                                                   'concentration a 2 hours in '\n",
       "                                                   'an oral glucose tolerance '\n",
       "                                                   'test',\n",
       "                                                   'Diastolic blood pressure '\n",
       "                                                   '(mm Hg)',\n",
       "                                                   'Triceps skin fold '\n",
       "                                                   'thickness (mm)',\n",
       "                                                   '2-Hour s...nsulin (mu '\n",
       "                                                   'U/ml)',\n",
       "                                                   'Body mass index (weight in '\n",
       "                                                   'kg/(height in m)^2)',\n",
       "                                                   'Diabetes pedigree function',\n",
       "                                                   'Age (years)']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(fill_value='N/A',\n",
       "                                                                                 strategy='constant')),\n",
       "                                                                  ('onehotencoder',\n",
       "                                                                   OneHotEncoder(handle_unknown='ignore',\n",
       "                                                                                 sparse=False))]),\n",
       "                                                  [])])),\n",
       "                ('gradientboostingregressor',\n",
       "                 GradientBoostingRegressor(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose '\n", " 'concentration a 2 hours in '\n", " 'an oral glucose tolerance '\n", " 'test',\n", " 'Diastolic blood pressure '\n", " '(mm Hg)',\n", " 'Triceps skin fold '\n", " 'thickness (mm)',\n", " '2-Hour s...nsulin (mu '\n", " 'U/ml)',\n", " 'Body mass index (weight in '\n", " 'kg/(height in m)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])),\n", " ('gradientboostingregressor',\n", " GradientBoostingRegressor(random_state=42))])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# train the model\n", "gbr_diabetes.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "17e62e7e", "metadata": {}, "outputs": [], "source": [ "# make predictions on the test set\n", "y_pred = gbr_diabetes.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 25, "id": "bc0c2276", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "R2: 0.22415907844020777\n" ] } ], "source": [ "# measure accuracy\n", "print('R2:', r2_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 26, "id": "aceb7cba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Class_variabley_pred
41800.068579
1800-0.010718
55600.107478
6010-0.013441
31710.827088
.........
62200.469301
60800.374756
63810.308118
24700.356902
1910.490692
\n", "

134 rows × 2 columns

\n", "
" ], "text/plain": [ " Class_variable y_pred\n", "418 0 0.068579\n", "180 0 -0.010718\n", "556 0 0.107478\n", "601 0 -0.013441\n", "317 1 0.827088\n", ".. ... ...\n", "622 0 0.469301\n", "608 0 0.374756\n", "638 1 0.308118\n", "247 0 0.356902\n", "19 1 0.490692\n", "\n", "[134 rows x 2 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# done manually to break out the example above\n", "y_test['y_pred'] = y_pred\n", "test_scores = y_test.copy()\n", "test_scores" ] }, { "cell_type": "code", "execution_count": 27, "id": "42ad8e77", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "R2: 0.22415907844020777\n", "mae: 0.33678577565359047\n", "act_mean: 0.39552238805970147\n", "pred_mean: 0.3679732831419842\n", "mape: 691517409333630.6\n" ] } ], "source": [ "r2 = r2_score(test_scores['Class_variable'], test_scores['y_pred'])\n", "mae = mean_absolute_error(test_scores['Class_variable'], test_scores['y_pred'])\n", "mean_act = test_scores['Class_variable'].mean()\n", "mean_pred = test_scores['y_pred'].mean()\n", "mape = mean_absolute_percentage_error(test_scores['Class_variable'], test_scores['y_pred'])\n", "print(f'R2: {r2}\\nmae: {mae}\\nact_mean: {mean_act}\\npred_mean: {mean_pred}\\nmape: {mape}')" ] }, { "cell_type": "code", "execution_count": 28, "id": "3f725c21", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose '\n", " 'concentration a 2 hours in '\n", " 'an oral glucose tolerance '\n", " 'test',\n", " 'Diastolic blood pressure '\n", " '(mm Hg)',\n", " 'Triceps skin fold '\n", " 'thickness (mm)',\n", " '2-Hour s...nsulin (mu '\n", " 'U/ml)',\n", " 'Body mass index (weight in '\n", " 'kg/(height in m)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])),\n", " ('gradientboostingregressor',\n", " GradientBoostingRegressor(random_state=42))])\n" ] } ], "source": [ "import joblib\n", "joblib.dump(gbr_diabetes, './diabetes.pkl')\n", "print(gbr_diabetes)" ] }, { "cell_type": "code", "execution_count": null, "id": "6d5c646a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }