## ML Pipeline with Sklearn

In [1]:
# load sample dataset
import pandas as pd
import seaborn as sns

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from yellowbrick.regressor import PredictionError

df = pd.read_csv('./diabetes.csv')
print(df.shape)
df

(768, 9)


Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [2]:
# Renaming the column Class variable

df = df.rename(columns={'Class variable': 'Class_variable'})
df

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class_variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# simple check for nulls
df.isna().sum()[df.isna().sum() > 0]

Series([], dtype: int64)

In [9]:
# eda (automated)
# profile.to_widgets() -- research to fix...

profile = ProfileReport(df)
profile.to_notebook_iframe()
profile.to_file('./reg_diabetes.html')


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# take care of any missing values 
# n/a in this case

In [10]:
# set aside and save unseen data set
data_unseen = df.sample(n=100, random_state=42)
data        = df.drop(data_unseen.index)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')
data_unseen.to_csv('./diabetes_unseen.csv', index=False)

Data for model: (668, 9),
Data for unseen predictions: (100, 9)


In [11]:
# data.columns!='Class_variable'
X = data.loc[: , data.columns!='Class_variable']
y = data.loc[: , data.columns=='Class_variable']

In [12]:
X

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years)
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
762,9,89,62,0,0,22.5,0.142,33
763,10,101,76,48,180,32.9,0.171,63
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [15]:
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# encoding 
# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(num_cols, '\n', cat_cols)

['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)'] 
 []


In [17]:
# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
num_pipe

In [18]:
# pipeline for categorical columns
cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N/A'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe

In [19]:
# combine both the pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
full_pipe

In [20]:
# build the model
gbr_diabetes = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))
gbr_diabetes

In [22]:
# train the model
gbr_diabetes.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [24]:
# make predictions on the test set
y_pred = gbr_diabetes.predict(X_test)

In [25]:
# measure accuracy
print('R2:', r2_score(y_test, y_pred))

R2: 0.22415907844020777


In [26]:
# done manually to break out the example above
y_test['y_pred'] = y_pred
test_scores = y_test.copy()
test_scores

Unnamed: 0,Class_variable,y_pred
418,0,0.068579
180,0,-0.010718
556,0,0.107478
601,0,-0.013441
317,1,0.827088
...,...,...
622,0,0.469301
608,0,0.374756
638,1,0.308118
247,0,0.356902


In [27]:
r2 = r2_score(test_scores['Class_variable'], test_scores['y_pred'])
mae = mean_absolute_error(test_scores['Class_variable'], test_scores['y_pred'])
mean_act = test_scores['Class_variable'].mean()
mean_pred = test_scores['y_pred'].mean()
mape = mean_absolute_percentage_error(test_scores['Class_variable'], test_scores['y_pred'])
print(f'R2: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')

R2: 0.22415907844020777
mae: 0.33678577565359047
act_mean: 0.39552238805970147
pred_mean: 0.3679732831419842
mape: 691517409333630.6


In [28]:
import joblib
joblib.dump(gbr_diabetes, './diabetes.pkl')
print(gbr_diabetes)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Number of times pregnant',
                                                   'Plasma glucose '
                                                   'concentration a 2 hours in '
                                                   'an oral glucose tolerance '
                                                   'test',
                                                   'Diastolic blood pressure '
                                                   '(mm Hg)',
                                     