{ "cells": [ { "cell_type": "markdown", "id": "f5afa212", "metadata": {}, "source": [ "## ML Pipeline with Sklearn" ] }, { "cell_type": "code", "execution_count": 1, "id": "29d5c7fe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(768, 9)\n" ] }, { "data": { "text/html": [ "
| \n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skin fold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "Body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (years) | \n", "Class variable | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "1 | \n", "
| 1 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "0 | \n", "
| 2 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "1 | \n", "
| 3 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "0 | \n", "
| 4 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "1 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 763 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "0 | \n", "
| 764 | \n", "2 | \n", "122 | \n", "70 | \n", "27 | \n", "0 | \n", "36.8 | \n", "0.340 | \n", "27 | \n", "0 | \n", "
| 765 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "0 | \n", "
| 766 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "1 | \n", "
| 767 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "0 | \n", "
768 rows × 9 columns
\n", "| \n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skin fold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "Body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (years) | \n", "Class_variable | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "1 | \n", "
| 1 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "0 | \n", "
| 2 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "1 | \n", "
| 3 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "0 | \n", "
| 4 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "1 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 763 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "0 | \n", "
| 764 | \n", "2 | \n", "122 | \n", "70 | \n", "27 | \n", "0 | \n", "36.8 | \n", "0.340 | \n", "27 | \n", "0 | \n", "
| 765 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "0 | \n", "
| 766 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "1 | \n", "
| 767 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "0 | \n", "
768 rows × 9 columns
\n", "| \n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skin fold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "Body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (years) | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "
| 1 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "
| 2 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "
| 3 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "
| 4 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 762 | \n", "9 | \n", "89 | \n", "62 | \n", "0 | \n", "0 | \n", "22.5 | \n", "0.142 | \n", "33 | \n", "
| 763 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "
| 765 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "
| 766 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "
| 767 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "
668 rows × 8 columns
\n", "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n",
" ('standardscaler', StandardScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n",
" ('standardscaler', StandardScaler())])SimpleImputer(strategy='median')
StandardScaler()
Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A', strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore', sparse=False))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A', strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore', sparse=False))])SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose concentration a 2 hours in '\n",
" 'an oral glucose tolerance test',\n",
" 'Diastolic blood pressure (mm Hg)',\n",
" 'Triceps skin fold thickness (mm)',\n",
" '2-Hour serum insulin (mu U/ml)',\n",
" 'Body mass index (weight in kg/(height in '\n",
" 'm)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose concentration a 2 hours in '\n",
" 'an oral glucose tolerance test',\n",
" 'Diastolic blood pressure (mm Hg)',\n",
" 'Triceps skin fold thickness (mm)',\n",
" '2-Hour serum insulin (mu U/ml)',\n",
" 'Body mass index (weight in kg/(height in '\n",
" 'm)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
Pipeline(steps=[('columntransformer',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose '\n",
" 'concentration a 2 hours in '\n",
" 'an oral glucose tolerance '\n",
" 'test',\n",
" 'Diastolic blood pressure '\n",
" '(mm Hg)',\n",
" 'Triceps skin fold '\n",
" 'thickness (mm)',\n",
" '2-Hour s...nsulin (mu '\n",
" 'U/ml)',\n",
" 'Body mass index (weight in '\n",
" 'kg/(height in m)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])),\n",
" ('gradientboostingregressor',\n",
" GradientBoostingRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('columntransformer',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose '\n",
" 'concentration a 2 hours in '\n",
" 'an oral glucose tolerance '\n",
" 'test',\n",
" 'Diastolic blood pressure '\n",
" '(mm Hg)',\n",
" 'Triceps skin fold '\n",
" 'thickness (mm)',\n",
" '2-Hour s...nsulin (mu '\n",
" 'U/ml)',\n",
" 'Body mass index (weight in '\n",
" 'kg/(height in m)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])),\n",
" ('gradientboostingregressor',\n",
" GradientBoostingRegressor(random_state=42))])ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose concentration a 2 hours in '\n",
" 'an oral glucose tolerance test',\n",
" 'Diastolic blood pressure (mm Hg)',\n",
" 'Triceps skin fold thickness (mm)',\n",
" '2-Hour serum insulin (mu U/ml)',\n",
" 'Body mass index (weight in kg/(height in '\n",
" 'm)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
GradientBoostingRegressor(random_state=42)
Pipeline(steps=[('columntransformer',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose '\n",
" 'concentration a 2 hours in '\n",
" 'an oral glucose tolerance '\n",
" 'test',\n",
" 'Diastolic blood pressure '\n",
" '(mm Hg)',\n",
" 'Triceps skin fold '\n",
" 'thickness (mm)',\n",
" '2-Hour s...nsulin (mu '\n",
" 'U/ml)',\n",
" 'Body mass index (weight in '\n",
" 'kg/(height in m)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])),\n",
" ('gradientboostingregressor',\n",
" GradientBoostingRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('columntransformer',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose '\n",
" 'concentration a 2 hours in '\n",
" 'an oral glucose tolerance '\n",
" 'test',\n",
" 'Diastolic blood pressure '\n",
" '(mm Hg)',\n",
" 'Triceps skin fold '\n",
" 'thickness (mm)',\n",
" '2-Hour s...nsulin (mu '\n",
" 'U/ml)',\n",
" 'Body mass index (weight in '\n",
" 'kg/(height in m)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])),\n",
" ('gradientboostingregressor',\n",
" GradientBoostingRegressor(random_state=42))])ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['Number of times pregnant',\n",
" 'Plasma glucose concentration a 2 hours in '\n",
" 'an oral glucose tolerance test',\n",
" 'Diastolic blood pressure (mm Hg)',\n",
" 'Triceps skin fold thickness (mm)',\n",
" '2-Hour serum insulin (mu U/ml)',\n",
" 'Body mass index (weight in kg/(height in '\n",
" 'm)^2)',\n",
" 'Diabetes pedigree function',\n",
" 'Age (years)']),\n",
" ('cat',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(fill_value='N/A',\n",
" strategy='constant')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse=False))]),\n",
" [])])['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
GradientBoostingRegressor(random_state=42)
| \n", " | Class_variable | \n", "y_pred | \n", "
|---|---|---|
| 418 | \n", "0 | \n", "0.068579 | \n", "
| 180 | \n", "0 | \n", "-0.010718 | \n", "
| 556 | \n", "0 | \n", "0.107478 | \n", "
| 601 | \n", "0 | \n", "-0.013441 | \n", "
| 317 | \n", "1 | \n", "0.827088 | \n", "
| ... | \n", "... | \n", "... | \n", "
| 622 | \n", "0 | \n", "0.469301 | \n", "
| 608 | \n", "0 | \n", "0.374756 | \n", "
| 638 | \n", "1 | \n", "0.308118 | \n", "
| 247 | \n", "0 | \n", "0.356902 | \n", "
| 19 | \n", "1 | \n", "0.490692 | \n", "
134 rows × 2 columns
\n", "