{ "cells": [ { "cell_type": "markdown", "id": "f5afa212", "metadata": {}, "source": [ "## ML Pipeline with Sklearn" ] }, { "cell_type": "code", "execution_count": 1, "id": "29d5c7fe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(768, 9)\n" ] }, { "data": { "text/html": [ "
\n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skin fold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "Body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (years) | \n", "Class variable | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "1 | \n", "
1 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "0 | \n", "
2 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "1 | \n", "
3 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "0 | \n", "
4 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
763 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "0 | \n", "
764 | \n", "2 | \n", "122 | \n", "70 | \n", "27 | \n", "0 | \n", "36.8 | \n", "0.340 | \n", "27 | \n", "0 | \n", "
765 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "0 | \n", "
766 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "1 | \n", "
767 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "0 | \n", "
768 rows × 9 columns
\n", "\n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skin fold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "Body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (years) | \n", "Class_variable | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "1 | \n", "
1 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "0 | \n", "
2 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "1 | \n", "
3 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "0 | \n", "
4 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
763 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "0 | \n", "
764 | \n", "2 | \n", "122 | \n", "70 | \n", "27 | \n", "0 | \n", "36.8 | \n", "0.340 | \n", "27 | \n", "0 | \n", "
765 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "0 | \n", "
766 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "1 | \n", "
767 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "0 | \n", "
768 rows × 9 columns
\n", "\n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skin fold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "Body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (years) | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "
1 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "
2 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "
3 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "
4 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
762 | \n", "9 | \n", "89 | \n", "62 | \n", "0 | \n", "0 | \n", "22.5 | \n", "0.142 | \n", "33 | \n", "
763 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "
765 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "
766 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "
767 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "
668 rows × 8 columns
\n", "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n", " ('standardscaler', StandardScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n", " ('standardscaler', StandardScaler())])
SimpleImputer(strategy='median')
StandardScaler()
Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A', strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore', sparse=False))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A', strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore', sparse=False))])
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose concentration a 2 hours in '\n", " 'an oral glucose tolerance test',\n", " 'Diastolic blood pressure (mm Hg)',\n", " 'Triceps skin fold thickness (mm)',\n", " '2-Hour serum insulin (mu U/ml)',\n", " 'Body mass index (weight in kg/(height in '\n", " 'm)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose concentration a 2 hours in '\n", " 'an oral glucose tolerance test',\n", " 'Diastolic blood pressure (mm Hg)',\n", " 'Triceps skin fold thickness (mm)',\n", " '2-Hour serum insulin (mu U/ml)',\n", " 'Body mass index (weight in kg/(height in '\n", " 'm)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])
['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose '\n", " 'concentration a 2 hours in '\n", " 'an oral glucose tolerance '\n", " 'test',\n", " 'Diastolic blood pressure '\n", " '(mm Hg)',\n", " 'Triceps skin fold '\n", " 'thickness (mm)',\n", " '2-Hour s...nsulin (mu '\n", " 'U/ml)',\n", " 'Body mass index (weight in '\n", " 'kg/(height in m)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])),\n", " ('gradientboostingregressor',\n", " GradientBoostingRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose '\n", " 'concentration a 2 hours in '\n", " 'an oral glucose tolerance '\n", " 'test',\n", " 'Diastolic blood pressure '\n", " '(mm Hg)',\n", " 'Triceps skin fold '\n", " 'thickness (mm)',\n", " '2-Hour s...nsulin (mu '\n", " 'U/ml)',\n", " 'Body mass index (weight in '\n", " 'kg/(height in m)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])),\n", " ('gradientboostingregressor',\n", " GradientBoostingRegressor(random_state=42))])
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose concentration a 2 hours in '\n", " 'an oral glucose tolerance test',\n", " 'Diastolic blood pressure (mm Hg)',\n", " 'Triceps skin fold thickness (mm)',\n", " '2-Hour serum insulin (mu U/ml)',\n", " 'Body mass index (weight in kg/(height in '\n", " 'm)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])
['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
GradientBoostingRegressor(random_state=42)
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose '\n", " 'concentration a 2 hours in '\n", " 'an oral glucose tolerance '\n", " 'test',\n", " 'Diastolic blood pressure '\n", " '(mm Hg)',\n", " 'Triceps skin fold '\n", " 'thickness (mm)',\n", " '2-Hour s...nsulin (mu '\n", " 'U/ml)',\n", " 'Body mass index (weight in '\n", " 'kg/(height in m)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])),\n", " ('gradientboostingregressor',\n", " GradientBoostingRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose '\n", " 'concentration a 2 hours in '\n", " 'an oral glucose tolerance '\n", " 'test',\n", " 'Diastolic blood pressure '\n", " '(mm Hg)',\n", " 'Triceps skin fold '\n", " 'thickness (mm)',\n", " '2-Hour s...nsulin (mu '\n", " 'U/ml)',\n", " 'Body mass index (weight in '\n", " 'kg/(height in m)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])),\n", " ('gradientboostingregressor',\n", " GradientBoostingRegressor(random_state=42))])
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['Number of times pregnant',\n", " 'Plasma glucose concentration a 2 hours in '\n", " 'an oral glucose tolerance test',\n", " 'Diastolic blood pressure (mm Hg)',\n", " 'Triceps skin fold thickness (mm)',\n", " '2-Hour serum insulin (mu U/ml)',\n", " 'Body mass index (weight in kg/(height in '\n", " 'm)^2)',\n", " 'Diabetes pedigree function',\n", " 'Age (years)']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(fill_value='N/A',\n", " strategy='constant')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse=False))]),\n", " [])])
['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
GradientBoostingRegressor(random_state=42)
\n", " | Class_variable | \n", "y_pred | \n", "
---|---|---|
418 | \n", "0 | \n", "0.068579 | \n", "
180 | \n", "0 | \n", "-0.010718 | \n", "
556 | \n", "0 | \n", "0.107478 | \n", "
601 | \n", "0 | \n", "-0.013441 | \n", "
317 | \n", "1 | \n", "0.827088 | \n", "
... | \n", "... | \n", "... | \n", "
622 | \n", "0 | \n", "0.469301 | \n", "
608 | \n", "0 | \n", "0.374756 | \n", "
638 | \n", "1 | \n", "0.308118 | \n", "
247 | \n", "0 | \n", "0.356902 | \n", "
19 | \n", "1 | \n", "0.490692 | \n", "
134 rows × 2 columns
\n", "