{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import datetime\n", "import cudf\n", "import pandas as pd\n", "import numpy as np\n", "from cuml.ensemble import RandomForestClassifier\n", "from cuml.metrics import accuracy_score\n", "\n", "import whylogs\n", "from whylogs.viz import ProfileVisualizer\n", "\n", "import warnings\n", "warnings.simplefilter('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "heart_data = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This dataset encodes missing values as `?`, so we will replace them with `np.nan` temporarily and then fill them with the median value for the column." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "heart_data.replace(\"?\", np.nan, inplace=True)\n", "heart_data.fillna(heart_data.median(), inplace=True)\n", "heart_data = heart_data.astype('float32')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset does not come packaged with a header, so we will add that next.\n", "\n", "More information about the data (including the header labels we'll add) can be found in `heart-disease.names`: \n", "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "heart_data.columns = ['age', 'sex', 'cp', 'trestbps', 'chol',\n", " 'fbs', 'restecg', 'thalach', 'exang', \n", " 'oldpeak', 'slope', 'ca', 'thal', 'target']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we will convert this object to an NVIDIA CuDF that works in the GPU." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "sex | \n", "cp | \n", "trestbps | \n", "chol | \n", "fbs | \n", "restecg | \n", "thalach | \n", "exang | \n", "oldpeak | \n", "slope | \n", "ca | \n", "thal | \n", "target | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "29.0 | \n", "1.0 | \n", "2.0 | \n", "120.0 | \n", "243.0 | \n", "0.0 | \n", "0.0 | \n", "160.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "0.0 | \n", "
1 | \n", "29.0 | \n", "1.0 | \n", "2.0 | \n", "140.0 | \n", "244.0 | \n", "0.0 | \n", "0.0 | \n", "170.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "0.0 | \n", "
2 | \n", "30.0 | \n", "0.0 | \n", "1.0 | \n", "170.0 | \n", "237.0 | \n", "0.0 | \n", "1.0 | \n", "170.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "0.0 | \n", "
3 | \n", "31.0 | \n", "0.0 | \n", "2.0 | \n", "100.0 | \n", "219.0 | \n", "0.0 | \n", "1.0 | \n", "150.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "0.0 | \n", "
4 | \n", "32.0 | \n", "0.0 | \n", "2.0 | \n", "105.0 | \n", "198.0 | \n", "0.0 | \n", "0.0 | \n", "165.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
288 | \n", "52.0 | \n", "1.0 | \n", "4.0 | \n", "160.0 | \n", "331.0 | \n", "0.0 | \n", "0.0 | \n", "94.0 | \n", "1.0 | \n", "2.5 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "1.0 | \n", "
289 | \n", "54.0 | \n", "0.0 | \n", "3.0 | \n", "130.0 | \n", "294.0 | \n", "0.0 | \n", "1.0 | \n", "100.0 | \n", "1.0 | \n", "0.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "1.0 | \n", "
290 | \n", "56.0 | \n", "1.0 | \n", "4.0 | \n", "155.0 | \n", "342.0 | \n", "1.0 | \n", "0.0 | \n", "150.0 | \n", "1.0 | \n", "3.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "1.0 | \n", "
291 | \n", "58.0 | \n", "0.0 | \n", "2.0 | \n", "180.0 | \n", "393.0 | \n", "0.0 | \n", "0.0 | \n", "110.0 | \n", "1.0 | \n", "1.0 | \n", "2.0 | \n", "0.0 | \n", "7.0 | \n", "1.0 | \n", "
292 | \n", "65.0 | \n", "1.0 | \n", "4.0 | \n", "130.0 | \n", "275.0 | \n", "0.0 | \n", "1.0 | \n", "115.0 | \n", "1.0 | \n", "1.0 | \n", "2.0 | \n", "0.0 | \n", "6.0 | \n", "1.0 | \n", "
293 rows × 14 columns
\n", "\n", " | column | \n", "count | \n", "null_count | \n", "bool_count | \n", "numeric_count | \n", "max | \n", "mean | \n", "min | \n", "stddev | \n", "nunique_numbers | \n", "... | \n", "ununique_str_upper | \n", "quantile_0.0000 | \n", "quantile_0.0100 | \n", "quantile_0.0500 | \n", "quantile_0.2500 | \n", "quantile_0.5000 | \n", "quantile_0.7500 | \n", "quantile_0.9500 | \n", "quantile_0.9900 | \n", "quantile_1.0000 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "age | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "66.0 | \n", "47.894198 | \n", "29.0 | \n", "7.738385 | \n", "37.0 | \n", "... | \n", "0.0 | \n", "29.0 | \n", "31.0 | \n", "35.0 | \n", "42.0 | \n", "49.0 | \n", "54.0 | \n", "59.0 | \n", "65.0 | \n", "66.0 | \n", "
1 | \n", "restecg | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "2.0 | \n", "0.211604 | \n", "0.0 | \n", "0.449050 | \n", "3.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "2.0 | \n", "2.0 | \n", "
2 | \n", "slope | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "3.0 | \n", "1.962457 | \n", "1.0 | \n", "0.207620 | \n", "3.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "2.0 | \n", "3.0 | \n", "
3 | \n", "ca | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
4 | \n", "exang | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "1.0 | \n", "0.303754 | \n", "0.0 | \n", "0.460665 | \n", "2.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "
5 | \n", "sex | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "1.0 | \n", "0.723549 | \n", "0.0 | \n", "0.448007 | \n", "2.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "
6 | \n", "thalach | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "190.0 | \n", "138.976109 | \n", "82.0 | \n", "23.436071 | \n", "71.0 | \n", "... | \n", "0.0 | \n", "82.0 | \n", "87.0 | \n", "98.0 | \n", "122.0 | \n", "140.0 | \n", "155.0 | \n", "175.0 | \n", "185.0 | \n", "190.0 | \n", "
7 | \n", "target | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "1.0 | \n", "0.361775 | \n", "0.0 | \n", "0.481336 | \n", "2.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "
8 | \n", "trestbps | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "200.0 | \n", "132.583618 | \n", "92.0 | \n", "17.626568 | \n", "31.0 | \n", "... | \n", "0.0 | \n", "92.0 | \n", "100.0 | \n", "110.0 | \n", "120.0 | \n", "130.0 | \n", "140.0 | \n", "160.0 | \n", "180.0 | \n", "200.0 | \n", "
9 | \n", "chol | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "603.0 | \n", "250.716724 | \n", "85.0 | \n", "64.714639 | \n", "153.0 | \n", "... | \n", "0.0 | \n", "85.0 | \n", "129.0 | \n", "167.0 | \n", "212.0 | \n", "244.0 | \n", "277.0 | \n", "358.0 | \n", "518.0 | \n", "603.0 | \n", "
10 | \n", "cp | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "4.0 | \n", "2.986348 | \n", "1.0 | \n", "0.965049 | \n", "4.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "2.0 | \n", "2.0 | \n", "3.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "
11 | \n", "oldpeak | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "5.0 | \n", "0.588055 | \n", "0.0 | \n", "0.909554 | \n", "6.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "2.5 | \n", "3.0 | \n", "5.0 | \n", "
12 | \n", "fbs | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "1.0 | \n", "0.068259 | \n", "0.0 | \n", "0.252622 | \n", "2.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "
13 | \n", "thal | \n", "293.0 | \n", "0.0 | \n", "0.0 | \n", "293.0 | \n", "7.0 | \n", "5.965870 | \n", "3.0 | \n", "0.502251 | \n", "3.0 | \n", "... | \n", "0.0 | \n", "3.0 | \n", "3.0 | \n", "6.0 | \n", "6.0 | \n", "6.0 | \n", "6.0 | \n", "6.0 | \n", "7.0 | \n", "7.0 | \n", "
14 rows × 32 columns
\n", "