{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load the Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: ucimlrepo in c:\\users\\moni2\\anaconda3\\lib\\site-packages (0.0.3)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AreaPerimeterMajorAxisLengthMinorAxisLengthAspectRatioEccentricityConvexAreaEquivDiameterExtentSolidityRoundnessCompactnessShapeFactor1ShapeFactor2ShapeFactor3ShapeFactor4
028395610.291208.178117173.8887471.1971910.54981228715190.1410970.7639230.9888560.9580270.9133580.0073320.0031470.8342220.998724
128734638.018200.524796182.7344191.0973560.41178529172191.2727510.7839680.9849860.8870340.9538610.0069790.0035640.9098510.998430
229380624.110212.826130175.9311431.2097130.56272729690193.4109040.7781130.9895590.9478490.9087740.0072440.0030480.8258710.999066
330008645.884210.557999182.5165161.1536380.49861630724195.4670620.7826810.9766960.9039360.9283290.0070170.0032150.8617940.994199
430140620.134201.847882190.2792791.0607980.33368030417195.8965030.7730980.9908930.9848770.9705160.0066970.0036650.9419000.999166
...................................................
1360642097759.696288.721612185.9447051.5527280.76500242508231.5157990.7145740.9903310.9166030.8018650.0068580.0017490.6429880.998385
1360742101757.499281.576392190.7131361.4764390.73570242494231.5267980.7999430.9907520.9220150.8222520.0066880.0018860.6760990.998219
1360842139759.321281.539928191.1879791.4725820.73406542569231.6312610.7299320.9898990.9184240.8227300.0066810.0018880.6768840.996767
1360942147763.779283.382636190.2757311.4893260.74105542667231.6532470.7053890.9878130.9079060.8174570.0067240.0018520.6682370.995222
1361042159772.237295.142741182.2047161.6198410.78669342600231.6862230.7889620.9896480.8883800.7849970.0070010.0016400.6162210.998180
\n", "

13611 rows × 16 columns

\n", "
" ], "text/plain": [ " Area Perimeter MajorAxisLength MinorAxisLength AspectRatio \\\n", "0 28395 610.291 208.178117 173.888747 1.197191 \n", "1 28734 638.018 200.524796 182.734419 1.097356 \n", "2 29380 624.110 212.826130 175.931143 1.209713 \n", "3 30008 645.884 210.557999 182.516516 1.153638 \n", "4 30140 620.134 201.847882 190.279279 1.060798 \n", "... ... ... ... ... ... \n", "13606 42097 759.696 288.721612 185.944705 1.552728 \n", "13607 42101 757.499 281.576392 190.713136 1.476439 \n", "13608 42139 759.321 281.539928 191.187979 1.472582 \n", "13609 42147 763.779 283.382636 190.275731 1.489326 \n", "13610 42159 772.237 295.142741 182.204716 1.619841 \n", "\n", " Eccentricity ConvexArea EquivDiameter Extent Solidity Roundness \\\n", "0 0.549812 28715 190.141097 0.763923 0.988856 0.958027 \n", "1 0.411785 29172 191.272751 0.783968 0.984986 0.887034 \n", "2 0.562727 29690 193.410904 0.778113 0.989559 0.947849 \n", "3 0.498616 30724 195.467062 0.782681 0.976696 0.903936 \n", "4 0.333680 30417 195.896503 0.773098 0.990893 0.984877 \n", "... ... ... ... ... ... ... \n", "13606 0.765002 42508 231.515799 0.714574 0.990331 0.916603 \n", "13607 0.735702 42494 231.526798 0.799943 0.990752 0.922015 \n", "13608 0.734065 42569 231.631261 0.729932 0.989899 0.918424 \n", "13609 0.741055 42667 231.653247 0.705389 0.987813 0.907906 \n", "13610 0.786693 42600 231.686223 0.788962 0.989648 0.888380 \n", "\n", " Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4 \n", "0 0.913358 0.007332 0.003147 0.834222 0.998724 \n", "1 0.953861 0.006979 0.003564 0.909851 0.998430 \n", "2 0.908774 0.007244 0.003048 0.825871 0.999066 \n", "3 0.928329 0.007017 0.003215 0.861794 0.994199 \n", "4 0.970516 0.006697 0.003665 0.941900 0.999166 \n", "... ... ... ... ... ... \n", "13606 0.801865 0.006858 0.001749 0.642988 0.998385 \n", "13607 0.822252 0.006688 0.001886 0.676099 0.998219 \n", "13608 0.822730 0.006681 0.001888 0.676884 0.996767 \n", "13609 0.817457 0.006724 0.001852 0.668237 0.995222 \n", "13610 0.784997 0.007001 0.001640 0.616221 0.998180 \n", "\n", "[13611 rows x 16 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "!pip install ucimlrepo\n", "\n", "from ucimlrepo import fetch_ucirepo \n", " \n", "# Fetch dataset \n", "dry_bean_dataset = fetch_ucirepo(id=602) \n", " \n", "# Load data (as pandas dataframes) \n", "X = dry_bean_dataset.data.features \n", "y = dry_bean_dataset.data.targets \n", "\n", "X" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Class
0SEKER
1SEKER
2SEKER
3SEKER
4SEKER
......
13606DERMASON
13607DERMASON
13608DERMASON
13609DERMASON
13610DERMASON
\n", "

13611 rows × 1 columns

\n", "
" ], "text/plain": [ " Class\n", "0 SEKER\n", "1 SEKER\n", "2 SEKER\n", "3 SEKER\n", "4 SEKER\n", "... ...\n", "13606 DERMASON\n", "13607 DERMASON\n", "13608 DERMASON\n", "13609 DERMASON\n", "13610 DERMASON\n", "\n", "[13611 rows x 1 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pre-processing" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5, 5, 5, ..., 3, 3, 3])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "y = np.ravel(y)\n", "# Convert string labels to integer labels\n", "\n", "label_encoder = LabelEncoder()\n", "y_encoded = label_encoder.fit_transform(y)\n", "y_encoded" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mapping of Encoded Labels to Original Labels:\n", "BARBUNYA -> 0\n", "BOMBAY -> 1\n", "CALI -> 2\n", "DERMASON -> 3\n", "HOROZ -> 4\n", "SEKER -> 5\n", "SIRA -> 6\n" ] } ], "source": [ "# Print the mapping of encoded integer labels to original string labels\n", "\n", "print(\"Mapping of Encoded Labels to Original Labels:\")\n", "for encoded_label, original_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):\n", " print(f\"{encoded_label} -> {original_label}\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "beans_names = list(label_encoder.classes_)\n", "beans_names" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data size: (10133, 16) (10133,)\n", "Calibration data size: (3378, 16) (3378,)\n", "Test data size: (100, 16) (100,)\n" ] } ], "source": [ "# Split data into training, calibration and test sets\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train_cal, X_test, y_train_cal, y_test = train_test_split(X, y_encoded, test_size=100, random_state=11)\n", "X_train, X_cal, y_train, y_cal = train_test_split(X_train_cal, y_train_cal, test_size=0.25, random_state=11)\n", "\n", "print(\"Training data size:\", X_train.shape, y_train.shape)\n", "print(\"Calibration data size:\", X_cal.shape, y_cal.shape)\n", "print(\"Test data size:\", X_test.shape, y_test.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Training" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.7575488454706927\n" ] } ], "source": [ "from sklearn.naive_bayes import GaussianNB\n", "from sklearn import metrics\n", "\n", "model = GaussianNB().fit(X_train, y_train)\n", "\n", "# Check accuracy on the calibration set\n", "\n", "y_pred = model.predict(X_cal) \n", "accuracy = metrics.accuracy_score(y_cal, y_pred)\n", "print('Accuracy:', accuracy)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Probabilities for a sample data point:\n", " [1.62253849e-01 1.39224885e-30 8.37746148e-01 1.82113281e-75\n", " 2.45849531e-09 1.74009195e-53 7.48056991e-41]\n", "\n", "True Class of the sample data: 5\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Get predicted probabilities for each class\n", "predictions = model.predict_proba(X_cal) \n", "\n", "idx = 23\n", "# Extract the predicted probability for each class for a data point\n", "probabilities_for_data_point = predictions[idx]\n", "print('Probabilities for a sample data point:\\n',probabilities_for_data_point)\n", "\n", "# Get the true class of the selected data point\n", "true_class = y_encoded[idx]\n", "print('\\nTrue Class of the sample data:', true_class)\n", "\n", "plt.bar(x=range(len(probabilities_for_data_point)), height=probabilities_for_data_point)\n", "plt.xlabel('Class')\n", "plt.ylabel('Probability Estimate')\n", "plt.title('Probability Estimate for a sample data point')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calibration" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "alpha: 0.15\n", "Calibration set size: 3378\n", "Predict Probability shape: (3378, 7)\n", "Prob true class shape: (3378,)\n", "q-level 0.8505032563647128\n", "qhat 0.8735675244396051\n" ] } ], "source": [ "# Calculate adjusted quantile\n", "\n", "alpha = 0.15 # means (1-alpha) coverage\n", "print('alpha:', alpha)\n", "\n", "n = X_cal.shape[0]\n", "print('Calibration set size:', n)\n", "\n", "# Get predicted probabilities for each class\n", "y_cal_predictions = model.predict_proba(X_cal) \n", "print('Predict Probability shape:', predictions.shape)\n", "\n", "# Get the probabaility of the true class\n", "prob_true_class = y_cal_predictions[np.arange(len(y_cal)),y_cal] # The second index taken from y_cal is 1 only for the true class\n", "print('Prob true class shape:', prob_true_class.shape)\n", "\n", "y_cal_scores = 1-prob_true_class # Larger conformal scores encode worse aggrement between X and y\n", "\n", "q_level = np.ceil((n+1)*(1-alpha))/n # finite-size correction\n", "print(\"q-level\", q_level)\n", "\n", "qhat = np.quantile(y_cal_scores, q_level)\n", "print(\"qhat\", qhat)\n" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Flatten the array of conformal scores\n", "flat_scores = y_cal_scores.flatten()\n", "\n", "# Plot histogram of frequency of conformal scores\n", "plt.hist(flat_scores, bins=10, color='skyblue', edgecolor='black')\n", "\n", "# Add vertical dotted line denoting qhat\n", "plt.axvline(x=qhat, color='red', linestyle='--', label=f'qhat = {qhat:.3f}')\n", "plt.xlabel('Conformal Score')\n", "plt.ylabel('Frequency')\n", "plt.title('Histogram of Conformal Scores for the Calibration Set')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predictions" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_test predicted scores:\n", " [5.31871940e-01 6.02122501e-31 4.68128053e-01 1.54048866e-68\n", " 6.96089668e-09 6.47527911e-47 9.28043054e-37]\n", "\n", "Prediction set:\n", " [ True False True False False False False]\n", "\n", "True class: 0\n" ] } ], "source": [ "id = 2\n", "y_test_predictions = model.predict_proba(X_test)\n", "print('X_test predicted scores:\\n', y_test_predictions[id])\n", "\n", "predictions_set = (1-y_test_predictions <= qhat)\n", "print('\\nPrediction set:\\n', predictions_set[id])\n", "print('\\nTrue class:', y_test[id])\n", "\n", "accepted_test_set = np.zeros(len(predictions_set[id]))\n", "for i in range(len(predictions_set[id])):\n", " if predictions_set[id][i]:\n", " accepted_test_set[i] = y_test_predictions[id][i]\n", "\n", "\n", "# Create a list to represent the heights of the bars of the true class\n", "true_bar_heights = [0] * len(predictions_set[id])\n", "true_bar_heights[y_test[id]] = 1\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "x=list(range(len(y_test_predictions[id])))\n", "\n", "bar_width = 0.35\n", "\n", "plt.figure(figsize=(10, 6))\n", "\n", "plt.bar(x, y_test_predictions[id], width=bar_width, color='lightblue', label='Predicted Probabilities')\n", "plt.bar(x, accepted_test_set, width=bar_width, color='lightblue', edgecolor='black', hatch='\\\\', label='Conformal Predicted Classes')\n", "plt.axhline(y=1-qhat, color='red', linestyle='--', label=f'1-qhat = {1-qhat:.3f}')\n", "\n", "plt.bar([i + bar_width for i in x], true_bar_heights, width=bar_width, color='red', label='True Class')\n", "\n", "xticks_positions = [i + bar_width / 2 for i in x]\n", "plt.xticks(xticks_positions, beans_names, rotation=90, fontsize=10)\n", "\n", "plt.xlabel('Class')\n", "plt.ylabel('Probability Estimates')\n", "plt.title('Probability estimates for a test data point')\n", "plt.legend()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 2 }