{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imbalanced Learning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Artículo completo en www.aprendemachinelearning.com" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Manejo de clases desbalanceadas con la librería Python ImbLearn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "No olvides instalar con:
pip install -U imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-05-14T23:03:49.648431Z", "start_time": "2019-05-14T23:03:42.916007Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/jbagnato/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n", " \"(https://pypi.org/project/six/).\", DeprecationWarning)\n", "/Users/jbagnato/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", " warnings.warn(msg, category=DeprecationWarning)\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.decomposition import PCA\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", "from pylab import rcParams\n", "\n", "from imblearn.under_sampling import NearMiss\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.combine import SMOTETomek\n", "from imblearn.ensemble import BalancedBaggingClassifier\n", "\n", "from collections import Counter\n", "\n", "#set up graphic style in this case I am using the color scheme from xkcd.com\n", "rcParams['figure.figsize'] = 14, 8.7 # Golden Mean\n", "LABELS = [\"Normal\",\"Fraud\"]\n", "#col_list = [\"cerulean\",\"scarlet\"]# https://xkcd.com/color/rgb/\n", "#sns.set(style='white', font_scale=1.75, palette=sns.xkcd_palette(col_list))\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset Credit Card Fraud Detection" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Descarga de Kaggle en https://www.kaggle.com/mlg-ulb/creditcardfraud/data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-05-14T23:03:55.243246Z", "start_time": "2019-05-14T23:03:49.651846Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " | Time | \n", "V1 | \n", "V2 | \n", "V3 | \n", "V4 | \n", "V5 | \n", "V6 | \n", "V7 | \n", "V8 | \n", "V9 | \n", "... | \n", "V21 | \n", "V22 | \n", "V23 | \n", "V24 | \n", "V25 | \n", "V26 | \n", "V27 | \n", "V28 | \n", "Amount | \n", "Class | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.0 | \n", "-1.359807 | \n", "-0.072781 | \n", "2.536347 | \n", "1.378155 | \n", "-0.338321 | \n", "0.462388 | \n", "0.239599 | \n", "0.098698 | \n", "0.363787 | \n", "... | \n", "-0.018307 | \n", "0.277838 | \n", "-0.110474 | \n", "0.066928 | \n", "0.128539 | \n", "-0.189115 | \n", "0.133558 | \n", "-0.021053 | \n", "149.62 | \n", "0 | \n", "
1 | \n", "0.0 | \n", "1.191857 | \n", "0.266151 | \n", "0.166480 | \n", "0.448154 | \n", "0.060018 | \n", "-0.082361 | \n", "-0.078803 | \n", "0.085102 | \n", "-0.255425 | \n", "... | \n", "-0.225775 | \n", "-0.638672 | \n", "0.101288 | \n", "-0.339846 | \n", "0.167170 | \n", "0.125895 | \n", "-0.008983 | \n", "0.014724 | \n", "2.69 | \n", "0 | \n", "
2 | \n", "1.0 | \n", "-1.358354 | \n", "-1.340163 | \n", "1.773209 | \n", "0.379780 | \n", "-0.503198 | \n", "1.800499 | \n", "0.791461 | \n", "0.247676 | \n", "-1.514654 | \n", "... | \n", "0.247998 | \n", "0.771679 | \n", "0.909412 | \n", "-0.689281 | \n", "-0.327642 | \n", "-0.139097 | \n", "-0.055353 | \n", "-0.059752 | \n", "378.66 | \n", "0 | \n", "
3 | \n", "1.0 | \n", "-0.966272 | \n", "-0.185226 | \n", "1.792993 | \n", "-0.863291 | \n", "-0.010309 | \n", "1.247203 | \n", "0.237609 | \n", "0.377436 | \n", "-1.387024 | \n", "... | \n", "-0.108300 | \n", "0.005274 | \n", "-0.190321 | \n", "-1.175575 | \n", "0.647376 | \n", "-0.221929 | \n", "0.062723 | \n", "0.061458 | \n", "123.50 | \n", "0 | \n", "
4 | \n", "2.0 | \n", "-1.158233 | \n", "0.877737 | \n", "1.548718 | \n", "0.403034 | \n", "-0.407193 | \n", "0.095921 | \n", "0.592941 | \n", "-0.270533 | \n", "0.817739 | \n", "... | \n", "-0.009431 | \n", "0.798278 | \n", "-0.137458 | \n", "0.141267 | \n", "-0.206010 | \n", "0.502292 | \n", "0.219422 | \n", "0.215153 | \n", "69.99 | \n", "0 | \n", "
5 rows × 31 columns
\n", "\n", " | algorithm | \n", "precision | \n", "recall | \n", "overall | \n", "
---|---|---|---|---|
1 | \n", "Penalizacion | \n", "1.0 | \n", "0.93 | \n", "0.965 | \n", "
2 | \n", "NearMiss Subsampling | \n", "1.0 | \n", "0.93 | \n", "0.965 | \n", "
3 | \n", "Random Oversampling | \n", "1.0 | \n", "0.89 | \n", "0.945 | \n", "
5 | \n", "Ensemble | \n", "1.0 | \n", "0.88 | \n", "0.940 | \n", "
4 | \n", "Smote Tomek | \n", "1.0 | \n", "0.85 | \n", "0.925 | \n", "
0 | \n", "Regresion Logística | \n", "1.0 | \n", "0.66 | \n", "0.830 | \n", "