{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 14 - Kaggle Competition\n", "# Fraud Detection\n", "\n", "\n", "## https://inclass.kaggle.com/c/easy-ml-class\n", "\n", "by [Alejandro Correa Bahnsen](albahnsen.com/)\n", "\n", "version 0.1, May 2016\n", "\n", "## Part of the class [Machine Learning for Security Informatics](https://github.com/albahnsen/ML_SecurityInformatics)\n", "\n", "\n", "This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```pip install tqdm```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fraud Detection" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "import zipfile\n", "with zipfile.ZipFile('../datasets/fraud_transactions_kaggle.csv.zip', 'r') as z:\n", " f = z.open('fraud_transactions_kaggle.csv')\n", " data = pd.read_csv(f, index_col=0)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecard_numbertypemerchantamountfraud
ID
02011-01-01 08:00:0619422832865.160.0
12011-01-01 08:00:165629242588260.840.0
22011-01-01 08:01:284082156226010.050.0
32011-01-01 08:01:43859245192348.460.0
42011-01-01 08:01:4837862355491160.350.0
\n", "
" ], "text/plain": [ " date card_number type merchant amount fraud\n", "ID \n", "0 2011-01-01 08:00:06 1942 2 8328 65.16 0.0\n", "1 2011-01-01 08:00:16 5629 2 42588 260.84 0.0\n", "2 2011-01-01 08:01:28 408 2 15622 6010.05 0.0\n", "3 2011-01-01 08:01:43 859 2 45192 348.46 0.0\n", "4 2011-01-01 08:01:48 3786 2 35549 1160.35 0.0" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecard_numbertypemerchantamountfraud
ID
1999952012-12-31 17:04:18406923582891.22NaN
1999962012-12-31 17:04:519246923390.95NaN
1999972012-12-31 17:05:3814811-10.65NaN
1999982012-12-31 17:05:55148114535390.04NaN
1999992012-12-31 17:25:02018322308.44NaN
\n", "
" ], "text/plain": [ " date card_number type merchant amount fraud\n", "ID \n", "199995 2012-12-31 17:04:18 4069 2 35828 91.22 NaN\n", "199996 2012-12-31 17:04:51 9 2 46923 390.95 NaN\n", "199997 2012-12-31 17:05:38 1481 1 -1 0.65 NaN\n", "199998 2012-12-31 17:05:55 1481 1 4535 390.04 NaN\n", "199999 2012-12-31 17:25:02 0 1 8322 308.44 NaN" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.tail()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ " 0.0 171048\n", "NaN 27909\n", " 1.0 1043\n", "Name: fraud, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.fraud.value_counts(dropna=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Estimate aggregated features" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from datetime import datetime, timedelta\n", "from tqdm import tqdm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split for each account and create the date as index" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 8087/8087 [00:20<00:00, 390.15it/s]\n" ] } ], "source": [ "card_numbers = data['card_number'].unique()\n", "data['trx_id'] = data.index\n", "data.index = pd.DatetimeIndex(data['date'])\n", "\n", "data_ = []\n", "for card_number in tqdm(card_numbers):\n", " data_.append(data.query('card_number == ' + str(card_number)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create Aggregated Features for one account\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "res_agg = pd.DataFrame(index=data['trx_id'].values, \n", " columns=['Trx_sum_7D', 'Trx_count_1D'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "trx = data_[0]\n", "\n", "for i in range(trx.shape[0]):\n", " date = trx.index[i]\n", " trx_id = int(trx.ix[i, 'trx_id'])\n", " # Sum 7 D\n", " agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]\n", " res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()\n", " # Count 1D\n", " agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]\n", " res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Trx_sum_7D 1054.881429\n", "Trx_count_1D 0.640693\n", "dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res_agg.mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "All accounts" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 8087/8087 [04:26<00:00, 30.33it/s] \n" ] } ], "source": [ "for trx in tqdm(data_):\n", " for i in range(trx.shape[0]):\n", " date = trx.index[i]\n", " trx_id = int(trx.ix[i, 'trx_id'])\n", " # Sum 7 D\n", " agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]\n", " res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()\n", " # Count 1D\n", " agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]\n", " res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Trx_sum_7DTrx_count_1D
000
100
200
300
400
\n", "
" ], "text/plain": [ " Trx_sum_7D Trx_count_1D\n", "0 0 0\n", "1 0 0\n", "2 0 0\n", "3 0 0\n", "4 0 0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res_agg.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.index = data.trx_id" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = data.join(res_agg)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecard_numbertypemerchantamountfraudtrx_idTrx_sum_7DTrx_count_1D
trx_id
40822011-01-16 16:26:533558213505528.820.04082307.850
236772011-04-04 08:13:41116229417117.290.02367700
300742011-04-29 13:09:07015699721.290.03007414171.92
654262011-09-09 10:11:24442025784929.700.06542600
722722011-10-04 10:43:002114251092170.650.0722721310207
744562011-10-11 17:17:222148213412150.190.07445600
846602011-11-19 17:06:581521135294651.590.08466000
1171672012-04-01 12:33:3314711-1650.940.01171674381.211
1197372012-04-09 14:27:12272313861613.030.011973713614.20
1324672012-05-27 16:43:11485724537341.700.0132467634.1310
1348582012-06-03 17:05:21211411869226.060.01348581752027
1421332012-06-29 16:21:37758823599192.530.01421331151.214
1581542012-08-20 10:55:234420253353182.650.0158154121.770
1764182012-10-16 14:23:04159522598515397.58NaN17641800
1864332012-11-20 11:04:004923236010217.89NaN186433573.40
\n", "
" ], "text/plain": [ " date card_number type merchant amount fraud \\\n", "trx_id \n", "4082 2011-01-16 16:26:53 3558 2 13505 528.82 0.0 \n", "23677 2011-04-04 08:13:41 1162 2 9417 117.29 0.0 \n", "30074 2011-04-29 13:09:07 0 1 56997 21.29 0.0 \n", "65426 2011-09-09 10:11:24 4420 2 57849 29.70 0.0 \n", "72272 2011-10-04 10:43:00 2114 2 5109 2170.65 0.0 \n", "74456 2011-10-11 17:17:22 2148 2 1341 2150.19 0.0 \n", "84660 2011-11-19 17:06:58 1521 1 35294 651.59 0.0 \n", "117167 2012-04-01 12:33:33 1471 1 -1 650.94 0.0 \n", "119737 2012-04-09 14:27:12 2723 1 38616 13.03 0.0 \n", "132467 2012-05-27 16:43:11 4857 2 45373 41.70 0.0 \n", "134858 2012-06-03 17:05:21 2114 1 18692 26.06 0.0 \n", "142133 2012-06-29 16:21:37 7588 2 35991 92.53 0.0 \n", "158154 2012-08-20 10:55:23 4420 2 53353 182.65 0.0 \n", "176418 2012-10-16 14:23:04 1595 2 25985 15397.58 NaN \n", "186433 2012-11-20 11:04:00 4923 2 36010 217.89 NaN \n", "\n", " trx_id Trx_sum_7D Trx_count_1D \n", "trx_id \n", "4082 4082 307.85 0 \n", "23677 23677 0 0 \n", "30074 30074 14171.9 2 \n", "65426 65426 0 0 \n", "72272 72272 131020 7 \n", "74456 74456 0 0 \n", "84660 84660 0 0 \n", "117167 117167 4381.21 1 \n", "119737 119737 13614.2 0 \n", "132467 132467 634.13 10 \n", "134858 134858 175202 7 \n", "142133 142133 1151.21 4 \n", "158154 158154 121.77 0 \n", "176418 176418 0 0 \n", "186433 186433 573.4 0 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.sample(15, random_state=42).sort_index()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Split train and test" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = data.loc[~data.fraud.isnull()]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "y = X.fraud" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = X.drop(['fraud', 'date', 'card_number'], axis=1)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_kaggle = data.loc[data.fraud.isnull()]\n", "X_kaggle = X_kaggle.drop(['fraud', 'date', 'card_number'], axis=1)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typemerchantamounttrx_idTrx_sum_7DTrx_count_1D
trx_id
172091213273208.5117209112016514
172092234472525.0517209271042.40
172093237909802.2417209312037415
172094235167130.3217209490638.19
1720952350739696.9617209500
\n", "
" ], "text/plain": [ " type merchant amount trx_id Trx_sum_7D Trx_count_1D\n", "trx_id \n", "172091 2 13273 208.51 172091 120165 14\n", "172092 2 34472 525.05 172092 71042.4 0\n", "172093 2 37909 802.24 172093 120374 15\n", "172094 2 35167 130.32 172094 90638.1 9\n", "172095 2 35073 9696.96 172095 0 0" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_kaggle.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Simple Random Forest" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.metrics import fbeta_score" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "KFold cross-validation" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.cross_validation import KFold" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [], "source": [ "kf = KFold(X.shape[0], n_folds=5)\n", "res = []\n", "for train, test in kf:\n", " X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]\n", " clf.fit(X_train, y_train)\n", " y_pred_proba = clf.predict_proba(X_test)[:, 1]\n", " y_pred = (y_pred_proba>0.05).astype(int)\n", " res.append(fbeta_score(y_test, y_pred, beta=2))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "count 5.000000\n", "mean 0.078145\n", "std 0.032472\n", "min 0.054945\n", "25% 0.057692\n", "50% 0.062500\n", "75% 0.082713\n", "max 0.132877\n", "dtype: float64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(res).describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Train with all" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predict and send to Kaggle" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n", " oob_score=False, random_state=None, verbose=0,\n", " warm_start=False)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "y_pred = clf.predict_proba(X_kaggle)[:, 1]" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [], "source": [ "y_pred = (y_pred>0.05).astype(int)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "y_pred = pd.Series(y_pred,name='fraud', index=X_kaggle.index)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "trx_id\n", "172091 0\n", "172092 1\n", "172093 1\n", "172094 0\n", "172095 1\n", "172096 1\n", "172097 1\n", "172098 0\n", "172099 1\n", "172100 0\n", "Name: fraud, dtype: int64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred.head(10)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": true }, "outputs": [], "source": [ "y_pred.to_csv('fraud_transactions_kaggle_1.csv', header=True, index_label='ID')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Main Issues" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Class imbalance\n", "* Feature creation\n", "* Model selection\n", "* Threshold selection" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }