{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 14 - Kaggle Competition\n",
"# Fraud Detection\n",
"\n",
"\n",
"## https://inclass.kaggle.com/c/easy-ml-class\n",
"\n",
"by [Alejandro Correa Bahnsen](albahnsen.com/)\n",
"\n",
"version 0.1, May 2016\n",
"\n",
"## Part of the class [Machine Learning for Security Informatics](https://github.com/albahnsen/ML_SecurityInformatics)\n",
"\n",
"\n",
"This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```pip install tqdm```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fraud Detection"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import zipfile\n",
"with zipfile.ZipFile('../datasets/fraud_transactions_kaggle.csv.zip', 'r') as z:\n",
" f = z.open('fraud_transactions_kaggle.csv')\n",
" data = pd.read_csv(f, index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" card_number | \n",
" type | \n",
" merchant | \n",
" amount | \n",
" fraud | \n",
"
\n",
" \n",
" ID | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2011-01-01 08:00:06 | \n",
" 1942 | \n",
" 2 | \n",
" 8328 | \n",
" 65.16 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2011-01-01 08:00:16 | \n",
" 5629 | \n",
" 2 | \n",
" 42588 | \n",
" 260.84 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 2011-01-01 08:01:28 | \n",
" 408 | \n",
" 2 | \n",
" 15622 | \n",
" 6010.05 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2011-01-01 08:01:43 | \n",
" 859 | \n",
" 2 | \n",
" 45192 | \n",
" 348.46 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2011-01-01 08:01:48 | \n",
" 3786 | \n",
" 2 | \n",
" 35549 | \n",
" 1160.35 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date card_number type merchant amount fraud\n",
"ID \n",
"0 2011-01-01 08:00:06 1942 2 8328 65.16 0.0\n",
"1 2011-01-01 08:00:16 5629 2 42588 260.84 0.0\n",
"2 2011-01-01 08:01:28 408 2 15622 6010.05 0.0\n",
"3 2011-01-01 08:01:43 859 2 45192 348.46 0.0\n",
"4 2011-01-01 08:01:48 3786 2 35549 1160.35 0.0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" card_number | \n",
" type | \n",
" merchant | \n",
" amount | \n",
" fraud | \n",
"
\n",
" \n",
" ID | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 199995 | \n",
" 2012-12-31 17:04:18 | \n",
" 4069 | \n",
" 2 | \n",
" 35828 | \n",
" 91.22 | \n",
" NaN | \n",
"
\n",
" \n",
" 199996 | \n",
" 2012-12-31 17:04:51 | \n",
" 9 | \n",
" 2 | \n",
" 46923 | \n",
" 390.95 | \n",
" NaN | \n",
"
\n",
" \n",
" 199997 | \n",
" 2012-12-31 17:05:38 | \n",
" 1481 | \n",
" 1 | \n",
" -1 | \n",
" 0.65 | \n",
" NaN | \n",
"
\n",
" \n",
" 199998 | \n",
" 2012-12-31 17:05:55 | \n",
" 1481 | \n",
" 1 | \n",
" 4535 | \n",
" 390.04 | \n",
" NaN | \n",
"
\n",
" \n",
" 199999 | \n",
" 2012-12-31 17:25:02 | \n",
" 0 | \n",
" 1 | \n",
" 8322 | \n",
" 308.44 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date card_number type merchant amount fraud\n",
"ID \n",
"199995 2012-12-31 17:04:18 4069 2 35828 91.22 NaN\n",
"199996 2012-12-31 17:04:51 9 2 46923 390.95 NaN\n",
"199997 2012-12-31 17:05:38 1481 1 -1 0.65 NaN\n",
"199998 2012-12-31 17:05:55 1481 1 4535 390.04 NaN\n",
"199999 2012-12-31 17:25:02 0 1 8322 308.44 NaN"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.tail()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
" 0.0 171048\n",
"NaN 27909\n",
" 1.0 1043\n",
"Name: fraud, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.fraud.value_counts(dropna=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Estimate aggregated features"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from datetime import datetime, timedelta\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Split for each account and create the date as index"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 8087/8087 [00:20<00:00, 390.15it/s]\n"
]
}
],
"source": [
"card_numbers = data['card_number'].unique()\n",
"data['trx_id'] = data.index\n",
"data.index = pd.DatetimeIndex(data['date'])\n",
"\n",
"data_ = []\n",
"for card_number in tqdm(card_numbers):\n",
" data_.append(data.query('card_number == ' + str(card_number)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create Aggregated Features for one account\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"res_agg = pd.DataFrame(index=data['trx_id'].values, \n",
" columns=['Trx_sum_7D', 'Trx_count_1D'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"trx = data_[0]\n",
"\n",
"for i in range(trx.shape[0]):\n",
" date = trx.index[i]\n",
" trx_id = int(trx.ix[i, 'trx_id'])\n",
" # Sum 7 D\n",
" agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]\n",
" res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()\n",
" # Count 1D\n",
" agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]\n",
" res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Trx_sum_7D 1054.881429\n",
"Trx_count_1D 0.640693\n",
"dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_agg.mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"All accounts"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 8087/8087 [04:26<00:00, 30.33it/s] \n"
]
}
],
"source": [
"for trx in tqdm(data_):\n",
" for i in range(trx.shape[0]):\n",
" date = trx.index[i]\n",
" trx_id = int(trx.ix[i, 'trx_id'])\n",
" # Sum 7 D\n",
" agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]\n",
" res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()\n",
" # Count 1D\n",
" agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]\n",
" res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Trx_sum_7D | \n",
" Trx_count_1D | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Trx_sum_7D Trx_count_1D\n",
"0 0 0\n",
"1 0 0\n",
"2 0 0\n",
"3 0 0\n",
"4 0 0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_agg.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.index = data.trx_id"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = data.join(res_agg)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" card_number | \n",
" type | \n",
" merchant | \n",
" amount | \n",
" fraud | \n",
" trx_id | \n",
" Trx_sum_7D | \n",
" Trx_count_1D | \n",
"
\n",
" \n",
" trx_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 4082 | \n",
" 2011-01-16 16:26:53 | \n",
" 3558 | \n",
" 2 | \n",
" 13505 | \n",
" 528.82 | \n",
" 0.0 | \n",
" 4082 | \n",
" 307.85 | \n",
" 0 | \n",
"
\n",
" \n",
" 23677 | \n",
" 2011-04-04 08:13:41 | \n",
" 1162 | \n",
" 2 | \n",
" 9417 | \n",
" 117.29 | \n",
" 0.0 | \n",
" 23677 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 30074 | \n",
" 2011-04-29 13:09:07 | \n",
" 0 | \n",
" 1 | \n",
" 56997 | \n",
" 21.29 | \n",
" 0.0 | \n",
" 30074 | \n",
" 14171.9 | \n",
" 2 | \n",
"
\n",
" \n",
" 65426 | \n",
" 2011-09-09 10:11:24 | \n",
" 4420 | \n",
" 2 | \n",
" 57849 | \n",
" 29.70 | \n",
" 0.0 | \n",
" 65426 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 72272 | \n",
" 2011-10-04 10:43:00 | \n",
" 2114 | \n",
" 2 | \n",
" 5109 | \n",
" 2170.65 | \n",
" 0.0 | \n",
" 72272 | \n",
" 131020 | \n",
" 7 | \n",
"
\n",
" \n",
" 74456 | \n",
" 2011-10-11 17:17:22 | \n",
" 2148 | \n",
" 2 | \n",
" 1341 | \n",
" 2150.19 | \n",
" 0.0 | \n",
" 74456 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 84660 | \n",
" 2011-11-19 17:06:58 | \n",
" 1521 | \n",
" 1 | \n",
" 35294 | \n",
" 651.59 | \n",
" 0.0 | \n",
" 84660 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 117167 | \n",
" 2012-04-01 12:33:33 | \n",
" 1471 | \n",
" 1 | \n",
" -1 | \n",
" 650.94 | \n",
" 0.0 | \n",
" 117167 | \n",
" 4381.21 | \n",
" 1 | \n",
"
\n",
" \n",
" 119737 | \n",
" 2012-04-09 14:27:12 | \n",
" 2723 | \n",
" 1 | \n",
" 38616 | \n",
" 13.03 | \n",
" 0.0 | \n",
" 119737 | \n",
" 13614.2 | \n",
" 0 | \n",
"
\n",
" \n",
" 132467 | \n",
" 2012-05-27 16:43:11 | \n",
" 4857 | \n",
" 2 | \n",
" 45373 | \n",
" 41.70 | \n",
" 0.0 | \n",
" 132467 | \n",
" 634.13 | \n",
" 10 | \n",
"
\n",
" \n",
" 134858 | \n",
" 2012-06-03 17:05:21 | \n",
" 2114 | \n",
" 1 | \n",
" 18692 | \n",
" 26.06 | \n",
" 0.0 | \n",
" 134858 | \n",
" 175202 | \n",
" 7 | \n",
"
\n",
" \n",
" 142133 | \n",
" 2012-06-29 16:21:37 | \n",
" 7588 | \n",
" 2 | \n",
" 35991 | \n",
" 92.53 | \n",
" 0.0 | \n",
" 142133 | \n",
" 1151.21 | \n",
" 4 | \n",
"
\n",
" \n",
" 158154 | \n",
" 2012-08-20 10:55:23 | \n",
" 4420 | \n",
" 2 | \n",
" 53353 | \n",
" 182.65 | \n",
" 0.0 | \n",
" 158154 | \n",
" 121.77 | \n",
" 0 | \n",
"
\n",
" \n",
" 176418 | \n",
" 2012-10-16 14:23:04 | \n",
" 1595 | \n",
" 2 | \n",
" 25985 | \n",
" 15397.58 | \n",
" NaN | \n",
" 176418 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 186433 | \n",
" 2012-11-20 11:04:00 | \n",
" 4923 | \n",
" 2 | \n",
" 36010 | \n",
" 217.89 | \n",
" NaN | \n",
" 186433 | \n",
" 573.4 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date card_number type merchant amount fraud \\\n",
"trx_id \n",
"4082 2011-01-16 16:26:53 3558 2 13505 528.82 0.0 \n",
"23677 2011-04-04 08:13:41 1162 2 9417 117.29 0.0 \n",
"30074 2011-04-29 13:09:07 0 1 56997 21.29 0.0 \n",
"65426 2011-09-09 10:11:24 4420 2 57849 29.70 0.0 \n",
"72272 2011-10-04 10:43:00 2114 2 5109 2170.65 0.0 \n",
"74456 2011-10-11 17:17:22 2148 2 1341 2150.19 0.0 \n",
"84660 2011-11-19 17:06:58 1521 1 35294 651.59 0.0 \n",
"117167 2012-04-01 12:33:33 1471 1 -1 650.94 0.0 \n",
"119737 2012-04-09 14:27:12 2723 1 38616 13.03 0.0 \n",
"132467 2012-05-27 16:43:11 4857 2 45373 41.70 0.0 \n",
"134858 2012-06-03 17:05:21 2114 1 18692 26.06 0.0 \n",
"142133 2012-06-29 16:21:37 7588 2 35991 92.53 0.0 \n",
"158154 2012-08-20 10:55:23 4420 2 53353 182.65 0.0 \n",
"176418 2012-10-16 14:23:04 1595 2 25985 15397.58 NaN \n",
"186433 2012-11-20 11:04:00 4923 2 36010 217.89 NaN \n",
"\n",
" trx_id Trx_sum_7D Trx_count_1D \n",
"trx_id \n",
"4082 4082 307.85 0 \n",
"23677 23677 0 0 \n",
"30074 30074 14171.9 2 \n",
"65426 65426 0 0 \n",
"72272 72272 131020 7 \n",
"74456 74456 0 0 \n",
"84660 84660 0 0 \n",
"117167 117167 4381.21 1 \n",
"119737 119737 13614.2 0 \n",
"132467 132467 634.13 10 \n",
"134858 134858 175202 7 \n",
"142133 142133 1151.21 4 \n",
"158154 158154 121.77 0 \n",
"176418 176418 0 0 \n",
"186433 186433 573.4 0 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.sample(15, random_state=42).sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split train and test"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X = data.loc[~data.fraud.isnull()]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y = X.fraud"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = X.drop(['fraud', 'date', 'card_number'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_kaggle = data.loc[data.fraud.isnull()]\n",
"X_kaggle = X_kaggle.drop(['fraud', 'date', 'card_number'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" type | \n",
" merchant | \n",
" amount | \n",
" trx_id | \n",
" Trx_sum_7D | \n",
" Trx_count_1D | \n",
"
\n",
" \n",
" trx_id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 172091 | \n",
" 2 | \n",
" 13273 | \n",
" 208.51 | \n",
" 172091 | \n",
" 120165 | \n",
" 14 | \n",
"
\n",
" \n",
" 172092 | \n",
" 2 | \n",
" 34472 | \n",
" 525.05 | \n",
" 172092 | \n",
" 71042.4 | \n",
" 0 | \n",
"
\n",
" \n",
" 172093 | \n",
" 2 | \n",
" 37909 | \n",
" 802.24 | \n",
" 172093 | \n",
" 120374 | \n",
" 15 | \n",
"
\n",
" \n",
" 172094 | \n",
" 2 | \n",
" 35167 | \n",
" 130.32 | \n",
" 172094 | \n",
" 90638.1 | \n",
" 9 | \n",
"
\n",
" \n",
" 172095 | \n",
" 2 | \n",
" 35073 | \n",
" 9696.96 | \n",
" 172095 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" type merchant amount trx_id Trx_sum_7D Trx_count_1D\n",
"trx_id \n",
"172091 2 13273 208.51 172091 120165 14\n",
"172092 2 34472 525.05 172092 71042.4 0\n",
"172093 2 37909 802.24 172093 120374 15\n",
"172094 2 35167 130.32 172094 90638.1 9\n",
"172095 2 35073 9696.96 172095 0 0"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_kaggle.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Simple Random Forest"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.metrics import fbeta_score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"KFold cross-validation"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.cross_validation import KFold"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"kf = KFold(X.shape[0], n_folds=5)\n",
"res = []\n",
"for train, test in kf:\n",
" X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]\n",
" clf.fit(X_train, y_train)\n",
" y_pred_proba = clf.predict_proba(X_test)[:, 1]\n",
" y_pred = (y_pred_proba>0.05).astype(int)\n",
" res.append(fbeta_score(y_test, y_pred, beta=2))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 5.000000\n",
"mean 0.078145\n",
"std 0.032472\n",
"min 0.054945\n",
"25% 0.057692\n",
"50% 0.062500\n",
"75% 0.082713\n",
"max 0.132877\n",
"dtype: float64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.Series(res).describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Train with all"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Predict and send to Kaggle"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred = clf.predict_proba(X_kaggle)[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"y_pred = (y_pred>0.05).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred = pd.Series(y_pred,name='fraud', index=X_kaggle.index)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"trx_id\n",
"172091 0\n",
"172092 1\n",
"172093 1\n",
"172094 0\n",
"172095 1\n",
"172096 1\n",
"172097 1\n",
"172098 0\n",
"172099 1\n",
"172100 0\n",
"Name: fraud, dtype: int64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y_pred.to_csv('fraud_transactions_kaggle_1.csv', header=True, index_label='ID')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Main Issues"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* Class imbalance\n",
"* Feature creation\n",
"* Model selection\n",
"* Threshold selection"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}