{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Full width\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import math\n", "import os\n", "import subprocess\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display\n", "\n", "# \n", "from lib_modeling import *\n", "from lib_feature_engineering import *\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data\n", "\n", "- Load train + tvt = train_filtered for features evaluation\n", "- Load train/test for applying mean encoding" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGET
01000021
11000030
21000040
31000060
41000070
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET\n", "0 100002 1 \n", "1 100003 0 \n", "2 100004 0 \n", "3 100006 0 \n", "4 100007 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load train/test data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "pdf_train = pd.read_csv(data_path)\n", "\n", "data_path = \"home-credit-default-risk/application_test.csv\"\n", "pdf_test = pd.read_csv(data_path)\n", "\n", "# filter by tvt code\n", "pdf_tvt_extend = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n", "pdf_train_filtered = (pdf_tvt_extend.query(\"tvt_code == 'train'\")\n", " .merge(pdf_train[[\"SK_ID_CURR\"]], on=\"SK_ID_CURR\")\n", " .drop(columns=[\"tvt_code\"]))\n", "pdf_train_filtered.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(3840312, 23)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRMONTHS_BALANCEAMT_BALANCEAMT_CREDIT_LIMIT_ACTUALAMT_DRAWINGS_ATM_CURRENTAMT_DRAWINGS_CURRENTAMT_DRAWINGS_OTHER_CURRENTAMT_DRAWINGS_POS_CURRENTAMT_INST_MIN_REGULARITYAMT_PAYMENT_CURRENTAMT_PAYMENT_TOTAL_CURRENTAMT_RECEIVABLE_PRINCIPALAMT_RECIVABLEAMT_TOTAL_RECEIVABLECNT_DRAWINGS_ATM_CURRENTCNT_DRAWINGS_CURRENTCNT_DRAWINGS_OTHER_CURRENTCNT_DRAWINGS_POS_CURRENTCNT_INSTALMENT_MATURE_CUMNAME_CONTRACT_STATUSSK_DPDSK_DPD_DEF
02562384378907-656.9701350000.0877.50.0877.51700.3251800.01800.00.0000.0000.0000.010.01.035.0Active00
12582071363914-163975.555450002250.02250.00.00.02250.0002250.02250.060175.08064875.55564875.5551.010.00.069.0Active00
21740877371185-731815.2254500000.00.00.00.02250.0002250.02250.026926.42531460.08531460.0850.000.00.030.0Active00
31389973337855-4236572.1102250002250.02250.00.00.011795.76011925.011925.0224949.285233048.970233048.9701.010.00.010.0Active00
41891521126868-1453919.4554500000.011547.00.011547.022924.89027000.027000.0443044.395453919.455453919.4550.010.01.0101.0Active00
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR MONTHS_BALANCE AMT_BALANCE AMT_CREDIT_LIMIT_ACTUAL AMT_DRAWINGS_ATM_CURRENT AMT_DRAWINGS_CURRENT AMT_DRAWINGS_OTHER_CURRENT AMT_DRAWINGS_POS_CURRENT AMT_INST_MIN_REGULARITY AMT_PAYMENT_CURRENT AMT_PAYMENT_TOTAL_CURRENT AMT_RECEIVABLE_PRINCIPAL AMT_RECIVABLE AMT_TOTAL_RECEIVABLE CNT_DRAWINGS_ATM_CURRENT CNT_DRAWINGS_CURRENT CNT_DRAWINGS_OTHER_CURRENT CNT_DRAWINGS_POS_CURRENT CNT_INSTALMENT_MATURE_CUM NAME_CONTRACT_STATUS SK_DPD SK_DPD_DEF\n", "0 2562384 378907 -6 56.970 135000 0.0 877.5 0.0 877.5 1700.325 1800.0 1800.0 0.000 0.000 0.000 0.0 1 0.0 1.0 35.0 Active 0 0 \n", "1 2582071 363914 -1 63975.555 45000 2250.0 2250.0 0.0 0.0 2250.000 2250.0 2250.0 60175.080 64875.555 64875.555 1.0 1 0.0 0.0 69.0 Active 0 0 \n", "2 1740877 371185 -7 31815.225 450000 0.0 0.0 0.0 0.0 2250.000 2250.0 2250.0 26926.425 31460.085 31460.085 0.0 0 0.0 0.0 30.0 Active 0 0 \n", "3 1389973 337855 -4 236572.110 225000 2250.0 2250.0 0.0 0.0 11795.760 11925.0 11925.0 224949.285 233048.970 233048.970 1.0 1 0.0 0.0 10.0 Active 0 0 \n", "4 1891521 126868 -1 453919.455 450000 0.0 11547.0 0.0 11547.0 22924.890 27000.0 27000.0 443044.395 453919.455 453919.455 0.0 1 0.0 1.0 101.0 Active 0 0 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load previous application\n", "data_path = \"home-credit-default-risk/credit_card_balance.csv\"\n", "pdf_data = pd.read_csv(data_path)\n", "print(pdf_data.shape)\n", "pdf_data.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# load meta data\n", "meta_path = \"../02_pandas/reports/report_credit_card_balance.csv\"\n", "pdf_meta = pd.read_csv(meta_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# DPD handling" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "pdf_data[\"is_DPD\"] = (pdf_data[\"SK_DPD\"] > 0).astype(int)\n", "pdf_data[\"is_DPD_DEF\"] = (pdf_data[\"SK_DPD_DEF\"] > 0).astype(int)\n", "\n", "# drop columns\n", "pdf_data.drop(columns=[\"SK_DPD\", \"SK_DPD_DEF\"], inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Categorical features" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['NAME_CONTRACT_STATUS']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list categorical attributes\n", "ls_cate = pdf_meta.query(\"sub_type == 'object'\")[\"name\"].tolist()\n", "ls_cate" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# construct categorical mapping\n", "dict_onehot = {}\n", "for cate in ls_cate:\n", " ls_val = pdf_data[cate].value_counts().index.tolist()\n", " dict_onehot[cate] = ls_val\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### one hot" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(3840312, 8)\n", "CPU times: user 23 s, sys: 1.73 s, total: 24.8 s\n", "Wall time: 11.4 s\n" ] } ], "source": [ "%%time\n", "pdf_onehot = gen_one_hot_feat(pdf_data, dict_onehot, main_key=\"SK_ID_CURR\")\n", "print(pdf_onehot.shape)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "{'NAME_CONTRACT_STATUS_Active': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Approved': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Completed': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Demand': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Refused': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Sent_proposal': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Signed': ['max', 'sum', 'mean']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (103558, 21)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
1NAME_CONTRACT_STATUS_Active_sum0.5549-0.05911.0
2NAME_CONTRACT_STATUS_Active_mean0.52100.02351.0
7NAME_CONTRACT_STATUS_Completed_sum0.5155-0.02161.0
8NAME_CONTRACT_STATUS_Completed_mean0.5155-0.02351.0
6NAME_CONTRACT_STATUS_Completed_max0.5154-0.02641.0
19NAME_CONTRACT_STATUS_Signed_sum0.5058-0.00661.0
18NAME_CONTRACT_STATUS_Signed_max0.5058-0.01501.0
20NAME_CONTRACT_STATUS_Signed_mean0.5056-0.00531.0
11NAME_CONTRACT_STATUS_Sent_proposal_mean0.5016-0.01271.0
10NAME_CONTRACT_STATUS_Sent_proposal_sum0.5016-0.01261.0
9NAME_CONTRACT_STATUS_Sent_proposal_max0.5016-0.01261.0
0NAME_CONTRACT_STATUS_Active_max0.50030.00681.0
13NAME_CONTRACT_STATUS_Demand_sum0.50010.00741.0
14NAME_CONTRACT_STATUS_Demand_mean0.50010.00651.0
12NAME_CONTRACT_STATUS_Demand_max0.50010.00451.0
5NAME_CONTRACT_STATUS_Approved_mean0.5000-0.00281.0
4NAME_CONTRACT_STATUS_Approved_sum0.5000-0.00281.0
3NAME_CONTRACT_STATUS_Approved_max0.5000-0.00281.0
15NAME_CONTRACT_STATUS_Refused_max0.50000.00101.0
16NAME_CONTRACT_STATUS_Refused_sum0.50000.00101.0
17NAME_CONTRACT_STATUS_Refused_mean0.50000.00051.0
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "1 NAME_CONTRACT_STATUS_Active_sum 0.5549 -0.0591 1.0 \n", "2 NAME_CONTRACT_STATUS_Active_mean 0.5210 0.0235 1.0 \n", "7 NAME_CONTRACT_STATUS_Completed_sum 0.5155 -0.0216 1.0 \n", "8 NAME_CONTRACT_STATUS_Completed_mean 0.5155 -0.0235 1.0 \n", "6 NAME_CONTRACT_STATUS_Completed_max 0.5154 -0.0264 1.0 \n", "19 NAME_CONTRACT_STATUS_Signed_sum 0.5058 -0.0066 1.0 \n", "18 NAME_CONTRACT_STATUS_Signed_max 0.5058 -0.0150 1.0 \n", "20 NAME_CONTRACT_STATUS_Signed_mean 0.5056 -0.0053 1.0 \n", "11 NAME_CONTRACT_STATUS_Sent_proposal_mean 0.5016 -0.0127 1.0 \n", "10 NAME_CONTRACT_STATUS_Sent_proposal_sum 0.5016 -0.0126 1.0 \n", "9 NAME_CONTRACT_STATUS_Sent_proposal_max 0.5016 -0.0126 1.0 \n", "0 NAME_CONTRACT_STATUS_Active_max 0.5003 0.0068 1.0 \n", "13 NAME_CONTRACT_STATUS_Demand_sum 0.5001 0.0074 1.0 \n", "14 NAME_CONTRACT_STATUS_Demand_mean 0.5001 0.0065 1.0 \n", "12 NAME_CONTRACT_STATUS_Demand_max 0.5001 0.0045 1.0 \n", "5 NAME_CONTRACT_STATUS_Approved_mean 0.5000 -0.0028 1.0 \n", "4 NAME_CONTRACT_STATUS_Approved_sum 0.5000 -0.0028 1.0 \n", "3 NAME_CONTRACT_STATUS_Approved_max 0.5000 -0.0028 1.0 \n", "15 NAME_CONTRACT_STATUS_Refused_max 0.5000 0.0010 1.0 \n", "16 NAME_CONTRACT_STATUS_Refused_sum 0.5000 0.0010 1.0 \n", "17 NAME_CONTRACT_STATUS_Refused_mean 0.5000 0.0005 1.0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.29 s, sys: 147 ms, total: 3.44 s\n", "Wall time: 1.65 s\n" ] } ], "source": [ "%%time\n", "pdf_agg01 = agg_common_data(pdf_onehot, [\"max\", \"sum\", \"mean\"], main_key=\"SK_ID_CURR\")\n", "eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01)\n", "display(eval_agg01)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10, 4)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_agg01.query(\"auc <= 0.501\").shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(103558, 11)\n" ] } ], "source": [ "sel_feat = eval_agg01.query(\"auc > 0.501\")[\"name\"].tolist()\n", "pdf_agg01 = pdf_agg01[sel_feat]\n", "print(pdf_agg01.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Numerical features" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['MONTHS_BALANCE',\n", " 'AMT_CREDIT_LIMIT_ACTUAL',\n", " 'CNT_DRAWINGS_CURRENT',\n", " 'is_DPD',\n", " 'is_DPD_DEF']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list numerical attributes\n", "# ls_num = pdf_meta.query(\"sub_type == 'int64'\")[\"name\"].tolist()\n", "series_type = pdf_data.dtypes\n", "ls_num = series_type[series_type == \"int64\"].index.tolist()\n", "ls_num = [col for col in ls_num if col not in [\"SK_ID_PREV\", \"SK_ID_CURR\"]]\n", "ls_num" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRMONTHS_BALANCEAMT_CREDIT_LIMIT_ACTUALCNT_DRAWINGS_CURRENTis_DPDis_DPD_DEF
02562384378907-6135000100
12582071363914-145000100
21740877371185-7450000000
31389973337855-4225000100
41891521126868-1450000100
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR MONTHS_BALANCE AMT_CREDIT_LIMIT_ACTUAL CNT_DRAWINGS_CURRENT is_DPD is_DPD_DEF\n", "0 2562384 378907 -6 135000 1 0 0 \n", "1 2582071 363914 -1 45000 1 0 0 \n", "2 1740877 371185 -7 450000 0 0 0 \n", "3 1389973 337855 -4 225000 1 0 0 \n", "4 1891521 126868 -1 450000 1 0 0 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_num = pdf_data[[\"SK_ID_PREV\", \"SK_ID_CURR\"] + ls_num].copy()\n", "pdf_num.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "pdf_num[\"MONTHS_BALANCE\"] = pdf_num[\"MONTHS_BALANCE\"] * -1" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'AMT_CREDIT_LIMIT_ACTUAL': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CNT_DRAWINGS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'MONTHS_BALANCE': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'is_DPD': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'is_DPD_DEF': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (103558, 25)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
24CNT_DRAWINGS_CURRENT_std0.62570.10870.9929
23CNT_DRAWINGS_CURRENT_mean0.62500.08431.0000
20CNT_DRAWINGS_CURRENT_max0.61530.10111.0000
22CNT_DRAWINGS_CURRENT_sum0.59690.05041.0000
8MONTHS_BALANCE_mean0.5627-0.06221.0000
5MONTHS_BALANCE_max0.5606-0.06131.0000
7MONTHS_BALANCE_sum0.5597-0.05891.0000
9MONTHS_BALANCE_std0.5582-0.06050.9929
17AMT_CREDIT_LIMIT_ACTUAL_sum0.5470-0.04261.0000
6MONTHS_BALANCE_min0.5259-0.03091.0000
16AMT_CREDIT_LIMIT_ACTUAL_min0.51830.00021.0000
19AMT_CREDIT_LIMIT_ACTUAL_std0.5180-0.01430.9929
21CNT_DRAWINGS_CURRENT_min0.51250.02901.0000
2is_DPD_DEF_sum0.5092-0.00831.0000
0is_DPD_DEF_max0.5087-0.01301.0000
3is_DPD_DEF_mean0.50730.00521.0000
4is_DPD_DEF_std0.5072-0.00210.9929
12is_DPD_sum0.5066-0.00841.0000
15AMT_CREDIT_LIMIT_ACTUAL_max0.5064-0.01131.0000
10is_DPD_max0.5051-0.00711.0000
13is_DPD_mean0.50320.00421.0000
14is_DPD_std0.50290.00270.9929
18AMT_CREDIT_LIMIT_ACTUAL_mean0.5003-0.00761.0000
1is_DPD_DEF_min0.5000NaN1.0000
11is_DPD_min0.5000NaN1.0000
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "24 CNT_DRAWINGS_CURRENT_std 0.6257 0.1087 0.9929 \n", "23 CNT_DRAWINGS_CURRENT_mean 0.6250 0.0843 1.0000 \n", "20 CNT_DRAWINGS_CURRENT_max 0.6153 0.1011 1.0000 \n", "22 CNT_DRAWINGS_CURRENT_sum 0.5969 0.0504 1.0000 \n", "8 MONTHS_BALANCE_mean 0.5627 -0.0622 1.0000 \n", "5 MONTHS_BALANCE_max 0.5606 -0.0613 1.0000 \n", "7 MONTHS_BALANCE_sum 0.5597 -0.0589 1.0000 \n", "9 MONTHS_BALANCE_std 0.5582 -0.0605 0.9929 \n", "17 AMT_CREDIT_LIMIT_ACTUAL_sum 0.5470 -0.0426 1.0000 \n", "6 MONTHS_BALANCE_min 0.5259 -0.0309 1.0000 \n", "16 AMT_CREDIT_LIMIT_ACTUAL_min 0.5183 0.0002 1.0000 \n", "19 AMT_CREDIT_LIMIT_ACTUAL_std 0.5180 -0.0143 0.9929 \n", "21 CNT_DRAWINGS_CURRENT_min 0.5125 0.0290 1.0000 \n", "2 is_DPD_DEF_sum 0.5092 -0.0083 1.0000 \n", "0 is_DPD_DEF_max 0.5087 -0.0130 1.0000 \n", "3 is_DPD_DEF_mean 0.5073 0.0052 1.0000 \n", "4 is_DPD_DEF_std 0.5072 -0.0021 0.9929 \n", "12 is_DPD_sum 0.5066 -0.0084 1.0000 \n", "15 AMT_CREDIT_LIMIT_ACTUAL_max 0.5064 -0.0113 1.0000 \n", "10 is_DPD_max 0.5051 -0.0071 1.0000 \n", "13 is_DPD_mean 0.5032 0.0042 1.0000 \n", "14 is_DPD_std 0.5029 0.0027 0.9929 \n", "18 AMT_CREDIT_LIMIT_ACTUAL_mean 0.5003 -0.0076 1.0000 \n", "1 is_DPD_DEF_min 0.5000 NaN 1.0000 \n", "11 is_DPD_min 0.5000 NaN 1.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 5.12 s, sys: 213 ms, total: 5.33 s\n", "Wall time: 2.14 s\n" ] } ], "source": [ "%%time\n", "pdf_agg02 = agg_common_data(pdf_num[[\"SK_ID_CURR\"] + ls_num], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)\n", "display(eval_agg02)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Continuous features" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AMT_BALANCE',\n", " 'AMT_DRAWINGS_ATM_CURRENT',\n", " 'AMT_DRAWINGS_CURRENT',\n", " 'AMT_DRAWINGS_OTHER_CURRENT',\n", " 'AMT_DRAWINGS_POS_CURRENT',\n", " 'AMT_INST_MIN_REGULARITY',\n", " 'AMT_PAYMENT_CURRENT',\n", " 'AMT_PAYMENT_TOTAL_CURRENT',\n", " 'AMT_RECEIVABLE_PRINCIPAL',\n", " 'AMT_RECIVABLE',\n", " 'AMT_TOTAL_RECEIVABLE',\n", " 'CNT_DRAWINGS_ATM_CURRENT',\n", " 'CNT_DRAWINGS_OTHER_CURRENT',\n", " 'CNT_DRAWINGS_POS_CURRENT',\n", " 'CNT_INSTALMENT_MATURE_CUM']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list continuous attributes\n", "ls_con = pdf_meta.query(\"sub_type == 'float64'\")[\"name\"].tolist()\n", "ls_con" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(3840312, 17)\n" ] } ], "source": [ "pdf_con = pdf_data[[\"SK_ID_PREV\", \"SK_ID_CURR\"] + ls_con].copy()\n", "print(pdf_con.shape)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "{'AMT_BALANCE': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_DRAWINGS_ATM_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_DRAWINGS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_DRAWINGS_OTHER_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_DRAWINGS_POS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_INST_MIN_REGULARITY': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_PAYMENT_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_PAYMENT_TOTAL_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_RECEIVABLE_PRINCIPAL': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_RECIVABLE': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_TOTAL_RECEIVABLE': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CNT_DRAWINGS_ATM_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CNT_DRAWINGS_OTHER_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CNT_DRAWINGS_POS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CNT_INSTALMENT_MATURE_CUM': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (103558, 75)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
3AMT_DRAWINGS_CURRENT_mean0.60910.06301.0000
63AMT_BALANCE_mean0.60770.08941.0000
53AMT_TOTAL_RECEIVABLE_mean0.60690.08871.0000
13AMT_RECIVABLE_mean0.60690.08871.0000
18AMT_RECEIVABLE_PRINCIPAL_mean0.60680.08821.0000
68CNT_DRAWINGS_ATM_CURRENT_mean0.60660.10920.7030
4AMT_DRAWINGS_CURRENT_std0.60190.07110.9929
69CNT_DRAWINGS_ATM_CURRENT_std0.59710.10730.6979
48AMT_INST_MIN_REGULARITY_mean0.59250.07521.0000
60AMT_BALANCE_max0.58430.07161.0000
50AMT_TOTAL_RECEIVABLE_max0.58300.07081.0000
10AMT_RECIVABLE_max0.58300.07081.0000
15AMT_RECEIVABLE_PRINCIPAL_max0.58250.06951.0000
58AMT_DRAWINGS_ATM_CURRENT_mean0.58140.06230.7030
49AMT_INST_MIN_REGULARITY_std0.58080.07000.9929
7CNT_DRAWINGS_POS_CURRENT_sum0.57900.03821.0000
45AMT_INST_MIN_REGULARITY_max0.57860.06611.0000
67CNT_DRAWINGS_ATM_CURRENT_sum0.57760.04991.0000
64AMT_BALANCE_std0.57540.06220.9929
54AMT_TOTAL_RECEIVABLE_std0.57470.06190.9929
14AMT_RECIVABLE_std0.57470.06190.9929
0AMT_DRAWINGS_CURRENT_max0.57450.05411.0000
19AMT_RECEIVABLE_PRINCIPAL_std0.57400.06080.9929
9CNT_DRAWINGS_POS_CURRENT_std0.57140.07380.6979
11AMT_RECIVABLE_min0.57140.06491.0000
51AMT_TOTAL_RECEIVABLE_min0.57140.06491.0000
23AMT_PAYMENT_TOTAL_CURRENT_mean0.57020.02501.0000
8CNT_DRAWINGS_POS_CURRENT_mean0.56870.05310.7030
5CNT_DRAWINGS_POS_CURRENT_max0.56620.06640.7030
57AMT_DRAWINGS_ATM_CURRENT_sum0.56620.03901.0000
61AMT_BALANCE_min0.56470.06561.0000
2AMT_DRAWINGS_CURRENT_sum0.56460.02561.0000
59AMT_DRAWINGS_ATM_CURRENT_std0.56270.05360.6979
42AMT_DRAWINGS_POS_CURRENT_sum0.5625-0.00241.0000
16AMT_RECEIVABLE_PRINCIPAL_min0.55810.06451.0000
65CNT_DRAWINGS_ATM_CURRENT_max0.55730.06300.7030
62AMT_BALANCE_sum0.55660.02051.0000
52AMT_TOTAL_RECEIVABLE_sum0.55600.02021.0000
12AMT_RECIVABLE_sum0.55600.02021.0000
17AMT_RECEIVABLE_PRINCIPAL_sum0.55590.02031.0000
24AMT_PAYMENT_TOTAL_CURRENT_std0.55520.03630.9929
43AMT_DRAWINGS_POS_CURRENT_mean0.5451-0.00270.7030
36AMT_PAYMENT_CURRENT_min0.54330.01670.7022
47AMT_INST_MIN_REGULARITY_sum0.54230.00411.0000
20AMT_PAYMENT_TOTAL_CURRENT_max0.54220.02931.0000
44AMT_DRAWINGS_POS_CURRENT_std0.53900.00240.6979
71CNT_INSTALMENT_MATURE_CUM_min0.5369-0.03021.0000
74CNT_INSTALMENT_MATURE_CUM_std0.5305-0.01120.9929
38AMT_PAYMENT_CURRENT_mean0.53010.00680.7022
35AMT_PAYMENT_CURRENT_max0.52900.00030.7022
40AMT_DRAWINGS_POS_CURRENT_max0.5287-0.00900.7030
55AMT_DRAWINGS_ATM_CURRENT_max0.52520.02410.7030
22AMT_PAYMENT_TOTAL_CURRENT_sum0.5248-0.00411.0000
37AMT_PAYMENT_CURRENT_sum0.5237-0.00411.0000
70CNT_INSTALMENT_MATURE_CUM_max0.5213-0.01661.0000
73CNT_INSTALMENT_MATURE_CUM_mean0.5142-0.02771.0000
1AMT_DRAWINGS_CURRENT_min0.51240.01581.0000
39AMT_PAYMENT_CURRENT_std0.51110.01330.6971
6CNT_DRAWINGS_POS_CURRENT_min0.50960.02430.7030
41AMT_DRAWINGS_POS_CURRENT_min0.5093-0.00210.7030
72CNT_INSTALMENT_MATURE_CUM_sum0.5070-0.04121.0000
66CNT_DRAWINGS_ATM_CURRENT_min0.50610.02690.7030
56AMT_DRAWINGS_ATM_CURRENT_min0.50610.01770.7030
21AMT_PAYMENT_TOTAL_CURRENT_min0.50590.00471.0000
25CNT_DRAWINGS_OTHER_CURRENT_max0.50350.00090.7030
32AMT_DRAWINGS_OTHER_CURRENT_sum0.50320.01071.0000
30AMT_DRAWINGS_OTHER_CURRENT_max0.50310.00480.7030
28CNT_DRAWINGS_OTHER_CURRENT_mean0.50300.01560.7030
27CNT_DRAWINGS_OTHER_CURRENT_sum0.50290.00151.0000
29CNT_DRAWINGS_OTHER_CURRENT_std0.50280.01250.6979
33AMT_DRAWINGS_OTHER_CURRENT_mean0.50280.01100.7030
34AMT_DRAWINGS_OTHER_CURRENT_std0.50270.00970.6979
46AMT_INST_MIN_REGULARITY_min0.50150.00281.0000
26CNT_DRAWINGS_OTHER_CURRENT_min0.5000-0.00260.7030
31AMT_DRAWINGS_OTHER_CURRENT_min0.5000-0.00240.7030
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "3 AMT_DRAWINGS_CURRENT_mean 0.6091 0.0630 1.0000 \n", "63 AMT_BALANCE_mean 0.6077 0.0894 1.0000 \n", "53 AMT_TOTAL_RECEIVABLE_mean 0.6069 0.0887 1.0000 \n", "13 AMT_RECIVABLE_mean 0.6069 0.0887 1.0000 \n", "18 AMT_RECEIVABLE_PRINCIPAL_mean 0.6068 0.0882 1.0000 \n", "68 CNT_DRAWINGS_ATM_CURRENT_mean 0.6066 0.1092 0.7030 \n", "4 AMT_DRAWINGS_CURRENT_std 0.6019 0.0711 0.9929 \n", "69 CNT_DRAWINGS_ATM_CURRENT_std 0.5971 0.1073 0.6979 \n", "48 AMT_INST_MIN_REGULARITY_mean 0.5925 0.0752 1.0000 \n", "60 AMT_BALANCE_max 0.5843 0.0716 1.0000 \n", "50 AMT_TOTAL_RECEIVABLE_max 0.5830 0.0708 1.0000 \n", "10 AMT_RECIVABLE_max 0.5830 0.0708 1.0000 \n", "15 AMT_RECEIVABLE_PRINCIPAL_max 0.5825 0.0695 1.0000 \n", "58 AMT_DRAWINGS_ATM_CURRENT_mean 0.5814 0.0623 0.7030 \n", "49 AMT_INST_MIN_REGULARITY_std 0.5808 0.0700 0.9929 \n", "7 CNT_DRAWINGS_POS_CURRENT_sum 0.5790 0.0382 1.0000 \n", "45 AMT_INST_MIN_REGULARITY_max 0.5786 0.0661 1.0000 \n", "67 CNT_DRAWINGS_ATM_CURRENT_sum 0.5776 0.0499 1.0000 \n", "64 AMT_BALANCE_std 0.5754 0.0622 0.9929 \n", "54 AMT_TOTAL_RECEIVABLE_std 0.5747 0.0619 0.9929 \n", "14 AMT_RECIVABLE_std 0.5747 0.0619 0.9929 \n", "0 AMT_DRAWINGS_CURRENT_max 0.5745 0.0541 1.0000 \n", "19 AMT_RECEIVABLE_PRINCIPAL_std 0.5740 0.0608 0.9929 \n", "9 CNT_DRAWINGS_POS_CURRENT_std 0.5714 0.0738 0.6979 \n", "11 AMT_RECIVABLE_min 0.5714 0.0649 1.0000 \n", "51 AMT_TOTAL_RECEIVABLE_min 0.5714 0.0649 1.0000 \n", "23 AMT_PAYMENT_TOTAL_CURRENT_mean 0.5702 0.0250 1.0000 \n", "8 CNT_DRAWINGS_POS_CURRENT_mean 0.5687 0.0531 0.7030 \n", "5 CNT_DRAWINGS_POS_CURRENT_max 0.5662 0.0664 0.7030 \n", "57 AMT_DRAWINGS_ATM_CURRENT_sum 0.5662 0.0390 1.0000 \n", "61 AMT_BALANCE_min 0.5647 0.0656 1.0000 \n", "2 AMT_DRAWINGS_CURRENT_sum 0.5646 0.0256 1.0000 \n", "59 AMT_DRAWINGS_ATM_CURRENT_std 0.5627 0.0536 0.6979 \n", "42 AMT_DRAWINGS_POS_CURRENT_sum 0.5625 -0.0024 1.0000 \n", "16 AMT_RECEIVABLE_PRINCIPAL_min 0.5581 0.0645 1.0000 \n", "65 CNT_DRAWINGS_ATM_CURRENT_max 0.5573 0.0630 0.7030 \n", "62 AMT_BALANCE_sum 0.5566 0.0205 1.0000 \n", "52 AMT_TOTAL_RECEIVABLE_sum 0.5560 0.0202 1.0000 \n", "12 AMT_RECIVABLE_sum 0.5560 0.0202 1.0000 \n", "17 AMT_RECEIVABLE_PRINCIPAL_sum 0.5559 0.0203 1.0000 \n", "24 AMT_PAYMENT_TOTAL_CURRENT_std 0.5552 0.0363 0.9929 \n", "43 AMT_DRAWINGS_POS_CURRENT_mean 0.5451 -0.0027 0.7030 \n", "36 AMT_PAYMENT_CURRENT_min 0.5433 0.0167 0.7022 \n", "47 AMT_INST_MIN_REGULARITY_sum 0.5423 0.0041 1.0000 \n", "20 AMT_PAYMENT_TOTAL_CURRENT_max 0.5422 0.0293 1.0000 \n", "44 AMT_DRAWINGS_POS_CURRENT_std 0.5390 0.0024 0.6979 \n", "71 CNT_INSTALMENT_MATURE_CUM_min 0.5369 -0.0302 1.0000 \n", "74 CNT_INSTALMENT_MATURE_CUM_std 0.5305 -0.0112 0.9929 \n", "38 AMT_PAYMENT_CURRENT_mean 0.5301 0.0068 0.7022 \n", "35 AMT_PAYMENT_CURRENT_max 0.5290 0.0003 0.7022 \n", "40 AMT_DRAWINGS_POS_CURRENT_max 0.5287 -0.0090 0.7030 \n", "55 AMT_DRAWINGS_ATM_CURRENT_max 0.5252 0.0241 0.7030 \n", "22 AMT_PAYMENT_TOTAL_CURRENT_sum 0.5248 -0.0041 1.0000 \n", "37 AMT_PAYMENT_CURRENT_sum 0.5237 -0.0041 1.0000 \n", "70 CNT_INSTALMENT_MATURE_CUM_max 0.5213 -0.0166 1.0000 \n", "73 CNT_INSTALMENT_MATURE_CUM_mean 0.5142 -0.0277 1.0000 \n", "1 AMT_DRAWINGS_CURRENT_min 0.5124 0.0158 1.0000 \n", "39 AMT_PAYMENT_CURRENT_std 0.5111 0.0133 0.6971 \n", "6 CNT_DRAWINGS_POS_CURRENT_min 0.5096 0.0243 0.7030 \n", "41 AMT_DRAWINGS_POS_CURRENT_min 0.5093 -0.0021 0.7030 \n", "72 CNT_INSTALMENT_MATURE_CUM_sum 0.5070 -0.0412 1.0000 \n", "66 CNT_DRAWINGS_ATM_CURRENT_min 0.5061 0.0269 0.7030 \n", "56 AMT_DRAWINGS_ATM_CURRENT_min 0.5061 0.0177 0.7030 \n", "21 AMT_PAYMENT_TOTAL_CURRENT_min 0.5059 0.0047 1.0000 \n", "25 CNT_DRAWINGS_OTHER_CURRENT_max 0.5035 0.0009 0.7030 \n", "32 AMT_DRAWINGS_OTHER_CURRENT_sum 0.5032 0.0107 1.0000 \n", "30 AMT_DRAWINGS_OTHER_CURRENT_max 0.5031 0.0048 0.7030 \n", "28 CNT_DRAWINGS_OTHER_CURRENT_mean 0.5030 0.0156 0.7030 \n", "27 CNT_DRAWINGS_OTHER_CURRENT_sum 0.5029 0.0015 1.0000 \n", "29 CNT_DRAWINGS_OTHER_CURRENT_std 0.5028 0.0125 0.6979 \n", "33 AMT_DRAWINGS_OTHER_CURRENT_mean 0.5028 0.0110 0.7030 \n", "34 AMT_DRAWINGS_OTHER_CURRENT_std 0.5027 0.0097 0.6979 \n", "46 AMT_INST_MIN_REGULARITY_min 0.5015 0.0028 1.0000 \n", "26 CNT_DRAWINGS_OTHER_CURRENT_min 0.5000 -0.0026 0.7030 \n", "31 AMT_DRAWINGS_OTHER_CURRENT_min 0.5000 -0.0024 0.7030 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 9.22 s, sys: 384 ms, total: 9.6 s\n", "Wall time: 5.76 s\n" ] } ], "source": [ "%%time\n", "pdf_agg03 = agg_common_data(pdf_con[[\"SK_ID_CURR\"] + ls_con], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)\n", "display(eval_agg03)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2, 4)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_agg03.query(\"auc <= 0.501\").shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save features" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(103558, 111)\n" ] } ], "source": [ "pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03)\n", "print(pdf_feat.shape)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Store features completed!\n", "CPU times: user 5.62 s, sys: 130 ms, total: 5.75 s\n", "Wall time: 4.22 s\n" ] } ], "source": [ "%%time\n", "fname = \"credit_card_balance\"\n", "fname = os.path.join(\"features\", \"{}.pkl.bz2\".format(fname))\n", "pdf_feat.to_pickle(fname, compression=\"bz2\")\n", "print(\"Store features completed!\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }