{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Full width\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import math\n", "import os\n", "import subprocess\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display\n", "\n", "# \n", "from lib_modeling import *\n", "from lib_feature_engineering import *\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data\n", "\n", "- Load train + tvt = train_filtered for features evaluation\n", "- Load train/test for applying mean encoding" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGET
01000021
11000030
21000040
31000060
41000070
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET\n", "0 100002 1 \n", "1 100003 0 \n", "2 100004 0 \n", "3 100006 0 \n", "4 100007 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load train/test data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "pdf_train = pd.read_csv(data_path)\n", "\n", "data_path = \"home-credit-default-risk/application_test.csv\"\n", "pdf_test = pd.read_csv(data_path)\n", "\n", "# filter by tvt code\n", "pdf_tvt_extend = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n", "pdf_train_filtered = (pdf_tvt_extend.query(\"tvt_code == 'train'\")\n", " .merge(pdf_train[[\"SK_ID_CURR\"]], on=\"SK_ID_CURR\")\n", " .drop(columns=[\"tvt_code\"]))\n", "pdf_train_filtered.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(10001358, 8)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRMONTHS_BALANCECNT_INSTALMENTCNT_INSTALMENT_FUTURENAME_CONTRACT_STATUSSK_DPDSK_DPD_DEF
01803195182943-3148.045.0Active00
11715348367990-3336.035.0Active00
21784872397406-3212.09.0Active00
31903291269225-3548.042.0Active00
42341044334279-3536.035.0Active00
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR MONTHS_BALANCE CNT_INSTALMENT CNT_INSTALMENT_FUTURE NAME_CONTRACT_STATUS SK_DPD SK_DPD_DEF\n", "0 1803195 182943 -31 48.0 45.0 Active 0 0 \n", "1 1715348 367990 -33 36.0 35.0 Active 0 0 \n", "2 1784872 397406 -32 12.0 9.0 Active 0 0 \n", "3 1903291 269225 -35 48.0 42.0 Active 0 0 \n", "4 2341044 334279 -35 36.0 35.0 Active 0 0 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load previous application\n", "data_path = \"home-credit-default-risk/POS_CASH_balance.csv\"\n", "pdf_data = pd.read_csv(data_path)\n", "print(pdf_data.shape)\n", "pdf_data.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# load meta data\n", "meta_path = \"../02_pandas/reports/report_POS_CASH_balance.csv\"\n", "pdf_meta = pd.read_csv(meta_path)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(10001358, 8)\n" ] } ], "source": [ "# filter here for look up application with years\n", "pdf_data[\"MONTHS_BALANCE\"] = pdf_data[\"MONTHS_BALANCE\"] * -1\n", "# pdf_data = pdf_data[(pdf_data[\"MONTHS_BALANCE\"] >= 12) & (pdf_data[\"MONTHS_BALANCE\"] < 12 * 2)]\n", "# pdf_data = pdf_data[(pdf_data[\"MONTHS_BALANCE\"] >= 12 * 2) & (pdf_data[\"MONTHS_BALANCE\"] < 12 * 3)]\n", "# pdf_data = pdf_data[pdf_data[\"MONTHS_BALANCE\"] >= 12 * 3]\n", "\n", "print(pdf_data.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# DPD handling" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "pdf_data[\"is_DPD\"] = (pdf_data[\"SK_DPD\"] > 0).astype(int)\n", "pdf_data[\"is_DPD_DEF\"] = (pdf_data[\"SK_DPD_DEF\"] > 0).astype(int)\n", "\n", "# drop columns\n", "pdf_data.drop(columns=[\"SK_DPD\", \"SK_DPD_DEF\"], inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Categorical features" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['NAME_CONTRACT_STATUS']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list categorical attributes\n", "ls_cate = pdf_meta.query(\"sub_type == 'object'\")[\"name\"].tolist()\n", "ls_cate" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# construct categorical mapping\n", "dict_onehot = {}\n", "for cate in ls_cate:\n", " ls_val = pdf_data[cate].value_counts().index.tolist()\n", " dict_onehot[cate] = ls_val\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### one hot" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(10001358, 10)\n", "CPU times: user 50.6 s, sys: 4.19 s, total: 54.8 s\n", "Wall time: 38.2 s\n" ] } ], "source": [ "%%time\n", "pdf_onehot = gen_one_hot_feat(pdf_data, dict_onehot, main_key=\"SK_ID_CURR\")\n", "print(pdf_onehot.shape)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'NAME_CONTRACT_STATUS_Active': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Amortized_debt': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Approved': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Canceled': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Completed': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Demand': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Returned_to_the_store': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_Signed': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_STATUS_XNA': ['max', 'sum', 'mean']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (337252, 27)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
13NAME_CONTRACT_STATUS_Active_sum0.5467-0.03721.0
10NAME_CONTRACT_STATUS_Completed_sum0.5404-0.02071.0
9NAME_CONTRACT_STATUS_Completed_max0.5107-0.01871.0
14NAME_CONTRACT_STATUS_Active_mean0.5079-0.00771.0
25NAME_CONTRACT_STATUS_Signed_sum0.5044-0.00451.0
24NAME_CONTRACT_STATUS_Signed_max0.5042-0.00581.0
11NAME_CONTRACT_STATUS_Completed_mean0.50300.00091.0
26NAME_CONTRACT_STATUS_Signed_mean0.50230.00891.0
5NAME_CONTRACT_STATUS_Returned_to_the_store_mean0.50160.01211.0
4NAME_CONTRACT_STATUS_Returned_to_the_store_sum0.50160.00791.0
3NAME_CONTRACT_STATUS_Returned_to_the_store_max0.50160.00721.0
17NAME_CONTRACT_STATUS_Demand_mean0.50060.00691.0
15NAME_CONTRACT_STATUS_Demand_max0.50060.01001.0
16NAME_CONTRACT_STATUS_Demand_sum0.50060.00371.0
21NAME_CONTRACT_STATUS_Approved_max0.5006-0.00281.0
22NAME_CONTRACT_STATUS_Approved_sum0.5006-0.00221.0
23NAME_CONTRACT_STATUS_Approved_mean0.50060.00101.0
6NAME_CONTRACT_STATUS_Amortized_debt_max0.50010.00881.0
7NAME_CONTRACT_STATUS_Amortized_debt_sum0.50010.00491.0
8NAME_CONTRACT_STATUS_Amortized_debt_mean0.50010.00631.0
12NAME_CONTRACT_STATUS_Active_max0.5001-0.00191.0
1NAME_CONTRACT_STATUS_Canceled_sum0.50000.00071.0
0NAME_CONTRACT_STATUS_Canceled_max0.50000.00071.0
2NAME_CONTRACT_STATUS_Canceled_mean0.5000-0.00091.0
20NAME_CONTRACT_STATUS_XNA_mean0.5000-0.00071.0
19NAME_CONTRACT_STATUS_XNA_sum0.5000-0.00071.0
18NAME_CONTRACT_STATUS_XNA_max0.5000-0.00071.0
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "13 NAME_CONTRACT_STATUS_Active_sum 0.5467 -0.0372 1.0 \n", "10 NAME_CONTRACT_STATUS_Completed_sum 0.5404 -0.0207 1.0 \n", "9 NAME_CONTRACT_STATUS_Completed_max 0.5107 -0.0187 1.0 \n", "14 NAME_CONTRACT_STATUS_Active_mean 0.5079 -0.0077 1.0 \n", "25 NAME_CONTRACT_STATUS_Signed_sum 0.5044 -0.0045 1.0 \n", "24 NAME_CONTRACT_STATUS_Signed_max 0.5042 -0.0058 1.0 \n", "11 NAME_CONTRACT_STATUS_Completed_mean 0.5030 0.0009 1.0 \n", "26 NAME_CONTRACT_STATUS_Signed_mean 0.5023 0.0089 1.0 \n", "5 NAME_CONTRACT_STATUS_Returned_to_the_store_mean 0.5016 0.0121 1.0 \n", "4 NAME_CONTRACT_STATUS_Returned_to_the_store_sum 0.5016 0.0079 1.0 \n", "3 NAME_CONTRACT_STATUS_Returned_to_the_store_max 0.5016 0.0072 1.0 \n", "17 NAME_CONTRACT_STATUS_Demand_mean 0.5006 0.0069 1.0 \n", "15 NAME_CONTRACT_STATUS_Demand_max 0.5006 0.0100 1.0 \n", "16 NAME_CONTRACT_STATUS_Demand_sum 0.5006 0.0037 1.0 \n", "21 NAME_CONTRACT_STATUS_Approved_max 0.5006 -0.0028 1.0 \n", "22 NAME_CONTRACT_STATUS_Approved_sum 0.5006 -0.0022 1.0 \n", "23 NAME_CONTRACT_STATUS_Approved_mean 0.5006 0.0010 1.0 \n", "6 NAME_CONTRACT_STATUS_Amortized_debt_max 0.5001 0.0088 1.0 \n", "7 NAME_CONTRACT_STATUS_Amortized_debt_sum 0.5001 0.0049 1.0 \n", "8 NAME_CONTRACT_STATUS_Amortized_debt_mean 0.5001 0.0063 1.0 \n", "12 NAME_CONTRACT_STATUS_Active_max 0.5001 -0.0019 1.0 \n", "1 NAME_CONTRACT_STATUS_Canceled_sum 0.5000 0.0007 1.0 \n", "0 NAME_CONTRACT_STATUS_Canceled_max 0.5000 0.0007 1.0 \n", "2 NAME_CONTRACT_STATUS_Canceled_mean 0.5000 -0.0009 1.0 \n", "20 NAME_CONTRACT_STATUS_XNA_mean 0.5000 -0.0007 1.0 \n", "19 NAME_CONTRACT_STATUS_XNA_sum 0.5000 -0.0007 1.0 \n", "18 NAME_CONTRACT_STATUS_XNA_max 0.5000 -0.0007 1.0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 22.3 s, sys: 1.04 s, total: 23.3 s\n", "Wall time: 7.63 s\n" ] } ], "source": [ "%%time\n", "pdf_agg01 = agg_common_data(pdf_onehot, [\"max\", \"sum\", \"mean\"], main_key=\"SK_ID_CURR\")\n", "eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01)\n", "display(eval_agg01)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(16, 4)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_agg01.query(\"auc <= 0.501\").shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(337252, 11)\n" ] } ], "source": [ "sel_feat = eval_agg01.query(\"auc > 0.501\")[\"name\"].tolist()\n", "pdf_agg01 = pdf_agg01[sel_feat]\n", "print(pdf_agg01.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Numerical features" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['MONTHS_BALANCE', 'is_DPD', 'is_DPD_DEF']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series_type = pdf_data.dtypes\n", "ls_num = series_type[series_type == \"int64\"]\n", "ls_num = [cname for cname in ls_num.index if cname not in [\"SK_ID_PREV\", \"SK_ID_CURR\"]]\n", "ls_num" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'MONTHS_BALANCE': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'is_DPD': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'is_DPD_DEF': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (337252, 15)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
0MONTHS_BALANCE_max0.5594-0.05641.0000
4MONTHS_BALANCE_std0.5590-0.05570.9988
2MONTHS_BALANCE_sum0.5560-0.04161.0000
3MONTHS_BALANCE_mean0.5442-0.03531.0000
14is_DPD_DEF_std0.52460.04720.9988
13is_DPD_DEF_mean0.52460.04581.0000
9is_DPD_std0.52430.03820.9988
8is_DPD_mean0.52430.02851.0000
12is_DPD_DEF_sum0.52320.02431.0000
10is_DPD_DEF_max0.52280.03631.0000
5is_DPD_max0.52220.03111.0000
7is_DPD_sum0.52200.00951.0000
1MONTHS_BALANCE_min0.51180.00381.0000
6is_DPD_min0.50010.00411.0000
11is_DPD_DEF_min0.50000.00481.0000
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "0 MONTHS_BALANCE_max 0.5594 -0.0564 1.0000 \n", "4 MONTHS_BALANCE_std 0.5590 -0.0557 0.9988 \n", "2 MONTHS_BALANCE_sum 0.5560 -0.0416 1.0000 \n", "3 MONTHS_BALANCE_mean 0.5442 -0.0353 1.0000 \n", "14 is_DPD_DEF_std 0.5246 0.0472 0.9988 \n", "13 is_DPD_DEF_mean 0.5246 0.0458 1.0000 \n", "9 is_DPD_std 0.5243 0.0382 0.9988 \n", "8 is_DPD_mean 0.5243 0.0285 1.0000 \n", "12 is_DPD_DEF_sum 0.5232 0.0243 1.0000 \n", "10 is_DPD_DEF_max 0.5228 0.0363 1.0000 \n", "5 is_DPD_max 0.5222 0.0311 1.0000 \n", "7 is_DPD_sum 0.5220 0.0095 1.0000 \n", "1 MONTHS_BALANCE_min 0.5118 0.0038 1.0000 \n", "6 is_DPD_min 0.5001 0.0041 1.0000 \n", "11 is_DPD_DEF_min 0.5000 0.0048 1.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 11.2 s, sys: 568 ms, total: 11.8 s\n", "Wall time: 4.94 s\n" ] } ], "source": [ "%%time\n", "pdf_agg02 = agg_common_data(pdf_data[[\"SK_ID_CURR\"] + ls_num], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)\n", "display(eval_agg02)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Continuous features" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list continuous attributes\n", "ls_con = pdf_meta.query(\"sub_type == 'float64'\")[\"name\"].tolist()\n", "ls_con" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRCNT_INSTALMENTCNT_INSTALMENT_FUTURE
0180319518294348.045.0
1171534836799036.035.0
2178487239740612.09.0
3190329126922548.042.0
4234104433427936.035.0
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR CNT_INSTALMENT CNT_INSTALMENT_FUTURE\n", "0 1803195 182943 48.0 45.0 \n", "1 1715348 367990 36.0 35.0 \n", "2 1784872 397406 12.0 9.0 \n", "3 1903291 269225 48.0 42.0 \n", "4 2341044 334279 36.0 35.0 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_con = pdf_data[[\"SK_ID_PREV\", \"SK_ID_CURR\"] + ls_con].copy()\n", "pdf_con.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "{'CNT_INSTALMENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CNT_INSTALMENT_FUTURE': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (337252, 10)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
2CNT_INSTALMENT_sum0.5264-0.01601.0000
1CNT_INSTALMENT_min0.52280.01890.9999
8CNT_INSTALMENT_FUTURE_mean0.52080.02790.9999
7CNT_INSTALMENT_FUTURE_sum0.5200-0.00691.0000
3CNT_INSTALMENT_mean0.51580.01740.9999
4CNT_INSTALMENT_std0.51340.00450.9987
6CNT_INSTALMENT_FUTURE_min0.50980.01860.9999
9CNT_INSTALMENT_FUTURE_std0.50650.01580.9987
5CNT_INSTALMENT_FUTURE_max0.50640.01360.9999
0CNT_INSTALMENT_max0.50630.01350.9999
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "2 CNT_INSTALMENT_sum 0.5264 -0.0160 1.0000 \n", "1 CNT_INSTALMENT_min 0.5228 0.0189 0.9999 \n", "8 CNT_INSTALMENT_FUTURE_mean 0.5208 0.0279 0.9999 \n", "7 CNT_INSTALMENT_FUTURE_sum 0.5200 -0.0069 1.0000 \n", "3 CNT_INSTALMENT_mean 0.5158 0.0174 0.9999 \n", "4 CNT_INSTALMENT_std 0.5134 0.0045 0.9987 \n", "6 CNT_INSTALMENT_FUTURE_min 0.5098 0.0186 0.9999 \n", "9 CNT_INSTALMENT_FUTURE_std 0.5065 0.0158 0.9987 \n", "5 CNT_INSTALMENT_FUTURE_max 0.5064 0.0136 0.9999 \n", "0 CNT_INSTALMENT_max 0.5063 0.0135 0.9999 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 9.32 s, sys: 358 ms, total: 9.68 s\n", "Wall time: 3.36 s\n" ] } ], "source": [ "%%time\n", "pdf_agg03 = agg_common_data(pdf_con[[\"SK_ID_CURR\"] + ls_con], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)\n", "display(eval_agg03)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save features" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(337252, 36)\n" ] } ], "source": [ "pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03)\n", "print(pdf_feat.shape)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Store features completed!\n", "CPU times: user 5.28 s, sys: 92.4 ms, total: 5.37 s\n", "Wall time: 4.49 s\n" ] } ], "source": [ "%%time\n", "fname = \"pos_cash\"\n", "# fname = \"pos_cash_in1year\"\n", "# fname = \"pos_cash_in2year\"\n", "# fname = \"pos_cash_gt3year\"\n", "\n", "fname = os.path.join(\"features\", \"{}.pkl.bz2\".format(fname))\n", "pdf_feat.to_pickle(fname, compression=\"bz2\")\n", "print(\"Store features completed!\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }