{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Full width\n",
"from IPython.core.display import display, HTML\n",
"display(HTML(\"\"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import os\n",
"import subprocess\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from IPython.display import display\n",
"\n",
"# \n",
"from lib_modeling import *\n",
"from lib_feature_engineering import *\n",
"\n",
"from sklearn import metrics\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# some settings for displaying Pandas results\n",
"pd.set_option('display.width', 2000)\n",
"pd.set_option('display.max_rows', 500)\n",
"pd.set_option('display.max_columns', 500)\n",
"pd.set_option('display.precision', 4)\n",
"pd.set_option('display.max_colwidth', -1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SK_ID_CURR | \n",
" TARGET | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100002 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 100003 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 100004 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 100006 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 100007 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SK_ID_CURR TARGET\n",
"0 100002 1 \n",
"1 100003 0 \n",
"2 100004 0 \n",
"3 100006 0 \n",
"4 100007 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load train/test data\n",
"data_path = \"home-credit-default-risk/application_train.csv\"\n",
"pdf_train = pd.read_csv(data_path)\n",
"\n",
"data_path = \"home-credit-default-risk/application_test.csv\"\n",
"pdf_test = pd.read_csv(data_path)\n",
"\n",
"# filter by tvt code\n",
"pdf_tvt_extend = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n",
"pdf_train_filtered = (pdf_tvt_extend.query(\"tvt_code == 'train'\")\n",
" .merge(pdf_train[[\"SK_ID_CURR\"]], on=\"SK_ID_CURR\")\n",
" .drop(columns=[\"tvt_code\"]))\n",
"pdf_train_filtered.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# load bureau + balance\n",
"data_path = \"home-credit-default-risk/bureau.csv\"\n",
"pdf_bureau = pd.read_csv(data_path)\n",
"\n",
"data_path = \"home-credit-default-risk/bureau_balance.csv\"\n",
"pdf_bureau_balance = pd.read_csv(data_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Calculate intermediate data (idata)\n",
"\n",
"- status: binary, frequency\n",
"- count: number bureau, number bureau transaction"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAFtFJREFUeJzt3X+MXfV55/H3Exwaxw0xlDDy2mhNFSstxUoCI3AXKZqNu8ZAFPNHWYHYYBArryISkV1LjVOthJofEpWWpgGlSFZwMa0bwpIgW4mJYxGuupUCAUMaB5wsU+LFE7s4xECYZBvW6bN/3O8kN8P13B9fZs41vF/S1dzznO8538cjz3zm/JgzkZlIklTjTU03IEk6+RkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqLWq6gYVy5pln5sqVK4fa9mc/+xlLlix5bRt6DdjXYOxrcKPam30Npqavffv2PZ+Z7+g5MDPfEK8LLrggh/XQQw8Nve18sq/B2NfgRrU3+xpMTV/AY9nH91hPc0mSqhkmkqRqhokkqZphIkmqZphIkqr1FSYRsTQi7ouI70fEgYj4w4g4IyL2RsTT5ePpZWxExG0RMRkR342I8zv2s7GMfzoiNnbUL4iI/WWb2yIiSn3gOSRJC6/fI5PPAV/PzN8D3g0cALYAD2bmKuDBsgxwKbCqvDYBd0A7GICbgYuAC4GbZ8KhjNnUsd36Uh9oDklSM3qGSUScBrwPuBMgM1/JzBeBDcD2Mmw7cEV5vwG4u9yi/DCwNCKWAZcAezPzWGa+AOwF1pd1p2Xmt8o9zXfP2tcgc0iSGtDPkcnvAj8G/joinoiIL0TEEmAsM48AlI9nlfHLgUMd20+V2lz1qS51hphDktSAfh6nsgg4H/hoZj4SEZ/j16ebuokutRyiPpe+tomITbRPgzE2Nkar1eqx2+6OHnuJ23fsHGrbWquXv/2E66anp4f+N80n+xrMqPYFo9ubfQ1mIfrqJ0ymgKnMfKQs30c7TJ6LiGWZeaScYjraMf7sju1XAIdLfWJWvVXqK7qMZ4g5fkNmbgW2AoyPj+fExMTsIX25fcdObt3fzGPMDl4zccJ1rVaLYf9N88m+BjOqfcHo9mZfg1mIvnqe5srMfwYORcS7Smkt8BSwC5i5I2sjMPOj+y7g2nLH1RrgpXKKag+wLiJOLxfe1wF7yrqXI2JNuYvr2ln7GmQOSVID+v1x+6PAjog4FXgGuJ52EN0bETcAzwJXlrG7gcuASeDnZSyZeSwiPgU8WsZ9MjOPlfcfBu4CFgMPlBfALYPMIUlqRl9hkpnfAca7rFrbZWwCN55gP9uAbV3qjwHndan/ZNA5JEkLz9+AlyRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVW9TPoIg4CLwM/BI4npnjEXEG8CVgJXAQ+I+Z+UJEBPA54DLg58B1mfl42c9G4L+X3X46M7eX+gXAXcBiYDdwU2bmMHO83qzc8rUTrtu8+jjXzbG+xsFbLp+X/Up6fRrkyOTfZ+Z7MnO8LG8BHszMVcCDZRngUmBVeW0C7gAowXAzcBFwIXBzRJxetrmjjJ3Zbv0wc0iSmlFzmmsDsL283w5c0VG/O9seBpZGxDLgEmBvZh7LzBeAvcD6su60zPxWZiZw96x9DTKHJKkB/YZJAt+IiH0RsanUxjLzCED5eFapLwcOdWw7VWpz1ae61IeZQ5LUgL6umQAXZ+bhiDgL2BsR359jbHSp5RD1ufS1TQm+TQBjY2O0Wq0eu+1ubHH7+sSomc++hv1cAUxPT1dtP1/sa3Cj2pt9DWYh+uorTDLzcPl4NCLup33N47mIWJaZR8oppqNl+BRwdsfmK4DDpT4xq94q9RVdxjPEHLP73gpsBRgfH8+JiYnZQ/py+46d3Lq/39xdOJtXH5+3vg5eMzH0tq1Wi2E/1/PJvgY3qr3Z12AWoq+ep7kiYklEvG3mPbAO+B6wC9hYhm0Edpb3u4Bro20N8FI5RbUHWBcRp5cL7+uAPWXdyxGxptylde2sfQ0yhySpAf38WDsG3N/+Ps8i4O8y8+sR8Shwb0TcADwLXFnG76Z9y+4k7dt2rwfIzGMR8Sng0TLuk5l5rLz/ML++NfiB8gK4ZZA5JEnN6BkmmfkM8O4u9Z8Aa7vUE7jxBPvaBmzrUn8MOO+1mEOStPD8DXhJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUre8wiYhTIuKJiPhqWT4nIh6JiKcj4ksRcWqp/1ZZnizrV3bs4xOl/oOIuKSjvr7UJiNiS0d94DkkSQtvkCOTm4ADHct/Dnw2M1cBLwA3lPoNwAuZ+U7gs2UcEXEucBXwB8B64K9KQJ0CfB64FDgXuLqMHXgOSVIz+gqTiFgBXA58oSwH8H7gvjJkO3BFeb+hLFPWry3jNwD3ZOYvMvOHwCRwYXlNZuYzmfkKcA+wYcg5JEkN6PfI5C+BPwH+tSz/DvBiZh4vy1PA8vJ+OXAIoKx/qYz/VX3WNieqDzOHJKkBi3oNiIgPAEczc19ETMyUuwzNHutOVO8WaHON7zX/r0TEJmATwNjYGK1Wq8tmvY0ths2rj/ceuMDms69hP1cA09PTVdvPF/sa3Kj2Zl+DWYi+eoYJcDHwwYi4DHgLcBrtI5WlEbGoHBmsAA6X8VPA2cBURCwC3g4c66jP6NymW/35Ieb4DZm5FdgKMD4+nhMTE338c1/t9h07uXV/P5+qhbV59fF56+vgNRNDb9tqtRj2cz2f7Gtwo9qbfQ1mIfrqeZorMz+RmSsycyXtC+jfzMxrgIeAPy7DNgI7y/tdZZmy/puZmaV+VbkT6xxgFfBt4FFgVblz69Qyx66yzaBzSJIaUPNj7ceBeyLi08ATwJ2lfifwNxExSfto4SqAzHwyIu4FngKOAzdm5i8BIuIjwB7gFGBbZj45zBySpGYMFCaZ2QJa5f0ztO/Emj3mX4ArT7D9Z4DPdKnvBnZ3qQ88hyRp4fkb8JKkaqN3VVkjYeWWrw297ebVx7muYvuDt1w+9LaSmuGRiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKq+WwujZya54LNpdczw3wmmDQ8j0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRV6xkmEfGWiPh2RPxjRDwZEX9W6udExCMR8XREfCkiTi313yrLk2X9yo59faLUfxARl3TU15faZERs6agPPIckaeH1c2TyC+D9mflu4D3A+ohYA/w58NnMXAW8ANxQxt8AvJCZ7wQ+W8YREecCVwF/AKwH/ioiTomIU4DPA5cC5wJXl7EMOockqRk9wyTbpsvim8srgfcD95X6duCK8n5DWaasXxsRUer3ZOYvMvOHwCRwYXlNZuYzmfkKcA+woWwz6BySpAb09fdMytHDPuCdtI8i/gl4MTOPlyFTwPLyfjlwCCAzj0fES8DvlPrDHbvt3ObQrPpFZZtB53h+Vt+bgE0AY2NjtFqtfv65rzK2uP23MEaNfQ2mV1/D/v+oNT093djcvYxqb/Y1mIXoq68wycxfAu+JiKXA/cDvdxtWPnY7Qsg56t2OjuYaP9ccv1nI3ApsBRgfH8+JiYkum/V2+46d3Lp/9P6O2ObVx+1rAL36OnjNxMI106HVajHs/835Nqq92ddgFqKvge7myswXgRawBlgaETNfmSuAw+X9FHA2QFn/duBYZ33WNieqPz/EHJKkBvRzN9c7yhEJEbEY+CPgAPAQ8Mdl2EZgZ3m/qyxT1n8zM7PUryp3Yp0DrAK+DTwKrCp3bp1K+yL9rrLNoHNIkhrQz7mIZcD2ct3kTcC9mfnViHgKuCciPg08AdxZxt8J/E1ETNI+WrgKIDOfjIh7gaeA48CN5fQZEfERYA9wCrAtM58s+/r4IHNIkprRM0wy87vAe7vUn6F9J9bs+r8AV55gX58BPtOlvhvY/VrMIUlaeP4GvCSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSp2uj9OTypISu3fK2Ree9av6SReaXXkkcmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSarWM0wi4uyIeCgiDkTEkxFxU6mfERF7I+Lp8vH0Uo+IuC0iJiPiuxFxfse+NpbxT0fExo76BRGxv2xzW0TEsHNIkhZePw96PA5szszHI+JtwL6I2AtcBzyYmbdExBZgC/Bx4FJgVXldBNwBXBQRZwA3A+NAlv3syswXyphNwMPAbmA98EDZZ99z1H4ypCbs/9FLXNfQQyYP3nJ5I/Pq9afnkUlmHsnMx8v7l4EDwHJgA7C9DNsOXFHebwDuzraHgaURsQy4BNibmcdKgOwF1pd1p2XmtzIzgbtn7WuQOSRJDRjomklErATeCzwCjGXmEWgHDnBWGbYcONSx2VSpzVWf6lJniDkkSQ3o+++ZRMRvA18GPpaZPy2XNboO7VLLIepzttPPNhGxifbpM8bGxmi1Wj12293YYti8+vhQ284n+xqMfb1ar6+J6enpob9u5pN9DWYh+uorTCLizbSDZEdmfqWUn4uIZZl5pJxiOlrqU8DZHZuvAA6X+sSseqvUV3QZP8wcvyEztwJbAcbHx3NiYmL2kL7cvmMnt+4fvb8jtnn1cfsagH292sFrJuZc32q1GPbrZj7Z12AWoq9+7uYK4E7gQGb+RceqXcDMHVkbgZ0d9WvLHVdrgJfKKao9wLqIOL3clbUO2FPWvRwRa8pc187a1yBzSJIa0M+PQxcDHwL2R8R3Su1PgVuAeyPiBuBZ4MqybjdwGTAJ/By4HiAzj0XEp4BHy7hPZuax8v7DwF3AYtp3cT1Q6gPNIUlqRs8wycx/oPs1CoC1XcYncOMJ9rUN2Nal/hhwXpf6TwadQ5K08PwNeElSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVK10XvqnaQFs7LHH+XavPr4vPzhLv8o1+uPRyaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGo+NVjSguv1tOJeap5m7BOL54dHJpKkaoaJJKlazzCJiG0RcTQivtdROyMi9kbE0+Xj6aUeEXFbRExGxHcj4vyObTaW8U9HxMaO+gURsb9sc1tExLBzSJKa0c+RyV3A+lm1LcCDmbkKeLAsA1wKrCqvTcAd0A4G4GbgIuBC4OaZcChjNnVst36YOSRJzekZJpn598CxWeUNwPbyfjtwRUf97mx7GFgaEcuAS4C9mXksM18A9gLry7rTMvNbmZnA3bP2NcgckqSGDHs311hmHgHIzCMRcVapLwcOdYybKrW56lNd6sPMcWR2kxGxifbRC2NjY7RarcH+lTONLG7fPTJq7Gsw9jW4Ue2tpq/bd+x8jbv5tbHFJ97/6uVvn7d5e5menh76+1+/Xutbg6NLLYeoDzPHq4uZW4GtAOPj4zkxMdFj193dvmMnt+4fvbuoN68+bl8DsK/BjWpvJ2NfB6+ZWNhmOrRaLYb9/tevYe/mem7m1FL5eLTUp4CzO8atAA73qK/oUh9mDklSQ4YNk13AzB1ZG4GdHfVryx1Xa4CXyqmqPcC6iDi9XHhfB+wp616OiDXlLq5rZ+1rkDkkSQ3peZwYEV8EJoAzI2KK9l1ZtwD3RsQNwLPAlWX4buAyYBL4OXA9QGYei4hPAY+WcZ/MzJmL+h+mfcfYYuCB8mLQOSRJzekZJpl59QlWre0yNoEbT7CfbcC2LvXHgPO61H8y6BySpGb4G/CSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqqN3pPSJOl1qPbv3te4a/2SeZ/DIxNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1U7aMImI9RHxg4iYjIgtTfcjSW9kJ2WYRMQpwOeBS4Fzgasj4txmu5KkN66TMkyAC4HJzHwmM18B7gE2NNyTJL1hnaxhshw41LE8VWqSpAZEZjbdw8Ai4krgksz8z2X5Q8CFmfnRWeM2AZvK4ruAHww55ZnA80NuO5/sazD2NbhR7c2+BlPT17/NzHf0GrRoyJ03bQo4u2N5BXB49qDM3ApsrZ0sIh7LzPHa/bzW7Gsw9jW4Ue3NvgazEH2drKe5HgVWRcQ5EXEqcBWwq+GeJOkN66Q8MsnM4xHxEWAPcAqwLTOfbLgtSXrDOinDBCAzdwO7F2i66lNl88S+BmNfgxvV3uxrMPPe10l5AV6SNFpO1msmkqQRYpj0MIqPbYmIbRFxNCK+13QvnSLi7Ih4KCIORMSTEXFT0z0BRMRbIuLbEfGPpa8/a7qnThFxSkQ8ERFfbbqXGRFxMCL2R8R3IuKxpvuZERFLI+K+iPh++X/2hyPQ07vK52nm9dOI+FjTfQFExH8t/+e/FxFfjIi3zNtcnuY6sfLYlv8N/AfatyM/ClydmU813Nf7gGng7sw8r8leOkXEMmBZZj4eEW8D9gFXjMDnK4AlmTkdEW8G/gG4KTMfbrKvGRHx34Bx4LTM/EDT/UA7TIDxzByp35mIiO3A/8rML5Q7Od+amS823deM8j3jR8BFmfl/Gu5lOe3/6+dm5v+NiHuB3Zl513zM55HJ3EbysS2Z+ffAsab7mC0zj2Tm4+X9y8ABRuDJBNk2XRbfXF4j8VNURKwALge+0HQvoy4iTgPeB9wJkJmvjFKQFGuBf2o6SDosAhZHxCLgrXT5fbzXimEyNx/bMqSIWAm8F3ik2U7ayqmk7wBHgb2ZORJ9AX8J/Anwr003MksC34iIfeVJEqPgd4EfA39dTgt+ISKWNN3ULFcBX2y6CYDM/BHwP4BngSPAS5n5jfmazzCZW3SpjcRPtKMsIn4b+DLwscz8adP9AGTmLzPzPbSflnBhRDR+ejAiPgAczcx9TffSxcWZeT7tJ3PfWE6tNm0RcD5wR2a+F/gZMBLXMQHKabcPAv+z6V4AIuJ02mdSzgH+DbAkIv7TfM1nmMytr8e26NfKNYkvAzsy8ytN9zNbOS3SAtY33ArAxcAHy/WJe4D3R8TfNttSW2YeLh+PAvfTPuXbtClgquOo8j7a4TIqLgUez8znmm6k+CPgh5n548z8f8BXgH83X5MZJnPzsS0DKBe67wQOZOZfNN3PjIh4R0QsLe8X0/4i+36zXUFmfiIzV2TmStr/t76ZmfP2k2O/ImJJuYGCchppHdD4nYOZ+c/AoYh4VymtBRq9uWOWqxmRU1zFs8CaiHhr+dpcS/s65rw4aX8DfiGM6mNbIuKLwARwZkRMATdn5p3NdgW0f9L+ELC/XJ8A+NPytIImLQO2lztt3gTcm5kjcxvuCBoD7m9//2ER8HeZ+fVmW/qVjwI7yg93zwDXN9wPABHxVtp3ff6XpnuZkZmPRMR9wOPAceAJ5vE34b01WJJUzdNckqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKq/X8MZzNNeD8JbgAAAABJRU5ErkJggg==\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"(pdf_bureau_balance[\"MONTHS_BALANCE\"] / -12.0).hist()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SK_ID_BUREAU | \n",
" MONTHS_BALANCE | \n",
" STATUS | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5715448 | \n",
" -0.0000 | \n",
" C | \n",
"
\n",
" \n",
" 1 | \n",
" 5715448 | \n",
" 0.0833 | \n",
" C | \n",
"
\n",
" \n",
" 2 | \n",
" 5715448 | \n",
" 0.1667 | \n",
" C | \n",
"
\n",
" \n",
" 3 | \n",
" 5715448 | \n",
" 0.2500 | \n",
" C | \n",
"
\n",
" \n",
" 4 | \n",
" 5715448 | \n",
" 0.3333 | \n",
" C | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SK_ID_BUREAU MONTHS_BALANCE STATUS\n",
"0 5715448 -0.0000 C \n",
"1 5715448 0.0833 C \n",
"2 5715448 0.1667 C \n",
"3 5715448 0.2500 C \n",
"4 5715448 0.3333 C "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# filter here for looking up balance within 1, 2, 3 years\n",
"pdf_bureau_balance[\"MONTHS_BALANCE\"] = pdf_bureau_balance[\"MONTHS_BALANCE\"] / -12\n",
"# pdf_bureau_balance = pdf_bureau_balance[(pdf_bureau_balance[\"MONTHS_BALANCE\"] >= 1) & (pdf_bureau_balance[\"MONTHS_BALANCE\"] < 1)]\n",
"# pdf_bureau_balance = pdf_bureau_balance[(pdf_bureau_balance[\"MONTHS_BALANCE\"] >= 2) & (pdf_bureau_balance[\"MONTHS_BALANCE\"] < 3)]\n",
"# pdf_bureau_balance = pdf_bureau_balance[pdf_bureau_balance[\"MONTHS_BALANCE\"] >= 3]\n",
"# pdf_bureau_balance = pdf_bureau_balance[pdf_bureau_balance[\"MONTHS_BALANCE\"] < 1]\n",
"pdf_bureau_balance.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"C 13646993\n",
"0 7499507 \n",
"X 5810482 \n",
"1 242347 \n",
"5 62406 \n",
"2 23419 \n",
"3 8924 \n",
"4 5847 \n",
"Name: STATUS, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pdf_bureau_balance[\"STATUS\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"C 13646993\n",
"0 7499507 \n",
"X 5810482 \n",
"DPD 280537 \n",
"Sold 62406 \n",
"Name: STATUS, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# status in (1, 2, 3, 4) is DPD (date past due). \n",
"# 5 is sold. C is closed. X is unknown. 0 is no DPD\n",
"pdf_bureau_balance.loc[pdf_bureau_balance[\"STATUS\"].isin(['1', '2', '3', '4']), \"STATUS\"] = \"DPD\"\n",
"pdf_bureau_balance.loc[pdf_bureau_balance[\"STATUS\"] == '5', \"STATUS\"] = \"Sold\"\n",
"pdf_bureau_balance[\"STATUS\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SK_ID_BUREAU | \n",
" STATUS_C | \n",
" STATUS_0 | \n",
" STATUS_X | \n",
" STATUS_DPD | \n",
" STATUS_Sold | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5715448 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 5715448 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 5715448 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 5715448 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5715448 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SK_ID_BUREAU STATUS_C STATUS_0 STATUS_X STATUS_DPD STATUS_Sold\n",
"0 5715448 1 0 0 0 0 \n",
"1 5715448 1 0 0 0 0 \n",
"2 5715448 1 0 0 0 0 \n",
"3 5715448 1 0 0 0 0 \n",
"4 5715448 1 0 0 0 0 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 1s, sys: 5.32 s, total: 1min 6s\n",
"Wall time: 51.5 s\n"
]
}
],
"source": [
"%%time\n",
"dict_onehot = {\n",
" \"STATUS\": ['C', '0', 'X', 'DPD', 'Sold'],\n",
"}\n",
"pdf_onehot = gen_one_hot_feat(pdf_bureau_balance, dict_onehot, main_key=\"SK_ID_BUREAU\")\n",
"display(pdf_onehot.head())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'STATUS_0': ['max', 'sum', 'mean', 'std'],\n",
" 'STATUS_C': ['max', 'sum', 'mean', 'std'],\n",
" 'STATUS_DPD': ['max', 'sum', 'mean', 'std'],\n",
" 'STATUS_Sold': ['max', 'sum', 'mean', 'std'],\n",
" 'STATUS_X': ['max', 'sum', 'mean', 'std']}"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"After agg: (817395, 20)\n",
"CPU times: user 14.3 s, sys: 1.81 s, total: 16.2 s\n",
"Wall time: 6.52 s\n"
]
}
],
"source": [
"%%time\n",
"pdf_idata01 = agg_common_data(pdf_onehot, [\"max\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_BUREAU\")\n",
"pdf_idata01.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bureau_num_trans | \n",
"
\n",
" \n",
" SK_ID_BUREAU | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 5001709 | \n",
" 97 | \n",
"
\n",
" \n",
" 5001710 | \n",
" 83 | \n",
"
\n",
" \n",
" 5001711 | \n",
" 4 | \n",
"
\n",
" \n",
" 5001712 | \n",
" 19 | \n",
"
\n",
" \n",
" 5001713 | \n",
" 22 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bureau_num_trans\n",
"SK_ID_BUREAU \n",
"5001709 97 \n",
"5001710 83 \n",
"5001711 4 \n",
"5001712 19 \n",
"5001713 22 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# \n",
"pdf_idata02 = pdf_bureau_balance.groupby(\"SK_ID_BUREAU\").size().to_frame(\"bureau_num_trans\")\n",
"pdf_idata02.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SK_ID_BUREAU | \n",
" STATUS_Sold_max | \n",
" STATUS_Sold_sum | \n",
" STATUS_Sold_mean | \n",
" STATUS_Sold_std | \n",
" STATUS_X_max | \n",
" STATUS_X_sum | \n",
" STATUS_X_mean | \n",
" STATUS_X_std | \n",
" STATUS_C_max | \n",
" STATUS_C_sum | \n",
" STATUS_C_mean | \n",
" STATUS_C_std | \n",
" STATUS_DPD_max | \n",
" STATUS_DPD_sum | \n",
" STATUS_DPD_mean | \n",
" STATUS_DPD_std | \n",
" STATUS_0_max | \n",
" STATUS_0_sum | \n",
" STATUS_0_mean | \n",
" STATUS_0_std | \n",
" bureau_num_trans | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5001709 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 11 | \n",
" 0.1134 | \n",
" 0.3187 | \n",
" 1 | \n",
" 86 | \n",
" 0.8866 | \n",
" 0.3187 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 97 | \n",
"
\n",
" \n",
" 1 | \n",
" 5001710 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 30 | \n",
" 0.3614 | \n",
" 0.4833 | \n",
" 1 | \n",
" 48 | \n",
" 0.5783 | \n",
" 0.4968 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 5 | \n",
" 0.0602 | \n",
" 0.2394 | \n",
" 83 | \n",
"
\n",
" \n",
" 2 | \n",
" 5001711 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 1 | \n",
" 0.2500 | \n",
" 0.5000 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 3 | \n",
" 0.7500 | \n",
" 0.5000 | \n",
" 4 | \n",
"
\n",
" \n",
" 3 | \n",
" 5001712 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 1 | \n",
" 9 | \n",
" 0.4737 | \n",
" 0.5130 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 10 | \n",
" 0.5263 | \n",
" 0.5130 | \n",
" 19 | \n",
"
\n",
" \n",
" 4 | \n",
" 5001713 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 22 | \n",
" 1.0000 | \n",
" 0.0000 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 22 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SK_ID_BUREAU STATUS_Sold_max STATUS_Sold_sum STATUS_Sold_mean STATUS_Sold_std STATUS_X_max STATUS_X_sum STATUS_X_mean STATUS_X_std STATUS_C_max STATUS_C_sum STATUS_C_mean STATUS_C_std STATUS_DPD_max STATUS_DPD_sum STATUS_DPD_mean STATUS_DPD_std STATUS_0_max STATUS_0_sum STATUS_0_mean STATUS_0_std bureau_num_trans\n",
"0 5001709 0 0 0.0 0.0 1 11 0.1134 0.3187 1 86 0.8866 0.3187 0 0 0.0 0.0 0 0 0.0000 0.0000 97 \n",
"1 5001710 0 0 0.0 0.0 1 30 0.3614 0.4833 1 48 0.5783 0.4968 0 0 0.0 0.0 1 5 0.0602 0.2394 83 \n",
"2 5001711 0 0 0.0 0.0 1 1 0.2500 0.5000 0 0 0.0000 0.0000 0 0 0.0 0.0 1 3 0.7500 0.5000 4 \n",
"3 5001712 0 0 0.0 0.0 0 0 0.0000 0.0000 1 9 0.4737 0.5130 0 0 0.0 0.0 1 10 0.5263 0.5130 19 \n",
"4 5001713 0 0 0.0 0.0 1 22 1.0000 0.0000 0 0 0.0000 0.0000 0 0 0.0 0.0 0 0 0.0000 0.0000 22 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pdf_idata = pdf_idata01.join(pdf_idata02).reset_index()\n",
"pdf_idata.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature engineering"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(774354, 23)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SK_ID_CURR | \n",
" SK_ID_BUREAU | \n",
" STATUS_Sold_max | \n",
" STATUS_Sold_sum | \n",
" STATUS_Sold_mean | \n",
" STATUS_Sold_std | \n",
" STATUS_X_max | \n",
" STATUS_X_sum | \n",
" STATUS_X_mean | \n",
" STATUS_X_std | \n",
" STATUS_C_max | \n",
" STATUS_C_sum | \n",
" STATUS_C_mean | \n",
" STATUS_C_std | \n",
" STATUS_DPD_max | \n",
" STATUS_DPD_sum | \n",
" STATUS_DPD_mean | \n",
" STATUS_DPD_std | \n",
" STATUS_0_max | \n",
" STATUS_0_sum | \n",
" STATUS_0_mean | \n",
" STATUS_0_std | \n",
" bureau_num_trans | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100002 | \n",
" 6158904 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 1 | \n",
" 0.0455 | \n",
" 0.2132 | \n",
" 1 | \n",
" 2 | \n",
" 0.0909 | \n",
" 0.2942 | \n",
" 1 | \n",
" 1 | \n",
" 0.0455 | \n",
" 0.2132 | \n",
" 1 | \n",
" 18 | \n",
" 0.8182 | \n",
" 0.3948 | \n",
" 22 | \n",
"
\n",
" \n",
" 1 | \n",
" 100002 | \n",
" 6158905 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 1 | \n",
" 13 | \n",
" 0.8125 | \n",
" 0.4031 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 1 | \n",
" 3 | \n",
" 0.1875 | \n",
" 0.4031 | \n",
" 16 | \n",
"
\n",
" \n",
" 2 | \n",
" 100002 | \n",
" 6158906 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 3 | \n",
" 0.1875 | \n",
" 0.4031 | \n",
" 1 | \n",
" 2 | \n",
" 0.1250 | \n",
" 0.3416 | \n",
" 1 | \n",
" 6 | \n",
" 0.3750 | \n",
" 0.5000 | \n",
" 1 | \n",
" 5 | \n",
" 0.3125 | \n",
" 0.4787 | \n",
" 16 | \n",
"
\n",
" \n",
" 3 | \n",
" 100002 | \n",
" 6158907 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1 | \n",
" 3 | \n",
" 0.1875 | \n",
" 0.4031 | \n",
" 1 | \n",
" 2 | \n",
" 0.1250 | \n",
" 0.3416 | \n",
" 1 | \n",
" 6 | \n",
" 0.3750 | \n",
" 0.5000 | \n",
" 1 | \n",
" 5 | \n",
" 0.3125 | \n",
" 0.4787 | \n",
" 16 | \n",
"
\n",
" \n",
" 4 | \n",
" 100002 | \n",
" 6158908 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 1 | \n",
" 2 | \n",
" 0.5000 | \n",
" 0.5774 | \n",
" 1 | \n",
" 2 | \n",
" 0.5000 | \n",
" 0.5774 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SK_ID_CURR SK_ID_BUREAU STATUS_Sold_max STATUS_Sold_sum STATUS_Sold_mean STATUS_Sold_std STATUS_X_max STATUS_X_sum STATUS_X_mean STATUS_X_std STATUS_C_max STATUS_C_sum STATUS_C_mean STATUS_C_std STATUS_DPD_max STATUS_DPD_sum STATUS_DPD_mean STATUS_DPD_std STATUS_0_max STATUS_0_sum STATUS_0_mean STATUS_0_std bureau_num_trans\n",
"0 100002 6158904 0 0 0.0 0.0 1 1 0.0455 0.2132 1 2 0.0909 0.2942 1 1 0.0455 0.2132 1 18 0.8182 0.3948 22 \n",
"1 100002 6158905 0 0 0.0 0.0 0 0 0.0000 0.0000 1 13 0.8125 0.4031 0 0 0.0000 0.0000 1 3 0.1875 0.4031 16 \n",
"2 100002 6158906 0 0 0.0 0.0 1 3 0.1875 0.4031 1 2 0.1250 0.3416 1 6 0.3750 0.5000 1 5 0.3125 0.4787 16 \n",
"3 100002 6158907 0 0 0.0 0.0 1 3 0.1875 0.4031 1 2 0.1250 0.3416 1 6 0.3750 0.5000 1 5 0.3125 0.4787 16 \n",
"4 100002 6158908 0 0 0.0 0.0 0 0 0.0000 0.0000 0 0 0.0000 0.0000 1 2 0.5000 0.5774 1 2 0.5000 0.5774 4 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Join application to bureau balance\n",
"pdf_ids = pd.concat([pdf_train[[\"SK_ID_CURR\"]], pdf_test[[\"SK_ID_CURR\"]]])\n",
"pdf_data = (pdf_ids.merge(pdf_bureau[[\"SK_ID_CURR\", \"SK_ID_BUREAU\"]], on=\"SK_ID_CURR\")\n",
" .merge(pdf_idata, on=\"SK_ID_BUREAU\"))\n",
"print(pdf_data.shape)\n",
"pdf_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SK_ID_CURR | \n",
" STATUS_Sold_sum | \n",
" STATUS_X_sum | \n",
" STATUS_C_sum | \n",
" STATUS_DPD_sum | \n",
" STATUS_0_sum | \n",
" bureau_num_trans | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100002 | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 18 | \n",
" 22 | \n",
"
\n",
" \n",
" 1 | \n",
" 100002 | \n",
" 0 | \n",
" 0 | \n",
" 13 | \n",
" 0 | \n",
" 3 | \n",
" 16 | \n",
"
\n",
" \n",
" 2 | \n",
" 100002 | \n",
" 0 | \n",
" 3 | \n",
" 2 | \n",
" 6 | \n",
" 5 | \n",
" 16 | \n",
"
\n",
" \n",
" 3 | \n",
" 100002 | \n",
" 0 | \n",
" 3 | \n",
" 2 | \n",
" 6 | \n",
" 5 | \n",
" 16 | \n",
"
\n",
" \n",
" 4 | \n",
" 100002 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SK_ID_CURR STATUS_Sold_sum STATUS_X_sum STATUS_C_sum STATUS_DPD_sum STATUS_0_sum bureau_num_trans\n",
"0 100002 0 1 2 1 18 22 \n",
"1 100002 0 0 13 0 3 16 \n",
"2 100002 0 3 2 6 5 16 \n",
"3 100002 0 3 2 6 5 16 \n",
"4 100002 0 0 0 2 2 4 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# statistics on status_sum, bureau_num_trans\n",
"ls_cols = [cname for cname in pdf_data.columns if \"_sum\" in cname] + [\"bureau_num_trans\"]\n",
"pdf_stats = pdf_data[[\"SK_ID_CURR\"] + ls_cols]\n",
"pdf_stats.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'STATUS_0_sum': ['min', 'max', 'mean', 'std', 'median'],\n",
" 'STATUS_C_sum': ['min', 'max', 'mean', 'std', 'median'],\n",
" 'STATUS_DPD_sum': ['min', 'max', 'mean', 'std', 'median'],\n",
" 'STATUS_Sold_sum': ['min', 'max', 'mean', 'std', 'median'],\n",
" 'STATUS_X_sum': ['min', 'max', 'mean', 'std', 'median'],\n",
" 'bureau_num_trans': ['min', 'max', 'mean', 'std', 'median']}"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"After agg: (134542, 30)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATUS_C_sum_min | \n",
" STATUS_C_sum_max | \n",
" STATUS_C_sum_mean | \n",
" STATUS_C_sum_std | \n",
" STATUS_C_sum_median | \n",
" bureau_num_trans_min | \n",
" bureau_num_trans_max | \n",
" bureau_num_trans_mean | \n",
" bureau_num_trans_std | \n",
" bureau_num_trans_median | \n",
" STATUS_0_sum_min | \n",
" STATUS_0_sum_max | \n",
" STATUS_0_sum_mean | \n",
" STATUS_0_sum_std | \n",
" STATUS_0_sum_median | \n",
" STATUS_DPD_sum_min | \n",
" STATUS_DPD_sum_max | \n",
" STATUS_DPD_sum_mean | \n",
" STATUS_DPD_sum_std | \n",
" STATUS_DPD_sum_median | \n",
" STATUS_Sold_sum_min | \n",
" STATUS_Sold_sum_max | \n",
" STATUS_Sold_sum_mean | \n",
" STATUS_Sold_sum_std | \n",
" STATUS_Sold_sum_median | \n",
" STATUS_X_sum_min | \n",
" STATUS_X_sum_max | \n",
" STATUS_X_sum_mean | \n",
" STATUS_X_sum_std | \n",
" STATUS_X_sum_median | \n",
"
\n",
" \n",
" SK_ID_CURR | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 100001 | \n",
" 0 | \n",
" 44 | \n",
" 15.7143 | \n",
" 16.9972 | \n",
" 18.0 | \n",
" 2 | \n",
" 52 | \n",
" 24.5714 | \n",
" 16.0505 | \n",
" 29.0 | \n",
" 1 | \n",
" 12 | \n",
" 4.4286 | \n",
" 4.8941 | \n",
" 2.0 | \n",
" 0 | \n",
" 1 | \n",
" 0.1429 | \n",
" 0.3780 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 9 | \n",
" 4.2857 | \n",
" 3.8173 | \n",
" 6.0 | \n",
"
\n",
" \n",
" 100002 | \n",
" 0 | \n",
" 13 | \n",
" 2.8750 | \n",
" 4.1897 | \n",
" 2.0 | \n",
" 4 | \n",
" 22 | \n",
" 13.7500 | \n",
" 6.3640 | \n",
" 16.0 | \n",
" 2 | \n",
" 18 | \n",
" 5.6250 | \n",
" 5.1807 | \n",
" 5.0 | \n",
" 0 | \n",
" 6 | \n",
" 3.3750 | \n",
" 2.8754 | \n",
" 4.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 3 | \n",
" 1.8750 | \n",
" 1.3562 | \n",
" 2.5 | \n",
"
\n",
" \n",
" 100005 | \n",
" 0 | \n",
" 5 | \n",
" 1.6667 | \n",
" 2.8868 | \n",
" 0.0 | \n",
" 3 | \n",
" 13 | \n",
" 7.0000 | \n",
" 5.2915 | \n",
" 5.0 | \n",
" 2 | \n",
" 7 | \n",
" 4.6667 | \n",
" 2.5166 | \n",
" 5.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0.6667 | \n",
" 0.5774 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 100010 | \n",
" 26 | \n",
" 26 | \n",
" 26.0000 | \n",
" 0.0000 | \n",
" 26.0 | \n",
" 36 | \n",
" 36 | \n",
" 36.0000 | \n",
" 0.0000 | \n",
" 36.0 | \n",
" 10 | \n",
" 10 | \n",
" 10.0000 | \n",
" 0.0000 | \n",
" 10.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 100013 | \n",
" 0 | \n",
" 44 | \n",
" 25.7500 | \n",
" 20.7264 | \n",
" 29.5 | \n",
" 40 | \n",
" 69 | \n",
" 57.5000 | \n",
" 13.1276 | \n",
" 60.5 | \n",
" 0 | \n",
" 34 | \n",
" 19.7500 | \n",
" 14.5230 | \n",
" 22.5 | \n",
" 0 | \n",
" 3 | \n",
" 1.7500 | \n",
" 1.2583 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 40 | \n",
" 10.2500 | \n",
" 19.8389 | \n",
" 0.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATUS_C_sum_min STATUS_C_sum_max STATUS_C_sum_mean STATUS_C_sum_std STATUS_C_sum_median bureau_num_trans_min bureau_num_trans_max bureau_num_trans_mean bureau_num_trans_std bureau_num_trans_median STATUS_0_sum_min STATUS_0_sum_max STATUS_0_sum_mean STATUS_0_sum_std STATUS_0_sum_median STATUS_DPD_sum_min STATUS_DPD_sum_max STATUS_DPD_sum_mean STATUS_DPD_sum_std STATUS_DPD_sum_median STATUS_Sold_sum_min STATUS_Sold_sum_max STATUS_Sold_sum_mean STATUS_Sold_sum_std STATUS_Sold_sum_median STATUS_X_sum_min STATUS_X_sum_max STATUS_X_sum_mean STATUS_X_sum_std STATUS_X_sum_median\n",
"SK_ID_CURR \n",
"100001 0 44 15.7143 16.9972 18.0 2 52 24.5714 16.0505 29.0 1 12 4.4286 4.8941 2.0 0 1 0.1429 0.3780 0.0 0 0 0.0 0.0 0.0 0 9 4.2857 3.8173 6.0 \n",
"100002 0 13 2.8750 4.1897 2.0 4 22 13.7500 6.3640 16.0 2 18 5.6250 5.1807 5.0 0 6 3.3750 2.8754 4.0 0 0 0.0 0.0 0.0 0 3 1.8750 1.3562 2.5 \n",
"100005 0 5 1.6667 2.8868 0.0 3 13 7.0000 5.2915 5.0 2 7 4.6667 2.5166 5.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 1 0.6667 0.5774 1.0 \n",
"100010 26 26 26.0000 0.0000 26.0 36 36 36.0000 0.0000 36.0 10 10 10.0000 0.0000 10.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 0 0.0000 0.0000 0.0 \n",
"100013 0 44 25.7500 20.7264 29.5 40 69 57.5000 13.1276 60.5 0 34 19.7500 14.5230 22.5 0 3 1.7500 1.2583 2.0 0 0 0.0 0.0 0.0 0 40 10.2500 19.8389 0.5 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pdf_agg01 = agg_common_data(pdf_stats, [\"min\", \"max\", \"mean\", \"std\", \"median\"])\n",
"pdf_agg01.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" auc | \n",
" corr | \n",
" coverage | \n",
"
\n",
" \n",
" \n",
" \n",
" 7 | \n",
" bureau_num_trans_mean | \n",
" 0.5928 | \n",
" -0.0785 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 9 | \n",
" bureau_num_trans_median | \n",
" 0.5899 | \n",
" -0.0744 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 6 | \n",
" bureau_num_trans_max | \n",
" 0.5728 | \n",
" -0.0668 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 2 | \n",
" STATUS_C_sum_mean | \n",
" 0.5717 | \n",
" -0.0618 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 5 | \n",
" bureau_num_trans_min | \n",
" 0.5708 | \n",
" -0.0459 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 4 | \n",
" STATUS_C_sum_median | \n",
" 0.5632 | \n",
" -0.0559 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 1 | \n",
" STATUS_C_sum_max | \n",
" 0.5625 | \n",
" -0.0591 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 3 | \n",
" STATUS_C_sum_std | \n",
" 0.5596 | \n",
" -0.0542 | \n",
" 0.8679 | \n",
"
\n",
" \n",
" 8 | \n",
" bureau_num_trans_std | \n",
" 0.5523 | \n",
" -0.0458 | \n",
" 0.8679 | \n",
"
\n",
" \n",
" 12 | \n",
" STATUS_0_sum_mean | \n",
" 0.5426 | \n",
" -0.0387 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 17 | \n",
" STATUS_DPD_sum_mean | \n",
" 0.5414 | \n",
" 0.0329 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 16 | \n",
" STATUS_DPD_sum_max | \n",
" 0.5401 | \n",
" 0.0250 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 11 | \n",
" STATUS_0_sum_max | \n",
" 0.5392 | \n",
" -0.0346 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 18 | \n",
" STATUS_DPD_sum_std | \n",
" 0.5380 | \n",
" 0.0233 | \n",
" 0.8679 | \n",
"
\n",
" \n",
" 14 | \n",
" STATUS_0_sum_median | \n",
" 0.5347 | \n",
" -0.0328 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 27 | \n",
" STATUS_X_sum_mean | \n",
" 0.5279 | \n",
" -0.0302 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 13 | \n",
" STATUS_0_sum_std | \n",
" 0.5266 | \n",
" -0.0298 | \n",
" 0.8679 | \n",
"
\n",
" \n",
" 26 | \n",
" STATUS_X_sum_max | \n",
" 0.5264 | \n",
" -0.0298 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 19 | \n",
" STATUS_DPD_sum_median | \n",
" 0.5256 | \n",
" 0.0326 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 28 | \n",
" STATUS_X_sum_std | \n",
" 0.5237 | \n",
" -0.0294 | \n",
" 0.8679 | \n",
"
\n",
" \n",
" 0 | \n",
" STATUS_C_sum_min | \n",
" 0.5226 | \n",
" -0.0229 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 29 | \n",
" STATUS_X_sum_median | \n",
" 0.5153 | \n",
" -0.0224 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 15 | \n",
" STATUS_DPD_sum_min | \n",
" 0.5144 | \n",
" 0.0255 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 10 | \n",
" STATUS_0_sum_min | \n",
" 0.5130 | \n",
" -0.0173 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 22 | \n",
" STATUS_Sold_sum_mean | \n",
" 0.5045 | \n",
" 0.0189 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 21 | \n",
" STATUS_Sold_sum_max | \n",
" 0.5045 | \n",
" 0.0123 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 25 | \n",
" STATUS_X_sum_min | \n",
" 0.5045 | \n",
" -0.0120 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 23 | \n",
" STATUS_Sold_sum_std | \n",
" 0.5038 | \n",
" 0.0084 | \n",
" 0.8679 | \n",
"
\n",
" \n",
" 24 | \n",
" STATUS_Sold_sum_median | \n",
" 0.5037 | \n",
" 0.0190 | \n",
" 1.0000 | \n",
"
\n",
" \n",
" 20 | \n",
" STATUS_Sold_sum_min | \n",
" 0.5021 | \n",
" 0.0173 | \n",
" 1.0000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name auc corr coverage\n",
"7 bureau_num_trans_mean 0.5928 -0.0785 1.0000 \n",
"9 bureau_num_trans_median 0.5899 -0.0744 1.0000 \n",
"6 bureau_num_trans_max 0.5728 -0.0668 1.0000 \n",
"2 STATUS_C_sum_mean 0.5717 -0.0618 1.0000 \n",
"5 bureau_num_trans_min 0.5708 -0.0459 1.0000 \n",
"4 STATUS_C_sum_median 0.5632 -0.0559 1.0000 \n",
"1 STATUS_C_sum_max 0.5625 -0.0591 1.0000 \n",
"3 STATUS_C_sum_std 0.5596 -0.0542 0.8679 \n",
"8 bureau_num_trans_std 0.5523 -0.0458 0.8679 \n",
"12 STATUS_0_sum_mean 0.5426 -0.0387 1.0000 \n",
"17 STATUS_DPD_sum_mean 0.5414 0.0329 1.0000 \n",
"16 STATUS_DPD_sum_max 0.5401 0.0250 1.0000 \n",
"11 STATUS_0_sum_max 0.5392 -0.0346 1.0000 \n",
"18 STATUS_DPD_sum_std 0.5380 0.0233 0.8679 \n",
"14 STATUS_0_sum_median 0.5347 -0.0328 1.0000 \n",
"27 STATUS_X_sum_mean 0.5279 -0.0302 1.0000 \n",
"13 STATUS_0_sum_std 0.5266 -0.0298 0.8679 \n",
"26 STATUS_X_sum_max 0.5264 -0.0298 1.0000 \n",
"19 STATUS_DPD_sum_median 0.5256 0.0326 1.0000 \n",
"28 STATUS_X_sum_std 0.5237 -0.0294 0.8679 \n",
"0 STATUS_C_sum_min 0.5226 -0.0229 1.0000 \n",
"29 STATUS_X_sum_median 0.5153 -0.0224 1.0000 \n",
"15 STATUS_DPD_sum_min 0.5144 0.0255 1.0000 \n",
"10 STATUS_0_sum_min 0.5130 -0.0173 1.0000 \n",
"22 STATUS_Sold_sum_mean 0.5045 0.0189 1.0000 \n",
"21 STATUS_Sold_sum_max 0.5045 0.0123 1.0000 \n",
"25 STATUS_X_sum_min 0.5045 -0.0120 1.0000 \n",
"23 STATUS_Sold_sum_std 0.5038 0.0084 0.8679 \n",
"24 STATUS_Sold_sum_median 0.5037 0.0190 1.0000 \n",
"20 STATUS_Sold_sum_min 0.5021 0.0173 1.0000 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01.reset_index())\n",
"display(eval_agg01)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SK_ID_CURR | \n",
" STATUS_Sold_max | \n",
" STATUS_X_max | \n",
" STATUS_C_max | \n",
" STATUS_DPD_max | \n",
" STATUS_0_max | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100002 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 100002 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 100002 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 100002 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 100002 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SK_ID_CURR STATUS_Sold_max STATUS_X_max STATUS_C_max STATUS_DPD_max STATUS_0_max\n",
"0 100002 0 1 1 1 1 \n",
"1 100002 0 0 1 0 1 \n",
"2 100002 0 1 1 1 1 \n",
"3 100002 0 1 1 1 1 \n",
"4 100002 0 0 0 1 1 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# keep binary values\n",
"ls_cols = [cname for cname in pdf_data.columns if \"_max\" in cname]\n",
"pdf_binary = pdf_data[[\"SK_ID_CURR\"] + ls_cols]\n",
"pdf_binary.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'STATUS_0_max': ['max'],\n",
" 'STATUS_C_max': ['max'],\n",
" 'STATUS_DPD_max': ['max'],\n",
" 'STATUS_Sold_max': ['max'],\n",
" 'STATUS_X_max': ['max']}"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"After agg: (134542, 5)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATUS_0_max_max | \n",
" STATUS_Sold_max_max | \n",
" STATUS_X_max_max | \n",
" STATUS_DPD_max_max | \n",
" STATUS_C_max_max | \n",
"
\n",
" \n",
" SK_ID_CURR | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 100001 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 100002 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 100005 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 100010 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 100013 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATUS_0_max_max STATUS_Sold_max_max STATUS_X_max_max STATUS_DPD_max_max STATUS_C_max_max\n",
"SK_ID_CURR \n",
"100001 1 0 1 1 1 \n",
"100002 1 0 1 1 1 \n",
"100005 1 0 1 0 1 \n",
"100010 1 0 0 0 1 \n",
"100013 1 0 1 1 1 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pdf_agg02 = agg_common_data(pdf_binary, [\"max\"])\n",
"pdf_agg02.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" auc | \n",
" corr | \n",
" coverage | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" STATUS_DPD_max_max | \n",
" 0.5378 | \n",
" 0.0440 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" STATUS_C_max_max | \n",
" 0.5246 | \n",
" -0.0335 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" STATUS_Sold_max_max | \n",
" 0.5045 | \n",
" 0.0161 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" STATUS_X_max_max | \n",
" 0.5020 | \n",
" 0.0027 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 0 | \n",
" STATUS_0_max_max | \n",
" 0.5001 | \n",
" 0.0003 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name auc corr coverage\n",
"3 STATUS_DPD_max_max 0.5378 0.0440 1.0 \n",
"4 STATUS_C_max_max 0.5246 -0.0335 1.0 \n",
"1 STATUS_Sold_max_max 0.5045 0.0161 1.0 \n",
"2 STATUS_X_max_max 0.5020 0.0027 1.0 \n",
"0 STATUS_0_max_max 0.5001 0.0003 1.0 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02.reset_index())\n",
"display(eval_agg02)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# save features"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(134542, 35)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" STATUS_C_sum_min | \n",
" STATUS_C_sum_max | \n",
" STATUS_C_sum_mean | \n",
" STATUS_C_sum_std | \n",
" STATUS_C_sum_median | \n",
" bureau_num_trans_min | \n",
" bureau_num_trans_max | \n",
" bureau_num_trans_mean | \n",
" bureau_num_trans_std | \n",
" bureau_num_trans_median | \n",
" STATUS_0_sum_min | \n",
" STATUS_0_sum_max | \n",
" STATUS_0_sum_mean | \n",
" STATUS_0_sum_std | \n",
" STATUS_0_sum_median | \n",
" STATUS_DPD_sum_min | \n",
" STATUS_DPD_sum_max | \n",
" STATUS_DPD_sum_mean | \n",
" STATUS_DPD_sum_std | \n",
" STATUS_DPD_sum_median | \n",
" STATUS_Sold_sum_min | \n",
" STATUS_Sold_sum_max | \n",
" STATUS_Sold_sum_mean | \n",
" STATUS_Sold_sum_std | \n",
" STATUS_Sold_sum_median | \n",
" STATUS_X_sum_min | \n",
" STATUS_X_sum_max | \n",
" STATUS_X_sum_mean | \n",
" STATUS_X_sum_std | \n",
" STATUS_X_sum_median | \n",
" STATUS_0_max_max | \n",
" STATUS_Sold_max_max | \n",
" STATUS_X_max_max | \n",
" STATUS_DPD_max_max | \n",
" STATUS_C_max_max | \n",
"
\n",
" \n",
" SK_ID_CURR | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 100001 | \n",
" 0 | \n",
" 44 | \n",
" 15.7143 | \n",
" 16.9972 | \n",
" 18.0 | \n",
" 2 | \n",
" 52 | \n",
" 24.5714 | \n",
" 16.0505 | \n",
" 29.0 | \n",
" 1 | \n",
" 12 | \n",
" 4.4286 | \n",
" 4.8941 | \n",
" 2.0 | \n",
" 0 | \n",
" 1 | \n",
" 0.1429 | \n",
" 0.3780 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 9 | \n",
" 4.2857 | \n",
" 3.8173 | \n",
" 6.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 100002 | \n",
" 0 | \n",
" 13 | \n",
" 2.8750 | \n",
" 4.1897 | \n",
" 2.0 | \n",
" 4 | \n",
" 22 | \n",
" 13.7500 | \n",
" 6.3640 | \n",
" 16.0 | \n",
" 2 | \n",
" 18 | \n",
" 5.6250 | \n",
" 5.1807 | \n",
" 5.0 | \n",
" 0 | \n",
" 6 | \n",
" 3.3750 | \n",
" 2.8754 | \n",
" 4.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 3 | \n",
" 1.8750 | \n",
" 1.3562 | \n",
" 2.5 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 100005 | \n",
" 0 | \n",
" 5 | \n",
" 1.6667 | \n",
" 2.8868 | \n",
" 0.0 | \n",
" 3 | \n",
" 13 | \n",
" 7.0000 | \n",
" 5.2915 | \n",
" 5.0 | \n",
" 2 | \n",
" 7 | \n",
" 4.6667 | \n",
" 2.5166 | \n",
" 5.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0.6667 | \n",
" 0.5774 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 100010 | \n",
" 26 | \n",
" 26 | \n",
" 26.0000 | \n",
" 0.0000 | \n",
" 26.0 | \n",
" 36 | \n",
" 36 | \n",
" 36.0000 | \n",
" 0.0000 | \n",
" 36.0 | \n",
" 10 | \n",
" 10 | \n",
" 10.0000 | \n",
" 0.0000 | \n",
" 10.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 100013 | \n",
" 0 | \n",
" 44 | \n",
" 25.7500 | \n",
" 20.7264 | \n",
" 29.5 | \n",
" 40 | \n",
" 69 | \n",
" 57.5000 | \n",
" 13.1276 | \n",
" 60.5 | \n",
" 0 | \n",
" 34 | \n",
" 19.7500 | \n",
" 14.5230 | \n",
" 22.5 | \n",
" 0 | \n",
" 3 | \n",
" 1.7500 | \n",
" 1.2583 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 40 | \n",
" 10.2500 | \n",
" 19.8389 | \n",
" 0.5 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" STATUS_C_sum_min STATUS_C_sum_max STATUS_C_sum_mean STATUS_C_sum_std STATUS_C_sum_median bureau_num_trans_min bureau_num_trans_max bureau_num_trans_mean bureau_num_trans_std bureau_num_trans_median STATUS_0_sum_min STATUS_0_sum_max STATUS_0_sum_mean STATUS_0_sum_std STATUS_0_sum_median STATUS_DPD_sum_min STATUS_DPD_sum_max STATUS_DPD_sum_mean STATUS_DPD_sum_std STATUS_DPD_sum_median STATUS_Sold_sum_min STATUS_Sold_sum_max STATUS_Sold_sum_mean STATUS_Sold_sum_std STATUS_Sold_sum_median STATUS_X_sum_min STATUS_X_sum_max STATUS_X_sum_mean STATUS_X_sum_std STATUS_X_sum_median STATUS_0_max_max STATUS_Sold_max_max STATUS_X_max_max STATUS_DPD_max_max STATUS_C_max_max\n",
"SK_ID_CURR \n",
"100001 0 44 15.7143 16.9972 18.0 2 52 24.5714 16.0505 29.0 1 12 4.4286 4.8941 2.0 0 1 0.1429 0.3780 0.0 0 0 0.0 0.0 0.0 0 9 4.2857 3.8173 6.0 1 0 1 1 1 \n",
"100002 0 13 2.8750 4.1897 2.0 4 22 13.7500 6.3640 16.0 2 18 5.6250 5.1807 5.0 0 6 3.3750 2.8754 4.0 0 0 0.0 0.0 0.0 0 3 1.8750 1.3562 2.5 1 0 1 1 1 \n",
"100005 0 5 1.6667 2.8868 0.0 3 13 7.0000 5.2915 5.0 2 7 4.6667 2.5166 5.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 1 0.6667 0.5774 1.0 1 0 1 0 1 \n",
"100010 26 26 26.0000 0.0000 26.0 36 36 36.0000 0.0000 36.0 10 10 10.0000 0.0000 10.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 0 0.0000 0.0000 0.0 1 0 0 0 1 \n",
"100013 0 44 25.7500 20.7264 29.5 40 69 57.5000 13.1276 60.5 0 34 19.7500 14.5230 22.5 0 3 1.7500 1.2583 2.0 0 0 0.0 0.0 0.0 0 40 10.2500 19.8389 0.5 1 0 1 1 1 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pdf_feat = pdf_agg01.join(pdf_agg02)\n",
"print(pdf_feat.shape)\n",
"pdf_feat.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Store features completed!\n",
"CPU times: user 3.03 s, sys: 40.9 ms, total: 3.07 s\n",
"Wall time: 1.7 s\n"
]
}
],
"source": [
"%%time\n",
"fname = \"bureau_balance\"\n",
"# fname = \"bureau_balance_1year\"\n",
"# fname = \"bureau_balance_2year\"\n",
"# fname = \"bureau_balance_gt3year\"\n",
"# fname = \"bureau_balance_lt1year\"\n",
"\n",
"fname = os.path.join(\"features\", \"{}.pkl.bz2\".format(fname))\n",
"pdf_feat.to_pickle(fname, compression=\"bz2\")\n",
"print(\"Store features completed!\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}