{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Full width\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import math\n", "import os\n", "import subprocess\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display\n", "\n", "# \n", "from lib_modeling import *\n", "from lib_feature_engineering import *\n", "\n", "from sklearn import metrics\n", "from sklearn.model_selection import train_test_split\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGET
01000021
11000030
21000040
31000060
41000070
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET\n", "0 100002 1 \n", "1 100003 0 \n", "2 100004 0 \n", "3 100006 0 \n", "4 100007 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load train/test data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "pdf_train = pd.read_csv(data_path)\n", "\n", "data_path = \"home-credit-default-risk/application_test.csv\"\n", "pdf_test = pd.read_csv(data_path)\n", "\n", "# filter by tvt code\n", "pdf_tvt_extend = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n", "pdf_train_filtered = (pdf_tvt_extend.query(\"tvt_code == 'train'\")\n", " .merge(pdf_train[[\"SK_ID_CURR\"]], on=\"SK_ID_CURR\")\n", " .drop(columns=[\"tvt_code\"]))\n", "pdf_train_filtered.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# load bureau + balance\n", "data_path = \"home-credit-default-risk/bureau.csv\"\n", "pdf_bureau = pd.read_csv(data_path)\n", "\n", "data_path = \"home-credit-default-risk/bureau_balance.csv\"\n", "pdf_bureau_balance = pd.read_csv(data_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculate intermediate data (idata)\n", "\n", "- status: binary, frequency\n", "- count: number bureau, number bureau transaction" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAFtFJREFUeJzt3X+MXfV55/H3Exwaxw0xlDDy2mhNFSstxUoCI3AXKZqNu8ZAFPNHWYHYYBArryISkV1LjVOthJofEpWWpgGlSFZwMa0bwpIgW4mJYxGuupUCAUMaB5wsU+LFE7s4xECYZBvW6bN/3O8kN8P13B9fZs41vF/S1dzznO8538cjz3zm/JgzkZlIklTjTU03IEk6+RkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqLWq6gYVy5pln5sqVK4fa9mc/+xlLlix5bRt6DdjXYOxrcKPam30Npqavffv2PZ+Z7+g5MDPfEK8LLrggh/XQQw8Nve18sq/B2NfgRrU3+xpMTV/AY9nH91hPc0mSqhkmkqRqhokkqZphIkmqZphIkqr1FSYRsTQi7ouI70fEgYj4w4g4IyL2RsTT5ePpZWxExG0RMRkR342I8zv2s7GMfzoiNnbUL4iI/WWb2yIiSn3gOSRJC6/fI5PPAV/PzN8D3g0cALYAD2bmKuDBsgxwKbCqvDYBd0A7GICbgYuAC4GbZ8KhjNnUsd36Uh9oDklSM3qGSUScBrwPuBMgM1/JzBeBDcD2Mmw7cEV5vwG4u9yi/DCwNCKWAZcAezPzWGa+AOwF1pd1p2Xmt8o9zXfP2tcgc0iSGtDPkcnvAj8G/joinoiIL0TEEmAsM48AlI9nlfHLgUMd20+V2lz1qS51hphDktSAfh6nsgg4H/hoZj4SEZ/j16ebuokutRyiPpe+tomITbRPgzE2Nkar1eqx2+6OHnuJ23fsHGrbWquXv/2E66anp4f+N80n+xrMqPYFo9ubfQ1mIfrqJ0ymgKnMfKQs30c7TJ6LiGWZeaScYjraMf7sju1XAIdLfWJWvVXqK7qMZ4g5fkNmbgW2AoyPj+fExMTsIX25fcdObt3fzGPMDl4zccJ1rVaLYf9N88m+BjOqfcHo9mZfg1mIvnqe5srMfwYORcS7Smkt8BSwC5i5I2sjMPOj+y7g2nLH1RrgpXKKag+wLiJOLxfe1wF7yrqXI2JNuYvr2ln7GmQOSVID+v1x+6PAjog4FXgGuJ52EN0bETcAzwJXlrG7gcuASeDnZSyZeSwiPgU8WsZ9MjOPlfcfBu4CFgMPlBfALYPMIUlqRl9hkpnfAca7rFrbZWwCN55gP9uAbV3qjwHndan/ZNA5JEkLz9+AlyRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVW9TPoIg4CLwM/BI4npnjEXEG8CVgJXAQ+I+Z+UJEBPA54DLg58B1mfl42c9G4L+X3X46M7eX+gXAXcBiYDdwU2bmMHO83qzc8rUTrtu8+jjXzbG+xsFbLp+X/Up6fRrkyOTfZ+Z7MnO8LG8BHszMVcCDZRngUmBVeW0C7gAowXAzcBFwIXBzRJxetrmjjJ3Zbv0wc0iSmlFzmmsDsL283w5c0VG/O9seBpZGxDLgEmBvZh7LzBeAvcD6su60zPxWZiZw96x9DTKHJKkB/YZJAt+IiH0RsanUxjLzCED5eFapLwcOdWw7VWpz1ae61IeZQ5LUgL6umQAXZ+bhiDgL2BsR359jbHSp5RD1ufS1TQm+TQBjY2O0Wq0eu+1ubHH7+sSomc++hv1cAUxPT1dtP1/sa3Cj2pt9DWYh+uorTDLzcPl4NCLup33N47mIWJaZR8oppqNl+BRwdsfmK4DDpT4xq94q9RVdxjPEHLP73gpsBRgfH8+JiYnZQ/py+46d3Lq/39xdOJtXH5+3vg5eMzH0tq1Wi2E/1/PJvgY3qr3Z12AWoq+ep7kiYklEvG3mPbAO+B6wC9hYhm0Edpb3u4Bro20N8FI5RbUHWBcRp5cL7+uAPWXdyxGxptylde2sfQ0yhySpAf38WDsG3N/+Ps8i4O8y8+sR8Shwb0TcADwLXFnG76Z9y+4k7dt2rwfIzGMR8Sng0TLuk5l5rLz/ML++NfiB8gK4ZZA5JEnN6BkmmfkM8O4u9Z8Aa7vUE7jxBPvaBmzrUn8MOO+1mEOStPD8DXhJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUre8wiYhTIuKJiPhqWT4nIh6JiKcj4ksRcWqp/1ZZnizrV3bs4xOl/oOIuKSjvr7UJiNiS0d94DkkSQtvkCOTm4ADHct/Dnw2M1cBLwA3lPoNwAuZ+U7gs2UcEXEucBXwB8B64K9KQJ0CfB64FDgXuLqMHXgOSVIz+gqTiFgBXA58oSwH8H7gvjJkO3BFeb+hLFPWry3jNwD3ZOYvMvOHwCRwYXlNZuYzmfkKcA+wYcg5JEkN6PfI5C+BPwH+tSz/DvBiZh4vy1PA8vJ+OXAIoKx/qYz/VX3WNieqDzOHJKkBi3oNiIgPAEczc19ETMyUuwzNHutOVO8WaHON7zX/r0TEJmATwNjYGK1Wq8tmvY0ths2rj/ceuMDms69hP1cA09PTVdvPF/sa3Kj2Zl+DWYi+eoYJcDHwwYi4DHgLcBrtI5WlEbGoHBmsAA6X8VPA2cBURCwC3g4c66jP6NymW/35Ieb4DZm5FdgKMD4+nhMTE338c1/t9h07uXV/P5+qhbV59fF56+vgNRNDb9tqtRj2cz2f7Gtwo9qbfQ1mIfrqeZorMz+RmSsycyXtC+jfzMxrgIeAPy7DNgI7y/tdZZmy/puZmaV+VbkT6xxgFfBt4FFgVblz69Qyx66yzaBzSJIaUPNj7ceBeyLi08ATwJ2lfifwNxExSfto4SqAzHwyIu4FngKOAzdm5i8BIuIjwB7gFGBbZj45zBySpGYMFCaZ2QJa5f0ztO/Emj3mX4ArT7D9Z4DPdKnvBnZ3qQ88hyRp4fkb8JKkaqN3VVkjYeWWrw297ebVx7muYvuDt1w+9LaSmuGRiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKq+WwujZya54LNpdczw3wmmDQ8j0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRV6xkmEfGWiPh2RPxjRDwZEX9W6udExCMR8XREfCkiTi313yrLk2X9yo59faLUfxARl3TU15faZERs6agPPIckaeH1c2TyC+D9mflu4D3A+ohYA/w58NnMXAW8ANxQxt8AvJCZ7wQ+W8YREecCVwF/AKwH/ioiTomIU4DPA5cC5wJXl7EMOockqRk9wyTbpsvim8srgfcD95X6duCK8n5DWaasXxsRUer3ZOYvMvOHwCRwYXlNZuYzmfkKcA+woWwz6BySpAb09fdMytHDPuCdtI8i/gl4MTOPlyFTwPLyfjlwCCAzj0fES8DvlPrDHbvt3ObQrPpFZZtB53h+Vt+bgE0AY2NjtFqtfv65rzK2uP23MEaNfQ2mV1/D/v+oNT093djcvYxqb/Y1mIXoq68wycxfAu+JiKXA/cDvdxtWPnY7Qsg56t2OjuYaP9ccv1nI3ApsBRgfH8+JiYkum/V2+46d3Lp/9P6O2ObVx+1rAL36OnjNxMI106HVajHs/835Nqq92ddgFqKvge7myswXgRawBlgaETNfmSuAw+X9FHA2QFn/duBYZ33WNieqPz/EHJKkBvRzN9c7yhEJEbEY+CPgAPAQ8Mdl2EZgZ3m/qyxT1n8zM7PUryp3Yp0DrAK+DTwKrCp3bp1K+yL9rrLNoHNIkhrQz7mIZcD2ct3kTcC9mfnViHgKuCciPg08AdxZxt8J/E1ETNI+WrgKIDOfjIh7gaeA48CN5fQZEfERYA9wCrAtM58s+/r4IHNIkprRM0wy87vAe7vUn6F9J9bs+r8AV55gX58BPtOlvhvY/VrMIUlaeP4GvCSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSp2uj9OTypISu3fK2Ree9av6SReaXXkkcmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSarWM0wi4uyIeCgiDkTEkxFxU6mfERF7I+Lp8vH0Uo+IuC0iJiPiuxFxfse+NpbxT0fExo76BRGxv2xzW0TEsHNIkhZePw96PA5szszHI+JtwL6I2AtcBzyYmbdExBZgC/Bx4FJgVXldBNwBXBQRZwA3A+NAlv3syswXyphNwMPAbmA98EDZZ99z1H4ypCbs/9FLXNfQQyYP3nJ5I/Pq9afnkUlmHsnMx8v7l4EDwHJgA7C9DNsOXFHebwDuzraHgaURsQy4BNibmcdKgOwF1pd1p2XmtzIzgbtn7WuQOSRJDRjomklErATeCzwCjGXmEWgHDnBWGbYcONSx2VSpzVWf6lJniDkkSQ3o+++ZRMRvA18GPpaZPy2XNboO7VLLIepzttPPNhGxifbpM8bGxmi1Wj12293YYti8+vhQ284n+xqMfb1ar6+J6enpob9u5pN9DWYh+uorTCLizbSDZEdmfqWUn4uIZZl5pJxiOlrqU8DZHZuvAA6X+sSseqvUV3QZP8wcvyEztwJbAcbHx3NiYmL2kL7cvmMnt+4fvb8jtnn1cfsagH292sFrJuZc32q1GPbrZj7Z12AWoq9+7uYK4E7gQGb+RceqXcDMHVkbgZ0d9WvLHVdrgJfKKao9wLqIOL3clbUO2FPWvRwRa8pc187a1yBzSJIa0M+PQxcDHwL2R8R3Su1PgVuAeyPiBuBZ4MqybjdwGTAJ/By4HiAzj0XEp4BHy7hPZuax8v7DwF3AYtp3cT1Q6gPNIUlqRs8wycx/oPs1CoC1XcYncOMJ9rUN2Nal/hhwXpf6TwadQ5K08PwNeElSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVK10XvqnaQFs7LHH+XavPr4vPzhLv8o1+uPRyaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGo+NVjSguv1tOJeap5m7BOL54dHJpKkaoaJJKlazzCJiG0RcTQivtdROyMi9kbE0+Xj6aUeEXFbRExGxHcj4vyObTaW8U9HxMaO+gURsb9sc1tExLBzSJKa0c+RyV3A+lm1LcCDmbkKeLAsA1wKrCqvTcAd0A4G4GbgIuBC4OaZcChjNnVst36YOSRJzekZJpn598CxWeUNwPbyfjtwRUf97mx7GFgaEcuAS4C9mXksM18A9gLry7rTMvNbmZnA3bP2NcgckqSGDHs311hmHgHIzCMRcVapLwcOdYybKrW56lNd6sPMcWR2kxGxifbRC2NjY7RarcH+lTONLG7fPTJq7Gsw9jW4Ue2tpq/bd+x8jbv5tbHFJ97/6uVvn7d5e5menh76+1+/Xutbg6NLLYeoDzPHq4uZW4GtAOPj4zkxMdFj193dvmMnt+4fvbuoN68+bl8DsK/BjWpvJ2NfB6+ZWNhmOrRaLYb9/tevYe/mem7m1FL5eLTUp4CzO8atAA73qK/oUh9mDklSQ4YNk13AzB1ZG4GdHfVryx1Xa4CXyqmqPcC6iDi9XHhfB+wp616OiDXlLq5rZ+1rkDkkSQ3peZwYEV8EJoAzI2KK9l1ZtwD3RsQNwLPAlWX4buAyYBL4OXA9QGYei4hPAY+WcZ/MzJmL+h+mfcfYYuCB8mLQOSRJzekZJpl59QlWre0yNoEbT7CfbcC2LvXHgPO61H8y6BySpGb4G/CSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqqN3pPSJOl1qPbv3te4a/2SeZ/DIxNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1U7aMImI9RHxg4iYjIgtTfcjSW9kJ2WYRMQpwOeBS4Fzgasj4txmu5KkN66TMkyAC4HJzHwmM18B7gE2NNyTJL1hnaxhshw41LE8VWqSpAZEZjbdw8Ai4krgksz8z2X5Q8CFmfnRWeM2AZvK4ruAHww55ZnA80NuO5/sazD2NbhR7c2+BlPT17/NzHf0GrRoyJ03bQo4u2N5BXB49qDM3ApsrZ0sIh7LzPHa/bzW7Gsw9jW4Ue3NvgazEH2drKe5HgVWRcQ5EXEqcBWwq+GeJOkN66Q8MsnM4xHxEWAPcAqwLTOfbLgtSXrDOinDBCAzdwO7F2i66lNl88S+BmNfgxvV3uxrMPPe10l5AV6SNFpO1msmkqQRYpj0MIqPbYmIbRFxNCK+13QvnSLi7Ih4KCIORMSTEXFT0z0BRMRbIuLbEfGPpa8/a7qnThFxSkQ8ERFfbbqXGRFxMCL2R8R3IuKxpvuZERFLI+K+iPh++X/2hyPQ07vK52nm9dOI+FjTfQFExH8t/+e/FxFfjIi3zNtcnuY6sfLYlv8N/AfatyM/ClydmU813Nf7gGng7sw8r8leOkXEMmBZZj4eEW8D9gFXjMDnK4AlmTkdEW8G/gG4KTMfbrKvGRHx34Bx4LTM/EDT/UA7TIDxzByp35mIiO3A/8rML5Q7Od+amS823deM8j3jR8BFmfl/Gu5lOe3/6+dm5v+NiHuB3Zl513zM55HJ3EbysS2Z+ffAsab7mC0zj2Tm4+X9y8ABRuDJBNk2XRbfXF4j8VNURKwALge+0HQvoy4iTgPeB9wJkJmvjFKQFGuBf2o6SDosAhZHxCLgrXT5fbzXimEyNx/bMqSIWAm8F3ik2U7ayqmk7wBHgb2ZORJ9AX8J/Anwr003MksC34iIfeVJEqPgd4EfA39dTgt+ISKWNN3ULFcBX2y6CYDM/BHwP4BngSPAS5n5jfmazzCZW3SpjcRPtKMsIn4b+DLwscz8adP9AGTmLzPzPbSflnBhRDR+ejAiPgAczcx9TffSxcWZeT7tJ3PfWE6tNm0RcD5wR2a+F/gZMBLXMQHKabcPAv+z6V4AIuJ02mdSzgH+DbAkIv7TfM1nmMytr8e26NfKNYkvAzsy8ytN9zNbOS3SAtY33ArAxcAHy/WJe4D3R8TfNttSW2YeLh+PAvfTPuXbtClgquOo8j7a4TIqLgUez8znmm6k+CPgh5n548z8f8BXgH83X5MZJnPzsS0DKBe67wQOZOZfNN3PjIh4R0QsLe8X0/4i+36zXUFmfiIzV2TmStr/t76ZmfP2k2O/ImJJuYGCchppHdD4nYOZ+c/AoYh4VymtBRq9uWOWqxmRU1zFs8CaiHhr+dpcS/s65rw4aX8DfiGM6mNbIuKLwARwZkRMATdn5p3NdgW0f9L+ELC/XJ8A+NPytIImLQO2lztt3gTcm5kjcxvuCBoD7m9//2ER8HeZ+fVmW/qVjwI7yg93zwDXN9wPABHxVtp3ff6XpnuZkZmPRMR9wOPAceAJ5vE34b01WJJUzdNckqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKq/X8MZzNNeD8JbgAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "(pdf_bureau_balance[\"MONTHS_BALANCE\"] / -12.0).hist()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_BUREAUMONTHS_BALANCESTATUS
05715448-0.0000C
157154480.0833C
257154480.1667C
357154480.2500C
457154480.3333C
\n", "
" ], "text/plain": [ " SK_ID_BUREAU MONTHS_BALANCE STATUS\n", "0 5715448 -0.0000 C \n", "1 5715448 0.0833 C \n", "2 5715448 0.1667 C \n", "3 5715448 0.2500 C \n", "4 5715448 0.3333 C " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filter here for looking up balance within 1, 2, 3 years\n", "pdf_bureau_balance[\"MONTHS_BALANCE\"] = pdf_bureau_balance[\"MONTHS_BALANCE\"] / -12\n", "# pdf_bureau_balance = pdf_bureau_balance[(pdf_bureau_balance[\"MONTHS_BALANCE\"] >= 1) & (pdf_bureau_balance[\"MONTHS_BALANCE\"] < 1)]\n", "# pdf_bureau_balance = pdf_bureau_balance[(pdf_bureau_balance[\"MONTHS_BALANCE\"] >= 2) & (pdf_bureau_balance[\"MONTHS_BALANCE\"] < 3)]\n", "# pdf_bureau_balance = pdf_bureau_balance[pdf_bureau_balance[\"MONTHS_BALANCE\"] >= 3]\n", "# pdf_bureau_balance = pdf_bureau_balance[pdf_bureau_balance[\"MONTHS_BALANCE\"] < 1]\n", "pdf_bureau_balance.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "C 13646993\n", "0 7499507 \n", "X 5810482 \n", "1 242347 \n", "5 62406 \n", "2 23419 \n", "3 8924 \n", "4 5847 \n", "Name: STATUS, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_bureau_balance[\"STATUS\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "C 13646993\n", "0 7499507 \n", "X 5810482 \n", "DPD 280537 \n", "Sold 62406 \n", "Name: STATUS, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# status in (1, 2, 3, 4) is DPD (date past due). \n", "# 5 is sold. C is closed. X is unknown. 0 is no DPD\n", "pdf_bureau_balance.loc[pdf_bureau_balance[\"STATUS\"].isin(['1', '2', '3', '4']), \"STATUS\"] = \"DPD\"\n", "pdf_bureau_balance.loc[pdf_bureau_balance[\"STATUS\"] == '5', \"STATUS\"] = \"Sold\"\n", "pdf_bureau_balance[\"STATUS\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_BUREAUSTATUS_CSTATUS_0STATUS_XSTATUS_DPDSTATUS_Sold
0571544810000
1571544810000
2571544810000
3571544810000
4571544810000
\n", "
" ], "text/plain": [ " SK_ID_BUREAU STATUS_C STATUS_0 STATUS_X STATUS_DPD STATUS_Sold\n", "0 5715448 1 0 0 0 0 \n", "1 5715448 1 0 0 0 0 \n", "2 5715448 1 0 0 0 0 \n", "3 5715448 1 0 0 0 0 \n", "4 5715448 1 0 0 0 0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 1s, sys: 5.32 s, total: 1min 6s\n", "Wall time: 51.5 s\n" ] } ], "source": [ "%%time\n", "dict_onehot = {\n", " \"STATUS\": ['C', '0', 'X', 'DPD', 'Sold'],\n", "}\n", "pdf_onehot = gen_one_hot_feat(pdf_bureau_balance, dict_onehot, main_key=\"SK_ID_BUREAU\")\n", "display(pdf_onehot.head())" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'STATUS_0': ['max', 'sum', 'mean', 'std'],\n", " 'STATUS_C': ['max', 'sum', 'mean', 'std'],\n", " 'STATUS_DPD': ['max', 'sum', 'mean', 'std'],\n", " 'STATUS_Sold': ['max', 'sum', 'mean', 'std'],\n", " 'STATUS_X': ['max', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (817395, 20)\n", "CPU times: user 14.3 s, sys: 1.81 s, total: 16.2 s\n", "Wall time: 6.52 s\n" ] } ], "source": [ "%%time\n", "pdf_idata01 = agg_common_data(pdf_onehot, [\"max\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_BUREAU\")\n", "pdf_idata01.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bureau_num_trans
SK_ID_BUREAU
500170997
500171083
50017114
500171219
500171322
\n", "
" ], "text/plain": [ " bureau_num_trans\n", "SK_ID_BUREAU \n", "5001709 97 \n", "5001710 83 \n", "5001711 4 \n", "5001712 19 \n", "5001713 22 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# \n", "pdf_idata02 = pdf_bureau_balance.groupby(\"SK_ID_BUREAU\").size().to_frame(\"bureau_num_trans\")\n", "pdf_idata02.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_BUREAUSTATUS_Sold_maxSTATUS_Sold_sumSTATUS_Sold_meanSTATUS_Sold_stdSTATUS_X_maxSTATUS_X_sumSTATUS_X_meanSTATUS_X_stdSTATUS_C_maxSTATUS_C_sumSTATUS_C_meanSTATUS_C_stdSTATUS_DPD_maxSTATUS_DPD_sumSTATUS_DPD_meanSTATUS_DPD_stdSTATUS_0_maxSTATUS_0_sumSTATUS_0_meanSTATUS_0_stdbureau_num_trans
05001709000.00.01110.11340.31871860.88660.3187000.00.0000.00000.000097
15001710000.00.01300.36140.48331480.57830.4968000.00.0150.06020.239483
25001711000.00.0110.25000.5000000.00000.0000000.00.0130.75000.50004
35001712000.00.0000.00000.0000190.47370.5130000.00.01100.52630.513019
45001713000.00.01221.00000.0000000.00000.0000000.00.0000.00000.000022
\n", "
" ], "text/plain": [ " SK_ID_BUREAU STATUS_Sold_max STATUS_Sold_sum STATUS_Sold_mean STATUS_Sold_std STATUS_X_max STATUS_X_sum STATUS_X_mean STATUS_X_std STATUS_C_max STATUS_C_sum STATUS_C_mean STATUS_C_std STATUS_DPD_max STATUS_DPD_sum STATUS_DPD_mean STATUS_DPD_std STATUS_0_max STATUS_0_sum STATUS_0_mean STATUS_0_std bureau_num_trans\n", "0 5001709 0 0 0.0 0.0 1 11 0.1134 0.3187 1 86 0.8866 0.3187 0 0 0.0 0.0 0 0 0.0000 0.0000 97 \n", "1 5001710 0 0 0.0 0.0 1 30 0.3614 0.4833 1 48 0.5783 0.4968 0 0 0.0 0.0 1 5 0.0602 0.2394 83 \n", "2 5001711 0 0 0.0 0.0 1 1 0.2500 0.5000 0 0 0.0000 0.0000 0 0 0.0 0.0 1 3 0.7500 0.5000 4 \n", "3 5001712 0 0 0.0 0.0 0 0 0.0000 0.0000 1 9 0.4737 0.5130 0 0 0.0 0.0 1 10 0.5263 0.5130 19 \n", "4 5001713 0 0 0.0 0.0 1 22 1.0000 0.0000 0 0 0.0000 0.0000 0 0 0.0 0.0 0 0 0.0000 0.0000 22 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_idata = pdf_idata01.join(pdf_idata02).reset_index()\n", "pdf_idata.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature engineering" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(774354, 23)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRSK_ID_BUREAUSTATUS_Sold_maxSTATUS_Sold_sumSTATUS_Sold_meanSTATUS_Sold_stdSTATUS_X_maxSTATUS_X_sumSTATUS_X_meanSTATUS_X_stdSTATUS_C_maxSTATUS_C_sumSTATUS_C_meanSTATUS_C_stdSTATUS_DPD_maxSTATUS_DPD_sumSTATUS_DPD_meanSTATUS_DPD_stdSTATUS_0_maxSTATUS_0_sumSTATUS_0_meanSTATUS_0_stdbureau_num_trans
01000026158904000.00.0110.04550.2132120.09090.2942110.04550.21321180.81820.394822
11000026158905000.00.0000.00000.00001130.81250.4031000.00000.0000130.18750.403116
21000026158906000.00.0130.18750.4031120.12500.3416160.37500.5000150.31250.478716
31000026158907000.00.0130.18750.4031120.12500.3416160.37500.5000150.31250.478716
41000026158908000.00.0000.00000.0000000.00000.0000120.50000.5774120.50000.57744
\n", "
" ], "text/plain": [ " SK_ID_CURR SK_ID_BUREAU STATUS_Sold_max STATUS_Sold_sum STATUS_Sold_mean STATUS_Sold_std STATUS_X_max STATUS_X_sum STATUS_X_mean STATUS_X_std STATUS_C_max STATUS_C_sum STATUS_C_mean STATUS_C_std STATUS_DPD_max STATUS_DPD_sum STATUS_DPD_mean STATUS_DPD_std STATUS_0_max STATUS_0_sum STATUS_0_mean STATUS_0_std bureau_num_trans\n", "0 100002 6158904 0 0 0.0 0.0 1 1 0.0455 0.2132 1 2 0.0909 0.2942 1 1 0.0455 0.2132 1 18 0.8182 0.3948 22 \n", "1 100002 6158905 0 0 0.0 0.0 0 0 0.0000 0.0000 1 13 0.8125 0.4031 0 0 0.0000 0.0000 1 3 0.1875 0.4031 16 \n", "2 100002 6158906 0 0 0.0 0.0 1 3 0.1875 0.4031 1 2 0.1250 0.3416 1 6 0.3750 0.5000 1 5 0.3125 0.4787 16 \n", "3 100002 6158907 0 0 0.0 0.0 1 3 0.1875 0.4031 1 2 0.1250 0.3416 1 6 0.3750 0.5000 1 5 0.3125 0.4787 16 \n", "4 100002 6158908 0 0 0.0 0.0 0 0 0.0000 0.0000 0 0 0.0000 0.0000 1 2 0.5000 0.5774 1 2 0.5000 0.5774 4 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Join application to bureau balance\n", "pdf_ids = pd.concat([pdf_train[[\"SK_ID_CURR\"]], pdf_test[[\"SK_ID_CURR\"]]])\n", "pdf_data = (pdf_ids.merge(pdf_bureau[[\"SK_ID_CURR\", \"SK_ID_BUREAU\"]], on=\"SK_ID_CURR\")\n", " .merge(pdf_idata, on=\"SK_ID_BUREAU\"))\n", "print(pdf_data.shape)\n", "pdf_data.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRSTATUS_Sold_sumSTATUS_X_sumSTATUS_C_sumSTATUS_DPD_sumSTATUS_0_sumbureau_num_trans
010000201211822
110000200130316
21000020326516
31000020326516
4100002000224
\n", "
" ], "text/plain": [ " SK_ID_CURR STATUS_Sold_sum STATUS_X_sum STATUS_C_sum STATUS_DPD_sum STATUS_0_sum bureau_num_trans\n", "0 100002 0 1 2 1 18 22 \n", "1 100002 0 0 13 0 3 16 \n", "2 100002 0 3 2 6 5 16 \n", "3 100002 0 3 2 6 5 16 \n", "4 100002 0 0 0 2 2 4 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# statistics on status_sum, bureau_num_trans\n", "ls_cols = [cname for cname in pdf_data.columns if \"_sum\" in cname] + [\"bureau_num_trans\"]\n", "pdf_stats = pdf_data[[\"SK_ID_CURR\"] + ls_cols]\n", "pdf_stats.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'STATUS_0_sum': ['min', 'max', 'mean', 'std', 'median'],\n", " 'STATUS_C_sum': ['min', 'max', 'mean', 'std', 'median'],\n", " 'STATUS_DPD_sum': ['min', 'max', 'mean', 'std', 'median'],\n", " 'STATUS_Sold_sum': ['min', 'max', 'mean', 'std', 'median'],\n", " 'STATUS_X_sum': ['min', 'max', 'mean', 'std', 'median'],\n", " 'bureau_num_trans': ['min', 'max', 'mean', 'std', 'median']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (134542, 30)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATUS_C_sum_minSTATUS_C_sum_maxSTATUS_C_sum_meanSTATUS_C_sum_stdSTATUS_C_sum_medianbureau_num_trans_minbureau_num_trans_maxbureau_num_trans_meanbureau_num_trans_stdbureau_num_trans_medianSTATUS_0_sum_minSTATUS_0_sum_maxSTATUS_0_sum_meanSTATUS_0_sum_stdSTATUS_0_sum_medianSTATUS_DPD_sum_minSTATUS_DPD_sum_maxSTATUS_DPD_sum_meanSTATUS_DPD_sum_stdSTATUS_DPD_sum_medianSTATUS_Sold_sum_minSTATUS_Sold_sum_maxSTATUS_Sold_sum_meanSTATUS_Sold_sum_stdSTATUS_Sold_sum_medianSTATUS_X_sum_minSTATUS_X_sum_maxSTATUS_X_sum_meanSTATUS_X_sum_stdSTATUS_X_sum_median
SK_ID_CURR
10000104415.714316.997218.025224.571416.050529.01124.42864.89412.0010.14290.37800.0000.00.00.0094.28573.81736.0
1000020132.87504.18972.042213.75006.364016.02185.62505.18075.0063.37502.87544.0000.00.00.0031.87501.35622.5
100005051.66672.88680.03137.00005.29155.0274.66672.51665.0000.00000.00000.0000.00.00.0010.66670.57741.0
100010262626.00000.000026.0363636.00000.000036.0101010.00000.000010.0000.00000.00000.0000.00.00.0000.00000.00000.0
10001304425.750020.726429.5406957.500013.127660.503419.750014.523022.5031.75001.25832.0000.00.00.004010.250019.83890.5
\n", "
" ], "text/plain": [ " STATUS_C_sum_min STATUS_C_sum_max STATUS_C_sum_mean STATUS_C_sum_std STATUS_C_sum_median bureau_num_trans_min bureau_num_trans_max bureau_num_trans_mean bureau_num_trans_std bureau_num_trans_median STATUS_0_sum_min STATUS_0_sum_max STATUS_0_sum_mean STATUS_0_sum_std STATUS_0_sum_median STATUS_DPD_sum_min STATUS_DPD_sum_max STATUS_DPD_sum_mean STATUS_DPD_sum_std STATUS_DPD_sum_median STATUS_Sold_sum_min STATUS_Sold_sum_max STATUS_Sold_sum_mean STATUS_Sold_sum_std STATUS_Sold_sum_median STATUS_X_sum_min STATUS_X_sum_max STATUS_X_sum_mean STATUS_X_sum_std STATUS_X_sum_median\n", "SK_ID_CURR \n", "100001 0 44 15.7143 16.9972 18.0 2 52 24.5714 16.0505 29.0 1 12 4.4286 4.8941 2.0 0 1 0.1429 0.3780 0.0 0 0 0.0 0.0 0.0 0 9 4.2857 3.8173 6.0 \n", "100002 0 13 2.8750 4.1897 2.0 4 22 13.7500 6.3640 16.0 2 18 5.6250 5.1807 5.0 0 6 3.3750 2.8754 4.0 0 0 0.0 0.0 0.0 0 3 1.8750 1.3562 2.5 \n", "100005 0 5 1.6667 2.8868 0.0 3 13 7.0000 5.2915 5.0 2 7 4.6667 2.5166 5.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 1 0.6667 0.5774 1.0 \n", "100010 26 26 26.0000 0.0000 26.0 36 36 36.0000 0.0000 36.0 10 10 10.0000 0.0000 10.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 0 0.0000 0.0000 0.0 \n", "100013 0 44 25.7500 20.7264 29.5 40 69 57.5000 13.1276 60.5 0 34 19.7500 14.5230 22.5 0 3 1.7500 1.2583 2.0 0 0 0.0 0.0 0.0 0 40 10.2500 19.8389 0.5 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_agg01 = agg_common_data(pdf_stats, [\"min\", \"max\", \"mean\", \"std\", \"median\"])\n", "pdf_agg01.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
7bureau_num_trans_mean0.5928-0.07851.0000
9bureau_num_trans_median0.5899-0.07441.0000
6bureau_num_trans_max0.5728-0.06681.0000
2STATUS_C_sum_mean0.5717-0.06181.0000
5bureau_num_trans_min0.5708-0.04591.0000
4STATUS_C_sum_median0.5632-0.05591.0000
1STATUS_C_sum_max0.5625-0.05911.0000
3STATUS_C_sum_std0.5596-0.05420.8679
8bureau_num_trans_std0.5523-0.04580.8679
12STATUS_0_sum_mean0.5426-0.03871.0000
17STATUS_DPD_sum_mean0.54140.03291.0000
16STATUS_DPD_sum_max0.54010.02501.0000
11STATUS_0_sum_max0.5392-0.03461.0000
18STATUS_DPD_sum_std0.53800.02330.8679
14STATUS_0_sum_median0.5347-0.03281.0000
27STATUS_X_sum_mean0.5279-0.03021.0000
13STATUS_0_sum_std0.5266-0.02980.8679
26STATUS_X_sum_max0.5264-0.02981.0000
19STATUS_DPD_sum_median0.52560.03261.0000
28STATUS_X_sum_std0.5237-0.02940.8679
0STATUS_C_sum_min0.5226-0.02291.0000
29STATUS_X_sum_median0.5153-0.02241.0000
15STATUS_DPD_sum_min0.51440.02551.0000
10STATUS_0_sum_min0.5130-0.01731.0000
22STATUS_Sold_sum_mean0.50450.01891.0000
21STATUS_Sold_sum_max0.50450.01231.0000
25STATUS_X_sum_min0.5045-0.01201.0000
23STATUS_Sold_sum_std0.50380.00840.8679
24STATUS_Sold_sum_median0.50370.01901.0000
20STATUS_Sold_sum_min0.50210.01731.0000
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "7 bureau_num_trans_mean 0.5928 -0.0785 1.0000 \n", "9 bureau_num_trans_median 0.5899 -0.0744 1.0000 \n", "6 bureau_num_trans_max 0.5728 -0.0668 1.0000 \n", "2 STATUS_C_sum_mean 0.5717 -0.0618 1.0000 \n", "5 bureau_num_trans_min 0.5708 -0.0459 1.0000 \n", "4 STATUS_C_sum_median 0.5632 -0.0559 1.0000 \n", "1 STATUS_C_sum_max 0.5625 -0.0591 1.0000 \n", "3 STATUS_C_sum_std 0.5596 -0.0542 0.8679 \n", "8 bureau_num_trans_std 0.5523 -0.0458 0.8679 \n", "12 STATUS_0_sum_mean 0.5426 -0.0387 1.0000 \n", "17 STATUS_DPD_sum_mean 0.5414 0.0329 1.0000 \n", "16 STATUS_DPD_sum_max 0.5401 0.0250 1.0000 \n", "11 STATUS_0_sum_max 0.5392 -0.0346 1.0000 \n", "18 STATUS_DPD_sum_std 0.5380 0.0233 0.8679 \n", "14 STATUS_0_sum_median 0.5347 -0.0328 1.0000 \n", "27 STATUS_X_sum_mean 0.5279 -0.0302 1.0000 \n", "13 STATUS_0_sum_std 0.5266 -0.0298 0.8679 \n", "26 STATUS_X_sum_max 0.5264 -0.0298 1.0000 \n", "19 STATUS_DPD_sum_median 0.5256 0.0326 1.0000 \n", "28 STATUS_X_sum_std 0.5237 -0.0294 0.8679 \n", "0 STATUS_C_sum_min 0.5226 -0.0229 1.0000 \n", "29 STATUS_X_sum_median 0.5153 -0.0224 1.0000 \n", "15 STATUS_DPD_sum_min 0.5144 0.0255 1.0000 \n", "10 STATUS_0_sum_min 0.5130 -0.0173 1.0000 \n", "22 STATUS_Sold_sum_mean 0.5045 0.0189 1.0000 \n", "21 STATUS_Sold_sum_max 0.5045 0.0123 1.0000 \n", "25 STATUS_X_sum_min 0.5045 -0.0120 1.0000 \n", "23 STATUS_Sold_sum_std 0.5038 0.0084 0.8679 \n", "24 STATUS_Sold_sum_median 0.5037 0.0190 1.0000 \n", "20 STATUS_Sold_sum_min 0.5021 0.0173 1.0000 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01.reset_index())\n", "display(eval_agg01)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRSTATUS_Sold_maxSTATUS_X_maxSTATUS_C_maxSTATUS_DPD_maxSTATUS_0_max
010000201111
110000200101
210000201111
310000201111
410000200011
\n", "
" ], "text/plain": [ " SK_ID_CURR STATUS_Sold_max STATUS_X_max STATUS_C_max STATUS_DPD_max STATUS_0_max\n", "0 100002 0 1 1 1 1 \n", "1 100002 0 0 1 0 1 \n", "2 100002 0 1 1 1 1 \n", "3 100002 0 1 1 1 1 \n", "4 100002 0 0 0 1 1 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# keep binary values\n", "ls_cols = [cname for cname in pdf_data.columns if \"_max\" in cname]\n", "pdf_binary = pdf_data[[\"SK_ID_CURR\"] + ls_cols]\n", "pdf_binary.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'STATUS_0_max': ['max'],\n", " 'STATUS_C_max': ['max'],\n", " 'STATUS_DPD_max': ['max'],\n", " 'STATUS_Sold_max': ['max'],\n", " 'STATUS_X_max': ['max']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (134542, 5)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATUS_0_max_maxSTATUS_Sold_max_maxSTATUS_X_max_maxSTATUS_DPD_max_maxSTATUS_C_max_max
SK_ID_CURR
10000110111
10000210111
10000510101
10001010001
10001310111
\n", "
" ], "text/plain": [ " STATUS_0_max_max STATUS_Sold_max_max STATUS_X_max_max STATUS_DPD_max_max STATUS_C_max_max\n", "SK_ID_CURR \n", "100001 1 0 1 1 1 \n", "100002 1 0 1 1 1 \n", "100005 1 0 1 0 1 \n", "100010 1 0 0 0 1 \n", "100013 1 0 1 1 1 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_agg02 = agg_common_data(pdf_binary, [\"max\"])\n", "pdf_agg02.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
3STATUS_DPD_max_max0.53780.04401.0
4STATUS_C_max_max0.5246-0.03351.0
1STATUS_Sold_max_max0.50450.01611.0
2STATUS_X_max_max0.50200.00271.0
0STATUS_0_max_max0.50010.00031.0
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "3 STATUS_DPD_max_max 0.5378 0.0440 1.0 \n", "4 STATUS_C_max_max 0.5246 -0.0335 1.0 \n", "1 STATUS_Sold_max_max 0.5045 0.0161 1.0 \n", "2 STATUS_X_max_max 0.5020 0.0027 1.0 \n", "0 STATUS_0_max_max 0.5001 0.0003 1.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02.reset_index())\n", "display(eval_agg02)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# save features" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(134542, 35)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATUS_C_sum_minSTATUS_C_sum_maxSTATUS_C_sum_meanSTATUS_C_sum_stdSTATUS_C_sum_medianbureau_num_trans_minbureau_num_trans_maxbureau_num_trans_meanbureau_num_trans_stdbureau_num_trans_medianSTATUS_0_sum_minSTATUS_0_sum_maxSTATUS_0_sum_meanSTATUS_0_sum_stdSTATUS_0_sum_medianSTATUS_DPD_sum_minSTATUS_DPD_sum_maxSTATUS_DPD_sum_meanSTATUS_DPD_sum_stdSTATUS_DPD_sum_medianSTATUS_Sold_sum_minSTATUS_Sold_sum_maxSTATUS_Sold_sum_meanSTATUS_Sold_sum_stdSTATUS_Sold_sum_medianSTATUS_X_sum_minSTATUS_X_sum_maxSTATUS_X_sum_meanSTATUS_X_sum_stdSTATUS_X_sum_medianSTATUS_0_max_maxSTATUS_Sold_max_maxSTATUS_X_max_maxSTATUS_DPD_max_maxSTATUS_C_max_max
SK_ID_CURR
10000104415.714316.997218.025224.571416.050529.01124.42864.89412.0010.14290.37800.0000.00.00.0094.28573.81736.010111
1000020132.87504.18972.042213.75006.364016.02185.62505.18075.0063.37502.87544.0000.00.00.0031.87501.35622.510111
100005051.66672.88680.03137.00005.29155.0274.66672.51665.0000.00000.00000.0000.00.00.0010.66670.57741.010101
100010262626.00000.000026.0363636.00000.000036.0101010.00000.000010.0000.00000.00000.0000.00.00.0000.00000.00000.010001
10001304425.750020.726429.5406957.500013.127660.503419.750014.523022.5031.75001.25832.0000.00.00.004010.250019.83890.510111
\n", "
" ], "text/plain": [ " STATUS_C_sum_min STATUS_C_sum_max STATUS_C_sum_mean STATUS_C_sum_std STATUS_C_sum_median bureau_num_trans_min bureau_num_trans_max bureau_num_trans_mean bureau_num_trans_std bureau_num_trans_median STATUS_0_sum_min STATUS_0_sum_max STATUS_0_sum_mean STATUS_0_sum_std STATUS_0_sum_median STATUS_DPD_sum_min STATUS_DPD_sum_max STATUS_DPD_sum_mean STATUS_DPD_sum_std STATUS_DPD_sum_median STATUS_Sold_sum_min STATUS_Sold_sum_max STATUS_Sold_sum_mean STATUS_Sold_sum_std STATUS_Sold_sum_median STATUS_X_sum_min STATUS_X_sum_max STATUS_X_sum_mean STATUS_X_sum_std STATUS_X_sum_median STATUS_0_max_max STATUS_Sold_max_max STATUS_X_max_max STATUS_DPD_max_max STATUS_C_max_max\n", "SK_ID_CURR \n", "100001 0 44 15.7143 16.9972 18.0 2 52 24.5714 16.0505 29.0 1 12 4.4286 4.8941 2.0 0 1 0.1429 0.3780 0.0 0 0 0.0 0.0 0.0 0 9 4.2857 3.8173 6.0 1 0 1 1 1 \n", "100002 0 13 2.8750 4.1897 2.0 4 22 13.7500 6.3640 16.0 2 18 5.6250 5.1807 5.0 0 6 3.3750 2.8754 4.0 0 0 0.0 0.0 0.0 0 3 1.8750 1.3562 2.5 1 0 1 1 1 \n", "100005 0 5 1.6667 2.8868 0.0 3 13 7.0000 5.2915 5.0 2 7 4.6667 2.5166 5.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 1 0.6667 0.5774 1.0 1 0 1 0 1 \n", "100010 26 26 26.0000 0.0000 26.0 36 36 36.0000 0.0000 36.0 10 10 10.0000 0.0000 10.0 0 0 0.0000 0.0000 0.0 0 0 0.0 0.0 0.0 0 0 0.0000 0.0000 0.0 1 0 0 0 1 \n", "100013 0 44 25.7500 20.7264 29.5 40 69 57.5000 13.1276 60.5 0 34 19.7500 14.5230 22.5 0 3 1.7500 1.2583 2.0 0 0 0.0 0.0 0.0 0 40 10.2500 19.8389 0.5 1 0 1 1 1 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_feat = pdf_agg01.join(pdf_agg02)\n", "print(pdf_feat.shape)\n", "pdf_feat.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Store features completed!\n", "CPU times: user 3.03 s, sys: 40.9 ms, total: 3.07 s\n", "Wall time: 1.7 s\n" ] } ], "source": [ "%%time\n", "fname = \"bureau_balance\"\n", "# fname = \"bureau_balance_1year\"\n", "# fname = \"bureau_balance_2year\"\n", "# fname = \"bureau_balance_gt3year\"\n", "# fname = \"bureau_balance_lt1year\"\n", "\n", "fname = os.path.join(\"features\", \"{}.pkl.bz2\".format(fname))\n", "pdf_feat.to_pickle(fname, compression=\"bz2\")\n", "print(\"Store features completed!\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }