{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Full width\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import math\n", "import os\n", "import subprocess\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display\n", "\n", "# \n", "from lib_modeling import *\n", "from lib_feature_engineering import *\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data\n", "\n", "- Load train + tvt = train_filtered for features evaluation\n", "- Load train/test for applying mean encoding" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGET
01000021
11000030
21000040
31000060
41000070
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET\n", "0 100002 1 \n", "1 100003 0 \n", "2 100004 0 \n", "3 100006 0 \n", "4 100007 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load train/test data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "pdf_train = pd.read_csv(data_path)\n", "\n", "data_path = \"home-credit-default-risk/application_test.csv\"\n", "pdf_test = pd.read_csv(data_path)\n", "\n", "# filter by tvt code\n", "pdf_tvt_extend = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n", "pdf_train_filtered = (pdf_tvt_extend.query(\"tvt_code == 'train'\")\n", " .merge(pdf_train[[\"SK_ID_CURR\"]], on=\"SK_ID_CURR\")\n", " .drop(columns=[\"tvt_code\"]))\n", "pdf_train_filtered.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(13605401, 8)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRNUM_INSTALMENT_VERSIONNUM_INSTALMENT_NUMBERDAYS_INSTALMENTDAYS_ENTRY_PAYMENTAMT_INSTALMENTAMT_PAYMENT
010541861616741.06-1180.0-1187.06948.3606948.360
113308311516390.034-2156.0-2156.01716.5251716.525
220852311930532.01-63.0-63.025425.00025425.000
324525271996971.03-2418.0-2426.024350.13024350.130
427147241677561.02-1383.0-1366.02165.0402160.585
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR NUM_INSTALMENT_VERSION NUM_INSTALMENT_NUMBER DAYS_INSTALMENT DAYS_ENTRY_PAYMENT AMT_INSTALMENT AMT_PAYMENT\n", "0 1054186 161674 1.0 6 -1180.0 -1187.0 6948.360 6948.360 \n", "1 1330831 151639 0.0 34 -2156.0 -2156.0 1716.525 1716.525 \n", "2 2085231 193053 2.0 1 -63.0 -63.0 25425.000 25425.000 \n", "3 2452527 199697 1.0 3 -2418.0 -2426.0 24350.130 24350.130 \n", "4 2714724 167756 1.0 2 -1383.0 -1366.0 2165.040 2160.585 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load previous application\n", "data_path = \"home-credit-default-risk/installments_payments.csv\"\n", "pdf_data = pd.read_csv(data_path)\n", "print(pdf_data.shape)\n", "pdf_data.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# load meta data\n", "meta_path = \"../02_pandas/reports/report_installments_payments.csv\"\n", "pdf_meta = pd.read_csv(meta_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAFRRJREFUeJzt3X+s3XV9x/Hne61oLUOq6E1Hm5VljRnaDOEGupGZqzgoYCwmskCcVGWpMWB0azKryYITTVgydIEoWScdxVU6hpo2Wlcb5MaZCPJDZsHquMMOLu2o2orUn7vuvT/O57rj9fSee86np99T7vORnJxz3ufz+X7evVzu635/nHMjM5EkqcZvNN2AJOnEZ5hIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSaq2sOkGjpfTTjstV6xY0dfcH/3oRyxevPjYNnQM2Fdv7Ks3w9oXDG9vz8W+Hnzwwe9l5ku7DszMeXE755xzsl/33HNP33MHyb56Y1+9Gda+Moe3t+diX8ADOYefsR7mkiRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFWbNx+nUmPPU8/w1o2fb2TtfTdc2si6ktQL90wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklSta5hExPKIuCci9kbEoxHx7lL/QEQ8FREPl9slbXPeFxETEfHtiLiorb6m1CYiYmNb/YyIuC8iHouIf46Ik0r9+eX5RHl9Rbc1JEnH31z2TKaADZn5e8Bq4JqIOLO89tHMPKvcdgKU164AXgGsAT4eEQsiYgHwMeBi4Ezgyrbt/E3Z1krgMHB1qV8NHM7M3wU+WsYddY2+vwqSpCpdwyQzD2TmQ+Xxs8Be4PRZpqwFtmXmzzLzO8AEcG65TWTm45n5c2AbsDYiAngtcFeZvwW4rG1bW8rju4ALyvijrSFJakBPf7a3HGZ6FXAfcD5wbURcBTxAa+/lMK2gubdt2iT/Hz5PzqifB7wE+EFmTnUYf/r0nMyciohnyvjZ1nhOWTHLnwvesGpqYH9O2D8XLKkXcw6TiDgZ+DTwnsz8YUTcAlwPZLm/EXg7EB2mJ533gnKW8czy2mxz2nteD6wHGBkZYXx8vMO07kYWtX5wD5tB9tXv1wrgyJEjVfMHxb56M6x9wfD2Np/7mlOYRMTzaAXJ1sz8DEBmPt32+j8AnytPJ4HlbdOXAfvL40717wGnRsTCsnfSPn56W5MRsRB4EXCoyxq/lJmbgE0Ao6OjOTY2Npd/7q+5eet2btzT007ccbFh1dTA+tr35rG+546Pj9Pv13qQ7Ks3w9oXDG9v87mvuVzNFcCtwN7M/EhbfWnbsDcCj5THO4ArypVYZwArga8B9wMry5VbJ9E6gb4jMxO4B3hTmb8O2N62rXXl8ZuAL5XxR1tDktSAufxaez7wFmBPRDxcau+ndTXWWbQOL+0D3gGQmY9GxJ3AN2ldCXZNZv4CICKuBXYBC4DNmflo2d57gW0R8SHg67TCi3L/yYiYoLVHckW3NSRJx1/XMMnMr9D5HMXOWeZ8GPhwh/rOTvMy83E6XI2VmT8FLu9lDUnS8ec74CVJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdW6hklELI+IeyJib0Q8GhHvLvUXR8TuiHis3C8p9YiImyJiIiK+ERFnt21rXRn/WESsa6ufExF7ypybIiL6XUOSdPzNZc9kCtiQmb8HrAauiYgzgY3A3Zm5Eri7PAe4GFhZbuuBW6AVDMB1wHnAucB10+FQxqxvm7em1HtaQ5LUjK5hkpkHMvOh8vhZYC9wOrAW2FKGbQEuK4/XArdny73AqRGxFLgI2J2ZhzLzMLAbWFNeOyUzv5qZCdw+Y1u9rCFJasDCXgZHxArgVcB9wEhmHoBW4ETEy8qw04En26ZNltps9ckOdfpY48CMftfT2nNhZGSE8fHxXv65vzSyCDasmupr7iANsq9+v1YAR44cqZo/KPbVm2HtC4a3t/nc15zDJCJOBj4NvCczf1hOa3Qc2qGWfdRnbWcuczJzE7AJYHR0NMfGxrpstrObt27nxj095e5xsWHV1MD62vfmsb7njo+P0+/XepDsqzfD2hcMb2/zua85Xc0VEc+jFSRbM/Mzpfz09KGlcn+w1CeB5W3TlwH7u9SXdaj3s4YkqQFzuZorgFuBvZn5kbaXdgDTV2StA7a31a8qV1ytBp4ph6p2ARdGxJJy4v1CYFd57dmIWF3WumrGtnpZQ5LUgLkcIzkfeAuwJyIeLrX3AzcAd0bE1cATwOXltZ3AJcAE8GPgbQCZeSgirgfuL+M+mJmHyuN3ArcBi4AvlBu9riFJakbXMMnMr9D5HAXABR3GJ3DNUba1Gdjcof4A8MoO9e/3uoYk6fjzHfCSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqdrwfUaIhsKKjZ/ve+6GVVO8tWL+vhsu7XuupGa4ZyJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmq1jVMImJzRByMiEfaah+IiKci4uFyu6TttfdFxEREfDsiLmqrrym1iYjY2FY/IyLui4jHIuKfI+KkUn9+eT5RXl/RbQ1JUjPmsmdyG7CmQ/2jmXlWue0EiIgzgSuAV5Q5H4+IBRGxAPgYcDFwJnBlGQvwN2VbK4HDwNWlfjVwODN/F/hoGXfUNXr7Z0uSjqWuYZKZXwYOzXF7a4FtmfmzzPwOMAGcW24Tmfl4Zv4c2AasjYgAXgvcVeZvAS5r29aW8vgu4IIy/mhrSJIaUnPO5NqI+EY5DLak1E4HnmwbM1lqR6u/BPhBZk7NqP/Ktsrrz5TxR9uWJKkhC/ucdwtwPZDl/kbg7UB0GJt0Dq2cZTyzvDbbnF8REeuB9QAjIyOMj493GtbVyCLYsGqq+8Dj7LnaV7//nbo5cuTIwLZdw756N6y9zee++gqTzHx6+nFE/APwufJ0EljeNnQZsL887lT/HnBqRCwsex/t46e3NRkRC4EX0TrcNtsaM/vcBGwCGB0dzbGxsZ7+ndNu3rqdG/f0m7uDs2HV1HOyr31vHjt2zbQZHx+n3++BQbKv3g1rb/O5r74Oc0XE0ranbwSmr/TaAVxRrsQ6A1gJfA24H1hZrtw6idYJ9B2ZmcA9wJvK/HXA9rZtrSuP3wR8qYw/2hqSpIZ0/fUxIu4AxoDTImISuA4Yi4izaB1e2ge8AyAzH42IO4FvAlPANZn5i7Kda4FdwAJgc2Y+WpZ4L7AtIj4EfB24tdRvBT4ZERO09kiu6LaGJKkZXcMkM6/sUL61Q216/IeBD3eo7wR2dqg/ToersTLzp8DlvawhSWqG74CXJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSteH7LA7Neys2fn4g292waoq3zrLtfTdcOpB1pfnAPRNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1/9KiVAzqLzx2s2HVFGONrCwdO+6ZSJKqGSaSpGpdwyQiNkfEwYh4pK324ojYHRGPlfslpR4RcVNETETENyLi7LY568r4xyJiXVv9nIjYU+bcFBHR7xqSpGbMZc/kNmDNjNpG4O7MXAncXZ4DXAysLLf1wC3QCgbgOuA84FzguulwKGPWt81b088akqTmdA2TzPwycGhGeS2wpTzeAlzWVr89W+4FTo2IpcBFwO7MPJSZh4HdwJry2imZ+dXMTOD2GdvqZQ1JUkP6PWcykpkHAMr9y0r9dODJtnGTpTZbfbJDvZ81JEkNOdaXBkeHWvZR72eNXx8YsZ7WoTBGRkYYHx/vsunORha1Lt8cNvbVm2Huq9/vzUE6cuTIUPYFw9vbfO6r3zB5OiKWZuaBcojpYKlPAsvbxi0D9pf62Iz6eKkv6zC+nzV+TWZuAjYBjI6O5tjYWKdhXd28dTs37hm+t+RsWDVlXz0Y5r7+pM/vzUEaHx+n3/9nBm1Ye5vPffV7mGsHMH1F1jpge1v9qnLF1WrgmXKIahdwYUQsKSfeLwR2ldeejYjV5Squq2Zsq5c1JEkN6fprWkTcQWuv4rSImKR1VdYNwJ0RcTXwBHB5Gb4TuASYAH4MvA0gMw9FxPXA/WXcBzNz+qT+O2ldMbYI+EK50esa0omsqXff77vh0kbW1XNP1zDJzCuP8tIFHcYmcM1RtrMZ2Nyh/gDwyg717/e6hiSpGb4DXpJUzTCRJFUbvktbJB03s52r2bBqirc2dC6nm5rePE80GO6ZSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkan6ciqR5ZZAf9z/bx7w81z/GxT0TSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVqwqTiNgXEXsi4uGIeKDUXhwRuyPisXK/pNQjIm6KiImI+EZEnN22nXVl/GMRsa6tfk7Z/kSZG7OtIUlqxrHYM3lNZp6VmaPl+Ubg7sxcCdxdngNcDKwst/XALdAKBuA64DzgXOC6tnC4pYydnremyxqSpAYM4jDXWmBLebwFuKytfnu23AucGhFLgYuA3Zl5KDMPA7uBNeW1UzLzq5mZwO0zttVpDUlSA6L1c7rPyRHfAQ4DCfx9Zm6KiB9k5qltYw5n5pKI+BxwQ2Z+pdTvBt4LjAEvyMwPlfpfAT8Bxsv415X6HwHvzczXH22NDv2tp7Vnw8jIyDnbtm3r69958NAzPP2TvqYO1Mgi7KsH9tWbYe0Lhre3Ye3rjBct4OSTT+5r7mte85oH2448HVXt34A/PzP3R8TLgN0R8a1ZxkaHWvZRn7PM3ARsAhgdHc2xsbFepv/SzVu3c+Oe2i/Vsbdh1ZR99cC+ejOsfcHw9jasfd22ZjH9/vybq6rDXJm5v9wfBD5L65zH0+UQFeX+YBk+CSxvm74M2N+lvqxDnVnWkCQ1oO8wiYjFEfGb04+BC4FHgB3A9BVZ64Dt5fEO4KpyVddq4JnMPADsAi6MiCXlxPuFwK7y2rMRsbpcxXXVjG11WkOS1ICa/bER4LPlat2FwKcy818j4n7gzoi4GngCuLyM3wlcAkwAPwbeBpCZhyLieuD+Mu6DmXmoPH4ncBuwCPhCuQHccJQ1JEkN6DtMMvNx4Pc71L8PXNChnsA1R9nWZmBzh/oDwCvnuoYkqRm+A16SVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lStRM6TCJiTUR8OyImImJj0/1I0nx1woZJRCwAPgZcDJwJXBkRZzbblSTNTydsmADnAhOZ+Xhm/hzYBqxtuCdJmpdO5DA5HXiy7flkqUmSjrPIzKZ76EtEXA5clJl/Vp6/BTg3M9/VNmY9sL48fTnw7T6XOw34XkW7g2JfvbGv3gxrXzC8vT0X+/rtzHxpt0EL+9z4MJgElrc9Xwbsbx+QmZuATbULRcQDmTlau51jzb56Y1+9Gda+YHh7m899nciHue4HVkbEGRFxEnAFsKPhniRpXjph90wycyoirgV2AQuAzZn5aMNtSdK8dMKGCUBm7gR2Hoelqg+VDYh99ca+ejOsfcHw9jZv+zphT8BLkobHiXzORJI0JAyTLobxI1siYnNEHIyIR5rupV1ELI+IeyJib0Q8GhHvbrongIh4QUR8LSL+vfT110331C4iFkTE1yPic033Mi0i9kXEnoh4OCIeaLqfaRFxakTcFRHfKt9nfzAEPb28fJ2mbz+MiPc03RdARPx5+Z5/JCLuiIgXDGwtD3MdXfnIlv8A/pjWpcj3A1dm5jcb7uvVwBHg9sx8ZZO9tIuIpcDSzHwoIn4TeBC4bAi+XgEszswjEfE84CvAuzPz3ib7mhYRfwGMAqdk5uub7gdaYQKMZuZQvWciIrYA/5aZnyhXcb4wM3/QdF/Tys+Mp4DzMvO/Gu7ldFrf62dm5k8i4k5gZ2beNoj13DOZ3VB+ZEtmfhk41HQfM2Xmgcx8qDx+FtjLEHwqQbYcKU+fV25D8VtURCwDLgU+0XQvwy4iTgFeDdwKkJk/H6YgKS4A/rPpIGmzEFgUEQuBFzLjvXjHkmEyOz+ypU8RsQJ4FXBfs520lENJDwMHgd2ZORR9AX8H/CXwv003MkMCX4yIB8snSQyD3wG+C/xjOSz4iYhY3HRTM1wB3NF0EwCZ+RTwt8ATwAHgmcz84qDWM0xmFx1qQ/Eb7TCLiJOBTwPvycwfNt0PQGb+IjPPovVJCedGROOHByPi9cDBzHyw6V46OD8zz6b1qdzXlEOrTVsInA3ckpmvAn4EDMV5TIBy2O0NwL803QtARCyhdSTlDOC3gMUR8aeDWs8wmV3Xj2zRryrnJD4NbM3MzzTdz0zlsMg4sKbhVgDOB95Qzk9sA14bEf/UbEstmbm/3B8EPkvrkG/TJoHJtr3Ku2iFy7C4GHgoM59uupHidcB3MvO7mfk/wGeAPxzUYobJ7PzIlh6UE923Ansz8yNN9zMtIl4aEaeWx4to/U/2rWa7gsx8X2Yuy8wVtL63vpSZA/vNca4iYnG5gIJyGOlCoPErBzPzv4EnI+LlpXQB0OjFHTNcyZAc4iqeAFZHxAvL/5sX0DqPORAn9DvgB21YP7IlIu4AxoDTImISuC4zb222K6D1m/ZbgD3l/ATA+8snFTRpKbClXGnzG8CdmTk0l+EOoRHgs62fPywEPpWZ/9psS7/0LmBr+eXuceBtDfcDQES8kNZVn+9oupdpmXlfRNwFPARMAV9ngO+E99JgSVI1D3NJkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSar2f/rOwhz0kS/SAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "(pdf_data[\"DAYS_INSTALMENT\"] / -365).hist()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# convert to years\n", "pdf_data[\"DAYS_INSTALMENT_TO_YEARS\"] = pdf_data[\"DAYS_INSTALMENT\"] / -365\n", "pdf_data.drop(columns=[\"DAYS_INSTALMENT\"], inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1.3605e+07\n", "mean 1.8364e+02\n", "std 1.9125e+04\n", "min -2.4247e+06\n", "25% 0.0000e+00\n", "50% 0.0000e+00\n", "75% 0.0000e+00\n", "max 2.6309e+06\n", "Name: diff_amount, dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate different amount\n", "pdf_data[\"AMT_PAYMENT\"].fillna(0, inplace=True)\n", "pdf_data[\"diff_amount\"] = pdf_data[\"AMT_PAYMENT\"] - pdf_data[\"AMT_INSTALMENT\"]\n", "pdf_data[\"diff_amount\"].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## filter data here for looking up balance within years" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# pdf_data = pdf_data[(pdf_data[\"DAYS_INSTALMENT_TO_YEARS\"] >= 1) & (pdf_data[\"DAYS_INSTALMENT_TO_YEARS\"] < 2)]\n", "# pdf_data = pdf_data[(pdf_data[\"DAYS_INSTALMENT_TO_YEARS\"] >= 2) & (pdf_data[\"DAYS_INSTALMENT_TO_YEARS\"] < 3)]\n", "# pdf_data = pdf_data[pdf_data[\"DAYS_INSTALMENT_TO_YEARS\"] >= 3]\n", "# print(pdf_data.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Numerical features" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'NUM_INSTALMENT_NUMBER': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (339587, 5)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
2NUM_INSTALMENT_NUMBER_sum0.5124-0.01771.0000
3NUM_INSTALMENT_NUMBER_mean0.5071-0.00991.0000
4NUM_INSTALMENT_NUMBER_std0.50490.00310.9975
0NUM_INSTALMENT_NUMBER_max0.50180.00441.0000
1NUM_INSTALMENT_NUMBER_min0.5001-0.00101.0000
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "2 NUM_INSTALMENT_NUMBER_sum 0.5124 -0.0177 1.0000 \n", "3 NUM_INSTALMENT_NUMBER_mean 0.5071 -0.0099 1.0000 \n", "4 NUM_INSTALMENT_NUMBER_std 0.5049 0.0031 0.9975 \n", "0 NUM_INSTALMENT_NUMBER_max 0.5018 0.0044 1.0000 \n", "1 NUM_INSTALMENT_NUMBER_min 0.5001 -0.0010 1.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 7.51 s, sys: 1.2 s, total: 8.71 s\n", "Wall time: 2.66 s\n" ] } ], "source": [ "%%time\n", "pdf_agg02 = agg_common_data(pdf_data[[\"SK_ID_CURR\", \"NUM_INSTALMENT_NUMBER\"]], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)\n", "display(eval_agg02)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Continuous features" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['NUM_INSTALMENT_VERSION',\n", " 'DAYS_ENTRY_PAYMENT',\n", " 'AMT_INSTALMENT',\n", " 'AMT_PAYMENT',\n", " 'DAYS_INSTALMENT_TO_YEARS',\n", " 'diff_amount']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list continuous attributes\n", "# ls_con = pdf_meta.query(\"sub_type == 'float64'\")[\"name\"].tolist()\n", "series_type = pdf_data.dtypes\n", "ls_con = series_type[series_type == \"float64\"].index.tolist()\n", "ls_con" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRNUM_INSTALMENT_VERSIONDAYS_ENTRY_PAYMENTAMT_INSTALMENTAMT_PAYMENTDAYS_INSTALMENT_TO_YEARSdiff_amount
010541861616741.0-1187.06948.3606948.3603.23290.000
113308311516390.0-2156.01716.5251716.5255.90680.000
220852311930532.0-63.025425.00025425.0000.17260.000
324525271996971.0-2426.024350.13024350.1306.62470.000
427147241677561.0-1366.02165.0402160.5853.7890-4.455
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR NUM_INSTALMENT_VERSION DAYS_ENTRY_PAYMENT AMT_INSTALMENT AMT_PAYMENT DAYS_INSTALMENT_TO_YEARS diff_amount\n", "0 1054186 161674 1.0 -1187.0 6948.360 6948.360 3.2329 0.000 \n", "1 1330831 151639 0.0 -2156.0 1716.525 1716.525 5.9068 0.000 \n", "2 2085231 193053 2.0 -63.0 25425.000 25425.000 0.1726 0.000 \n", "3 2452527 199697 1.0 -2426.0 24350.130 24350.130 6.6247 0.000 \n", "4 2714724 167756 1.0 -1366.0 2165.040 2160.585 3.7890 -4.455 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_con = pdf_data[[\"SK_ID_PREV\", \"SK_ID_CURR\"] + ls_con].copy()\n", "pdf_con.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "{'AMT_INSTALMENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_ENTRY_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_INSTALMENT_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NUM_INSTALMENT_VERSION': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'diff_amount': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (339587, 30)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
1DAYS_ENTRY_PAYMENT_min0.56310.05971.0000
15DAYS_INSTALMENT_TO_YEARS_max0.5628-0.05951.0000
19DAYS_INSTALMENT_TO_YEARS_std0.5619-0.05900.9975
4DAYS_ENTRY_PAYMENT_std0.5610-0.05850.9974
3DAYS_ENTRY_PAYMENT_mean0.55600.04471.0000
6AMT_PAYMENT_min0.5554-0.02441.0000
18DAYS_INSTALMENT_TO_YEARS_mean0.5553-0.04421.0000
13diff_amount_mean0.5552-0.03061.0000
2DAYS_ENTRY_PAYMENT_sum0.54960.03521.0000
17DAYS_INSTALMENT_TO_YEARS_sum0.5492-0.03501.0000
12diff_amount_sum0.5490-0.02911.0000
7AMT_PAYMENT_sum0.5448-0.02601.0000
21AMT_INSTALMENT_min0.5447-0.01951.0000
8AMT_PAYMENT_mean0.5431-0.02341.0000
27NUM_INSTALMENT_VERSION_sum0.5411-0.03101.0000
11diff_amount_min0.5397-0.02021.0000
22AMT_INSTALMENT_sum0.5391-0.02121.0000
23AMT_INSTALMENT_mean0.5353-0.01841.0000
28NUM_INSTALMENT_VERSION_mean0.5329-0.02811.0000
26NUM_INSTALMENT_VERSION_min0.5259-0.03251.0000
14diff_amount_std0.5224-0.01210.9975
5AMT_PAYMENT_max0.52190.00051.0000
20AMT_INSTALMENT_max0.52120.00141.0000
10diff_amount_max0.5150-0.01611.0000
25NUM_INSTALMENT_VERSION_max0.5142-0.01961.0000
24AMT_INSTALMENT_std0.51170.00080.9975
0DAYS_ENTRY_PAYMENT_max0.5109-0.00161.0000
29NUM_INSTALMENT_VERSION_std0.5078-0.01070.9975
9AMT_PAYMENT_std0.5068-0.00130.9975
16DAYS_INSTALMENT_TO_YEARS_min0.50310.00241.0000
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "1 DAYS_ENTRY_PAYMENT_min 0.5631 0.0597 1.0000 \n", "15 DAYS_INSTALMENT_TO_YEARS_max 0.5628 -0.0595 1.0000 \n", "19 DAYS_INSTALMENT_TO_YEARS_std 0.5619 -0.0590 0.9975 \n", "4 DAYS_ENTRY_PAYMENT_std 0.5610 -0.0585 0.9974 \n", "3 DAYS_ENTRY_PAYMENT_mean 0.5560 0.0447 1.0000 \n", "6 AMT_PAYMENT_min 0.5554 -0.0244 1.0000 \n", "18 DAYS_INSTALMENT_TO_YEARS_mean 0.5553 -0.0442 1.0000 \n", "13 diff_amount_mean 0.5552 -0.0306 1.0000 \n", "2 DAYS_ENTRY_PAYMENT_sum 0.5496 0.0352 1.0000 \n", "17 DAYS_INSTALMENT_TO_YEARS_sum 0.5492 -0.0350 1.0000 \n", "12 diff_amount_sum 0.5490 -0.0291 1.0000 \n", "7 AMT_PAYMENT_sum 0.5448 -0.0260 1.0000 \n", "21 AMT_INSTALMENT_min 0.5447 -0.0195 1.0000 \n", "8 AMT_PAYMENT_mean 0.5431 -0.0234 1.0000 \n", "27 NUM_INSTALMENT_VERSION_sum 0.5411 -0.0310 1.0000 \n", "11 diff_amount_min 0.5397 -0.0202 1.0000 \n", "22 AMT_INSTALMENT_sum 0.5391 -0.0212 1.0000 \n", "23 AMT_INSTALMENT_mean 0.5353 -0.0184 1.0000 \n", "28 NUM_INSTALMENT_VERSION_mean 0.5329 -0.0281 1.0000 \n", "26 NUM_INSTALMENT_VERSION_min 0.5259 -0.0325 1.0000 \n", "14 diff_amount_std 0.5224 -0.0121 0.9975 \n", "5 AMT_PAYMENT_max 0.5219 0.0005 1.0000 \n", "20 AMT_INSTALMENT_max 0.5212 0.0014 1.0000 \n", "10 diff_amount_max 0.5150 -0.0161 1.0000 \n", "25 NUM_INSTALMENT_VERSION_max 0.5142 -0.0196 1.0000 \n", "24 AMT_INSTALMENT_std 0.5117 0.0008 0.9975 \n", "0 DAYS_ENTRY_PAYMENT_max 0.5109 -0.0016 1.0000 \n", "29 NUM_INSTALMENT_VERSION_std 0.5078 -0.0107 0.9975 \n", "9 AMT_PAYMENT_std 0.5068 -0.0013 0.9975 \n", "16 DAYS_INSTALMENT_TO_YEARS_min 0.5031 0.0024 1.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 20.7 s, sys: 864 ms, total: 21.6 s\n", "Wall time: 7.93 s\n" ] } ], "source": [ "%%time\n", "pdf_agg03 = agg_common_data(pdf_con[[\"SK_ID_CURR\"] + ls_con], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)\n", "display(eval_agg03)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save features" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(339587, 35)\n" ] } ], "source": [ "pdf_feat = pdf_agg02.join(pdf_agg03)\n", "print(pdf_feat.shape)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Store features completed!\n", "CPU times: user 8.74 s, sys: 160 ms, total: 8.9 s\n", "Wall time: 7.55 s\n" ] } ], "source": [ "%%time\n", "fname = \"installments_payments\"\n", "# fname = \"installments_payments_in1year\"\n", "# fname = \"installments_payments_in2year\"\n", "# fname = \"installments_payments_gt3year\"\n", "\n", "fname = os.path.join(\"features\", \"{}.pkl.bz2\".format(fname))\n", "pdf_feat.to_pickle(fname, compression=\"bz2\")\n", "print(\"Store features completed!\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }