{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os, subprocess, pickle\n", "import pandas as pd\n", "import numpy as np\n", "from IPython.display import display\n", "# from lib_feature_engineering import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Combine features" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['baseline_extend.pkl.bz2',\n", " 'baseline.pkl.bz2',\n", " 'bureau_balance_1year.pkl.bz2',\n", " 'bureau_balance_2year.pkl.bz2',\n", " 'bureau_balance_gt3year.pkl.bz2',\n", " 'bureau_balance_lt1year.pkl.bz2',\n", " 'bureau_balance.pkl.bz2',\n", " 'bureau.pkl.bz2',\n", " 'credit_card_balance.pkl.bz2',\n", " 'installments_payments_gt3year.pkl.bz2',\n", " 'installments_payments_in1year.pkl.bz2',\n", " 'installments_payments_in2year.pkl.bz2',\n", " 'installments_payments.pkl.bz2',\n", " 'mean_encoding_feat_cat.pkl.bz2',\n", " 'pdf_features_label.csv.bz2',\n", " 'pos_cash_gt3year.pkl.bz2',\n", " 'pos_cash_in1year.pkl.bz2',\n", " 'pos_cash_in2year.pkl.bz2',\n", " 'pos_cash.pkl.bz2',\n", " 'prev_app.pkl.bz2']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check features folders\n", "subprocess.check_output([\"ls\", \"features\"]).splitlines()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# specified features set for joining\n", "ls_feat_file = [\n", " 'baseline.pkl.bz2',\n", " 'baseline_extend.pkl.bz2',\n", " 'bureau_balance_1year.pkl.bz2',\n", " 'bureau_balance_2year.pkl.bz2',\n", " 'bureau_balance_gt3year.pkl.bz2',\n", " 'bureau_balance_lt1year.pkl.bz2',\n", " 'bureau_balance.pkl.bz2',\n", " 'bureau.pkl.bz2',\n", " 'credit_card_balance.pkl.bz2',\n", " 'installments_payments_gt3year.pkl.bz2',\n", " 'installments_payments_in1year.pkl.bz2',\n", " 'installments_payments_in2year.pkl.bz2',\n", " 'installments_payments.pkl.bz2',\n", " 'pos_cash_gt3year.pkl.bz2',\n", " 'pos_cash_in1year.pkl.bz2',\n", " 'pos_cash_in2year.pkl.bz2',\n", " 'pos_cash.pkl.bz2',\n", " 'prev_app.pkl.bz2'\n", "]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('baseline_extend.pkl.bz2', (356255, 77))\n", "('bureau_balance_1year.pkl.bz2', (123107, 35))\n", "('bureau_balance_2year.pkl.bz2', (110354, 35))\n", "('bureau_balance_gt3year.pkl.bz2', (99247, 35))\n", "('bureau_balance_lt1year.pkl.bz2', (132250, 35))\n", "('bureau_balance.pkl.bz2', (134542, 35))\n", "('bureau.pkl.bz2', (305811, 87))\n", "('credit_card_balance.pkl.bz2', (103558, 111))\n", "('installments_payments_gt3year.pkl.bz2', (209639, 35))\n", "('installments_payments_in1year.pkl.bz2', (238405, 35))\n", "('installments_payments_in2year.pkl.bz2', (171713, 35))\n", "('installments_payments.pkl.bz2', (339587, 35))\n", "('pos_cash_gt3year.pkl.bz2', (212897, 31))\n", "('pos_cash_in1year.pkl.bz2', (239502, 31))\n", "('pos_cash_in2year.pkl.bz2', (171658, 33))\n", "('pos_cash.pkl.bz2', (337252, 36))\n", "('prev_app.pkl.bz2', (338857, 244))\n", "('rows, columns', (356255, 1042))\n" ] }, { "data": { "text/html": [ "
\n", " | SK_ID_CURR | \n", "NAME_INCOME_TYPE_Working | \n", "is_REGION_RATING_CLIENT_W_CITY | \n", "is_REGION_RATING_CLIENT | \n", "is_CODE_GENDER | \n", "NAME_EDUCATION_TYPE_Higher_education | \n", "NAME_EDUCATION_TYPE_Secondary___secondary_special | \n", "is_REG_CITY_NOT_WORK_CITY | \n", "is_FLAG_DOCUMENT_3 | \n", "HOUSETYPE_MODE_block_of_flats | \n", "... | \n", "prev_app_DAYS_FIRST_DUE_TO_YEARS_min | \n", "prev_app_AMT_DOWN_PAYMENT_min | \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_mean | \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_min | \n", "prev_app_DAYS_LAST_DUE_TO_YEARS_min | \n", "prev_app_DAYS_TERMINATION_TO_YEARS_min | \n", "prev_app_RATE_DOWN_PAYMENT_min | \n", "prev_app_AMT_CREDIT_std | \n", "prev_app_RATE_INTEREST_PRIMARY_mean | \n", "prev_app_SK_ID_PREV | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "100002 | \n", "1 | \n", "2 | \n", "2 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "... | \n", "1.547945 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.068493 | \n", "0.046575 | \n", "0.000000 | \n", "NaN | \n", "0.189122 | \n", "1.0 | \n", "
1 | \n", "100003 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "... | \n", "1.961644 | \n", "0.0 | \n", "0.666667 | \n", "0.0 | \n", "1.468493 | \n", "1.443836 | \n", "0.000000 | \n", "497949.861808 | \n", "0.189122 | \n", "3.0 | \n", "
2 | \n", "100004 | \n", "1 | \n", "2 | \n", "2 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "2.147945 | \n", "4860.0 | \n", "0.000000 | \n", "0.0 | \n", "1.983562 | \n", "1.956164 | \n", "0.212008 | \n", "NaN | \n", "0.189122 | \n", "1.0 | \n", "
3 | \n", "100006 | \n", "1 | \n", "2 | \n", "2 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "... | \n", "0.413699 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.413699 | \n", "0.391781 | \n", "0.051605 | \n", "333337.354853 | \n", "0.189122 | \n", "9.0 | \n", "
4 | \n", "100007 | \n", "1 | \n", "2 | \n", "2 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "... | \n", "0.942466 | \n", "0.0 | \n", "0.600000 | \n", "0.0 | \n", "0.969863 | \n", "0.950685 | \n", "0.051605 | \n", "118032.409509 | \n", "0.189122 | \n", "6.0 | \n", "
5 rows × 1042 columns
\n", "\n", " | SK_ID_CURR | \n", "TARGET | \n", "tvt_code | \n", "
---|---|---|---|
0 | \n", "100002 | \n", "1 | \n", "train | \n", "
1 | \n", "100003 | \n", "0 | \n", "train | \n", "
2 | \n", "100004 | \n", "0 | \n", "train | \n", "
3 | \n", "100006 | \n", "0 | \n", "train | \n", "
4 | \n", "100007 | \n", "0 | \n", "train | \n", "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "
---|---|---|---|---|---|
SK_ID_CURR | \n", "100002 | \n", "100003 | \n", "100004 | \n", "100006 | \n", "100007 | \n", "
TARGET | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
tvt_code | \n", "train | \n", "train | \n", "train | \n", "train | \n", "train | \n", "
NAME_INCOME_TYPE_Working | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "1 | \n", "
is_REGION_RATING_CLIENT_W_CITY | \n", "2 | \n", "1 | \n", "2 | \n", "2 | \n", "2 | \n", "
is_REGION_RATING_CLIENT | \n", "2 | \n", "1 | \n", "2 | \n", "2 | \n", "2 | \n", "
is_CODE_GENDER | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "
NAME_EDUCATION_TYPE_Higher_education | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
NAME_EDUCATION_TYPE_Secondary___secondary_special | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "1 | \n", "
is_REG_CITY_NOT_WORK_CITY | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
is_FLAG_DOCUMENT_3 | \n", "1 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "
HOUSETYPE_MODE_block_of_flats | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
NAME_INCOME_TYPE_Pensioner | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ORGANIZATION_TYPE_XNA | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
is_FLAG_EMP_PHONE | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
OCCUPATION_TYPE_Laborers | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "
WALLSMATERIAL_MODE_Panel | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
is_LIVE_CITY_NOT_WORK_CITY | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
NAME_FAMILY_STATUS_Married | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
is_FLAG_WORK_PHONE | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
is_FLAG_PHONE | \n", "1 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "
is_FLAG_OWN_CAR | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
ORGANIZATION_TYPE_Self_employed | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ORGANIZATION_TYPE_Business_Entity_Type_3 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "
NAME_FAMILY_STATUS_Single___not_married | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "
FONDKAPREMONT_MODE_reg_oper_account | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
is_NAME_CONTRACT_TYPE | \n", "1 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "
NAME_HOUSING_TYPE_House___apartment | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
is_FLAG_DOCUMENT_6 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
OCCUPATION_TYPE_Drivers | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std | \n", "NaN | \n", "2.34238 | \n", "NaN | \n", "2.15334 | \n", "2.55569 | \n", "
prev_app_AMT_APPLICATION_max | \n", "179055 | \n", "900000 | \n", "24282 | \n", "688500 | \n", "247500 | \n", "
prev_app_AMT_GOODS_PRICE_max | \n", "179055 | \n", "900000 | \n", "24282 | \n", "688500 | \n", "247500 | \n", "
prev_app_AMT_ANNUITY_std | \n", "NaN | \n", "46332.6 | \n", "NaN | \n", "15995.2 | \n", "8063.59 | \n", "
prev_app_RATE_DOWN_PAYMENT_sum | \n", "0 | \n", "0.151666 | \n", "0.212008 | \n", "0.68806 | \n", "0.525453 | \n", "
prev_app_NFLAG_INSURED_ON_APPROVAL_sum | \n", "0 | \n", "2 | \n", "0 | \n", "0 | \n", "3 | \n", "
prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min | \n", "-0.342466 | \n", "1.05753 | \n", "1.90137 | \n", "-3.44932 | \n", "-0.947945 | \n", "
prev_app_RATE_INTEREST_PRIVILEGED_mean | \n", "0.835095 | \n", "0.835095 | \n", "0.835095 | \n", "0.835095 | \n", "0.835095 | \n", "
prev_app_AMT_APPLICATION_sum | \n", "179055 | \n", "1.30631e+06 | \n", "24282 | \n", "2.44983e+06 | \n", "903182 | \n", "
prev_app_AMT_GOODS_PRICE_sum | \n", "179055 | \n", "1.30631e+06 | \n", "24282 | \n", "2.44983e+06 | \n", "903182 | \n", "
prev_app_AMT_CREDIT_max | \n", "179055 | \n", "1.03588e+06 | \n", "20106 | \n", "906615 | \n", "284400 | \n", "
prev_app_NFLAG_INSURED_ON_APPROVAL_max | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
prev_app_RATE_INTEREST_PRIMARY_sum | \n", "0.189122 | \n", "0.567367 | \n", "0.189122 | \n", "1.7021 | \n", "1.13473 | \n", "
prev_app_RATE_INTEREST_PRIVILEGED_sum | \n", "0.835095 | \n", "2.50529 | \n", "0.835095 | \n", "7.51586 | \n", "5.01057 | \n", "
prev_app_NFLAG_INSURED_ON_APPROVAL_std | \n", "NaN | \n", "0.57735 | \n", "NaN | \n", "0 | \n", "0.547723 | \n", "
prev_app_CNT_PAYMENT_mean | \n", "24 | \n", "10 | \n", "4 | \n", "15.3333 | \n", "20.6667 | \n", "
prev_app_CNT_PAYMENT_sum | \n", "24 | \n", "30 | \n", "4 | \n", "138 | \n", "124 | \n", "
prev_app_AMT_APPLICATION_std | \n", "NaN | \n", "424162 | \n", "NaN | \n", "286175 | \n", "100586 | \n", "
prev_app_AMT_GOODS_PRICE_std | \n", "NaN | \n", "424162 | \n", "NaN | \n", "286175 | \n", "100586 | \n", "
prev_app_AMT_CREDIT_sum | \n", "179055 | \n", "1.45257e+06 | \n", "20106 | \n", "2.62526e+06 | \n", "999832 | \n", "
prev_app_DAYS_FIRST_DUE_TO_YEARS_min | \n", "1.54795 | \n", "1.96164 | \n", "2.14795 | \n", "0.413699 | \n", "0.942466 | \n", "
prev_app_AMT_DOWN_PAYMENT_min | \n", "0 | \n", "0 | \n", "4860 | \n", "0 | \n", "0 | \n", "
prev_app_NFLAG_INSURED_ON_APPROVAL_mean | \n", "0 | \n", "0.666667 | \n", "0 | \n", "0 | \n", "0.6 | \n", "
prev_app_NFLAG_INSURED_ON_APPROVAL_min | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
prev_app_DAYS_LAST_DUE_TO_YEARS_min | \n", "0.0684932 | \n", "1.46849 | \n", "1.98356 | \n", "0.413699 | \n", "0.969863 | \n", "
prev_app_DAYS_TERMINATION_TO_YEARS_min | \n", "0.0465753 | \n", "1.44384 | \n", "1.95616 | \n", "0.391781 | \n", "0.950685 | \n", "
prev_app_RATE_DOWN_PAYMENT_min | \n", "0 | \n", "0 | \n", "0.212008 | \n", "0.0516051 | \n", "0.0516051 | \n", "
prev_app_AMT_CREDIT_std | \n", "NaN | \n", "497950 | \n", "NaN | \n", "333337 | \n", "118032 | \n", "
prev_app_RATE_INTEREST_PRIMARY_mean | \n", "0.189122 | \n", "0.189122 | \n", "0.189122 | \n", "0.189122 | \n", "0.189122 | \n", "
prev_app_SK_ID_PREV | \n", "1 | \n", "3 | \n", "1 | \n", "9 | \n", "6 | \n", "
1044 rows × 5 columns
\n", "