{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os, subprocess, pickle\n", "import pandas as pd\n", "import numpy as np\n", "from IPython.display import display\n", "# from lib_feature_engineering import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Combine features" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['baseline_extend.pkl.bz2',\n", " 'baseline.pkl.bz2',\n", " 'bureau_balance_1year.pkl.bz2',\n", " 'bureau_balance_2year.pkl.bz2',\n", " 'bureau_balance_gt3year.pkl.bz2',\n", " 'bureau_balance_lt1year.pkl.bz2',\n", " 'bureau_balance.pkl.bz2',\n", " 'bureau.pkl.bz2',\n", " 'credit_card_balance.pkl.bz2',\n", " 'installments_payments_gt3year.pkl.bz2',\n", " 'installments_payments_in1year.pkl.bz2',\n", " 'installments_payments_in2year.pkl.bz2',\n", " 'installments_payments.pkl.bz2',\n", " 'mean_encoding_feat_cat.pkl.bz2',\n", " 'pdf_features_label.csv.bz2',\n", " 'pos_cash_gt3year.pkl.bz2',\n", " 'pos_cash_in1year.pkl.bz2',\n", " 'pos_cash_in2year.pkl.bz2',\n", " 'pos_cash.pkl.bz2',\n", " 'prev_app.pkl.bz2']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check features folders\n", "subprocess.check_output([\"ls\", \"features\"]).splitlines()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# specified features set for joining\n", "ls_feat_file = [\n", " 'baseline.pkl.bz2',\n", " 'baseline_extend.pkl.bz2',\n", " 'bureau_balance_1year.pkl.bz2',\n", " 'bureau_balance_2year.pkl.bz2',\n", " 'bureau_balance_gt3year.pkl.bz2',\n", " 'bureau_balance_lt1year.pkl.bz2',\n", " 'bureau_balance.pkl.bz2',\n", " 'bureau.pkl.bz2',\n", " 'credit_card_balance.pkl.bz2',\n", " 'installments_payments_gt3year.pkl.bz2',\n", " 'installments_payments_in1year.pkl.bz2',\n", " 'installments_payments_in2year.pkl.bz2',\n", " 'installments_payments.pkl.bz2',\n", " 'pos_cash_gt3year.pkl.bz2',\n", " 'pos_cash_in1year.pkl.bz2',\n", " 'pos_cash_in2year.pkl.bz2',\n", " 'pos_cash.pkl.bz2',\n", " 'prev_app.pkl.bz2'\n", "]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('baseline_extend.pkl.bz2', (356255, 77))\n", "('bureau_balance_1year.pkl.bz2', (123107, 35))\n", "('bureau_balance_2year.pkl.bz2', (110354, 35))\n", "('bureau_balance_gt3year.pkl.bz2', (99247, 35))\n", "('bureau_balance_lt1year.pkl.bz2', (132250, 35))\n", "('bureau_balance.pkl.bz2', (134542, 35))\n", "('bureau.pkl.bz2', (305811, 87))\n", "('credit_card_balance.pkl.bz2', (103558, 111))\n", "('installments_payments_gt3year.pkl.bz2', (209639, 35))\n", "('installments_payments_in1year.pkl.bz2', (238405, 35))\n", "('installments_payments_in2year.pkl.bz2', (171713, 35))\n", "('installments_payments.pkl.bz2', (339587, 35))\n", "('pos_cash_gt3year.pkl.bz2', (212897, 31))\n", "('pos_cash_in1year.pkl.bz2', (239502, 31))\n", "('pos_cash_in2year.pkl.bz2', (171658, 33))\n", "('pos_cash.pkl.bz2', (337252, 36))\n", "('prev_app.pkl.bz2', (338857, 244))\n", "('rows, columns', (356255, 1042))\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRNAME_INCOME_TYPE_Workingis_REGION_RATING_CLIENT_W_CITYis_REGION_RATING_CLIENTis_CODE_GENDERNAME_EDUCATION_TYPE_Higher_educationNAME_EDUCATION_TYPE_Secondary___secondary_specialis_REG_CITY_NOT_WORK_CITYis_FLAG_DOCUMENT_3HOUSETYPE_MODE_block_of_flats...prev_app_DAYS_FIRST_DUE_TO_YEARS_minprev_app_AMT_DOWN_PAYMENT_minprev_app_NFLAG_INSURED_ON_APPROVAL_meanprev_app_NFLAG_INSURED_ON_APPROVAL_minprev_app_DAYS_LAST_DUE_TO_YEARS_minprev_app_DAYS_TERMINATION_TO_YEARS_minprev_app_RATE_DOWN_PAYMENT_minprev_app_AMT_CREDIT_stdprev_app_RATE_INTEREST_PRIMARY_meanprev_app_SK_ID_PREV
0100002122101011...1.5479450.00.0000000.00.0684930.0465750.000000NaN0.1891221.0
1100003011010011...1.9616440.00.6666670.01.4684931.4438360.000000497949.8618080.1891223.0
2100004122101000...2.1479454860.00.0000000.01.9835621.9561640.212008NaN0.1891221.0
3100006122001010...0.4136990.00.0000000.00.4136990.3917810.051605333337.3548530.1891229.0
4100007122101100...0.9424660.00.6000000.00.9698630.9506850.051605118032.4095090.1891226.0
\n", "

5 rows × 1042 columns

\n", "
" ], "text/plain": [ " SK_ID_CURR NAME_INCOME_TYPE_Working is_REGION_RATING_CLIENT_W_CITY \\\n", "0 100002 1 2 \n", "1 100003 0 1 \n", "2 100004 1 2 \n", "3 100006 1 2 \n", "4 100007 1 2 \n", "\n", " is_REGION_RATING_CLIENT is_CODE_GENDER \\\n", "0 2 1 \n", "1 1 0 \n", "2 2 1 \n", "3 2 0 \n", "4 2 1 \n", "\n", " NAME_EDUCATION_TYPE_Higher_education \\\n", "0 0 \n", "1 1 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", " NAME_EDUCATION_TYPE_Secondary___secondary_special \\\n", "0 1 \n", "1 0 \n", "2 1 \n", "3 1 \n", "4 1 \n", "\n", " is_REG_CITY_NOT_WORK_CITY is_FLAG_DOCUMENT_3 \\\n", "0 0 1 \n", "1 0 1 \n", "2 0 0 \n", "3 0 1 \n", "4 1 0 \n", "\n", " HOUSETYPE_MODE_block_of_flats ... \\\n", "0 1 ... \n", "1 1 ... \n", "2 0 ... \n", "3 0 ... \n", "4 0 ... \n", "\n", " prev_app_DAYS_FIRST_DUE_TO_YEARS_min prev_app_AMT_DOWN_PAYMENT_min \\\n", "0 1.547945 0.0 \n", "1 1.961644 0.0 \n", "2 2.147945 4860.0 \n", "3 0.413699 0.0 \n", "4 0.942466 0.0 \n", "\n", " prev_app_NFLAG_INSURED_ON_APPROVAL_mean \\\n", "0 0.000000 \n", "1 0.666667 \n", "2 0.000000 \n", "3 0.000000 \n", "4 0.600000 \n", "\n", " prev_app_NFLAG_INSURED_ON_APPROVAL_min \\\n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 \n", "\n", " prev_app_DAYS_LAST_DUE_TO_YEARS_min \\\n", "0 0.068493 \n", "1 1.468493 \n", "2 1.983562 \n", "3 0.413699 \n", "4 0.969863 \n", "\n", " prev_app_DAYS_TERMINATION_TO_YEARS_min prev_app_RATE_DOWN_PAYMENT_min \\\n", "0 0.046575 0.000000 \n", "1 1.443836 0.000000 \n", "2 1.956164 0.212008 \n", "3 0.391781 0.051605 \n", "4 0.950685 0.051605 \n", "\n", " prev_app_AMT_CREDIT_std prev_app_RATE_INTEREST_PRIMARY_mean \\\n", "0 NaN 0.189122 \n", "1 497949.861808 0.189122 \n", "2 NaN 0.189122 \n", "3 333337.354853 0.189122 \n", "4 118032.409509 0.189122 \n", "\n", " prev_app_SK_ID_PREV \n", "0 1.0 \n", "1 3.0 \n", "2 1.0 \n", "3 9.0 \n", "4 6.0 \n", "\n", "[5 rows x 1042 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2min 2s, sys: 8.96 s, total: 2min 11s\n", "Wall time: 54.4 s\n" ] } ], "source": [ "%%time\n", "# use first features for base joined\n", "feat_path = os.path.join(\"features\", ls_feat_file[0])\n", "pdf_combined = pd.read_pickle(feat_path, compression=\"bz2\")\n", "\n", "# join next features set\n", "for fname in ls_feat_file[1:]:\n", " feat_path = os.path.join(\"features\", fname)\n", " pdf_feat = pd.read_pickle(feat_path, compression=\"bz2\")\n", " print(fname, pdf_feat.shape)\n", " \n", " # add table prefix\n", " tbl_prefix = fname.split(\".\")[0]\n", " rename_col = {cname: \"{}_{}\".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != \"SK_ID_CURR\"}\n", " pdf_feat.rename(columns=rename_col, inplace=True)\n", " \n", " # join\n", " pdf_combined = pdf_combined.merge(pdf_feat, on=\"SK_ID_CURR\", how=\"left\")\n", "\n", "print(\"rows, columns\", pdf_combined.shape)\n", "ls_features = [feat for feat in pdf_combined.columns if feat not in [\"SK_ID_CURR\"]]\n", "display(pdf_combined.head())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%time\n", "if False:\n", " def filter_feat_low_auc(pdf_label, pdf_input, threshold=0.501):\n", " pdf_eval = feature_evaluate(pdf_label, pdf_input)\n", " ls_filtered_feat = pdf_eval.query(\"auc > {}\".format(threshold))[\"name\"].tolist()\n", " return ls_filtered_feat\n", " \n", " \n", " # load train data\n", " data_path = \"home-credit-default-risk/application_train.csv\"\n", " pdf_train = pd.read_csv(data_path)\n", "\n", " # filter by tvt code\n", " pdf_tvt_extend = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n", " pdf_train_filtered = (pdf_tvt_extend.query(\"tvt_code == 'train'\")\n", " .merge(pdf_train[[\"SK_ID_CURR\"]], on=\"SK_ID_CURR\")\n", " .drop(columns=[\"tvt_code\"]))\n", " \n", " ls_filtered_feat = filter_feat_low_auc(pdf_train_filtered, pdf_combined, threshold=0.501)\n", " pdf_combined = pdf_combined[[\"SK_ID_CURR\"] + ls_filtered_feat]\n", " print(\"After filtered: {}\".format(pdf_combined.shape))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# join with label" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(356255, 3)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGETtvt_code
01000021train
11000030train
21000040train
31000060train
41000070train
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET tvt_code\n", "0 100002 1 train\n", "1 100003 0 train\n", "2 100004 0 train\n", "3 100006 0 train\n", "4 100007 0 train" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pdf_tvt = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n", "print(pdf_tvt.shape)\n", "display(pdf_tvt.head())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "train 216948\n", "kaggle_test 48744\n", "test 46127\n", "val 44436\n", "Name: tvt_code, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_tvt[\"tvt_code\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(356255, 1044)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
SK_ID_CURR100002100003100004100006100007
TARGET10000
tvt_codetraintraintraintraintrain
NAME_INCOME_TYPE_Working10111
is_REGION_RATING_CLIENT_W_CITY21222
is_REGION_RATING_CLIENT21222
is_CODE_GENDER10101
NAME_EDUCATION_TYPE_Higher_education01000
NAME_EDUCATION_TYPE_Secondary___secondary_special10111
is_REG_CITY_NOT_WORK_CITY00001
is_FLAG_DOCUMENT_311010
HOUSETYPE_MODE_block_of_flats11000
NAME_INCOME_TYPE_Pensioner00000
ORGANIZATION_TYPE_XNA00000
is_FLAG_EMP_PHONE11111
OCCUPATION_TYPE_Laborers10110
WALLSMATERIAL_MODE_Panel00000
is_LIVE_CITY_NOT_WORK_CITY00001
NAME_FAMILY_STATUS_Married01000
is_FLAG_WORK_PHONE00100
is_FLAG_PHONE11100
is_FLAG_OWN_CAR00100
ORGANIZATION_TYPE_Self_employed00000
ORGANIZATION_TYPE_Business_Entity_Type_310010
NAME_FAMILY_STATUS_Single___not_married10101
FONDKAPREMONT_MODE_reg_oper_account11000
is_NAME_CONTRACT_TYPE11011
NAME_HOUSING_TYPE_House___apartment11111
is_FLAG_DOCUMENT_600000
OCCUPATION_TYPE_Drivers00000
..................
prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_stdNaN2.34238NaN2.153342.55569
prev_app_AMT_APPLICATION_max17905590000024282688500247500
prev_app_AMT_GOODS_PRICE_max17905590000024282688500247500
prev_app_AMT_ANNUITY_stdNaN46332.6NaN15995.28063.59
prev_app_RATE_DOWN_PAYMENT_sum00.1516660.2120080.688060.525453
prev_app_NFLAG_INSURED_ON_APPROVAL_sum02003
prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min-0.3424661.057531.90137-3.44932-0.947945
prev_app_RATE_INTEREST_PRIVILEGED_mean0.8350950.8350950.8350950.8350950.835095
prev_app_AMT_APPLICATION_sum1790551.30631e+06242822.44983e+06903182
prev_app_AMT_GOODS_PRICE_sum1790551.30631e+06242822.44983e+06903182
prev_app_AMT_CREDIT_max1790551.03588e+0620106906615284400
prev_app_NFLAG_INSURED_ON_APPROVAL_max01001
prev_app_RATE_INTEREST_PRIMARY_sum0.1891220.5673670.1891221.70211.13473
prev_app_RATE_INTEREST_PRIVILEGED_sum0.8350952.505290.8350957.515865.01057
prev_app_NFLAG_INSURED_ON_APPROVAL_stdNaN0.57735NaN00.547723
prev_app_CNT_PAYMENT_mean2410415.333320.6667
prev_app_CNT_PAYMENT_sum24304138124
prev_app_AMT_APPLICATION_stdNaN424162NaN286175100586
prev_app_AMT_GOODS_PRICE_stdNaN424162NaN286175100586
prev_app_AMT_CREDIT_sum1790551.45257e+06201062.62526e+06999832
prev_app_DAYS_FIRST_DUE_TO_YEARS_min1.547951.961642.147950.4136990.942466
prev_app_AMT_DOWN_PAYMENT_min00486000
prev_app_NFLAG_INSURED_ON_APPROVAL_mean00.666667000.6
prev_app_NFLAG_INSURED_ON_APPROVAL_min00000
prev_app_DAYS_LAST_DUE_TO_YEARS_min0.06849321.468491.983560.4136990.969863
prev_app_DAYS_TERMINATION_TO_YEARS_min0.04657531.443841.956160.3917810.950685
prev_app_RATE_DOWN_PAYMENT_min000.2120080.05160510.0516051
prev_app_AMT_CREDIT_stdNaN497950NaN333337118032
prev_app_RATE_INTEREST_PRIMARY_mean0.1891220.1891220.1891220.1891220.189122
prev_app_SK_ID_PREV13196
\n", "

1044 rows × 5 columns

\n", "
" ], "text/plain": [ " 0 1 \\\n", "SK_ID_CURR 100002 100003 \n", "TARGET 1 0 \n", "tvt_code train train \n", "NAME_INCOME_TYPE_Working 1 0 \n", "is_REGION_RATING_CLIENT_W_CITY 2 1 \n", "is_REGION_RATING_CLIENT 2 1 \n", "is_CODE_GENDER 1 0 \n", "NAME_EDUCATION_TYPE_Higher_education 0 1 \n", "NAME_EDUCATION_TYPE_Secondary___secondary_special 1 0 \n", "is_REG_CITY_NOT_WORK_CITY 0 0 \n", "is_FLAG_DOCUMENT_3 1 1 \n", "HOUSETYPE_MODE_block_of_flats 1 1 \n", "NAME_INCOME_TYPE_Pensioner 0 0 \n", "ORGANIZATION_TYPE_XNA 0 0 \n", "is_FLAG_EMP_PHONE 1 1 \n", "OCCUPATION_TYPE_Laborers 1 0 \n", "WALLSMATERIAL_MODE_Panel 0 0 \n", "is_LIVE_CITY_NOT_WORK_CITY 0 0 \n", "NAME_FAMILY_STATUS_Married 0 1 \n", "is_FLAG_WORK_PHONE 0 0 \n", "is_FLAG_PHONE 1 1 \n", "is_FLAG_OWN_CAR 0 0 \n", "ORGANIZATION_TYPE_Self_employed 0 0 \n", "ORGANIZATION_TYPE_Business_Entity_Type_3 1 0 \n", "NAME_FAMILY_STATUS_Single___not_married 1 0 \n", "FONDKAPREMONT_MODE_reg_oper_account 1 1 \n", "is_NAME_CONTRACT_TYPE 1 1 \n", "NAME_HOUSING_TYPE_House___apartment 1 1 \n", "is_FLAG_DOCUMENT_6 0 0 \n", "OCCUPATION_TYPE_Drivers 0 0 \n", "... ... ... \n", "prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std NaN 2.34238 \n", "prev_app_AMT_APPLICATION_max 179055 900000 \n", "prev_app_AMT_GOODS_PRICE_max 179055 900000 \n", "prev_app_AMT_ANNUITY_std NaN 46332.6 \n", "prev_app_RATE_DOWN_PAYMENT_sum 0 0.151666 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_sum 0 2 \n", "prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min -0.342466 1.05753 \n", "prev_app_RATE_INTEREST_PRIVILEGED_mean 0.835095 0.835095 \n", "prev_app_AMT_APPLICATION_sum 179055 1.30631e+06 \n", "prev_app_AMT_GOODS_PRICE_sum 179055 1.30631e+06 \n", "prev_app_AMT_CREDIT_max 179055 1.03588e+06 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_max 0 1 \n", "prev_app_RATE_INTEREST_PRIMARY_sum 0.189122 0.567367 \n", "prev_app_RATE_INTEREST_PRIVILEGED_sum 0.835095 2.50529 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_std NaN 0.57735 \n", "prev_app_CNT_PAYMENT_mean 24 10 \n", "prev_app_CNT_PAYMENT_sum 24 30 \n", "prev_app_AMT_APPLICATION_std NaN 424162 \n", "prev_app_AMT_GOODS_PRICE_std NaN 424162 \n", "prev_app_AMT_CREDIT_sum 179055 1.45257e+06 \n", "prev_app_DAYS_FIRST_DUE_TO_YEARS_min 1.54795 1.96164 \n", "prev_app_AMT_DOWN_PAYMENT_min 0 0 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_mean 0 0.666667 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_min 0 0 \n", "prev_app_DAYS_LAST_DUE_TO_YEARS_min 0.0684932 1.46849 \n", "prev_app_DAYS_TERMINATION_TO_YEARS_min 0.0465753 1.44384 \n", "prev_app_RATE_DOWN_PAYMENT_min 0 0 \n", "prev_app_AMT_CREDIT_std NaN 497950 \n", "prev_app_RATE_INTEREST_PRIMARY_mean 0.189122 0.189122 \n", "prev_app_SK_ID_PREV 1 3 \n", "\n", " 2 3 \\\n", "SK_ID_CURR 100004 100006 \n", "TARGET 0 0 \n", "tvt_code train train \n", "NAME_INCOME_TYPE_Working 1 1 \n", "is_REGION_RATING_CLIENT_W_CITY 2 2 \n", "is_REGION_RATING_CLIENT 2 2 \n", "is_CODE_GENDER 1 0 \n", "NAME_EDUCATION_TYPE_Higher_education 0 0 \n", "NAME_EDUCATION_TYPE_Secondary___secondary_special 1 1 \n", "is_REG_CITY_NOT_WORK_CITY 0 0 \n", "is_FLAG_DOCUMENT_3 0 1 \n", "HOUSETYPE_MODE_block_of_flats 0 0 \n", "NAME_INCOME_TYPE_Pensioner 0 0 \n", "ORGANIZATION_TYPE_XNA 0 0 \n", "is_FLAG_EMP_PHONE 1 1 \n", "OCCUPATION_TYPE_Laborers 1 1 \n", "WALLSMATERIAL_MODE_Panel 0 0 \n", "is_LIVE_CITY_NOT_WORK_CITY 0 0 \n", "NAME_FAMILY_STATUS_Married 0 0 \n", "is_FLAG_WORK_PHONE 1 0 \n", "is_FLAG_PHONE 1 0 \n", "is_FLAG_OWN_CAR 1 0 \n", "ORGANIZATION_TYPE_Self_employed 0 0 \n", "ORGANIZATION_TYPE_Business_Entity_Type_3 0 1 \n", "NAME_FAMILY_STATUS_Single___not_married 1 0 \n", "FONDKAPREMONT_MODE_reg_oper_account 0 0 \n", "is_NAME_CONTRACT_TYPE 0 1 \n", "NAME_HOUSING_TYPE_House___apartment 1 1 \n", "is_FLAG_DOCUMENT_6 0 0 \n", "OCCUPATION_TYPE_Drivers 0 0 \n", "... ... ... \n", "prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std NaN 2.15334 \n", "prev_app_AMT_APPLICATION_max 24282 688500 \n", "prev_app_AMT_GOODS_PRICE_max 24282 688500 \n", "prev_app_AMT_ANNUITY_std NaN 15995.2 \n", "prev_app_RATE_DOWN_PAYMENT_sum 0.212008 0.68806 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_sum 0 0 \n", "prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min 1.90137 -3.44932 \n", "prev_app_RATE_INTEREST_PRIVILEGED_mean 0.835095 0.835095 \n", "prev_app_AMT_APPLICATION_sum 24282 2.44983e+06 \n", "prev_app_AMT_GOODS_PRICE_sum 24282 2.44983e+06 \n", "prev_app_AMT_CREDIT_max 20106 906615 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_max 0 0 \n", "prev_app_RATE_INTEREST_PRIMARY_sum 0.189122 1.7021 \n", "prev_app_RATE_INTEREST_PRIVILEGED_sum 0.835095 7.51586 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_std NaN 0 \n", "prev_app_CNT_PAYMENT_mean 4 15.3333 \n", "prev_app_CNT_PAYMENT_sum 4 138 \n", "prev_app_AMT_APPLICATION_std NaN 286175 \n", "prev_app_AMT_GOODS_PRICE_std NaN 286175 \n", "prev_app_AMT_CREDIT_sum 20106 2.62526e+06 \n", "prev_app_DAYS_FIRST_DUE_TO_YEARS_min 2.14795 0.413699 \n", "prev_app_AMT_DOWN_PAYMENT_min 4860 0 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_mean 0 0 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_min 0 0 \n", "prev_app_DAYS_LAST_DUE_TO_YEARS_min 1.98356 0.413699 \n", "prev_app_DAYS_TERMINATION_TO_YEARS_min 1.95616 0.391781 \n", "prev_app_RATE_DOWN_PAYMENT_min 0.212008 0.0516051 \n", "prev_app_AMT_CREDIT_std NaN 333337 \n", "prev_app_RATE_INTEREST_PRIMARY_mean 0.189122 0.189122 \n", "prev_app_SK_ID_PREV 1 9 \n", "\n", " 4 \n", "SK_ID_CURR 100007 \n", "TARGET 0 \n", "tvt_code train \n", "NAME_INCOME_TYPE_Working 1 \n", "is_REGION_RATING_CLIENT_W_CITY 2 \n", "is_REGION_RATING_CLIENT 2 \n", "is_CODE_GENDER 1 \n", "NAME_EDUCATION_TYPE_Higher_education 0 \n", "NAME_EDUCATION_TYPE_Secondary___secondary_special 1 \n", "is_REG_CITY_NOT_WORK_CITY 1 \n", "is_FLAG_DOCUMENT_3 0 \n", "HOUSETYPE_MODE_block_of_flats 0 \n", "NAME_INCOME_TYPE_Pensioner 0 \n", "ORGANIZATION_TYPE_XNA 0 \n", "is_FLAG_EMP_PHONE 1 \n", "OCCUPATION_TYPE_Laborers 0 \n", "WALLSMATERIAL_MODE_Panel 0 \n", "is_LIVE_CITY_NOT_WORK_CITY 1 \n", "NAME_FAMILY_STATUS_Married 0 \n", "is_FLAG_WORK_PHONE 0 \n", "is_FLAG_PHONE 0 \n", "is_FLAG_OWN_CAR 0 \n", "ORGANIZATION_TYPE_Self_employed 0 \n", "ORGANIZATION_TYPE_Business_Entity_Type_3 0 \n", "NAME_FAMILY_STATUS_Single___not_married 1 \n", "FONDKAPREMONT_MODE_reg_oper_account 0 \n", "is_NAME_CONTRACT_TYPE 1 \n", "NAME_HOUSING_TYPE_House___apartment 1 \n", "is_FLAG_DOCUMENT_6 0 \n", "OCCUPATION_TYPE_Drivers 0 \n", "... ... \n", "prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std 2.55569 \n", "prev_app_AMT_APPLICATION_max 247500 \n", "prev_app_AMT_GOODS_PRICE_max 247500 \n", "prev_app_AMT_ANNUITY_std 8063.59 \n", "prev_app_RATE_DOWN_PAYMENT_sum 0.525453 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_sum 3 \n", "prev_app_DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min -0.947945 \n", "prev_app_RATE_INTEREST_PRIVILEGED_mean 0.835095 \n", "prev_app_AMT_APPLICATION_sum 903182 \n", "prev_app_AMT_GOODS_PRICE_sum 903182 \n", "prev_app_AMT_CREDIT_max 284400 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_max 1 \n", "prev_app_RATE_INTEREST_PRIMARY_sum 1.13473 \n", "prev_app_RATE_INTEREST_PRIVILEGED_sum 5.01057 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_std 0.547723 \n", "prev_app_CNT_PAYMENT_mean 20.6667 \n", "prev_app_CNT_PAYMENT_sum 124 \n", "prev_app_AMT_APPLICATION_std 100586 \n", "prev_app_AMT_GOODS_PRICE_std 100586 \n", "prev_app_AMT_CREDIT_sum 999832 \n", "prev_app_DAYS_FIRST_DUE_TO_YEARS_min 0.942466 \n", "prev_app_AMT_DOWN_PAYMENT_min 0 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_mean 0.6 \n", "prev_app_NFLAG_INSURED_ON_APPROVAL_min 0 \n", "prev_app_DAYS_LAST_DUE_TO_YEARS_min 0.969863 \n", "prev_app_DAYS_TERMINATION_TO_YEARS_min 0.950685 \n", "prev_app_RATE_DOWN_PAYMENT_min 0.0516051 \n", "prev_app_AMT_CREDIT_std 118032 \n", "prev_app_RATE_INTEREST_PRIMARY_mean 0.189122 \n", "prev_app_SK_ID_PREV 6 \n", "\n", "[1044 rows x 5 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pdf_features_label = pdf_tvt.merge(pdf_combined, on=\"SK_ID_CURR\", how=\"left\")\n", "print(pdf_features_label.shape)\n", "display(pdf_features_label.head().T)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4min 57s, sys: 6.65 s, total: 5min 4s\n", "Wall time: 5min 6s\n" ] } ], "source": [ "%%time\n", "# save combined features with label\n", "# pdf_features_label.to_pickle(os.path.join(\"features\", \"pdf_features_label.pkl.bz2\"), compression=\"bz2\")\n", "pdf_features_label.to_csv(os.path.join(\"features\", \"pdf_features_label.csv.bz2\"), compression=\"bz2\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }