{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os, math, subprocess\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import train_test_split\n", "from IPython.display import display\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load train and test data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('(rows, columns)', (307511, 122))\n", "First 5 rows\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGETNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITYAMT_GOODS_PRICENAME_TYPE_SUITENAME_INCOME_TYPENAME_EDUCATION_TYPENAME_FAMILY_STATUSNAME_HOUSING_TYPEREGION_POPULATION_RELATIVEDAYS_BIRTHDAYS_EMPLOYEDDAYS_REGISTRATIONDAYS_ID_PUBLISHOWN_CAR_AGEFLAG_MOBILFLAG_EMP_PHONEFLAG_WORK_PHONEFLAG_CONT_MOBILEFLAG_PHONEFLAG_EMAILOCCUPATION_TYPECNT_FAM_MEMBERSREGION_RATING_CLIENTREGION_RATING_CLIENT_W_CITYWEEKDAY_APPR_PROCESS_STARTHOUR_APPR_PROCESS_STARTREG_REGION_NOT_LIVE_REGIONREG_REGION_NOT_WORK_REGIONLIVE_REGION_NOT_WORK_REGIONREG_CITY_NOT_LIVE_CITYREG_CITY_NOT_WORK_CITYLIVE_CITY_NOT_WORK_CITYORGANIZATION_TYPEEXT_SOURCE_1EXT_SOURCE_2EXT_SOURCE_3APARTMENTS_AVGBASEMENTAREA_AVGYEARS_BEGINEXPLUATATION_AVGYEARS_BUILD_AVGCOMMONAREA_AVGELEVATORS_AVGENTRANCES_AVGFLOORSMAX_AVGFLOORSMIN_AVGLANDAREA_AVGLIVINGAPARTMENTS_AVGLIVINGAREA_AVGNONLIVINGAPARTMENTS_AVGNONLIVINGAREA_AVGAPARTMENTS_MODEBASEMENTAREA_MODEYEARS_BEGINEXPLUATATION_MODEYEARS_BUILD_MODECOMMONAREA_MODEELEVATORS_MODEENTRANCES_MODEFLOORSMAX_MODEFLOORSMIN_MODELANDAREA_MODELIVINGAPARTMENTS_MODELIVINGAREA_MODENONLIVINGAPARTMENTS_MODENONLIVINGAREA_MODEAPARTMENTS_MEDIBASEMENTAREA_MEDIYEARS_BEGINEXPLUATATION_MEDIYEARS_BUILD_MEDICOMMONAREA_MEDIELEVATORS_MEDIENTRANCES_MEDIFLOORSMAX_MEDIFLOORSMIN_MEDILANDAREA_MEDILIVINGAPARTMENTS_MEDILIVINGAREA_MEDINONLIVINGAPARTMENTS_MEDINONLIVINGAREA_MEDIFONDKAPREMONT_MODEHOUSETYPE_MODETOTALAREA_MODEWALLSMATERIAL_MODEEMERGENCYSTATE_MODEOBS_30_CNT_SOCIAL_CIRCLEDEF_30_CNT_SOCIAL_CIRCLEOBS_60_CNT_SOCIAL_CIRCLEDEF_60_CNT_SOCIAL_CIRCLEDAYS_LAST_PHONE_CHANGEFLAG_DOCUMENT_2FLAG_DOCUMENT_3FLAG_DOCUMENT_4FLAG_DOCUMENT_5FLAG_DOCUMENT_6FLAG_DOCUMENT_7FLAG_DOCUMENT_8FLAG_DOCUMENT_9FLAG_DOCUMENT_10FLAG_DOCUMENT_11FLAG_DOCUMENT_12FLAG_DOCUMENT_13FLAG_DOCUMENT_14FLAG_DOCUMENT_15FLAG_DOCUMENT_16FLAG_DOCUMENT_17FLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEAR
01000021Cash loansMNY0202500.0406597.524700.5351000.0UnaccompaniedWorkingSecondary / secondary specialSingle / not marriedHouse / apartment0.0188-9461-637-3648.0-2120NaN110110Laborers1.022WEDNESDAY10000000Business Entity Type 30.08300.26290.13940.02470.03690.97220.61920.01430.000.06900.08330.12500.03690.02020.01900.00000.00000.02520.03830.97220.63410.01440.00000.06900.08330.12500.03770.0220.01980.00.00.02500.03690.97220.62430.01440.000.06900.08330.12500.03750.02050.01930.00000.00reg oper accountblock of flats0.0149Stone, brickNo2.02.02.02.0-1134.0010000000000000000000.00.00.00.00.01.0
11000030Cash loansFNN0270000.01293502.535698.51129500.0FamilyState servantHigher educationMarriedHouse / apartment0.0035-16765-1188-1186.0-291NaN110110Core staff2.011MONDAY11000000School0.31130.6222NaN0.09590.05290.98510.79600.06050.080.03450.29170.33330.01300.07730.05490.00390.00980.09240.05380.98510.80400.04970.08060.03450.29170.33330.01280.0790.05540.00.00.09680.05290.98510.79870.06080.080.03450.29170.33330.01320.07870.05580.00390.01reg oper accountblock of flats0.0714BlockNo1.00.01.00.0-828.0010000000000000000000.00.00.00.00.00.0
21000040Revolving loansMYY067500.0135000.06750.0135000.0UnaccompaniedWorkingSecondary / secondary specialSingle / not marriedHouse / apartment0.0100-19046-225-4260.0-253126.0111110Laborers1.022MONDAY9000000GovernmentNaN0.55590.7296NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.00.00.00.0-815.0000000000000000000000.00.00.00.00.00.0
31000060Cash loansFNY0135000.0312682.529686.5297000.0UnaccompaniedWorkingSecondary / secondary specialCivil marriageHouse / apartment0.0080-19005-3039-9833.0-2437NaN110100Laborers2.022WEDNESDAY17000000Business Entity Type 3NaN0.6504NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2.00.02.00.0-617.001000000000000000000NaNNaNNaNNaNNaNNaN
41000070Cash loansMNY0121500.0513000.021865.5513000.0UnaccompaniedWorkingSecondary / secondary specialSingle / not marriedHouse / apartment0.0287-19932-3038-4311.0-3458NaN110100Core staff1.022THURSDAY11000011ReligionNaN0.3227NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.00.00.00.0-1106.0000000100000000000000.00.00.00.00.00.0
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 \\\n", "0 100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 351000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.0188 -9461 -637 -3648.0 -2120 NaN 1 1 0 1 1 0 Laborers 1.0 2 2 WEDNESDAY 10 0 0 0 0 0 0 Business Entity Type 3 0.0830 0.2629 0.1394 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 0.9722 0.6341 0.0144 0.0000 0.0690 0.0833 0.1250 0.0377 0.022 0.0198 0.0 0.0 0.0250 0.0369 0.9722 0.6243 0.0144 0.00 0.0690 0.0833 0.1250 0.0375 0.0205 0.0193 0.0000 0.00 reg oper account block of flats 0.0149 Stone, brick No 2.0 2.0 2.0 2.0 -1134.0 0 1 0 0 0 0 0 0 0 0 \n", "1 100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 1129500.0 Family State servant Higher education Married House / apartment 0.0035 -16765 -1188 -1186.0 -291 NaN 1 1 0 1 1 0 Core staff 2.0 1 1 MONDAY 11 0 0 0 0 0 0 School 0.3113 0.6222 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 0.9851 0.8040 0.0497 0.0806 0.0345 0.2917 0.3333 0.0128 0.079 0.0554 0.0 0.0 0.0968 0.0529 0.9851 0.7987 0.0608 0.08 0.0345 0.2917 0.3333 0.0132 0.0787 0.0558 0.0039 0.01 reg oper account block of flats 0.0714 Block No 1.0 0.0 1.0 0.0 -828.0 0 1 0 0 0 0 0 0 0 0 \n", "2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 135000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.0100 -19046 -225 -4260.0 -2531 26.0 1 1 1 1 1 0 Laborers 1.0 2 2 MONDAY 9 0 0 0 0 0 0 Government NaN 0.5559 0.7296 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -815.0 0 0 0 0 0 0 0 0 0 0 \n", "3 100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 297000.0 Unaccompanied Working Secondary / secondary special Civil marriage House / apartment 0.0080 -19005 -3039 -9833.0 -2437 NaN 1 1 0 1 0 0 Laborers 2.0 2 2 WEDNESDAY 17 0 0 0 0 0 0 Business Entity Type 3 NaN 0.6504 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 0.0 2.0 0.0 -617.0 0 1 0 0 0 0 0 0 0 0 \n", "4 100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 513000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.0287 -19932 -3038 -4311.0 -3458 NaN 1 1 0 1 0 0 Core staff 1.0 2 2 THURSDAY 11 0 0 0 0 1 1 Religion NaN 0.3227 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -1106.0 0 0 0 0 0 0 1 0 0 0 \n", "\n", " FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR \n", "0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0 \n", "1 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN \n", "4 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "('(rows, columns)', (48744, 121))\n", "First 5 rows\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITYAMT_GOODS_PRICENAME_TYPE_SUITENAME_INCOME_TYPENAME_EDUCATION_TYPENAME_FAMILY_STATUSNAME_HOUSING_TYPEREGION_POPULATION_RELATIVEDAYS_BIRTHDAYS_EMPLOYEDDAYS_REGISTRATIONDAYS_ID_PUBLISHOWN_CAR_AGEFLAG_MOBILFLAG_EMP_PHONEFLAG_WORK_PHONEFLAG_CONT_MOBILEFLAG_PHONEFLAG_EMAILOCCUPATION_TYPECNT_FAM_MEMBERSREGION_RATING_CLIENTREGION_RATING_CLIENT_W_CITYWEEKDAY_APPR_PROCESS_STARTHOUR_APPR_PROCESS_STARTREG_REGION_NOT_LIVE_REGIONREG_REGION_NOT_WORK_REGIONLIVE_REGION_NOT_WORK_REGIONREG_CITY_NOT_LIVE_CITYREG_CITY_NOT_WORK_CITYLIVE_CITY_NOT_WORK_CITYORGANIZATION_TYPEEXT_SOURCE_1EXT_SOURCE_2EXT_SOURCE_3APARTMENTS_AVGBASEMENTAREA_AVGYEARS_BEGINEXPLUATATION_AVGYEARS_BUILD_AVGCOMMONAREA_AVGELEVATORS_AVGENTRANCES_AVGFLOORSMAX_AVGFLOORSMIN_AVGLANDAREA_AVGLIVINGAPARTMENTS_AVGLIVINGAREA_AVGNONLIVINGAPARTMENTS_AVGNONLIVINGAREA_AVGAPARTMENTS_MODEBASEMENTAREA_MODEYEARS_BEGINEXPLUATATION_MODEYEARS_BUILD_MODECOMMONAREA_MODEELEVATORS_MODEENTRANCES_MODEFLOORSMAX_MODEFLOORSMIN_MODELANDAREA_MODELIVINGAPARTMENTS_MODELIVINGAREA_MODENONLIVINGAPARTMENTS_MODENONLIVINGAREA_MODEAPARTMENTS_MEDIBASEMENTAREA_MEDIYEARS_BEGINEXPLUATATION_MEDIYEARS_BUILD_MEDICOMMONAREA_MEDIELEVATORS_MEDIENTRANCES_MEDIFLOORSMAX_MEDIFLOORSMIN_MEDILANDAREA_MEDILIVINGAPARTMENTS_MEDILIVINGAREA_MEDINONLIVINGAPARTMENTS_MEDINONLIVINGAREA_MEDIFONDKAPREMONT_MODEHOUSETYPE_MODETOTALAREA_MODEWALLSMATERIAL_MODEEMERGENCYSTATE_MODEOBS_30_CNT_SOCIAL_CIRCLEDEF_30_CNT_SOCIAL_CIRCLEOBS_60_CNT_SOCIAL_CIRCLEDEF_60_CNT_SOCIAL_CIRCLEDAYS_LAST_PHONE_CHANGEFLAG_DOCUMENT_2FLAG_DOCUMENT_3FLAG_DOCUMENT_4FLAG_DOCUMENT_5FLAG_DOCUMENT_6FLAG_DOCUMENT_7FLAG_DOCUMENT_8FLAG_DOCUMENT_9FLAG_DOCUMENT_10FLAG_DOCUMENT_11FLAG_DOCUMENT_12FLAG_DOCUMENT_13FLAG_DOCUMENT_14FLAG_DOCUMENT_15FLAG_DOCUMENT_16FLAG_DOCUMENT_17FLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEAR
0100001Cash loansFNY0135000.0568800.020560.5450000.0UnaccompaniedWorkingHigher educationMarriedHouse / apartment0.0188-19241-2329-5170.0-812NaN110101NaN2.022TUESDAY18000000Kindergarten0.75260.78970.15950.06600.05900.9732NaNNaNNaN0.13790.125NaNNaNNaN0.0505NaNNaN0.06720.06120.9732NaNNaNNaN0.13790.125NaNNaNNaN0.0526NaNNaN0.06660.05900.9732NaNNaNNaN0.13790.125NaNNaNNaN0.0514NaNNaNNaNblock of flats0.0392Stone, brickNo0.00.00.00.0-1740.0010000000000000000000.00.00.00.00.00.0
1100005Cash loansMNY099000.0222768.017370.0180000.0UnaccompaniedWorkingSecondary / secondary specialMarriedHouse / apartment0.0358-18064-4469-9118.0-1623NaN110100Low-skill Laborers2.022FRIDAY9000000Self-employed0.56500.29170.4330NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.00.00.00.00.0010000000000000000000.00.00.00.00.03.0
2100013Cash loansMYY0202500.0663264.069777.0630000.0NaNWorkingHigher educationMarriedHouse / apartment0.0191-20038-4458-2175.0-35035.0110100Drivers2.022MONDAY14000000Transport: type 3NaN0.69980.6110NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.00.00.00.0-856.0000000100000000000000.00.00.00.01.04.0
3100028Cash loansFNY2315000.01575000.049018.51575000.0UnaccompaniedWorkingSecondary / secondary specialMarriedHouse / apartment0.0264-13976-1866-2000.0-4208NaN110110Sales staff4.022WEDNESDAY11000000Business Entity Type 30.52570.50970.61270.30520.19740.99700.95920.11650.320.27590.3750.04170.20420.24040.36730.03860.080.31090.20490.99700.96080.11760.32220.27590.3750.04170.20890.26260.38270.03890.08470.30810.19740.99700.95970.11730.320.27590.3750.04170.20780.24460.37390.03880.0817reg oper accountblock of flats0.3700PanelNo0.00.00.00.0-1805.0010000000000000000000.00.00.00.00.03.0
4100038Cash loansMYN1180000.0625500.032067.0625500.0UnaccompaniedWorkingSecondary / secondary specialMarriedHouse / apartment0.0100-13040-2191-4000.0-426216.0111100NaN3.022FRIDAY5000011Business Entity Type 30.20210.4257NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.00.00.00.0-821.001000000000000000000NaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " SK_ID_CURR NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 \\\n", "0 100001 Cash loans F N Y 0 135000.0 568800.0 20560.5 450000.0 Unaccompanied Working Higher education Married House / apartment 0.0188 -19241 -2329 -5170.0 -812 NaN 1 1 0 1 0 1 NaN 2.0 2 2 TUESDAY 18 0 0 0 0 0 0 Kindergarten 0.7526 0.7897 0.1595 0.0660 0.0590 0.9732 NaN NaN NaN 0.1379 0.125 NaN NaN NaN 0.0505 NaN NaN 0.0672 0.0612 0.9732 NaN NaN NaN 0.1379 0.125 NaN NaN NaN 0.0526 NaN NaN 0.0666 0.0590 0.9732 NaN NaN NaN 0.1379 0.125 NaN NaN NaN 0.0514 NaN NaN NaN block of flats 0.0392 Stone, brick No 0.0 0.0 0.0 0.0 -1740.0 0 1 0 0 0 0 0 0 0 0 0 \n", "1 100005 Cash loans M N Y 0 99000.0 222768.0 17370.0 180000.0 Unaccompanied Working Secondary / secondary special Married House / apartment 0.0358 -18064 -4469 -9118.0 -1623 NaN 1 1 0 1 0 0 Low-skill Laborers 2.0 2 2 FRIDAY 9 0 0 0 0 0 0 Self-employed 0.5650 0.2917 0.4330 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 0.0 0 1 0 0 0 0 0 0 0 0 0 \n", "2 100013 Cash loans M Y Y 0 202500.0 663264.0 69777.0 630000.0 NaN Working Higher education Married House / apartment 0.0191 -20038 -4458 -2175.0 -3503 5.0 1 1 0 1 0 0 Drivers 2.0 2 2 MONDAY 14 0 0 0 0 0 0 Transport: type 3 NaN 0.6998 0.6110 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -856.0 0 0 0 0 0 0 1 0 0 0 0 \n", "3 100028 Cash loans F N Y 2 315000.0 1575000.0 49018.5 1575000.0 Unaccompanied Working Secondary / secondary special Married House / apartment 0.0264 -13976 -1866 -2000.0 -4208 NaN 1 1 0 1 1 0 Sales staff 4.0 2 2 WEDNESDAY 11 0 0 0 0 0 0 Business Entity Type 3 0.5257 0.5097 0.6127 0.3052 0.1974 0.9970 0.9592 0.1165 0.32 0.2759 0.375 0.0417 0.2042 0.2404 0.3673 0.0386 0.08 0.3109 0.2049 0.9970 0.9608 0.1176 0.3222 0.2759 0.375 0.0417 0.2089 0.2626 0.3827 0.0389 0.0847 0.3081 0.1974 0.9970 0.9597 0.1173 0.32 0.2759 0.375 0.0417 0.2078 0.2446 0.3739 0.0388 0.0817 reg oper account block of flats 0.3700 Panel No 0.0 0.0 0.0 0.0 -1805.0 0 1 0 0 0 0 0 0 0 0 0 \n", "4 100038 Cash loans M Y N 1 180000.0 625500.0 32067.0 625500.0 Unaccompanied Working Secondary / secondary special Married House / apartment 0.0100 -13040 -2191 -4000.0 -4262 16.0 1 1 1 1 0 0 NaN 3.0 2 2 FRIDAY 5 0 0 0 0 1 1 Business Entity Type 3 0.2021 0.4257 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -821.0 0 1 0 0 0 0 0 0 0 0 0 \n", "\n", " FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR \n", "0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 3.0 \n", "2 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 4.0 \n", "3 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 3.0 \n", "4 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# train\n", "train_path = \"home-credit-default-risk/application_train.csv\"\n", "pdf_train = pd.read_csv(train_path)\n", "print(\"(rows, columns)\", pdf_train.shape)\n", "print(\"First 5 rows\")\n", "display(pdf_train.head(5))\n", "\n", "# test\n", "test_path = \"home-credit-default-risk/application_test.csv\"\n", "pdf_test = pd.read_csv(test_path)\n", "print(\"(rows, columns)\", pdf_test.shape)\n", "print(\"First 5 rows\")\n", "display(pdf_test.head(5))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of overlap ids: 0\n" ] } ], "source": [ "# check overlap\n", "train_ids = pdf_train[\"SK_ID_CURR\"]\n", "test_ids = pdf_test[\"SK_ID_CURR\"]\n", "n_overlap = len(set(train_ids) & set(test_ids))\n", "print(\"Number of overlap ids: {}\".format(n_overlap))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train data no duplicate\n", "Test data no duplicate\n" ] } ], "source": [ "# check duplicate\n", "if len(train_ids.drop_duplicates()) == pdf_train.shape[0]:\n", " print(\"Train data no duplicate\")\n", "if len(test_ids.drop_duplicates()) == pdf_test.shape[0]:\n", " print(\"Test data no duplicate\")\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train validation test (tvt) split" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGETtvt_code
01000021train
11000030train
21000040train
31000060train
41000070train
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET tvt_code\n", "0 100002 1 train \n", "1 100003 0 train \n", "2 100004 0 train \n", "3 100006 0 train \n", "4 100007 0 train " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get ids and target label\n", "pdf_tvt = pdf_train[[\"SK_ID_CURR\", \"TARGET\"]].copy()\n", "pdf_tvt[\"tvt_code\"] = \"kaggle_test\"\n", "\n", "# target\n", "y = pdf_tvt[\"TARGET\"]\n", "\n", "# index\n", "X = pdf_tvt[\"SK_ID_CURR\"]\n", "\n", "# train, val, test set will be 70%, 15%, 15% of the dataset respectively.\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)\n", "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.17, random_state=1)\n", "\n", "# check validity\n", "assert X.shape[0] == X_train.shape[0] + X_val.shape[0] + X_test.shape[0], \"TVT split not correct\"\n", "\n", "# assign code\n", "pdf_tvt.loc[X_train.index, \"tvt_code\"] = \"train\"\n", "pdf_tvt.loc[X_val.index, \"tvt_code\"] = \"val\"\n", "pdf_tvt.loc[X_test.index, \"tvt_code\"] = \"test\"\n", "\n", "pdf_tvt.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGETtvt_code
01000021train
11000030train
21000040train
31000060train
41000070train
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET tvt_code\n", "0 100002 1 train \n", "1 100003 0 train \n", "2 100004 0 train \n", "3 100006 0 train \n", "4 100007 0 train " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# extend for kaggle test\n", "pdf_test[\"TARGET\"] = -1\n", "pdf_test[\"tvt_code\"] = \"kaggle_test\"\n", "\n", "pdf_tvt_extend = pd.concat([pdf_tvt, pdf_test[[\"SK_ID_CURR\", \"TARGET\", \"tvt_code\"]]])\n", "pdf_tvt_extend = pdf_tvt_extend.reset_index(drop=True)\n", "pdf_tvt_extend.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# check validity\n", "assert pdf_tvt_extend.shape[0] == pdf_tvt_extend[\"SK_ID_CURR\"].nunique(), \"Extend TVT overlaped\"" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cntpercentage
train21694870.5497
kaggle_test4874415.8511
test4612715.0001
val4443614.4502
\n", "
" ], "text/plain": [ " cnt percentage\n", "train 216948 70.5497 \n", "kaggle_test 48744 15.8511 \n", "test 46127 15.0001 \n", "val 44436 14.4502 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check tvt percentage\n", "pdf_tvt_check = pdf_tvt_extend[\"tvt_code\"].value_counts().to_frame(\"cnt\")\n", "pdf_tvt_check[\"percentage\"] = pdf_tvt_check[\"cnt\"] * 100.0 / pdf_tvt.shape[0]\n", "pdf_tvt_check" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# save tvt split\n", "pdf_tvt_extend.to_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }