{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Full width\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import math\n", "import os\n", "import subprocess\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display\n", "\n", "# \n", "from lib_modeling import *\n", "from lib_feature_engineering import *\n", "\n", "from sklearn.preprocessing import Imputer\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data\n", "\n", "- Load train + tvt = train_filtered for features evaluation\n", "- Load train/test for applying mean encoding" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGET
01000021
11000030
21000040
31000060
41000070
\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET\n", "0 100002 1 \n", "1 100003 0 \n", "2 100004 0 \n", "3 100006 0 \n", "4 100007 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load train/test data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "pdf_train = pd.read_csv(data_path)\n", "\n", "data_path = \"home-credit-default-risk/application_test.csv\"\n", "pdf_test = pd.read_csv(data_path)\n", "\n", "# filter by tvt code\n", "pdf_tvt_extend = pd.read_pickle(\"pdf_tvt_extend.pkl\", compression=\"bz2\")\n", "pdf_train_filtered = (pdf_tvt_extend.query(\"tvt_code == 'train'\")\n", " .merge(pdf_train[[\"SK_ID_CURR\"]], on=\"SK_ID_CURR\")\n", " .drop(columns=[\"tvt_code\"]))\n", "pdf_train_filtered.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1670214, 37)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRNAME_CONTRACT_TYPEAMT_ANNUITYAMT_APPLICATIONAMT_CREDITAMT_DOWN_PAYMENTAMT_GOODS_PRICEWEEKDAY_APPR_PROCESS_STARTHOUR_APPR_PROCESS_STARTFLAG_LAST_APPL_PER_CONTRACTNFLAG_LAST_APPL_IN_DAYRATE_DOWN_PAYMENTRATE_INTEREST_PRIMARYRATE_INTEREST_PRIVILEGEDNAME_CASH_LOAN_PURPOSENAME_CONTRACT_STATUSDAYS_DECISIONNAME_PAYMENT_TYPECODE_REJECT_REASONNAME_TYPE_SUITENAME_CLIENT_TYPENAME_GOODS_CATEGORYNAME_PORTFOLIONAME_PRODUCT_TYPECHANNEL_TYPESELLERPLACE_AREANAME_SELLER_INDUSTRYCNT_PAYMENTNAME_YIELD_GROUPPRODUCT_COMBINATIONDAYS_FIRST_DRAWINGDAYS_FIRST_DUEDAYS_LAST_DUE_1ST_VERSIONDAYS_LAST_DUEDAYS_TERMINATIONNFLAG_INSURED_ON_APPROVAL
02030495271877Consumer loans1730.43017145.017145.00.017145.0SATURDAY15Y10.00.18280.8673XAPApproved-73Cash through the bankXAPNaNRepeaterMobilePOSXNACountry-wide35Connectivity12.0middlePOS mobile with interest365243.0-42.0300.0-42.0-37.00.0
12802425108129Cash loans25188.615607500.0679671.0NaN607500.0THURSDAY11Y1NaNNaNNaNXNAApproved-164XNAXAPUnaccompaniedRepeaterXNACashx-sellContact center-1XNA36.0low_actionCash X-Sell: low365243.0-134.0916.0365243.0365243.01.0
22523466122040Cash loans15060.735112500.0136444.5NaN112500.0TUESDAY11Y1NaNNaNNaNXNAApproved-301Cash through the bankXAPSpouse, partnerRepeaterXNACashx-sellCredit and cash offices-1XNA12.0highCash X-Sell: high365243.0-271.059.0365243.0365243.01.0
32819243176158Cash loans47041.335450000.0470790.0NaN450000.0MONDAY7Y1NaNNaNNaNXNAApproved-512Cash through the bankXAPNaNRepeaterXNACashx-sellCredit and cash offices-1XNA12.0middleCash X-Sell: middle365243.0-482.0-152.0-182.0-177.01.0
41784265202054Cash loans31924.395337500.0404055.0NaN337500.0THURSDAY9Y1NaNNaNNaNRepairsRefused-781Cash through the bankHCNaNRepeaterXNACashwalk-inCredit and cash offices-1XNA24.0highCash Street: highNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START FLAG_LAST_APPL_PER_CONTRACT NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS DAYS_DECISION NAME_PAYMENT_TYPE CODE_REJECT_REASON NAME_TYPE_SUITE NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO NAME_PRODUCT_TYPE CHANNEL_TYPE SELLERPLACE_AREA NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION NFLAG_INSURED_ON_APPROVAL\n", "0 2030495 271877 Consumer loans 1730.430 17145.0 17145.0 0.0 17145.0 SATURDAY 15 Y 1 0.0 0.1828 0.8673 XAP Approved -73 Cash through the bank XAP NaN Repeater Mobile POS XNA Country-wide 35 Connectivity 12.0 middle POS mobile with interest 365243.0 -42.0 300.0 -42.0 -37.0 0.0 \n", "1 2802425 108129 Cash loans 25188.615 607500.0 679671.0 NaN 607500.0 THURSDAY 11 Y 1 NaN NaN NaN XNA Approved -164 XNA XAP Unaccompanied Repeater XNA Cash x-sell Contact center -1 XNA 36.0 low_action Cash X-Sell: low 365243.0 -134.0 916.0 365243.0 365243.0 1.0 \n", "2 2523466 122040 Cash loans 15060.735 112500.0 136444.5 NaN 112500.0 TUESDAY 11 Y 1 NaN NaN NaN XNA Approved -301 Cash through the bank XAP Spouse, partner Repeater XNA Cash x-sell Credit and cash offices -1 XNA 12.0 high Cash X-Sell: high 365243.0 -271.0 59.0 365243.0 365243.0 1.0 \n", "3 2819243 176158 Cash loans 47041.335 450000.0 470790.0 NaN 450000.0 MONDAY 7 Y 1 NaN NaN NaN XNA Approved -512 Cash through the bank XAP NaN Repeater XNA Cash x-sell Credit and cash offices -1 XNA 12.0 middle Cash X-Sell: middle 365243.0 -482.0 -152.0 -182.0 -177.0 1.0 \n", "4 1784265 202054 Cash loans 31924.395 337500.0 404055.0 NaN 337500.0 THURSDAY 9 Y 1 NaN NaN NaN Repairs Refused -781 Cash through the bank HC NaN Repeater XNA Cash walk-in Credit and cash offices -1 XNA 24.0 high Cash Street: high NaN NaN NaN NaN NaN NaN " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load previous application\n", "data_path = \"home-credit-default-risk/previous_application.csv\"\n", "pdf_prev_app = pd.read_csv(data_path)\n", "print(pdf_prev_app.shape)\n", "pdf_prev_app.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# load meta data\n", "meta_path = \"../02_pandas/reports/report_previous_application.csv\"\n", "pdf_meta = pd.read_csv(meta_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NAME_CONTRACT_TYPE" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Cash loans 747553\n", "Consumer loans 729151\n", "Revolving loans 193164\n", "XNA 346 \n", "Name: NAME_CONTRACT_TYPE, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"NAME_CONTRACT_TYPE\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Cash loans 747899\n", "Consumer loans 729151\n", "Revolving loans 193164\n", "Name: NAME_CONTRACT_TYPE, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# số lượng XNA khá nhỏ, có 346 records. Replace thành Cash loans\n", "pdf_prev_app.loc[pdf_prev_app[\"NAME_CONTRACT_TYPE\"] == \"XNA\", \"NAME_CONTRACT_TYPE\"] = \"Cash loans\"\n", "pdf_prev_app[\"NAME_CONTRACT_TYPE\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## AMT_ANNUITY" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.22286665062081865" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"AMT_ANNUITY\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Canceled 305805\n", "Refused 40898 \n", "Unused offer 25524 \n", "Approved 8 \n", "Name: NAME_CONTRACT_STATUS, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[pdf_prev_app[\"AMT_ANNUITY\"].isna()][\"NAME_CONTRACT_STATUS\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# AMT_ANNUITY bị null khi tình trạng của HĐ là hủy. \n", "# chỉ một số ít AMT_ANNUITY là null nhưng tình trạng của HĐ là approved.\n", "# Fill giá trị này là 0\n", "pdf_prev_app.loc[pdf_prev_app[\"AMT_ANNUITY\"].isna(), \"AMT_ANNUITY\"] = 0\n", "pdf_prev_app[\"AMT_ANNUITY\"].isna().mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## AMT_DOWN_PAYMENT\n", "\n", "KH phải trả số tiền ban đầu là bao nhiêu." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.536364801157217" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"AMT_DOWN_PAYMENT\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Mặc định KH không trả giá trị ban đầu mà vay toàn bộ. Fill 0 cho các giá trị null này\n", "pdf_prev_app.loc[pdf_prev_app[\"AMT_DOWN_PAYMENT\"].isna(), \"AMT_DOWN_PAYMENT\"] = 0\n", "pdf_prev_app[\"AMT_DOWN_PAYMENT\"].isna().mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## AMT_GOODS_PRICE\n", "\n", "Số tiền của sản phẩm mà khách hàng muốn vay trả góp" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.23081772754868538" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"AMT_GOODS_PRICE\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8664747156401178" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Có rất nhiều giá trị 0 tại cột AMT_CREDIT khi AMT_GOODS_PRICE là Null\n", "series_check = pdf_prev_app[pdf_prev_app[\"AMT_GOODS_PRICE\"].isna()][\"AMT_CREDIT\"]\n", "series_check[series_check == 0.0].count() * 1.0 / series_check.shape[0]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Approved 334039\n", "Name: NAME_CONTRACT_STATUS, dtype: int64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# giải ngân là 0, tình trạng của HĐ là approved\n", "pdf_prev_app.loc[series_check[series_check == 0.0]][\"NAME_CONTRACT_STATUS\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "XAP 334039\n", "Name: NAME_CASH_LOAN_PURPOSE, dtype: int64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# giải ngân là 0, mục đích vay là XAP\n", "pdf_prev_app.loc[series_check[series_check == 0.0]][\"NAME_CASH_LOAN_PURPOSE\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fill 0 cho các giá trị AMT_GOODS_PRICE là 0\n", "pdf_prev_app.loc[pdf_prev_app[\"AMT_GOODS_PRICE\"].isna(), \"AMT_GOODS_PRICE\"] = 0\n", "pdf_prev_app[\"AMT_GOODS_PRICE\"].isna().mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RATE_DOWN_PAYMENT, RATE_INTEREST_PRIMARY, RATE_INTEREST_PRIVILEGED\n", "\n", "Lãi suất của khoản vay" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.536364801157217" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"RATE_DOWN_PAYMENT\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9964369835242669" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"RATE_INTEREST_PRIMARY\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9964369835242669" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"RATE_INTEREST_PRIVILEGED\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Fill median với giả định ngân hàng đa phần cho vay với tỉ lệ này nên hiếm khi điền thông tin cho trường này\n", "imputer = Imputer(strategy = 'median')\n", "pdf_prev_app['RATE_DOWN_PAYMENT'] = imputer.fit_transform(pdf_prev_app[['RATE_DOWN_PAYMENT']])\n", "pdf_prev_app['RATE_INTEREST_PRIMARY'] = imputer.fit_transform(pdf_prev_app[['RATE_INTEREST_PRIMARY']])\n", "pdf_prev_app['RATE_INTEREST_PRIVILEGED'] = imputer.fit_transform(pdf_prev_app[['RATE_INTEREST_PRIVILEGED']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CNT_PAYMENT\n", "\n", "Kì hạn của khoản vay" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.22286365699245725" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"CNT_PAYMENT\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9003707385218817" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Có rất nhiều giá trị 0 tại cột AMT_CREDIT khi CNT_PAYMENT là Null\n", "series_check = pdf_prev_app[pdf_prev_app[\"CNT_PAYMENT\"].isna()][\"AMT_CREDIT\"]\n", "series_check[series_check == 0.0].count() * 1.0 / series_check.shape[0]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Canceled 305566\n", "Refused 29087 \n", "Unused offer 492 \n", "Name: NAME_CONTRACT_STATUS, dtype: int64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app.loc[series_check[series_check == 0.0].index][\"NAME_CONTRACT_STATUS\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Unused offer 25032\n", "Refused 11810\n", "Canceled 239 \n", "Approved 4 \n", "Name: NAME_CONTRACT_STATUS, dtype: int64" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app.loc[series_check[series_check > 0.0].index][\"NAME_CONTRACT_STATUS\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['DAYS_DECISION',\n", " 'DAYS_FIRST_DRAWING',\n", " 'DAYS_FIRST_DUE',\n", " 'DAYS_LAST_DUE_1ST_VERSION',\n", " 'DAYS_LAST_DUE',\n", " 'DAYS_TERMINATION']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ls_days = [cname for cname in pdf_prev_app.columns if \"DAYS\" in cname]\n", "ls_days" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DAYS_DECISIONDAYS_FIRST_DRAWINGDAYS_FIRST_DUEDAYS_LAST_DUE_1ST_VERSIONDAYS_LAST_DUEDAYS_TERMINATION
count372230.00000.00.00.00.00.0
mean-316.8921NaNNaNNaNNaNNaN
std325.8686NaNNaNNaNNaNNaN
min-2580.0000NaNNaNNaNNaNNaN
25%-363.0000NaNNaNNaNNaNNaN
50%-250.0000NaNNaNNaNNaNNaN
75%-153.0000NaNNaNNaNNaNNaN
max-2.0000NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " DAYS_DECISION DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION\n", "count 372230.0000 0.0 0.0 0.0 0.0 0.0 \n", "mean -316.8921 NaN NaN NaN NaN NaN \n", "std 325.8686 NaN NaN NaN NaN NaN \n", "min -2580.0000 NaN NaN NaN NaN NaN \n", "25% -363.0000 NaN NaN NaN NaN NaN \n", "50% -250.0000 NaN NaN NaN NaN NaN \n", "75% -153.0000 NaN NaN NaN NaN NaN \n", "max -2.0000 NaN NaN NaN NaN NaN " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Những rows có CNT_PAYMENT là null thì các trường DAYS* đều là null. \n", "# Có thể KH đã trả trước.\n", "pdf_prev_app.loc[series_check.index][ls_days].describe()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fill 0 cho các trường hợp này\n", "pdf_prev_app.loc[pdf_prev_app[\"CNT_PAYMENT\"].isna(), \"CNT_PAYMENT\"] = 0\n", "pdf_prev_app[\"CNT_PAYMENT\"].isna().mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DAYS_FIRST_DRAWING: \n", "\n", "Ngày giải ngân đầu tiên" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "\n", "# Đây là trường hợp \n", "# Đặt các món vay này có PRE_PAID_ALL = 1\n", "# Fill giá trị PRE_PAID_ALL các món trên là 1\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.40298129461254667" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"DAYS_FIRST_DRAWING\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Canceled 316319\n", "Refused 290678\n", "Approved 39632 \n", "Unused offer 26436 \n", "Name: NAME_CONTRACT_STATUS, dtype: int64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# DAYS_FIRST_DRAWING bị null mà vẫn được approved?\n", "# có thể KH trả toàn bộ tiền, thay đổi ý kiến trước khi nhận được tiền vay\n", "pdf_prev_app[pdf_prev_app[\"DAYS_FIRST_DRAWING\"].isna()][\"NAME_CONTRACT_STATUS\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 997149\n", "1 673065\n", "Name: PREPAID_USER, dtype: int64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# tạo feature xác định có phải là prepaid user hay không\n", "pdf_prev_app[\"PREPAID_USER\"] = 0\n", "pdf_prev_app.loc[pdf_prev_app[\"DAYS_FIRST_DRAWING\"].isna(), \"PREPAID_USER\"] = 1\n", "pdf_prev_app[\"PREPAID_USER\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Handling DAYS_* == 365243\n", "\n", "Các giá trị 365243 thể hiện giá trị vô hạn, tương đương null value. Do đó, fill toàn bộ giá trị này thành null và đánh dấu điểm dị biệt này" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DAYS_DECISION: 0.0\n", "DAYS_FIRST_DRAWING: 0.559475612107\n", "DAYS_FIRST_DUE: 0.0243352049498\n", "DAYS_LAST_DUE_1ST_VERSION: 0.0561987865028\n", "DAYS_LAST_DUE: 0.126463435224\n", "DAYS_TERMINATION: 0.135259912802\n" ] } ], "source": [ "def handling_days(pdf_input, cname):\n", " # Create an anomalous flag column\n", " pdf_input[\"{}_ANOM\".format(cname)] = pdf_input[cname] == 365243\n", "\n", " # Replace the anomalous values with nan\n", " pdf_input[cname] = pdf_input[cname].replace({365243: np.nan})\n", "\n", " # Calculate years\n", " pdf_input[\"{}_TO_YEARS\".format(cname)] = pdf_input[cname] / -365\n", " pdf_input.drop(columns=[cname], inplace=True)\n", "\n", " return pdf_input\n", "\n", "\n", "for cname in ls_days:\n", " print(\"{}: {}\".format(cname, (pdf_prev_app[cname] == 365243).mean()))\n", " pdf_prev_app = handling_days(pdf_prev_app, cname)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Ordinal encoding\n", "\n", "NAME_CASH_LOAN_PURPOSE, NAME_CONTRACT_STATUS, NAME_PAYMENT_TYPE, CODE_REJECT_REASON, NAME_CLIENT_TYPE, NAME_GOODS_CATEGORY, NAME_PORTFOLIO, CHANNEL_TYPE, NAME_SELLER_INDUSTRY, NAME_YIELD_GROUP" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "def ordinal_encoding(pdf_input, cname):\n", " # ranking by count frequency\n", " series_ranking = pdf_prev_app[cname].value_counts()\n", " \n", " # generate mapping to ordinal\n", " cate_map = dict(zip(series_ranking.index, range(series_ranking.shape[0])))\n", " \n", " # transform values\n", " pdf_prev_app[\"{}_ordinal\".format(cname)] = pdf_prev_app[cname].apply(lambda x: cate_map[x])\n", " pdf_input[\"{}_ordinal\".format(cname)] = pdf_input[cname].apply(lambda x: cate_map[x])\n", " \n", " return pdf_input" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Encoding NAME_CASH_LOAN_PURPOSE...\n", "Encoding NAME_CONTRACT_STATUS...\n", "Encoding NAME_PAYMENT_TYPE...\n", "Encoding CODE_REJECT_REASON...\n", "Encoding NAME_CLIENT_TYPE...\n", "Encoding NAME_GOODS_CATEGORY...\n", "Encoding NAME_PORTFOLIO...\n", "Encoding CHANNEL_TYPE...\n", "Encoding NAME_SELLER_INDUSTRY...\n", "Encoding NAME_YIELD_GROUP...\n" ] } ], "source": [ "ls_to_ordinal = [\n", " \"NAME_CASH_LOAN_PURPOSE\",\n", " \"NAME_CONTRACT_STATUS\",\n", " \"NAME_PAYMENT_TYPE\",\n", " \"CODE_REJECT_REASON\",\n", " \"NAME_CLIENT_TYPE\",\n", " \"NAME_GOODS_CATEGORY\",\n", " \"NAME_PORTFOLIO\",\n", " \"CHANNEL_TYPE\",\n", " \"NAME_SELLER_INDUSTRY\",\n", " \"NAME_YIELD_GROUP\" \n", "]\n", "for cname in ls_to_ordinal:\n", " print(\"Encoding {}...\".format(cname))\n", " pdf_prev_app = ordinal_encoding(pdf_prev_app, cname)\n", " \n", "# drop columns\n", "pdf_prev_app.drop(columns=ls_to_ordinal, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NAME_TYPE_SUITE\n", "\n", "Ai đi cùng KH tới vay" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.4911975351661524" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_prev_app[\"NAME_TYPE_SUITE\"].isna().mean()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fill na là Unaccompanied\n", "pdf_prev_app.loc[pdf_prev_app[\"NAME_TYPE_SUITE\"].isna(), \"NAME_TYPE_SUITE\"] = \"Unaccompanied\"\n", "pdf_prev_app[\"NAME_TYPE_SUITE\"].isna().mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Categorical features" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['NAME_CONTRACT_TYPE',\n", " 'WEEKDAY_APPR_PROCESS_START',\n", " 'FLAG_LAST_APPL_PER_CONTRACT',\n", " 'NAME_TYPE_SUITE',\n", " 'NAME_PRODUCT_TYPE',\n", " 'PRODUCT_COMBINATION']" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list categorical attributes\n", "# ls_cate = pdf_meta.query(\"sub_type == 'object'\")[\"name\"].tolist()\n", "series_type = pdf_prev_app.dtypes\n", "ls_cate = series_type[series_type == \"object\"].index.tolist()\n", "ls_cate" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# construct categorical mapping\n", "dict_onehot = {}\n", "for cate in ls_cate:\n", " ls_val = pdf_prev_app[cate].value_counts().index.tolist()\n", " dict_onehot[cate] = ls_val\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### one hot" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1670214, 40)\n", "CPU times: user 1min 19s, sys: 2.64 s, total: 1min 21s\n", "Wall time: 28.9 s\n" ] } ], "source": [ "%%time\n", "pdf_onehot = gen_one_hot_feat(pdf_prev_app, dict_onehot, main_key=\"SK_ID_CURR\")\n", "print(pdf_onehot.shape)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'FLAG_LAST_APPL_PER_CONTRACT_N': ['max', 'sum', 'mean'],\n", " 'FLAG_LAST_APPL_PER_CONTRACT_Y': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_TYPE_Cash_loans': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_TYPE_Consumer_loans': ['max', 'sum', 'mean'],\n", " 'NAME_CONTRACT_TYPE_Revolving_loans': ['max', 'sum', 'mean'],\n", " 'NAME_PRODUCT_TYPE_XNA': ['max', 'sum', 'mean'],\n", " 'NAME_PRODUCT_TYPE_walk_in': ['max', 'sum', 'mean'],\n", " 'NAME_PRODUCT_TYPE_x_sell': ['max', 'sum', 'mean'],\n", " 'NAME_TYPE_SUITE_Children': ['max', 'sum', 'mean'],\n", " 'NAME_TYPE_SUITE_Family': ['max', 'sum', 'mean'],\n", " 'NAME_TYPE_SUITE_Group_of_people': ['max', 'sum', 'mean'],\n", " 'NAME_TYPE_SUITE_Other_A': ['max', 'sum', 'mean'],\n", " 'NAME_TYPE_SUITE_Other_B': ['max', 'sum', 'mean'],\n", " 'NAME_TYPE_SUITE_Spouse,_partner': ['max', 'sum', 'mean'],\n", " 'NAME_TYPE_SUITE_Unaccompanied': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Card_Street': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Card_X_Sell': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Cash': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Cash_Street__high': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Cash_Street__low': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Cash_Street__middle': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Cash_X_Sell__high': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Cash_X_Sell__low': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_Cash_X_Sell__middle': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_household_with_interest': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_household_without_interest': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_industry_with_interest': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_industry_without_interest': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_mobile_with_interest': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_mobile_without_interest': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_other_with_interest': ['max', 'sum', 'mean'],\n", " 'PRODUCT_COMBINATION_POS_others_without_interest': ['max', 'sum', 'mean'],\n", " 'WEEKDAY_APPR_PROCESS_START_FRIDAY': ['max', 'sum', 'mean'],\n", " 'WEEKDAY_APPR_PROCESS_START_MONDAY': ['max', 'sum', 'mean'],\n", " 'WEEKDAY_APPR_PROCESS_START_SATURDAY': ['max', 'sum', 'mean'],\n", " 'WEEKDAY_APPR_PROCESS_START_SUNDAY': ['max', 'sum', 'mean'],\n", " 'WEEKDAY_APPR_PROCESS_START_THURSDAY': ['max', 'sum', 'mean'],\n", " 'WEEKDAY_APPR_PROCESS_START_TUESDAY': ['max', 'sum', 'mean'],\n", " 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY': ['max', 'sum', 'mean']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (338857, 117)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
58NAME_PRODUCT_TYPE_walk_in_sum0.55020.06421.0
59NAME_PRODUCT_TYPE_walk_in_mean0.54990.05881.0
57NAME_PRODUCT_TYPE_walk_in_max0.54680.05951.0
115NAME_CONTRACT_TYPE_Revolving_loans_sum0.53780.04711.0
116NAME_CONTRACT_TYPE_Revolving_loans_mean0.53580.03941.0
100PRODUCT_COMBINATION_Card_Street_sum0.53430.05011.0
101PRODUCT_COMBINATION_Card_Street_mean0.53330.04161.0
98NAME_CONTRACT_TYPE_Consumer_loans_mean0.5328-0.03191.0
114NAME_CONTRACT_TYPE_Revolving_loans_max0.53270.03751.0
99PRODUCT_COMBINATION_Card_Street_max0.53170.04171.0
14PRODUCT_COMBINATION_Cash_X_Sell__low_mean0.5307-0.04001.0
12PRODUCT_COMBINATION_Cash_X_Sell__low_max0.5288-0.03791.0
13PRODUCT_COMBINATION_Cash_X_Sell__low_sum0.5287-0.02491.0
71PRODUCT_COMBINATION_POS_industry_with_interest_mean0.5282-0.03531.0
97NAME_CONTRACT_TYPE_Consumer_loans_sum0.5272-0.01971.0
8NAME_TYPE_SUITE_Unaccompanied_mean0.52700.02381.0
70PRODUCT_COMBINATION_POS_industry_with_interest_sum0.5269-0.03121.0
69PRODUCT_COMBINATION_POS_industry_with_interest_max0.5266-0.03521.0
56NAME_TYPE_SUITE_Family_mean0.5241-0.02161.0
43PRODUCT_COMBINATION_Cash_X_Sell__high_sum0.52290.03841.0
44PRODUCT_COMBINATION_Cash_X_Sell__high_mean0.52250.03201.0
42PRODUCT_COMBINATION_Cash_X_Sell__high_max0.52240.03761.0
29NAME_PRODUCT_TYPE_XNA_mean0.5219-0.02081.0
34PRODUCT_COMBINATION_Cash_Street__high_sum0.52150.03221.0
54NAME_TYPE_SUITE_Family_max0.5215-0.02421.0
35PRODUCT_COMBINATION_Cash_Street__high_mean0.52150.03171.0
55NAME_TYPE_SUITE_Family_sum0.5214-0.01641.0
33PRODUCT_COMBINATION_Cash_Street__high_max0.52130.03681.0
7NAME_TYPE_SUITE_Unaccompanied_sum0.51980.02731.0
92PRODUCT_COMBINATION_POS_household_without_interest_mean0.5197-0.02391.0
91PRODUCT_COMBINATION_POS_household_without_interest_sum0.5191-0.02321.0
90PRODUCT_COMBINATION_POS_household_without_interest_max0.5190-0.02651.0
25PRODUCT_COMBINATION_Cash_sum0.51830.02361.0
112NAME_CONTRACT_TYPE_Cash_loans_sum0.51720.02171.0
110PRODUCT_COMBINATION_POS_mobile_with_interest_mean0.51670.02121.0
26PRODUCT_COMBINATION_Cash_mean0.51660.01801.0
62NAME_PRODUCT_TYPE_x_sell_mean0.5160-0.01971.0
24PRODUCT_COMBINATION_Cash_max0.51520.01751.0
108PRODUCT_COMBINATION_POS_mobile_with_interest_max0.51450.01621.0
109PRODUCT_COMBINATION_POS_mobile_with_interest_sum0.51440.01321.0
113NAME_CONTRACT_TYPE_Cash_loans_mean0.51410.01401.0
4PRODUCT_COMBINATION_Cash_Street__middle_sum0.51220.02661.0
3PRODUCT_COMBINATION_Cash_Street__middle_max0.51200.02581.0
5PRODUCT_COMBINATION_Cash_Street__middle_mean0.51190.01631.0
77PRODUCT_COMBINATION_Cash_X_Sell__middle_mean0.5115-0.01641.0
20PRODUCT_COMBINATION_POS_household_with_interest_mean0.5111-0.00821.0
19PRODUCT_COMBINATION_POS_household_with_interest_sum0.5105-0.00811.0
75PRODUCT_COMBINATION_Cash_X_Sell__middle_max0.5100-0.01281.0
76PRODUCT_COMBINATION_Cash_X_Sell__middle_sum0.5100-0.00771.0
64PRODUCT_COMBINATION_Card_X_Sell_sum0.51000.01781.0
79FLAG_LAST_APPL_PER_CONTRACT_Y_sum0.51000.01931.0
53NAME_TYPE_SUITE_Children_mean0.5099-0.01751.0
51NAME_TYPE_SUITE_Children_max0.5097-0.02031.0
52NAME_TYPE_SUITE_Children_sum0.5097-0.01751.0
111NAME_CONTRACT_TYPE_Cash_loans_max0.50960.01071.0
63PRODUCT_COMBINATION_Card_X_Sell_max0.50950.01311.0
65PRODUCT_COMBINATION_Card_X_Sell_mean0.50900.00941.0
11WEEKDAY_APPR_PROCESS_START_SATURDAY_mean0.5090-0.00981.0
46WEEKDAY_APPR_PROCESS_START_MONDAY_sum0.50900.01381.0
18PRODUCT_COMBINATION_POS_household_with_interest_max0.5085-0.00941.0
60NAME_PRODUCT_TYPE_x_sell_max0.5084-0.00931.0
83PRODUCT_COMBINATION_POS_industry_without_interest_mean0.5080-0.02161.0
81PRODUCT_COMBINATION_POS_industry_without_interest_max0.5080-0.02461.0
82PRODUCT_COMBINATION_POS_industry_without_interest_sum0.5080-0.02271.0
61NAME_PRODUCT_TYPE_x_sell_sum0.5078-0.00341.0
95WEEKDAY_APPR_PROCESS_START_SUNDAY_mean0.5078-0.00781.0
96NAME_CONTRACT_TYPE_Consumer_loans_max0.5077-0.01611.0
9WEEKDAY_APPR_PROCESS_START_SATURDAY_max0.5068-0.00761.0
73WEEKDAY_APPR_PROCESS_START_WEDNESDAY_sum0.50680.01141.0
1WEEKDAY_APPR_PROCESS_START_FRIDAY_sum0.50660.01241.0
93WEEKDAY_APPR_PROCESS_START_SUNDAY_max0.5066-0.00781.0
45WEEKDAY_APPR_PROCESS_START_MONDAY_max0.50640.00721.0
94WEEKDAY_APPR_PROCESS_START_SUNDAY_sum0.5059-0.00241.0
37WEEKDAY_APPR_PROCESS_START_THURSDAY_sum0.50580.01441.0
47WEEKDAY_APPR_PROCESS_START_MONDAY_mean0.50580.00531.0
16PRODUCT_COMBINATION_Cash_Street__low_sum0.50500.01261.0
15PRODUCT_COMBINATION_Cash_Street__low_max0.50490.01161.0
17PRODUCT_COMBINATION_Cash_Street__low_mean0.50480.00231.0
88WEEKDAY_APPR_PROCESS_START_TUESDAY_sum0.50460.01301.0
74WEEKDAY_APPR_PROCESS_START_WEDNESDAY_mean0.50420.00551.0
72WEEKDAY_APPR_PROCESS_START_WEDNESDAY_max0.50410.00461.0
10WEEKDAY_APPR_PROCESS_START_SATURDAY_sum0.50390.00321.0
6NAME_TYPE_SUITE_Unaccompanied_max0.50380.00791.0
0WEEKDAY_APPR_PROCESS_START_FRIDAY_max0.50340.00371.0
2WEEKDAY_APPR_PROCESS_START_FRIDAY_mean0.50330.00531.0
86NAME_TYPE_SUITE_Spouse,_partner_mean0.5021-0.00411.0
36WEEKDAY_APPR_PROCESS_START_THURSDAY_max0.50200.00231.0
27NAME_PRODUCT_TYPE_XNA_max0.5020-0.00591.0
28NAME_PRODUCT_TYPE_XNA_sum0.50190.00711.0
84NAME_TYPE_SUITE_Spouse,_partner_max0.5018-0.00271.0
32PRODUCT_COMBINATION_POS_mobile_without_interest_mean0.5017-0.00221.0
30PRODUCT_COMBINATION_POS_mobile_without_interest_max0.5017-0.00391.0
31PRODUCT_COMBINATION_POS_mobile_without_interest_sum0.5017-0.00241.0
50PRODUCT_COMBINATION_POS_other_with_interest_mean0.5016-0.00491.0
48PRODUCT_COMBINATION_POS_other_with_interest_max0.5015-0.00361.0
49PRODUCT_COMBINATION_POS_other_with_interest_sum0.5015-0.00291.0
85NAME_TYPE_SUITE_Spouse,_partner_sum0.50150.00271.0
67FLAG_LAST_APPL_PER_CONTRACT_N_sum0.50150.01031.0
80FLAG_LAST_APPL_PER_CONTRACT_Y_mean0.5015-0.00681.0
68FLAG_LAST_APPL_PER_CONTRACT_N_mean0.50150.00681.0
66FLAG_LAST_APPL_PER_CONTRACT_N_max0.50150.00621.0
38WEEKDAY_APPR_PROCESS_START_THURSDAY_mean0.50140.00101.0
103NAME_TYPE_SUITE_Other_A_sum0.50100.00351.0
104NAME_TYPE_SUITE_Other_A_mean0.50100.00551.0
102NAME_TYPE_SUITE_Other_A_max0.50100.00381.0
87WEEKDAY_APPR_PROCESS_START_TUESDAY_max0.50100.00111.0
106NAME_TYPE_SUITE_Other_B_sum0.50070.00291.0
107NAME_TYPE_SUITE_Other_B_mean0.50070.00251.0
105NAME_TYPE_SUITE_Other_B_max0.50070.00181.0
40PRODUCT_COMBINATION_POS_others_without_interest_sum0.5004-0.00331.0
39PRODUCT_COMBINATION_POS_others_without_interest_max0.5004-0.00301.0
41PRODUCT_COMBINATION_POS_others_without_interest_mean0.5004-0.00191.0
23NAME_TYPE_SUITE_Group_of_people_mean0.50030.00341.0
22NAME_TYPE_SUITE_Group_of_people_sum0.50030.00231.0
21NAME_TYPE_SUITE_Group_of_people_max0.50030.00211.0
89WEEKDAY_APPR_PROCESS_START_TUESDAY_mean0.50000.00051.0
78FLAG_LAST_APPL_PER_CONTRACT_Y_max0.5000NaN1.0
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "58 NAME_PRODUCT_TYPE_walk_in_sum 0.5502 0.0642 1.0 \n", "59 NAME_PRODUCT_TYPE_walk_in_mean 0.5499 0.0588 1.0 \n", "57 NAME_PRODUCT_TYPE_walk_in_max 0.5468 0.0595 1.0 \n", "115 NAME_CONTRACT_TYPE_Revolving_loans_sum 0.5378 0.0471 1.0 \n", "116 NAME_CONTRACT_TYPE_Revolving_loans_mean 0.5358 0.0394 1.0 \n", "100 PRODUCT_COMBINATION_Card_Street_sum 0.5343 0.0501 1.0 \n", "101 PRODUCT_COMBINATION_Card_Street_mean 0.5333 0.0416 1.0 \n", "98 NAME_CONTRACT_TYPE_Consumer_loans_mean 0.5328 -0.0319 1.0 \n", "114 NAME_CONTRACT_TYPE_Revolving_loans_max 0.5327 0.0375 1.0 \n", "99 PRODUCT_COMBINATION_Card_Street_max 0.5317 0.0417 1.0 \n", "14 PRODUCT_COMBINATION_Cash_X_Sell__low_mean 0.5307 -0.0400 1.0 \n", "12 PRODUCT_COMBINATION_Cash_X_Sell__low_max 0.5288 -0.0379 1.0 \n", "13 PRODUCT_COMBINATION_Cash_X_Sell__low_sum 0.5287 -0.0249 1.0 \n", "71 PRODUCT_COMBINATION_POS_industry_with_interest_mean 0.5282 -0.0353 1.0 \n", "97 NAME_CONTRACT_TYPE_Consumer_loans_sum 0.5272 -0.0197 1.0 \n", "8 NAME_TYPE_SUITE_Unaccompanied_mean 0.5270 0.0238 1.0 \n", "70 PRODUCT_COMBINATION_POS_industry_with_interest_sum 0.5269 -0.0312 1.0 \n", "69 PRODUCT_COMBINATION_POS_industry_with_interest_max 0.5266 -0.0352 1.0 \n", "56 NAME_TYPE_SUITE_Family_mean 0.5241 -0.0216 1.0 \n", "43 PRODUCT_COMBINATION_Cash_X_Sell__high_sum 0.5229 0.0384 1.0 \n", "44 PRODUCT_COMBINATION_Cash_X_Sell__high_mean 0.5225 0.0320 1.0 \n", "42 PRODUCT_COMBINATION_Cash_X_Sell__high_max 0.5224 0.0376 1.0 \n", "29 NAME_PRODUCT_TYPE_XNA_mean 0.5219 -0.0208 1.0 \n", "34 PRODUCT_COMBINATION_Cash_Street__high_sum 0.5215 0.0322 1.0 \n", "54 NAME_TYPE_SUITE_Family_max 0.5215 -0.0242 1.0 \n", "35 PRODUCT_COMBINATION_Cash_Street__high_mean 0.5215 0.0317 1.0 \n", "55 NAME_TYPE_SUITE_Family_sum 0.5214 -0.0164 1.0 \n", "33 PRODUCT_COMBINATION_Cash_Street__high_max 0.5213 0.0368 1.0 \n", "7 NAME_TYPE_SUITE_Unaccompanied_sum 0.5198 0.0273 1.0 \n", "92 PRODUCT_COMBINATION_POS_household_without_interest_mean 0.5197 -0.0239 1.0 \n", "91 PRODUCT_COMBINATION_POS_household_without_interest_sum 0.5191 -0.0232 1.0 \n", "90 PRODUCT_COMBINATION_POS_household_without_interest_max 0.5190 -0.0265 1.0 \n", "25 PRODUCT_COMBINATION_Cash_sum 0.5183 0.0236 1.0 \n", "112 NAME_CONTRACT_TYPE_Cash_loans_sum 0.5172 0.0217 1.0 \n", "110 PRODUCT_COMBINATION_POS_mobile_with_interest_mean 0.5167 0.0212 1.0 \n", "26 PRODUCT_COMBINATION_Cash_mean 0.5166 0.0180 1.0 \n", "62 NAME_PRODUCT_TYPE_x_sell_mean 0.5160 -0.0197 1.0 \n", "24 PRODUCT_COMBINATION_Cash_max 0.5152 0.0175 1.0 \n", "108 PRODUCT_COMBINATION_POS_mobile_with_interest_max 0.5145 0.0162 1.0 \n", "109 PRODUCT_COMBINATION_POS_mobile_with_interest_sum 0.5144 0.0132 1.0 \n", "113 NAME_CONTRACT_TYPE_Cash_loans_mean 0.5141 0.0140 1.0 \n", "4 PRODUCT_COMBINATION_Cash_Street__middle_sum 0.5122 0.0266 1.0 \n", "3 PRODUCT_COMBINATION_Cash_Street__middle_max 0.5120 0.0258 1.0 \n", "5 PRODUCT_COMBINATION_Cash_Street__middle_mean 0.5119 0.0163 1.0 \n", "77 PRODUCT_COMBINATION_Cash_X_Sell__middle_mean 0.5115 -0.0164 1.0 \n", "20 PRODUCT_COMBINATION_POS_household_with_interest_mean 0.5111 -0.0082 1.0 \n", "19 PRODUCT_COMBINATION_POS_household_with_interest_sum 0.5105 -0.0081 1.0 \n", "75 PRODUCT_COMBINATION_Cash_X_Sell__middle_max 0.5100 -0.0128 1.0 \n", "76 PRODUCT_COMBINATION_Cash_X_Sell__middle_sum 0.5100 -0.0077 1.0 \n", "64 PRODUCT_COMBINATION_Card_X_Sell_sum 0.5100 0.0178 1.0 \n", "79 FLAG_LAST_APPL_PER_CONTRACT_Y_sum 0.5100 0.0193 1.0 \n", "53 NAME_TYPE_SUITE_Children_mean 0.5099 -0.0175 1.0 \n", "51 NAME_TYPE_SUITE_Children_max 0.5097 -0.0203 1.0 \n", "52 NAME_TYPE_SUITE_Children_sum 0.5097 -0.0175 1.0 \n", "111 NAME_CONTRACT_TYPE_Cash_loans_max 0.5096 0.0107 1.0 \n", "63 PRODUCT_COMBINATION_Card_X_Sell_max 0.5095 0.0131 1.0 \n", "65 PRODUCT_COMBINATION_Card_X_Sell_mean 0.5090 0.0094 1.0 \n", "11 WEEKDAY_APPR_PROCESS_START_SATURDAY_mean 0.5090 -0.0098 1.0 \n", "46 WEEKDAY_APPR_PROCESS_START_MONDAY_sum 0.5090 0.0138 1.0 \n", "18 PRODUCT_COMBINATION_POS_household_with_interest_max 0.5085 -0.0094 1.0 \n", "60 NAME_PRODUCT_TYPE_x_sell_max 0.5084 -0.0093 1.0 \n", "83 PRODUCT_COMBINATION_POS_industry_without_interest_mean 0.5080 -0.0216 1.0 \n", "81 PRODUCT_COMBINATION_POS_industry_without_interest_max 0.5080 -0.0246 1.0 \n", "82 PRODUCT_COMBINATION_POS_industry_without_interest_sum 0.5080 -0.0227 1.0 \n", "61 NAME_PRODUCT_TYPE_x_sell_sum 0.5078 -0.0034 1.0 \n", "95 WEEKDAY_APPR_PROCESS_START_SUNDAY_mean 0.5078 -0.0078 1.0 \n", "96 NAME_CONTRACT_TYPE_Consumer_loans_max 0.5077 -0.0161 1.0 \n", "9 WEEKDAY_APPR_PROCESS_START_SATURDAY_max 0.5068 -0.0076 1.0 \n", "73 WEEKDAY_APPR_PROCESS_START_WEDNESDAY_sum 0.5068 0.0114 1.0 \n", "1 WEEKDAY_APPR_PROCESS_START_FRIDAY_sum 0.5066 0.0124 1.0 \n", "93 WEEKDAY_APPR_PROCESS_START_SUNDAY_max 0.5066 -0.0078 1.0 \n", "45 WEEKDAY_APPR_PROCESS_START_MONDAY_max 0.5064 0.0072 1.0 \n", "94 WEEKDAY_APPR_PROCESS_START_SUNDAY_sum 0.5059 -0.0024 1.0 \n", "37 WEEKDAY_APPR_PROCESS_START_THURSDAY_sum 0.5058 0.0144 1.0 \n", "47 WEEKDAY_APPR_PROCESS_START_MONDAY_mean 0.5058 0.0053 1.0 \n", "16 PRODUCT_COMBINATION_Cash_Street__low_sum 0.5050 0.0126 1.0 \n", "15 PRODUCT_COMBINATION_Cash_Street__low_max 0.5049 0.0116 1.0 \n", "17 PRODUCT_COMBINATION_Cash_Street__low_mean 0.5048 0.0023 1.0 \n", "88 WEEKDAY_APPR_PROCESS_START_TUESDAY_sum 0.5046 0.0130 1.0 \n", "74 WEEKDAY_APPR_PROCESS_START_WEDNESDAY_mean 0.5042 0.0055 1.0 \n", "72 WEEKDAY_APPR_PROCESS_START_WEDNESDAY_max 0.5041 0.0046 1.0 \n", "10 WEEKDAY_APPR_PROCESS_START_SATURDAY_sum 0.5039 0.0032 1.0 \n", "6 NAME_TYPE_SUITE_Unaccompanied_max 0.5038 0.0079 1.0 \n", "0 WEEKDAY_APPR_PROCESS_START_FRIDAY_max 0.5034 0.0037 1.0 \n", "2 WEEKDAY_APPR_PROCESS_START_FRIDAY_mean 0.5033 0.0053 1.0 \n", "86 NAME_TYPE_SUITE_Spouse,_partner_mean 0.5021 -0.0041 1.0 \n", "36 WEEKDAY_APPR_PROCESS_START_THURSDAY_max 0.5020 0.0023 1.0 \n", "27 NAME_PRODUCT_TYPE_XNA_max 0.5020 -0.0059 1.0 \n", "28 NAME_PRODUCT_TYPE_XNA_sum 0.5019 0.0071 1.0 \n", "84 NAME_TYPE_SUITE_Spouse,_partner_max 0.5018 -0.0027 1.0 \n", "32 PRODUCT_COMBINATION_POS_mobile_without_interest_mean 0.5017 -0.0022 1.0 \n", "30 PRODUCT_COMBINATION_POS_mobile_without_interest_max 0.5017 -0.0039 1.0 \n", "31 PRODUCT_COMBINATION_POS_mobile_without_interest_sum 0.5017 -0.0024 1.0 \n", "50 PRODUCT_COMBINATION_POS_other_with_interest_mean 0.5016 -0.0049 1.0 \n", "48 PRODUCT_COMBINATION_POS_other_with_interest_max 0.5015 -0.0036 1.0 \n", "49 PRODUCT_COMBINATION_POS_other_with_interest_sum 0.5015 -0.0029 1.0 \n", "85 NAME_TYPE_SUITE_Spouse,_partner_sum 0.5015 0.0027 1.0 \n", "67 FLAG_LAST_APPL_PER_CONTRACT_N_sum 0.5015 0.0103 1.0 \n", "80 FLAG_LAST_APPL_PER_CONTRACT_Y_mean 0.5015 -0.0068 1.0 \n", "68 FLAG_LAST_APPL_PER_CONTRACT_N_mean 0.5015 0.0068 1.0 \n", "66 FLAG_LAST_APPL_PER_CONTRACT_N_max 0.5015 0.0062 1.0 \n", "38 WEEKDAY_APPR_PROCESS_START_THURSDAY_mean 0.5014 0.0010 1.0 \n", "103 NAME_TYPE_SUITE_Other_A_sum 0.5010 0.0035 1.0 \n", "104 NAME_TYPE_SUITE_Other_A_mean 0.5010 0.0055 1.0 \n", "102 NAME_TYPE_SUITE_Other_A_max 0.5010 0.0038 1.0 \n", "87 WEEKDAY_APPR_PROCESS_START_TUESDAY_max 0.5010 0.0011 1.0 \n", "106 NAME_TYPE_SUITE_Other_B_sum 0.5007 0.0029 1.0 \n", "107 NAME_TYPE_SUITE_Other_B_mean 0.5007 0.0025 1.0 \n", "105 NAME_TYPE_SUITE_Other_B_max 0.5007 0.0018 1.0 \n", "40 PRODUCT_COMBINATION_POS_others_without_interest_sum 0.5004 -0.0033 1.0 \n", "39 PRODUCT_COMBINATION_POS_others_without_interest_max 0.5004 -0.0030 1.0 \n", "41 PRODUCT_COMBINATION_POS_others_without_interest_mean 0.5004 -0.0019 1.0 \n", "23 NAME_TYPE_SUITE_Group_of_people_mean 0.5003 0.0034 1.0 \n", "22 NAME_TYPE_SUITE_Group_of_people_sum 0.5003 0.0023 1.0 \n", "21 NAME_TYPE_SUITE_Group_of_people_max 0.5003 0.0021 1.0 \n", "89 WEEKDAY_APPR_PROCESS_START_TUESDAY_mean 0.5000 0.0005 1.0 \n", "78 FLAG_LAST_APPL_PER_CONTRACT_Y_max 0.5000 NaN 1.0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 54.8 s, sys: 1.93 s, total: 56.7 s\n", "Wall time: 14.5 s\n" ] } ], "source": [ "%%time\n", "pdf_agg01 = agg_common_data(pdf_onehot, [\"max\", \"sum\", \"mean\"], main_key=\"SK_ID_CURR\")\n", "eval_agg01 = feature_evaluate(pdf_train_filtered, pdf_agg01)\n", "display(eval_agg01)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "(12, 4)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_agg01.query(\"auc <= 0.501\").shape" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(338857, 105)\n" ] } ], "source": [ "sel_feat = eval_agg01.query(\"auc > 0.501\")[\"name\"].tolist()\n", "pdf_agg01 = pdf_agg01[sel_feat]\n", "print(pdf_agg01.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Numerical features" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['HOUR_APPR_PROCESS_START',\n", " 'NFLAG_LAST_APPL_IN_DAY',\n", " 'SELLERPLACE_AREA',\n", " 'PREPAID_USER',\n", " 'NAME_CASH_LOAN_PURPOSE_ordinal',\n", " 'NAME_CONTRACT_STATUS_ordinal',\n", " 'NAME_PAYMENT_TYPE_ordinal',\n", " 'CODE_REJECT_REASON_ordinal',\n", " 'NAME_CLIENT_TYPE_ordinal',\n", " 'NAME_GOODS_CATEGORY_ordinal',\n", " 'NAME_PORTFOLIO_ordinal',\n", " 'CHANNEL_TYPE_ordinal',\n", " 'NAME_SELLER_INDUSTRY_ordinal',\n", " 'NAME_YIELD_GROUP_ordinal']" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list numerical attributes\n", "# ls_num = pdf_meta.query(\"sub_type == 'int64'\")[\"name\"].tolist()\n", "series_type = pdf_prev_app.dtypes\n", "ls_num = series_type[series_type == \"int64\"].index.tolist()\n", "ls_num = [col for col in ls_num if col not in [\"SK_ID_PREV\", \"SK_ID_CURR\"]]\n", "ls_num" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRHOUR_APPR_PROCESS_STARTNFLAG_LAST_APPL_IN_DAYSELLERPLACE_AREAPREPAID_USERNAME_CASH_LOAN_PURPOSE_ordinalNAME_CONTRACT_STATUS_ordinalNAME_PAYMENT_TYPE_ordinalCODE_REJECT_REASON_ordinalNAME_CLIENT_TYPE_ordinalNAME_GOODS_CATEGORY_ordinalNAME_PORTFOLIO_ordinalCHANNEL_TYPE_ordinalNAME_SELLER_INDUSTRY_ordinalNAME_YIELD_GROUP_ordinal
020304952718771513500000010121
12802425108129111-101010001404
22523466122040111-101000001002
3281924317615871-101000001001
4178426520205491-112201001002
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR HOUR_APPR_PROCESS_START NFLAG_LAST_APPL_IN_DAY SELLERPLACE_AREA PREPAID_USER NAME_CASH_LOAN_PURPOSE_ordinal NAME_CONTRACT_STATUS_ordinal NAME_PAYMENT_TYPE_ordinal CODE_REJECT_REASON_ordinal NAME_CLIENT_TYPE_ordinal NAME_GOODS_CATEGORY_ordinal NAME_PORTFOLIO_ordinal CHANNEL_TYPE_ordinal NAME_SELLER_INDUSTRY_ordinal NAME_YIELD_GROUP_ordinal\n", "0 2030495 271877 15 1 35 0 0 0 0 0 0 1 0 1 2 1 \n", "1 2802425 108129 11 1 -1 0 1 0 1 0 0 0 1 4 0 4 \n", "2 2523466 122040 11 1 -1 0 1 0 0 0 0 0 1 0 0 2 \n", "3 2819243 176158 7 1 -1 0 1 0 0 0 0 0 1 0 0 1 \n", "4 1784265 202054 9 1 -1 1 2 2 0 1 0 0 1 0 0 2 " ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_num = pdf_prev_app[[\"SK_ID_PREV\", \"SK_ID_CURR\"] + ls_num].copy()\n", "pdf_num.head()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'CHANNEL_TYPE_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CODE_REJECT_REASON_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'HOUR_APPR_PROCESS_START': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_CASH_LOAN_PURPOSE_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_CLIENT_TYPE_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_CONTRACT_STATUS_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_GOODS_CATEGORY_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_PAYMENT_TYPE_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_PORTFOLIO_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_SELLER_INDUSTRY_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NAME_YIELD_GROUP_ordinal': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NFLAG_LAST_APPL_IN_DAY': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'PREPAID_USER': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'SELLERPLACE_AREA': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (338857, 70)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
19NAME_CONTRACT_STATUS_ordinal_std0.56575.8804e-020.8192
59CODE_REJECT_REASON_ordinal_std0.56555.8077e-020.8192
18NAME_CONTRACT_STATUS_ordinal_mean0.56237.0461e-021.0000
3PREPAID_USER_mean0.55746.0266e-021.0000
58CODE_REJECT_REASON_ordinal_mean0.55616.0062e-021.0000
57CODE_REJECT_REASON_ordinal_sum0.55486.0961e-021.0000
68NAME_YIELD_GROUP_ordinal_mean0.5519-4.8816e-021.0000
17NAME_CONTRACT_STATUS_ordinal_sum0.55165.7904e-021.0000
55CODE_REJECT_REASON_ordinal_max0.55074.9633e-021.0000
13NAME_GOODS_CATEGORY_ordinal_mean0.5494-3.6786e-021.0000
65NAME_YIELD_GROUP_ordinal_max0.5452-4.1166e-021.0000
2PREPAID_USER_sum0.54504.9427e-021.0000
15NAME_CONTRACT_STATUS_ordinal_max0.54354.3386e-021.0000
28NAME_PORTFOLIO_ordinal_mean0.54294.2349e-021.0000
10NAME_GOODS_CATEGORY_ordinal_max0.5426-3.2552e-021.0000
12NAME_GOODS_CATEGORY_ordinal_sum0.5426-2.9256e-021.0000
4PREPAID_USER_std0.54054.4479e-020.8192
38SELLERPLACE_AREA_mean0.5382-2.6579e-031.0000
33HOUR_APPR_PROCESS_START_mean0.5377-3.6333e-021.0000
54NAME_CASH_LOAN_PURPOSE_ordinal_std0.53683.7460e-020.8192
39SELLERPLACE_AREA_std0.5366-4.3819e-030.8192
14NAME_GOODS_CATEGORY_ordinal_std0.5365-2.8191e-020.8192
35SELLERPLACE_AREA_max0.5351-3.1607e-031.0000
23NAME_PAYMENT_TYPE_ordinal_mean0.53513.2790e-021.0000
27NAME_PORTFOLIO_ordinal_sum0.53404.0230e-021.0000
22NAME_PAYMENT_TYPE_ordinal_sum0.53393.8618e-021.0000
31HOUR_APPR_PROCESS_START_min0.5333-3.1684e-021.0000
30HOUR_APPR_PROCESS_START_max0.5333-3.1928e-021.0000
37SELLERPLACE_AREA_sum0.5325-3.9560e-031.0000
25NAME_PORTFOLIO_ordinal_max0.53112.9373e-021.0000
29NAME_PORTFOLIO_ordinal_std0.53052.9315e-020.8192
0PREPAID_USER_max0.52893.2307e-021.0000
43NAME_SELLER_INDUSTRY_ordinal_mean0.5277-2.6485e-021.0000
50NAME_CASH_LOAN_PURPOSE_ordinal_max0.52603.5837e-021.0000
67NAME_YIELD_GROUP_ordinal_sum0.5255-1.3066e-021.0000
69NAME_YIELD_GROUP_ordinal_std0.5242-1.9679e-020.8192
40NAME_SELLER_INDUSTRY_ordinal_max0.5241-2.4982e-021.0000
52NAME_CASH_LOAN_PURPOSE_ordinal_sum0.52373.4514e-021.0000
53NAME_CASH_LOAN_PURPOSE_ordinal_mean0.52332.7435e-021.0000
42NAME_SELLER_INDUSTRY_ordinal_sum0.5229-1.4972e-021.0000
9NAME_CLIENT_TYPE_ordinal_std0.5216-1.7422e-020.8192
66NAME_YIELD_GROUP_ordinal_min0.5211-2.1298e-021.0000
44NAME_SELLER_INDUSTRY_ordinal_std0.5209-2.1940e-020.8192
24NAME_PAYMENT_TYPE_ordinal_std0.51982.1540e-020.8192
20NAME_PAYMENT_TYPE_ordinal_max0.51761.8720e-021.0000
11NAME_GOODS_CATEGORY_ordinal_min0.5163-1.9570e-021.0000
5NAME_CLIENT_TYPE_ordinal_max0.5133-1.1119e-021.0000
49CHANNEL_TYPE_ordinal_std0.51101.3215e-020.8192
62NFLAG_LAST_APPL_IN_DAY_sum0.51011.9556e-021.0000
34HOUR_APPR_PROCESS_START_std0.5100-8.9635e-030.8192
41NAME_SELLER_INDUSTRY_ordinal_min0.5098-1.2479e-021.0000
36SELLERPLACE_AREA_min0.5092-1.3533e-031.0000
45CHANNEL_TYPE_ordinal_max0.50831.3879e-021.0000
26NAME_PORTFOLIO_ordinal_min0.50781.4560e-021.0000
21NAME_PAYMENT_TYPE_ordinal_min0.50429.2246e-031.0000
8NAME_CLIENT_TYPE_ordinal_mean0.5042-7.3017e-031.0000
46CHANNEL_TYPE_ordinal_min0.5038-2.6060e-041.0000
6NAME_CLIENT_TYPE_ordinal_min0.5029-1.4025e-041.0000
48CHANNEL_TYPE_ordinal_mean0.50275.8389e-031.0000
47CHANNEL_TYPE_ordinal_sum0.50252.2143e-021.0000
7NAME_CLIENT_TYPE_ordinal_sum0.50163.7297e-031.0000
64NFLAG_LAST_APPL_IN_DAY_std0.50144.5610e-030.8192
61NFLAG_LAST_APPL_IN_DAY_min0.5011-5.0978e-031.0000
63NFLAG_LAST_APPL_IN_DAY_mean0.5011-3.5944e-031.0000
51NAME_CASH_LOAN_PURPOSE_ordinal_min0.50065.1359e-031.0000
32HOUR_APPR_PROCESS_START_sum0.50021.0930e-021.0000
56CODE_REJECT_REASON_ordinal_min0.50011.2725e-031.0000
16NAME_CONTRACT_STATUS_ordinal_min0.50005.2200e-041.0000
60NFLAG_LAST_APPL_IN_DAY_max0.5000-1.0546e-031.0000
1PREPAID_USER_min0.5000-8.9535e-051.0000
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "19 NAME_CONTRACT_STATUS_ordinal_std 0.5657 5.8804e-02 0.8192 \n", "59 CODE_REJECT_REASON_ordinal_std 0.5655 5.8077e-02 0.8192 \n", "18 NAME_CONTRACT_STATUS_ordinal_mean 0.5623 7.0461e-02 1.0000 \n", "3 PREPAID_USER_mean 0.5574 6.0266e-02 1.0000 \n", "58 CODE_REJECT_REASON_ordinal_mean 0.5561 6.0062e-02 1.0000 \n", "57 CODE_REJECT_REASON_ordinal_sum 0.5548 6.0961e-02 1.0000 \n", "68 NAME_YIELD_GROUP_ordinal_mean 0.5519 -4.8816e-02 1.0000 \n", "17 NAME_CONTRACT_STATUS_ordinal_sum 0.5516 5.7904e-02 1.0000 \n", "55 CODE_REJECT_REASON_ordinal_max 0.5507 4.9633e-02 1.0000 \n", "13 NAME_GOODS_CATEGORY_ordinal_mean 0.5494 -3.6786e-02 1.0000 \n", "65 NAME_YIELD_GROUP_ordinal_max 0.5452 -4.1166e-02 1.0000 \n", "2 PREPAID_USER_sum 0.5450 4.9427e-02 1.0000 \n", "15 NAME_CONTRACT_STATUS_ordinal_max 0.5435 4.3386e-02 1.0000 \n", "28 NAME_PORTFOLIO_ordinal_mean 0.5429 4.2349e-02 1.0000 \n", "10 NAME_GOODS_CATEGORY_ordinal_max 0.5426 -3.2552e-02 1.0000 \n", "12 NAME_GOODS_CATEGORY_ordinal_sum 0.5426 -2.9256e-02 1.0000 \n", "4 PREPAID_USER_std 0.5405 4.4479e-02 0.8192 \n", "38 SELLERPLACE_AREA_mean 0.5382 -2.6579e-03 1.0000 \n", "33 HOUR_APPR_PROCESS_START_mean 0.5377 -3.6333e-02 1.0000 \n", "54 NAME_CASH_LOAN_PURPOSE_ordinal_std 0.5368 3.7460e-02 0.8192 \n", "39 SELLERPLACE_AREA_std 0.5366 -4.3819e-03 0.8192 \n", "14 NAME_GOODS_CATEGORY_ordinal_std 0.5365 -2.8191e-02 0.8192 \n", "35 SELLERPLACE_AREA_max 0.5351 -3.1607e-03 1.0000 \n", "23 NAME_PAYMENT_TYPE_ordinal_mean 0.5351 3.2790e-02 1.0000 \n", "27 NAME_PORTFOLIO_ordinal_sum 0.5340 4.0230e-02 1.0000 \n", "22 NAME_PAYMENT_TYPE_ordinal_sum 0.5339 3.8618e-02 1.0000 \n", "31 HOUR_APPR_PROCESS_START_min 0.5333 -3.1684e-02 1.0000 \n", "30 HOUR_APPR_PROCESS_START_max 0.5333 -3.1928e-02 1.0000 \n", "37 SELLERPLACE_AREA_sum 0.5325 -3.9560e-03 1.0000 \n", "25 NAME_PORTFOLIO_ordinal_max 0.5311 2.9373e-02 1.0000 \n", "29 NAME_PORTFOLIO_ordinal_std 0.5305 2.9315e-02 0.8192 \n", "0 PREPAID_USER_max 0.5289 3.2307e-02 1.0000 \n", "43 NAME_SELLER_INDUSTRY_ordinal_mean 0.5277 -2.6485e-02 1.0000 \n", "50 NAME_CASH_LOAN_PURPOSE_ordinal_max 0.5260 3.5837e-02 1.0000 \n", "67 NAME_YIELD_GROUP_ordinal_sum 0.5255 -1.3066e-02 1.0000 \n", "69 NAME_YIELD_GROUP_ordinal_std 0.5242 -1.9679e-02 0.8192 \n", "40 NAME_SELLER_INDUSTRY_ordinal_max 0.5241 -2.4982e-02 1.0000 \n", "52 NAME_CASH_LOAN_PURPOSE_ordinal_sum 0.5237 3.4514e-02 1.0000 \n", "53 NAME_CASH_LOAN_PURPOSE_ordinal_mean 0.5233 2.7435e-02 1.0000 \n", "42 NAME_SELLER_INDUSTRY_ordinal_sum 0.5229 -1.4972e-02 1.0000 \n", "9 NAME_CLIENT_TYPE_ordinal_std 0.5216 -1.7422e-02 0.8192 \n", "66 NAME_YIELD_GROUP_ordinal_min 0.5211 -2.1298e-02 1.0000 \n", "44 NAME_SELLER_INDUSTRY_ordinal_std 0.5209 -2.1940e-02 0.8192 \n", "24 NAME_PAYMENT_TYPE_ordinal_std 0.5198 2.1540e-02 0.8192 \n", "20 NAME_PAYMENT_TYPE_ordinal_max 0.5176 1.8720e-02 1.0000 \n", "11 NAME_GOODS_CATEGORY_ordinal_min 0.5163 -1.9570e-02 1.0000 \n", "5 NAME_CLIENT_TYPE_ordinal_max 0.5133 -1.1119e-02 1.0000 \n", "49 CHANNEL_TYPE_ordinal_std 0.5110 1.3215e-02 0.8192 \n", "62 NFLAG_LAST_APPL_IN_DAY_sum 0.5101 1.9556e-02 1.0000 \n", "34 HOUR_APPR_PROCESS_START_std 0.5100 -8.9635e-03 0.8192 \n", "41 NAME_SELLER_INDUSTRY_ordinal_min 0.5098 -1.2479e-02 1.0000 \n", "36 SELLERPLACE_AREA_min 0.5092 -1.3533e-03 1.0000 \n", "45 CHANNEL_TYPE_ordinal_max 0.5083 1.3879e-02 1.0000 \n", "26 NAME_PORTFOLIO_ordinal_min 0.5078 1.4560e-02 1.0000 \n", "21 NAME_PAYMENT_TYPE_ordinal_min 0.5042 9.2246e-03 1.0000 \n", "8 NAME_CLIENT_TYPE_ordinal_mean 0.5042 -7.3017e-03 1.0000 \n", "46 CHANNEL_TYPE_ordinal_min 0.5038 -2.6060e-04 1.0000 \n", "6 NAME_CLIENT_TYPE_ordinal_min 0.5029 -1.4025e-04 1.0000 \n", "48 CHANNEL_TYPE_ordinal_mean 0.5027 5.8389e-03 1.0000 \n", "47 CHANNEL_TYPE_ordinal_sum 0.5025 2.2143e-02 1.0000 \n", "7 NAME_CLIENT_TYPE_ordinal_sum 0.5016 3.7297e-03 1.0000 \n", "64 NFLAG_LAST_APPL_IN_DAY_std 0.5014 4.5610e-03 0.8192 \n", "61 NFLAG_LAST_APPL_IN_DAY_min 0.5011 -5.0978e-03 1.0000 \n", "63 NFLAG_LAST_APPL_IN_DAY_mean 0.5011 -3.5944e-03 1.0000 \n", "51 NAME_CASH_LOAN_PURPOSE_ordinal_min 0.5006 5.1359e-03 1.0000 \n", "32 HOUR_APPR_PROCESS_START_sum 0.5002 1.0930e-02 1.0000 \n", "56 CODE_REJECT_REASON_ordinal_min 0.5001 1.2725e-03 1.0000 \n", "16 NAME_CONTRACT_STATUS_ordinal_min 0.5000 5.2200e-04 1.0000 \n", "60 NFLAG_LAST_APPL_IN_DAY_max 0.5000 -1.0546e-03 1.0000 \n", "1 PREPAID_USER_min 0.5000 -8.9535e-05 1.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 27.3 s, sys: 1.13 s, total: 28.5 s\n", "Wall time: 8.55 s\n" ] } ], "source": [ "%%time\n", "pdf_agg02 = agg_common_data(pdf_num[[\"SK_ID_CURR\"] + ls_num], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg02 = feature_evaluate(pdf_train_filtered, pdf_agg02)\n", "display(eval_agg02)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6, 4)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_agg02.query(\"auc <= 0.501\").shape" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(338857, 64)\n" ] } ], "source": [ "sel_feat = eval_agg02.query(\"auc > 0.501\")[\"name\"].tolist()\n", "pdf_agg02 = pdf_agg02[sel_feat]\n", "print(pdf_agg02.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Continuous features" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AMT_ANNUITY',\n", " 'AMT_APPLICATION',\n", " 'AMT_CREDIT',\n", " 'AMT_DOWN_PAYMENT',\n", " 'AMT_GOODS_PRICE',\n", " 'RATE_DOWN_PAYMENT',\n", " 'RATE_INTEREST_PRIMARY',\n", " 'RATE_INTEREST_PRIVILEGED',\n", " 'CNT_PAYMENT',\n", " 'NFLAG_INSURED_ON_APPROVAL',\n", " 'DAYS_DECISION_TO_YEARS',\n", " 'DAYS_FIRST_DRAWING_TO_YEARS',\n", " 'DAYS_FIRST_DUE_TO_YEARS',\n", " 'DAYS_LAST_DUE_1ST_VERSION_TO_YEARS',\n", " 'DAYS_LAST_DUE_TO_YEARS',\n", " 'DAYS_TERMINATION_TO_YEARS']" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get list continuous attributes\n", "# ls_con = pdf_meta.query(\"sub_type == 'float64'\")[\"name\"].tolist()\n", "series_type = pdf_prev_app.dtypes\n", "ls_con = series_type[series_type == \"float64\"].index.tolist()\n", "ls_con" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREVSK_ID_CURRAMT_ANNUITYAMT_APPLICATIONAMT_CREDITAMT_DOWN_PAYMENTAMT_GOODS_PRICERATE_DOWN_PAYMENTRATE_INTEREST_PRIMARYRATE_INTEREST_PRIVILEGEDCNT_PAYMENTNFLAG_INSURED_ON_APPROVALDAYS_DECISION_TO_YEARSDAYS_FIRST_DRAWING_TO_YEARSDAYS_FIRST_DUE_TO_YEARSDAYS_LAST_DUE_1ST_VERSION_TO_YEARSDAYS_LAST_DUE_TO_YEARSDAYS_TERMINATION_TO_YEARS
020304952718771730.43017145.017145.00.017145.00.00000.18280.867312.00.00.2000NaN0.1151-0.82190.11510.1014
1280242510812925188.615607500.0679671.00.0607500.00.05160.18910.835136.01.00.4493NaN0.3671-2.5096NaNNaN
2252346612204015060.735112500.0136444.50.0112500.00.05160.18910.835112.01.00.8247NaN0.7425-0.1616NaNNaN
3281924317615847041.335450000.0470790.00.0450000.00.05160.18910.835112.01.01.4027NaN1.32050.41640.49860.4849
4178426520205431924.395337500.0404055.00.0337500.00.05160.18910.835124.0NaN2.1397NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " SK_ID_PREV SK_ID_CURR AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED CNT_PAYMENT NFLAG_INSURED_ON_APPROVAL DAYS_DECISION_TO_YEARS DAYS_FIRST_DRAWING_TO_YEARS DAYS_FIRST_DUE_TO_YEARS DAYS_LAST_DUE_1ST_VERSION_TO_YEARS DAYS_LAST_DUE_TO_YEARS DAYS_TERMINATION_TO_YEARS\n", "0 2030495 271877 1730.430 17145.0 17145.0 0.0 17145.0 0.0000 0.1828 0.8673 12.0 0.0 0.2000 NaN 0.1151 -0.8219 0.1151 0.1014 \n", "1 2802425 108129 25188.615 607500.0 679671.0 0.0 607500.0 0.0516 0.1891 0.8351 36.0 1.0 0.4493 NaN 0.3671 -2.5096 NaN NaN \n", "2 2523466 122040 15060.735 112500.0 136444.5 0.0 112500.0 0.0516 0.1891 0.8351 12.0 1.0 0.8247 NaN 0.7425 -0.1616 NaN NaN \n", "3 2819243 176158 47041.335 450000.0 470790.0 0.0 450000.0 0.0516 0.1891 0.8351 12.0 1.0 1.4027 NaN 1.3205 0.4164 0.4986 0.4849 \n", "4 1784265 202054 31924.395 337500.0 404055.0 0.0 337500.0 0.0516 0.1891 0.8351 24.0 NaN 2.1397 NaN NaN NaN NaN NaN " ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf_con = pdf_prev_app[[\"SK_ID_PREV\", \"SK_ID_CURR\"] + ls_con].copy()\n", "pdf_con.head()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'AMT_ANNUITY': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_APPLICATION': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_CREDIT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_DOWN_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'AMT_GOODS_PRICE': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'CNT_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_DECISION_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_FIRST_DRAWING_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_FIRST_DUE_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_LAST_DUE_1ST_VERSION_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_LAST_DUE_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'DAYS_TERMINATION_TO_YEARS': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'NFLAG_INSURED_ON_APPROVAL': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'RATE_DOWN_PAYMENT': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'RATE_INTEREST_PRIMARY': ['max', 'min', 'sum', 'mean', 'std'],\n", " 'RATE_INTEREST_PRIVILEGED': ['max', 'min', 'sum', 'mean', 'std']}" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After agg: (338857, 80)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameauccorrcoverage
64DAYS_FIRST_DRAWING_TO_YEARS_std0.6163-0.09450.0005
63DAYS_FIRST_DRAWING_TO_YEARS_mean0.5944-0.09630.1830
60DAYS_FIRST_DRAWING_TO_YEARS_max0.5943-0.09610.1830
61DAYS_FIRST_DRAWING_TO_YEARS_min0.5942-0.09620.1830
44DAYS_DECISION_TO_YEARS_std0.5675-0.06050.8192
57DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_sum0.5635-0.05161.0000
4AMT_DOWN_PAYMENT_std0.5608-0.02450.8192
19DAYS_FIRST_DUE_TO_YEARS_std0.5589-0.04970.7013
43DAYS_DECISION_TO_YEARS_mean0.5582-0.04721.0000
15DAYS_FIRST_DUE_TO_YEARS_max0.5576-0.05460.9937
40DAYS_DECISION_TO_YEARS_max0.5574-0.05451.0000
55DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_max0.5561-0.05340.9863
32DAYS_LAST_DUE_TO_YEARS_sum0.5559-0.04411.0000
7DAYS_TERMINATION_TO_YEARS_sum0.5549-0.04341.0000
2AMT_DOWN_PAYMENT_sum0.5542-0.02751.0000
17DAYS_FIRST_DUE_TO_YEARS_sum0.5540-0.04261.0000
0AMT_DOWN_PAYMENT_max0.5537-0.02581.0000
3AMT_DOWN_PAYMENT_mean0.5534-0.02981.0000
30DAYS_LAST_DUE_TO_YEARS_max0.5527-0.04960.9244
79RATE_DOWN_PAYMENT_std0.5526-0.04320.8192
5DAYS_TERMINATION_TO_YEARS_max0.5521-0.04880.9172
58DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_mean0.5509-0.04210.9863
18DAYS_FIRST_DUE_TO_YEARS_mean0.5494-0.04090.9937
73AMT_ANNUITY_mean0.5471-0.04211.0000
34DAYS_LAST_DUE_TO_YEARS_std0.5448-0.03850.5814
39CNT_PAYMENT_std0.54480.04220.8192
75RATE_DOWN_PAYMENT_max0.5442-0.03831.0000
9DAYS_TERMINATION_TO_YEARS_std0.5419-0.03590.5705
33DAYS_LAST_DUE_TO_YEARS_mean0.5419-0.03410.9244
8DAYS_TERMINATION_TO_YEARS_mean0.5414-0.03360.9172
78RATE_DOWN_PAYMENT_mean0.5358-0.03261.0000
71AMT_ANNUITY_min0.5297-0.03021.0000
42DAYS_DECISION_TO_YEARS_sum0.5296-0.01911.0000
28AMT_APPLICATION_mean0.5288-0.02311.0000
48AMT_GOODS_PRICE_mean0.5287-0.02301.0000
62DAYS_FIRST_DRAWING_TO_YEARS_sum0.52820.00011.0000
70AMT_ANNUITY_max0.5278-0.02911.0000
21AMT_CREDIT_min0.5275-0.02041.0000
41DAYS_DECISION_TO_YEARS_min0.5246-0.01721.0000
35CNT_PAYMENT_max0.52380.02901.0000
46AMT_GOODS_PRICE_min0.5221-0.02131.0000
26AMT_APPLICATION_min0.5221-0.02131.0000
23AMT_CREDIT_mean0.5212-0.01731.0000
72AMT_ANNUITY_sum0.5205-0.00751.0000
36CNT_PAYMENT_min0.5175-0.01191.0000
59DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std0.5172-0.01110.6718
25AMT_APPLICATION_max0.5171-0.01381.0000
45AMT_GOODS_PRICE_max0.5171-0.01381.0000
74AMT_ANNUITY_std0.5162-0.02520.8192
77RATE_DOWN_PAYMENT_sum0.5157-0.01071.0000
67NFLAG_INSURED_ON_APPROVAL_sum0.5147-0.01731.0000
56DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min0.5140-0.01170.9863
53RATE_INTEREST_PRIVILEGED_mean0.51290.00111.0000
27AMT_APPLICATION_sum0.51190.00321.0000
47AMT_GOODS_PRICE_sum0.51190.00331.0000
20AMT_CREDIT_max0.5110-0.00941.0000
65NFLAG_INSURED_ON_APPROVAL_max0.5103-0.01140.9956
12RATE_INTEREST_PRIMARY_sum0.51030.01971.0000
52RATE_INTEREST_PRIVILEGED_sum0.51020.01981.0000
69NFLAG_INSURED_ON_APPROVAL_std0.50980.00260.7213
38CNT_PAYMENT_mean0.50910.01211.0000
37CNT_PAYMENT_sum0.50820.02671.0000
29AMT_APPLICATION_std0.5077-0.01490.8192
49AMT_GOODS_PRICE_std0.5077-0.01490.8192
22AMT_CREDIT_sum0.50710.00691.0000
16DAYS_FIRST_DUE_TO_YEARS_min0.5064-0.00500.9937
1AMT_DOWN_PAYMENT_min0.5058-0.01621.0000
68NFLAG_INSURED_ON_APPROVAL_mean0.5057-0.00210.9956
66NFLAG_INSURED_ON_APPROVAL_min0.50530.01120.9956
31DAYS_LAST_DUE_TO_YEARS_min0.5038-0.00140.9244
6DAYS_TERMINATION_TO_YEARS_min0.5032-0.00200.9172
76RATE_DOWN_PAYMENT_min0.5032-0.01031.0000
24AMT_CREDIT_std0.5017-0.01010.8192
13RATE_INTEREST_PRIMARY_mean0.50110.00081.0000
11RATE_INTEREST_PRIMARY_min0.50040.00161.0000
51RATE_INTEREST_PRIVILEGED_min0.50030.00221.0000
10RATE_INTEREST_PRIMARY_max0.5003-0.00051.0000
54RATE_INTEREST_PRIVILEGED_std0.5002-0.00170.8192
14RATE_INTEREST_PRIMARY_std0.5001-0.00100.8192
50RATE_INTEREST_PRIVILEGED_max0.50010.00041.0000
\n", "
" ], "text/plain": [ " name auc corr coverage\n", "64 DAYS_FIRST_DRAWING_TO_YEARS_std 0.6163 -0.0945 0.0005 \n", "63 DAYS_FIRST_DRAWING_TO_YEARS_mean 0.5944 -0.0963 0.1830 \n", "60 DAYS_FIRST_DRAWING_TO_YEARS_max 0.5943 -0.0961 0.1830 \n", "61 DAYS_FIRST_DRAWING_TO_YEARS_min 0.5942 -0.0962 0.1830 \n", "44 DAYS_DECISION_TO_YEARS_std 0.5675 -0.0605 0.8192 \n", "57 DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_sum 0.5635 -0.0516 1.0000 \n", "4 AMT_DOWN_PAYMENT_std 0.5608 -0.0245 0.8192 \n", "19 DAYS_FIRST_DUE_TO_YEARS_std 0.5589 -0.0497 0.7013 \n", "43 DAYS_DECISION_TO_YEARS_mean 0.5582 -0.0472 1.0000 \n", "15 DAYS_FIRST_DUE_TO_YEARS_max 0.5576 -0.0546 0.9937 \n", "40 DAYS_DECISION_TO_YEARS_max 0.5574 -0.0545 1.0000 \n", "55 DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_max 0.5561 -0.0534 0.9863 \n", "32 DAYS_LAST_DUE_TO_YEARS_sum 0.5559 -0.0441 1.0000 \n", "7 DAYS_TERMINATION_TO_YEARS_sum 0.5549 -0.0434 1.0000 \n", "2 AMT_DOWN_PAYMENT_sum 0.5542 -0.0275 1.0000 \n", "17 DAYS_FIRST_DUE_TO_YEARS_sum 0.5540 -0.0426 1.0000 \n", "0 AMT_DOWN_PAYMENT_max 0.5537 -0.0258 1.0000 \n", "3 AMT_DOWN_PAYMENT_mean 0.5534 -0.0298 1.0000 \n", "30 DAYS_LAST_DUE_TO_YEARS_max 0.5527 -0.0496 0.9244 \n", "79 RATE_DOWN_PAYMENT_std 0.5526 -0.0432 0.8192 \n", "5 DAYS_TERMINATION_TO_YEARS_max 0.5521 -0.0488 0.9172 \n", "58 DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_mean 0.5509 -0.0421 0.9863 \n", "18 DAYS_FIRST_DUE_TO_YEARS_mean 0.5494 -0.0409 0.9937 \n", "73 AMT_ANNUITY_mean 0.5471 -0.0421 1.0000 \n", "34 DAYS_LAST_DUE_TO_YEARS_std 0.5448 -0.0385 0.5814 \n", "39 CNT_PAYMENT_std 0.5448 0.0422 0.8192 \n", "75 RATE_DOWN_PAYMENT_max 0.5442 -0.0383 1.0000 \n", "9 DAYS_TERMINATION_TO_YEARS_std 0.5419 -0.0359 0.5705 \n", "33 DAYS_LAST_DUE_TO_YEARS_mean 0.5419 -0.0341 0.9244 \n", "8 DAYS_TERMINATION_TO_YEARS_mean 0.5414 -0.0336 0.9172 \n", "78 RATE_DOWN_PAYMENT_mean 0.5358 -0.0326 1.0000 \n", "71 AMT_ANNUITY_min 0.5297 -0.0302 1.0000 \n", "42 DAYS_DECISION_TO_YEARS_sum 0.5296 -0.0191 1.0000 \n", "28 AMT_APPLICATION_mean 0.5288 -0.0231 1.0000 \n", "48 AMT_GOODS_PRICE_mean 0.5287 -0.0230 1.0000 \n", "62 DAYS_FIRST_DRAWING_TO_YEARS_sum 0.5282 0.0001 1.0000 \n", "70 AMT_ANNUITY_max 0.5278 -0.0291 1.0000 \n", "21 AMT_CREDIT_min 0.5275 -0.0204 1.0000 \n", "41 DAYS_DECISION_TO_YEARS_min 0.5246 -0.0172 1.0000 \n", "35 CNT_PAYMENT_max 0.5238 0.0290 1.0000 \n", "46 AMT_GOODS_PRICE_min 0.5221 -0.0213 1.0000 \n", "26 AMT_APPLICATION_min 0.5221 -0.0213 1.0000 \n", "23 AMT_CREDIT_mean 0.5212 -0.0173 1.0000 \n", "72 AMT_ANNUITY_sum 0.5205 -0.0075 1.0000 \n", "36 CNT_PAYMENT_min 0.5175 -0.0119 1.0000 \n", "59 DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_std 0.5172 -0.0111 0.6718 \n", "25 AMT_APPLICATION_max 0.5171 -0.0138 1.0000 \n", "45 AMT_GOODS_PRICE_max 0.5171 -0.0138 1.0000 \n", "74 AMT_ANNUITY_std 0.5162 -0.0252 0.8192 \n", "77 RATE_DOWN_PAYMENT_sum 0.5157 -0.0107 1.0000 \n", "67 NFLAG_INSURED_ON_APPROVAL_sum 0.5147 -0.0173 1.0000 \n", "56 DAYS_LAST_DUE_1ST_VERSION_TO_YEARS_min 0.5140 -0.0117 0.9863 \n", "53 RATE_INTEREST_PRIVILEGED_mean 0.5129 0.0011 1.0000 \n", "27 AMT_APPLICATION_sum 0.5119 0.0032 1.0000 \n", "47 AMT_GOODS_PRICE_sum 0.5119 0.0033 1.0000 \n", "20 AMT_CREDIT_max 0.5110 -0.0094 1.0000 \n", "65 NFLAG_INSURED_ON_APPROVAL_max 0.5103 -0.0114 0.9956 \n", "12 RATE_INTEREST_PRIMARY_sum 0.5103 0.0197 1.0000 \n", "52 RATE_INTEREST_PRIVILEGED_sum 0.5102 0.0198 1.0000 \n", "69 NFLAG_INSURED_ON_APPROVAL_std 0.5098 0.0026 0.7213 \n", "38 CNT_PAYMENT_mean 0.5091 0.0121 1.0000 \n", "37 CNT_PAYMENT_sum 0.5082 0.0267 1.0000 \n", "29 AMT_APPLICATION_std 0.5077 -0.0149 0.8192 \n", "49 AMT_GOODS_PRICE_std 0.5077 -0.0149 0.8192 \n", "22 AMT_CREDIT_sum 0.5071 0.0069 1.0000 \n", "16 DAYS_FIRST_DUE_TO_YEARS_min 0.5064 -0.0050 0.9937 \n", "1 AMT_DOWN_PAYMENT_min 0.5058 -0.0162 1.0000 \n", "68 NFLAG_INSURED_ON_APPROVAL_mean 0.5057 -0.0021 0.9956 \n", "66 NFLAG_INSURED_ON_APPROVAL_min 0.5053 0.0112 0.9956 \n", "31 DAYS_LAST_DUE_TO_YEARS_min 0.5038 -0.0014 0.9244 \n", "6 DAYS_TERMINATION_TO_YEARS_min 0.5032 -0.0020 0.9172 \n", "76 RATE_DOWN_PAYMENT_min 0.5032 -0.0103 1.0000 \n", "24 AMT_CREDIT_std 0.5017 -0.0101 0.8192 \n", "13 RATE_INTEREST_PRIMARY_mean 0.5011 0.0008 1.0000 \n", "11 RATE_INTEREST_PRIMARY_min 0.5004 0.0016 1.0000 \n", "51 RATE_INTEREST_PRIVILEGED_min 0.5003 0.0022 1.0000 \n", "10 RATE_INTEREST_PRIMARY_max 0.5003 -0.0005 1.0000 \n", "54 RATE_INTEREST_PRIVILEGED_std 0.5002 -0.0017 0.8192 \n", "14 RATE_INTEREST_PRIMARY_std 0.5001 -0.0010 0.8192 \n", "50 RATE_INTEREST_PRIVILEGED_max 0.5001 0.0004 1.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 33.3 s, sys: 1.39 s, total: 34.7 s\n", "Wall time: 10.7 s\n" ] } ], "source": [ "%%time\n", "pdf_agg03 = agg_common_data(pdf_con[[\"SK_ID_CURR\"] + ls_con], [\"max\", \"min\", \"sum\", \"mean\", \"std\"], main_key=\"SK_ID_CURR\")\n", "eval_agg03 = feature_evaluate(pdf_train_filtered, pdf_agg03)\n", "display(eval_agg03)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6, 4)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_agg03.query(\"auc <= 0.501\").shape" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(338857, 74)\n" ] } ], "source": [ "sel_feat = eval_agg03.query(\"auc > 0.501\")[\"name\"].tolist()\n", "pdf_agg03 = pdf_agg03[sel_feat]\n", "print(pdf_agg03.shape)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_PREV
count338857.0000
mean4.9290
std4.2207
min1.0000
25%2.0000
50%4.0000
75%7.0000
max77.0000
\n", "
" ], "text/plain": [ " SK_ID_PREV\n", "count 338857.0000\n", "mean 4.9290 \n", "std 4.2207 \n", "min 1.0000 \n", "25% 2.0000 \n", "50% 4.0000 \n", "75% 7.0000 \n", "max 77.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Có nhiều KH có 1-nhiều khoản vay trước đó\n", "pdf_agg04 = pdf_prev_app.groupby(\"SK_ID_CURR\").agg({\"SK_ID_PREV\": \"count\"})\n", "display(pdf_agg04.describe())\n", "pdf_agg04.hist()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save features" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(338857, 244)\n" ] } ], "source": [ "pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03).join(pdf_agg04)\n", "print(pdf_feat.shape)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Store features completed!\n", "CPU times: user 34.3 s, sys: 749 ms, total: 35 s\n", "Wall time: 34.4 s\n" ] } ], "source": [ "%%time\n", "fname = \"prev_app\"\n", "fname = os.path.join(\"features\", \"{}.pkl.bz2\".format(fname))\n", "pdf_feat.to_pickle(fname, compression=\"bz2\")\n", "print(\"Store features completed!\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }