{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", "collapsed": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/dg/anaconda3/envs/fastai/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", " from numpy.core.umath_tests import inner1d\n" ] } ], "source": [ "from fastai.text import *\n", "from fastai.imports import *\n", "from fastai.structured import *\n", "from fastai.column_data import *\n", "from torch.nn import functional as F\n", "from sklearn.metrics import roc_auc_score\n", "from sklearn.model_selection import train_test_split\n", "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import gc\n", "import pdb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data processing" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "## Func" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "hidden": true }, "outputs": [], "source": [ "def scale_vars(df, mapper, scale_col_exc):\n", " warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)\n", " if mapper is None:\n", " map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n]) and n not in scale_col_exc]\n", " mapper = DataFrameMapper(map_f).fit(df)\n", " df[mapper.transformed_names_] = mapper.transform(df)\n", " return mapper" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "hidden": true }, "outputs": [], "source": [ "def proc_df2(df, y_fld = None, skip_flds=None, do_scale=True, scale_col_exc = None, na_dict=None,\n", " preproc_fn=None, max_n_cat=10, subset=None, mapper=None):\n", " if not skip_flds: skip_flds=[]\n", " if subset: df = get_sample(df,subset)\n", " df = df.copy()\n", " if preproc_fn: preproc_fn(df)\n", " if y_fld is not None: \n", " y = df[y_fld].values\n", " df.drop(skip_flds+[y_fld], axis=1, inplace=True)\n", "\n", " if na_dict is None: na_dict = {}\n", " for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)\n", " if do_scale: mapper = scale_vars(df, mapper, scale_col_exc)\n", " for n,c in df.items(): numericalize(df, c, n, max_n_cat)\n", " if y_fld is not None: \n", " res = [pd.get_dummies(df, dummy_na=True), y, na_dict]\n", " else:\n", " res = [pd.get_dummies(df, dummy_na=True), na_dict]\n", " if do_scale: res = res + [mapper]\n", " return res" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "## app train" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "hidden": true }, "outputs": [], "source": [ "df_train = pd.read_csv('data/application_train.csv')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "hidden": true }, "outputs": [], "source": [ "sk_id = set(df_train.SK_ID_CURR.values)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "hidden": true }, "outputs": [], "source": [ "sk2tg = { k:v for k,v in df_train[['SK_ID_CURR', 'TARGET']].values}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## cc train" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "cc_train = pd.read_csv('data/credit_card_balance.csv')" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "cc_train = cc_train[cc_train['SK_ID_CURR'].isin(sk_id)]" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "cc_train.drop(['SK_ID_PREV'], axis=1, inplace = True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "train_cats(cc_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "cc_sk_id = set(cc_train['SK_ID_CURR'].values)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "cc_y = [sk2tg[i] for i in cc_sk_id]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "cc_sk_id = list(cc_sk_id)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "x_train,x_valid, y_train, y_valid = train_test_split(cc_sk_id, cc_y, test_size=0.2, stratify = cc_y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "cc_train_proc, _, _ = proc_df2(cc_train, do_scale = True, scale_col_exc = ['SK_ID_CURR', 'MONTHS_BALANCE'])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def df_to_maxtrix(df, cols): return df[cols].values.astype(np.float32)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "cc_train_proc = cc_train_proc.sort_values(by = ['SK_ID_CURR', 'MONTHS_BALANCE'])" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "cc_train = cc_train.sort_values(by = ['SK_ID_CURR', 'MONTHS_BALANCE'])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": true }, "outputs": [], "source": [ "cols = [i for i in cc_train_proc.columns if i not in ['SK_ID_CURR','MONTHS_BALANCE']]\n", "cc_train_group = cc_train_proc.groupby(['SK_ID_CURR']).apply(lambda x: df_to_maxtrix(x, cols))" ] }, { "cell_type": "code", "execution_count": 309, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | SK_ID_CURR | \n", "MONTHS_BALANCE | \n", "AMT_BALANCE | \n", "AMT_CREDIT_LIMIT_ACTUAL | \n", "AMT_DRAWINGS_ATM_CURRENT | \n", "AMT_DRAWINGS_CURRENT | \n", "AMT_DRAWINGS_OTHER_CURRENT | \n", "AMT_DRAWINGS_POS_CURRENT | \n", "AMT_INST_MIN_REGULARITY | \n", "AMT_PAYMENT_CURRENT | \n", "... | \n", "AMT_RECIVABLE | \n", "AMT_TOTAL_RECEIVABLE | \n", "CNT_DRAWINGS_ATM_CURRENT | \n", "CNT_DRAWINGS_CURRENT | \n", "CNT_DRAWINGS_OTHER_CURRENT | \n", "CNT_DRAWINGS_POS_CURRENT | \n", "CNT_INSTALMENT_MATURE_CUM | \n", "NAME_CONTRACT_STATUS | \n", "SK_DPD | \n", "SK_DPD_DEF | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1636141 | \n", "100006 | \n", "-6 | \n", "0.0 | \n", "270000 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "... | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "Active | \n", "0 | \n", "0 | \n", "
| 655566 | \n", "100006 | \n", "-5 | \n", "0.0 | \n", "270000 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "... | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "Active | \n", "0 | \n", "0 | \n", "
| 1399895 | \n", "100006 | \n", "-4 | \n", "0.0 | \n", "270000 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "... | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "Active | \n", "0 | \n", "0 | \n", "
| 1347528 | \n", "100006 | \n", "-3 | \n", "0.0 | \n", "270000 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "... | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "Active | \n", "0 | \n", "0 | \n", "
| 520387 | \n", "100006 | \n", "-2 | \n", "0.0 | \n", "270000 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "... | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "Active | \n", "0 | \n", "0 | \n", "
| 584804 | \n", "100006 | \n", "-1 | \n", "0.0 | \n", "270000 | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "NaN | \n", "... | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "0 | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "Active | \n", "0 | \n", "0 | \n", "
6 rows × 22 columns
\n", "