{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Neural averaging ensembles.ipynb", "provenance": [], "collapsed_sections": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "ScpcrL6quKNs", "colab_type": "text" }, "source": [ "# Neural averaging ensembles on benchml data" ] }, { "cell_type": "markdown", "metadata": { "id": "tLg7ElixubDE", "colab_type": "text" }, "source": [ "Dr. Michael Allgöwer, b.telligent, michael.allgoewer@btelligent.com" ] }, { "cell_type": "code", "metadata": { "id": "OPyMiAzoTt5g", "colab_type": "code", "outputId": "2a7c148b-391a-46c8-e626-0faa715f6870", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "try:\n", " %tensorflow_version 2.x\n", "except Exception:\n", " pass" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "TensorFlow 2.x selected.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "y9IXsNjrTX1O", "colab_type": "code", "colab": {} }, "source": [ "import pandas as pd\n", "import numpy as np\n", "import tensorflow as tf\n", "import os\n", "from pathlib import Path\n", "from collections import OrderedDict" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "22Y9RjpwBhn8", "colab_type": "code", "outputId": "c945b494-3a49-43da-f840-3dee26da5e22", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "print(tf.__version__, tf.keras.__version__)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "2.0.0-rc2 2.2.4-tf\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "rVEwxoNfeL7F", "colab_type": "code", "colab": {} }, "source": [ "# Importing data, keeping it all together in a class being able to return either pandas dataframe or tf dataset\n", "# tf datasets are the TensorFlow 2.0-native way of handling data.\n", "class Flights():\n", " '''Flight delay classification data from Szilard Pafka's benchml; derived from the well-known fligts dataset'''\n", "\n", " def __init__(self):\n", " \n", " # you may want to change these paths, depending on where you put the files\n", " train_path = 'https://raw.githubusercontent.com/Allgoerithm/neuralaveraging/master/data/train-0.01m.csv' \n", " test_path = 'https://raw.githubusercontent.com/Allgoerithm/neuralaveraging/master/data/test.csv'\n", " paths = {'train': train_path, 'test': test_path}\n", " slices = list(paths.keys())\n", "\n", " random_seed = 4711\n", " self.data = {} # neural-network version of the data, with integer indices for categorial data\n", " self.data_1h = {} # onehot-encoded version of the data, needed for gradient boosted trees\n", "\n", " for (data_slice, input_path) in paths.items():\n", " self.data[data_slice] = pd.read_csv(input_path, delimiter=',', quotechar='\"', na_values=' ')\n", " self.data[data_slice]['slice'] = data_slice # add new column with the slice the data belongs to\n", " data_complete = self.data['train'].append(self.data['test'])\n", " data_complete.rename(index=str, columns={'dep_delayed_15min': 'target'}, inplace=True) \n", "\n", " # change binary target variable from Y/N to 0/1 (new datatype: int)\n", " all_replacements = {'target': {'Y': 1, 'N': 0}}\n", " data_complete.replace(all_replacements, inplace=True)\n", " data_complete_1h = data_complete\n", " \n", " # indexing all categorial columns (transform into successive integers)\n", " self.categorial_columns = [list(data_complete.columns)[i]\n", " for i in range(len(data_complete.columns))\n", " if list(data_complete.dtypes)[i] == np.dtype('object')]\n", " self.categorial_columns.remove('slice')\n", " self.index_lengths = OrderedDict()\n", " for column in self.categorial_columns:\n", " data_complete['catindex_' + column] = -1 + data_complete[column]\\\n", " .rank(method='dense', numeric_only=False)\n", " data_complete = data_complete.drop(columns=[column])\n", " self.index_lengths['catindex_' + column] = 1 + data_complete['catindex_' + column].max()\n", "\n", " # onehot-encoding for onehot-version of data \n", " categorial_column_prefixes = ['onehot_' + name for name in self.categorial_columns]\n", " data_complete_1h = pd.get_dummies(data_complete_1h, columns=self.categorial_columns,\n", " prefix=categorial_column_prefixes, drop_first=True) \n", "\n", " for (data_slice, input_path) in paths.items():\n", " self.data[data_slice] = data_complete[data_complete['slice'] == data_slice].drop(columns=['slice'])\n", " self.data_1h[data_slice] = data_complete_1h[data_complete_1h['slice'] == data_slice].drop(columns=['slice'])\n", "\n", " # standardize all columns except categorial columns and target variable \n", " self.categorial_columns = ['catindex_' + col for col in self.categorial_columns]\n", " columns_to_standardize = [col for col in self.data['train'].columns \n", " if col not in self.categorial_columns + ['target']]\n", " for feature_name in columns_to_standardize:\n", " # mean and variance equal of noncategorial columns are equal for data and data_1h\n", " mean = self.data['train'][feature_name].mean() \n", " std = self.data['train'][feature_name].std()\n", " if std > 0: # keep only colums with at least some variance\n", " for data_slice in slices:\n", " self.data[data_slice][feature_name] = (self.data[data_slice][feature_name] - mean) / std\n", " self.data_1h[data_slice][feature_name] = (self.data_1h[data_slice][feature_name] - mean) / std\n", " else: # drop constant columns\n", " for data_slice in slices:\n", " self.data[data_slice] = self.data[data_slice].drop(feature_name, axis=1)\n", " self.data_1h[data_slice] = self.data_1h[data_slice].drop(feature_name, axis=1)\n", "\n", " def get_dataframe(self, data_slice: str, categorials_as_onehot: bool = False):\n", " assert data_slice in ('train', 'test', 'valid')\n", " result = self.data_1h[data_slice] if categorials_as_onehot else self.data[data_slice]\n", " return result\n", "\n", " def get_dataset(self, data_slice: str):\n", " assert data_slice in ('train', 'test', 'valid')\n", " target = self.data[data_slice]['target']\n", " predictor_cols = [c for c in self.data[data_slice].columns if c != 'target']\n", " predictors = self.data[data_slice][predictor_cols]\n", " dataset = tf.data.Dataset.from_tensor_slices((predictors.values, target.values))\n", " return dataset\n", " \n", " def get_index_lengths(self):\n", " return self.index_lengths\n", "\n", " def no_of_predictors(self):\n", " return len(self.data['train'].columns) - 1\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ONF3C6ZmeU51", "colab_type": "code", "colab": {} }, "source": [ "# instantiate our new class and get test and training data\n", "flights = Flights()\n", "data_train_df = flights.get_dataframe(data_slice='train')\n", "data_test_df = flights.get_dataframe(data_slice='test')\n", "\n", "# now for XGboost, with one-hot encoded categorial variables\n", "data_train_1h_df = flights.get_dataframe(data_slice='train', categorials_as_onehot=True)\n", "data_test_1h_df = flights.get_dataframe(data_slice='test', categorials_as_onehot=True)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "fsvc0eQvxaL4", "colab_type": "code", "colab": {} }, "source": [ "# fit gradient boosting model to the data as a baseline, to check we can reproduce Szilard Pafkas's findings\n", "\n", "import xgboost as xgb\n", "import numpy as np\n", "import sklearn.metrics\n", "\n", "d_train = xgb.DMatrix(data_train_1h_df.drop(columns=['target']), label=data_train_1h_df['target'])\n", "d_test = xgb.DMatrix(data_test_1h_df.drop(columns=['target']), label=data_test_1h_df['target'])\n", "param = {'objective':'binary:logistic', 'max_depth': 16, 'eta': 0.01, 'subsample': 0.5, 'min_obs_node': 1}\n", "\n", "gb_model = xgb.train(params=param, dtrain=d_train, num_boost_round=1000) \n", "\n", "gb_pred_test = gb_model.predict(d_test)\n", "gb_auc = sklearn.metrics.roc_auc_score(data_test_df['target'], gb_pred_test)\n", "gb_mae = sklearn.metrics.mean_absolute_error(data_test_df['target'], gb_pred_test)\n", "print(f'AUC:{gb_auc}, MAE:{gb_mae}')" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8gFarMekS764", "colab_type": "code", "colab": {} }, "source": [ "import time\n", "from pathlib import Path\n", "root_logdir = Path('logs')\n", "\n", "def get_log_dir() -> Path:\n", " run_id = Path(time.strftime('run_%Y_%m_%d-%H_%M_%S'))\n", " return root_logdir / run_id" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "1WmPqsiZZwnb", "colab_type": "text" }, "source": [ "##Building a neural averaging ensemble" ] }, { "cell_type": "code", "metadata": { "id": "KDwmiLd2ZaYJ", "colab_type": "code", "colab": {} }, "source": [ "import sklearn.metrics # to compute AUC\n", "import datetime\n", "\n", "from functools import partial # higher-order-function for currying\n", "\n", "# We use the functional API here which is almost as simple to use as a sequential model,\n", "# and versatile enough for our needs. To keep things tidy, we place the model definition inside a function.\n", "# If things get more complicated (especially dynamic nets), the subclassing API is needed.\n", "\n", "def averaging_ensemble(inputs_numeric: int, inputs_for_embedding: int, embedding_input_dims: list, \n", " embedding_output_dims: list, width: int, weak_learners: int, activation_name: \n", " str = 'tanh', share_embedding_layer: bool = False, sigmoid_layer: bool = True, \n", " averaging_layer: bool = True):\n", " r'''Return a generic dense network model\n", "\n", " inputs_numeric: number of numeric columns (features) in the input data set; these are expected to be the first \n", " columns\n", " inputs_for_embedding: integer columns of input set to be transformed by embeddings\n", " embedding_input_dims: input dimension (size of the vocabulary) for each column to be transformed by an embedding;\n", " this is supposed to be a list of length inputs_for_embedding\n", " embedding_output_dims: output dimensions for each column to be transformed by an embedding;\n", " this is supposed to be a list of length inputs_for_embedding\n", " width: number of neurons in the hidden layer of each weak learner\n", " weak_learners: number of weak learners in the ensemble\n", " activation_name: string choosing the activation function for the hidden layers,\n", " 'tanh' for tanh activation,\n", " 'relu' for ReLU activation,\n", " 'selu' for SELU activation\n", " sigmoid_layer: switches sigmoid layer on and off as last layer for each weak learner. The layer is usually needed, \n", " it is only switched off for hidden layer size checking \n", " averaging_layer: switches last averaging layer on and off\n", " '''\n", " assert width >= 1, 'width is required to be at least 1'\n", " assert weak_learners >= 1, 'weak_learners is required to be at least 1'\n", " assert activation_name.lower() in ['tanh', 'relu', 'selu'], \\\n", " f'Unknown value \"{activation_name}\" for activation_fct. Options are \"tanh\", \"relu\" and \"selu\".'\n", " assert len(embedding_input_dims) == inputs_for_embedding, \\\n", " 'length of list embedding_input_dims is supposed to be equal to inputs_for_embedding'\n", " assert len(embedding_output_dims) == inputs_for_embedding, \\\n", " 'length of list embedding_output_dims is supposed to be equal to inputs_for_embedding'\n", "\n", " if activation_name.lower() == 'tanh':\n", " activation = tf.keras.activations.tanh\n", " kernel_initializer = tf.initializers.GlorotUniform()\n", " elif activation_name.lower() == 'relu':\n", " activation = tf.keras.activations.relu\n", " kernel_initializer = tf.initializers.GlorotUniform() \n", " else:\n", " activation = tf.keras.activations.selu\n", " kernel_initializer = tf.initializers.VarianceScaling(scale=1.0, mode='fan_in')\n", "\n", " input_layer = tf.keras.Input(shape=(inputs_numeric + inputs_for_embedding,))\n", " split_input_layer = tf.split(input_layer, [inputs_numeric] + [1]*inputs_for_embedding, axis=1)\n", "\n", " hidden = []\n", " name_hidden = 'hidden' if weak_learners==1 else None\n", " # add hidden layer as a list of weak learners\n", " for i in range(weak_learners):\n", " if i == 0 or not(share_embedding_layer):\n", " embedded_input_components = [split_input_layer[0]] # use numerical inputs without transformation\n", " # embedd the other components\n", " for j in range(inputs_for_embedding):\n", " prefix = '' if share_embedding_layer else f'wl_{i}_'\n", " embedding_layer_name = prefix + f'emb_{j}_in{embedding_input_dims[j]}_out{embedding_output_dims[j]}'\n", " embedded_input = tf.keras.layers.Embedding(input_dim=embedding_input_dims[j], \n", " output_dim=embedding_output_dims[j], input_length=1, \n", " name=embedding_layer_name)(split_input_layer[1 + j])\n", " embedded_input_components.append(tf.keras.layers.Flatten()(embedded_input))\n", " embedded_input = tf.keras.layers.Concatenate(axis=1)(embedded_input_components)\n", "\n", " # create flat dense layer and sigmoid layer for classification \n", " weak_learner = tf.keras.layers.Dense(units=width, activation=activation, kernel_initializer=kernel_initializer,\n", " name=name_hidden)(embedded_input)\n", " weak_learner = tf.keras.layers.Dense(units=1, activation=tf.keras.activations.sigmoid)(weak_learner)\n", " hidden.append(weak_learner)\n", "\n", " if weak_learners > 1 and averaging_layer: \n", " output_layer = tf.keras.layers.Average()(hidden) # add an averaging layer at the end\n", " elif weak_learners > 1: # if we have multiple outputs and no averaging layer, we return them all\n", " output_layer = hidden\n", " else:\n", " output_layer = weak_learner # if there's only one weak learner, we use it as output directly\n", " \n", " return (input_layer, output_layer) " ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "OWjox3QLMqjQ", "colab_type": "text" }, "source": [ "### first, determine layer size" ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "esCLU_9_N6e6", "colab": {} }, "source": [ "#this function is needed as a helper below\n", "def compute_correlation_histogram(mat: np.array):\n", " '''Computes the correlations of the columns of mat and returns a histogram (counts for each binned correlation value)\n", " '''\n", " corrmatrix_raw = pd.DataFrame(data=mat, \n", " columns=[f'n_{i:02}' for i in range(mat.shape[1])])\\\n", " .corr(method=\"spearman\").abs() # we discard the sign of the correlations\n", " corrmatrix = corrmatrix_raw.stack().reset_index()\n", " corrmatrix.rename(index=str, columns={\"level_0\": \"variable_1\", \"level_1\": \"variable_2\", 0: \"correlation\"},\n", " inplace=True) # set meaningful variable names\n", " correlations = corrmatrix[corrmatrix['variable_1'] > corrmatrix['variable_2']] # keep only upper triangular entries\n", " correlations = correlations.reset_index()\n", " \n", " return correlations.iloc[correlations['correlation'].idxmax(axis='rows')] # return row with maximum correlation\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kxesDbL-fTI7", "colab_type": "code", "colab": {} }, "source": [ "output_dimensions = OrderedDict([('catindex_Month', 2), ('catindex_DayofMonth', 2), ('catindex_DayOfWeek', 2),\n", " ('catindex_UniqueCarrier', 5), ('catindex_Origin', 5), ('catindex_Dest', 5)])\n", "input_dims = OrderedDict([('catindex_Month', 13),\n", " ('catindex_DayofMonth', 32),\n", " ('catindex_DayOfWeek', 8),\n", " ('catindex_UniqueCarrier', 23),\n", " ('catindex_Origin', 305),\n", " ('catindex_Dest', 305)])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "HP8D7MA5TqKC", "colab_type": "code", "outputId": "4eec2bed-61b8-4b4d-9bbd-d9480a9da97e", "colab": { "base_uri": "https://localhost:8080/", "height": 824 } }, "source": [ "# Check the size of the hidden layer: Train a model with a single weak learner\n", "import datetime\n", "tf.random.set_seed(3141592653) # set a fixed (arbitrary) seed for TensorFlow's random numbers, global level\n", "np.random.seed(seed=3141592653) # ...and do the same for numpy's random numbers\n", "\n", "for x in range(10):\n", " now=datetime.datetime.now()\n", "\n", " log_dir = get_log_dir()\n", " model_name = 'benchml100k_Layersize_check'\n", " activation_name ='tanh'\n", " weak_learners = 1 # when we check the size, we do not use averaging\n", "\n", " # We go for a low batch size (slow, but less prone to overfitting).\n", " # The learning rate has been chosen by some quick trials (going down from 1 by dividing by 10 in each step until\n", " # learning is sufficiently stable).\n", " # We combine that with a low number of epochs as we only need a rough estimation to gauge the correlations.\n", " learning_rate = 0.1\n", " batch_size = 10\n", " epochs = 20\n", " widths = []\n", " max_correlations = []\n", " validation_data = (data_test_df[[c for c in data_test_df.columns if c != 'target']].values, \n", " data_test_df['target'].values)\n", "\n", " for width in (10, 20, 30, 40, 50, 60, 80, 100, 120, 140):\n", " (inputs, outputs) = averaging_ensemble(inputs_numeric=flights.no_of_predictors() - len(flights.get_index_lengths()), \n", " inputs_for_embedding=len(flights.get_index_lengths()), \n", " embedding_input_dims=list(input_dims.values()), # list(flights.get_index_lengths().values()), \n", " embedding_output_dims=list(output_dimensions.values()),\n", " width=width, weak_learners=weak_learners, activation_name=activation_name)\n", " model = tf.keras.Model(inputs=inputs, outputs=outputs)\n", " model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss='binary_crossentropy', \n", " metrics=['mae', 'AUC'])\n", " model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values, \n", " epochs=epochs, batch_size=batch_size, verbose=0)\n", " \n", " # shave the model, i.e., delete the last layer\n", " layer_name = 'hidden'\n", " shaved_model = tf.keras.Model(inputs=model.input, outputs=model.get_layer(layer_name).output)\n", " hidden_layer_output = shaved_model.predict(validation_data)\n", " max_correlation = compute_correlation_histogram(hidden_layer_output)\n", " widths.append(width)\n", " max_correlations.append(max_correlation)\n", " print(f\"{widths[-1]}: {max_correlation['correlation']}\")\n", " if max_correlation['correlation'] >= 0.98:\n", " break" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "10: 0.8881912545780639\n", "20: 0.9451014211983376\n", "30: 0.9685687572680682\n", "40: 0.947551540745585\n", "50: 0.9645273599570406\n", "60: 0.9763770528780563\n", "80: 0.9572017647929024\n", "100: 0.9846774011987006\n", "10: 0.9419475646449638\n", "20: 0.958465299219847\n", "30: 0.9711734558646341\n", "40: 0.9801705036474484\n", "10: 0.7366949374359877\n", "20: 0.8926622584310067\n", "30: 0.9508609708677088\n", "40: 0.9760320661499896\n", "50: 0.9605671215074669\n", "60: 0.9374681472720487\n", "80: 0.9806870175219736\n", "10: 0.8780105916016454\n", "20: 0.9002817446860963\n", "30: 0.9599076665318469\n", "40: 0.9701751239117474\n", "50: 0.9358039686850287\n", "60: 0.93732918693722\n", "80: 0.9835738119950854\n", "10: 0.7920294754857148\n" ], "name": "stdout" }, { "output_type": "error", "ename": "KeyboardInterrupt", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 33\u001b[0m metrics=['mae', 'AUC'])\n\u001b[1;32m 34\u001b[0m model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values, \n\u001b[0;32m---> 35\u001b[0;31m epochs=epochs, batch_size=batch_size, verbose=0)\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;31m# shave the model, i.e., delete the last layer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0mmax_queue_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_queue_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 727\u001b[0m \u001b[0mworkers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mworkers\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 728\u001b[0;31m use_multiprocessing=use_multiprocessing)\n\u001b[0m\u001b[1;32m 729\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 730\u001b[0m def evaluate(self,\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mModeKeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0mtraining_context\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtraining_context\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 324\u001b[0;31m total_epochs=epochs)\n\u001b[0m\u001b[1;32m 325\u001b[0m \u001b[0mcbks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake_logs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch_logs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_result\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mModeKeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2.py\u001b[0m in \u001b[0;36mrun_one_epoch\u001b[0;34m(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)\u001b[0m\n\u001b[1;32m 121\u001b[0m step=step, mode=mode, size=current_batch_size) as batch_logs:\n\u001b[1;32m 122\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 123\u001b[0;31m \u001b[0mbatch_outs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexecution_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 124\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mStopIteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOutOfRangeError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;31m# TODO(kaftan): File bug about tf function and errors.OutOfRangeError?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2_utils.py\u001b[0m in \u001b[0;36mexecution_function\u001b[0;34m(input_fn)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;31m# `numpy` translates Tensors to values in Eager mode.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m return nest.map_structure(_non_none_constant_value,\n\u001b[0;32m---> 86\u001b[0;31m distributed_function(input_fn))\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mexecution_function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwds)\u001b[0m\n\u001b[1;32m 455\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0mtracing_count\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_tracing_count\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 457\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 458\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtracing_count\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_tracing_count\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_counter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcalled_without_tracing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/def_function.py\u001b[0m in \u001b[0;36m_call\u001b[0;34m(self, *args, **kwds)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;31m# In this case we have created variables on the first call, so we run the\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0;31m# defunned version which is guaranteed to never create variables.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stateless_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# pylint: disable=not-callable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stateful_fn\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;31m# Release the lock early so that multiple threads can perform the call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1821\u001b[0m \u001b[0;34m\"\"\"Calls a graph function specialized to the inputs.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1822\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_define_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1823\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_filtered_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# pylint: disable=protected-access\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1824\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1825\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py\u001b[0m in \u001b[0;36m_filtered_call\u001b[0;34m(self, args, kwargs)\u001b[0m\n\u001b[1;32m 1139\u001b[0m if isinstance(t, (ops.Tensor,\n\u001b[1;32m 1140\u001b[0m resource_variable_ops.BaseResourceVariable))),\n\u001b[0;32m-> 1141\u001b[0;31m self.captured_inputs)\n\u001b[0m\u001b[1;32m 1142\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1143\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_call_flat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcaptured_inputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcancellation_manager\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py\u001b[0m in \u001b[0;36m_call_flat\u001b[0;34m(self, args, captured_inputs, cancellation_manager)\u001b[0m\n\u001b[1;32m 1222\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexecuting_eagerly\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1223\u001b[0m flat_outputs = forward_function.call(\n\u001b[0;32m-> 1224\u001b[0;31m ctx, args, cancellation_manager=cancellation_manager)\n\u001b[0m\u001b[1;32m 1225\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1226\u001b[0m \u001b[0mgradient_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_delayed_rewrite_functions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mregister\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, ctx, args, cancellation_manager)\u001b[0m\n\u001b[1;32m 509\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"executor_type\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexecutor_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"config_proto\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 511\u001b[0;31m ctx=ctx)\n\u001b[0m\u001b[1;32m 512\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 513\u001b[0m outputs = execute.execute_with_cancellation(\n", "\u001b[0;32m/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/execute.py\u001b[0m in \u001b[0;36mquick_execute\u001b[0;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[1;32m 59\u001b[0m tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,\n\u001b[1;32m 60\u001b[0m \u001b[0mop_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m num_outputs)\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_NotOkStatusException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ] }, { "cell_type": "code", "metadata": { "id": "R2rkANYlRnIy", "colab_type": "code", "outputId": "17d533e2-5c39-4175-fd9d-67ab1b142152", "colab": { "base_uri": "https://localhost:8080/", "height": 218 } }, "source": [ "# Now train the model, with the weak learner size we just determined: From the output of the last cell we estimate\n", "# that width 80 will probably suffice.\n", "import datetime\n", "now=datetime.datetime.now()\n", "\n", "tf.random.set_seed(3141592653) # set a fixed (arbitrary) seed for TensorFlow's random numbers, global level\n", "np.random.seed(seed=3141592653) # ...and do the same for numpy's random numbers\n", "\n", "model_name = 'benchml10k'\n", "activation_name ='tanh'\n", "weak_learners = 100 # 100 is good for a final model\n", "\n", "# We go for a low batch size (slow, but less prone to overfitting).\n", "# The learning rate has been chosen by some quick trials (going down from 1 by dividing by 10 in each step until\n", "# learning is sufficiently stable).\n", "# We combine that with a low number of epochs as we only need a rough estimation to gauge the correlations.\n", "learning_rate = 1\n", "batch_size = 10\n", "epochs = 5\n", "width = 80\n", "\n", "(inputs, outputs) = averaging_ensemble(inputs_numeric=flights.no_of_predictors() - len(flights.get_index_lengths()), \n", " inputs_for_embedding=len(flights.get_index_lengths()), \n", " embedding_input_dims=list(input_dims.values()), # list(flights.get_index_lengths().values()), \n", " embedding_output_dims=list(output_dimensions.values()),\n", " width=width, weak_learners=weak_learners, activation_name=activation_name,\n", " share_embedding_layer=True)\n", "model = tf.keras.Model(inputs=inputs, outputs=outputs)\n", "validation_data = (data_test_df.drop(columns=['target']).values, \n", " data_test_df['target'].values)\n", "model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss='binary_crossentropy', \n", " metrics=['mae', 'AUC'])\n", "model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values, \n", " epochs=epochs, batch_size=batch_size, validation_data=validation_data) " ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Train on 10000 samples, validate on 100000 samples\n", "Epoch 1/5\n", "10000/10000 [==============================] - 173s 17ms/sample - loss: 0.4777 - mae: 0.3229 - AUC: 0.6430 - val_loss: 0.4889 - val_mae: 0.3036 - val_AUC: 0.6842\n", "Epoch 2/5\n", "10000/10000 [==============================] - 168s 17ms/sample - loss: 0.4518 - mae: 0.2883 - AUC: 0.6952 - val_loss: 0.4874 - val_mae: 0.2984 - val_AUC: 0.6884\n", "Epoch 3/5\n", "10000/10000 [==============================] - 167s 17ms/sample - loss: 0.4470 - mae: 0.2835 - AUC: 0.7048 - val_loss: 0.4836 - val_mae: 0.3005 - val_AUC: 0.6931\n", "Epoch 4/5\n", "10000/10000 [==============================] - 166s 17ms/sample - loss: 0.4436 - mae: 0.2826 - AUC: 0.7113 - val_loss: 0.4829 - val_mae: 0.2977 - val_AUC: 0.6959\n", "Epoch 5/5\n", "10000/10000 [==============================] - 157s 16ms/sample - loss: 0.4405 - mae: 0.2788 - AUC: 0.7176 - val_loss: 0.4836 - val_mae: 0.3009 - val_AUC: 0.6927\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "code", "metadata": { "id": "IjTU37l8pNVu", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] } ] }