{ "cells": [ { "cell_type": "code", "metadata": {}, "source": [ "#%%\n", "\"\"\"File 04validation.py\n", "\n", ":author: Michel Bierlaire, EPFL\n", ":date: Thu Jun 4 17:55:27 2020\n", "\n", " Example of the out-of-sample validation of a logit model.\n", " Three alternatives: Train, Car and Swissmetro\n", " SP data\n", "\"\"\"\n", "\n", "import pandas as pd\n", "import biogeme.database as db\n", "import biogeme.biogeme as bio\n", "import biogeme.models as models\n", "from biogeme.expressions import Beta\n", "\n", "# Read the data\n", "df = pd.read_csv('swissmetro.dat', '\\t')\n", "database = db.Database('swissmetro', df)\n", "\n", "# The following statement allows you to use the names of the variable\n", "# as Python variable.\n", "globals().update(database.variables)\n", "\n", "# Removing some observations\n", "exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0\n", "database.remove(exclude)\n", "\n", "# Parameters to be estimated\n", "ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)\n", "ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)\n", "ASC_SM = Beta('ASC_SM', 0, None, None, 1)\n", "B_TIME = Beta('B_TIME', 0, None, None, 0)\n", "B_COST = Beta('B_COST', 0, None, None, 0)\n", "\n", "\n", "# Definition of new variables\n", "SM_COST = SM_CO * (GA == 0)\n", "TRAIN_COST = TRAIN_CO * (GA == 0)\n", "CAR_AV_SP = CAR_AV * (SP != 0)\n", "TRAIN_AV_SP = TRAIN_AV * (SP != 0)\n", "TRAIN_TT_SCALED = TRAIN_TT / 100.0\n", "TRAIN_COST_SCALED = TRAIN_COST / 100\n", "SM_TT_SCALED = SM_TT / 100.0\n", "SM_COST_SCALED = SM_COST / 100\n", "CAR_TT_SCALED = CAR_TT / 100\n", "CAR_CO_SCALED = CAR_CO / 100\n", "\n", "# Definition of the utility functions\n", "V1 = ASC_TRAIN + \\\n", " B_TIME * TRAIN_TT_SCALED + \\\n", " B_COST * TRAIN_COST_SCALED\n", "V2 = ASC_SM + \\\n", " B_TIME * SM_TT_SCALED + \\\n", " B_COST * SM_COST_SCALED\n", "V3 = ASC_CAR + \\\n", " B_TIME * CAR_TT_SCALED + \\\n", " B_COST * CAR_CO_SCALED\n", "\n", "# Associate utility functions with the numbering of alternatives\n", "V = {1: V1,\n", " 2: V2,\n", " 3: V3}\n", "\n", "# Associate the availability conditions with the alternatives\n", "av = {1: TRAIN_AV_SP,\n", " 2: SM_AV,\n", " 3: CAR_AV_SP}\n", "\n", "# Definition of the model. This is the contribution of each\n", "# observation to the log likelihood function.\n", "logprob = models.loglogit(V, av, CHOICE)\n", "\n", "# Create the Biogeme object\n", "biogeme = bio.BIOGEME(database, logprob)\n", "biogeme.modelName = '04validation'\n", "\n", "# Estimate the parameters\n", "results = biogeme.estimate()\n", "\n", "# The validation consists in organizing the data into several slices\n", "# of about the same size, randomly defined. Each slice is considered\n", "# as a validation dataset. The model is then re-estimated using all\n", "# the data except the slice, and the estimated model is applied on the\n", "# validation set (i.e. the slice). The value of the log likelihood for\n", "# each observation in the validation set is reported in a\n", "# dataframe. As this is done for each slice, the output is a list of\n", "# dataframes, each corresponding to one of these exercises.\n", "\n", "validation_results = biogeme.validate(results)\n", "\n", "for slide in validation_results:\n", " print(f'Log likelihood for {slide.shape[0]} validation data: {slide[\"Loglikelihood\"].sum()}')\n" ], "outputs": [], "execution_count": null } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 }