{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Booking.com trip recommendation part 1 - baseline model\n", "> Booking.com challenge on trip recommendation part 1\n", "\n", "- toc: true\n", "- badges: true\n", "- comments: true\n", "- categories: [travel]\n", "- image: " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "YrHhkJNbghNP" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LR03pKu4hTyH" }, "outputs": [], "source": [ "!wget https://github.com/sparsh-ai/reco-data/raw/master/BookingChallenge.zip\n", "!unzip BookingChallenge.zip" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "id": "L-wi74ZtgvVH", "outputId": "f6fcd4c5-ada2-4b88-9dba-5f449d7c9226" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1166835, 9)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idcheckincheckoutcity_iddevice_classaffiliate_idbooker_countryhotel_countryutrip_id
010000272016-08-132016-08-148183desktop7168ElboniaGondal1000027_1
110000272016-08-142016-08-1615626desktop7168ElboniaGondal1000027_1
210000272016-08-162016-08-1860902desktop7168ElboniaGondal1000027_1
310000272016-08-182016-08-2130628desktop253ElboniaGondal1000027_1
410000332016-04-092016-04-1138677mobile359GondalCobra Island1000033_1
\n", "
" ], "text/plain": [ " user_id checkin checkout ... booker_country hotel_country utrip_id\n", "0 1000027 2016-08-13 2016-08-14 ... Elbonia Gondal 1000027_1\n", "1 1000027 2016-08-14 2016-08-16 ... Elbonia Gondal 1000027_1\n", "2 1000027 2016-08-16 2016-08-18 ... Elbonia Gondal 1000027_1\n", "3 1000027 2016-08-18 2016-08-21 ... Elbonia Gondal 1000027_1\n", "4 1000033 2016-04-09 2016-04-11 ... Gondal Cobra Island 1000033_1\n", "\n", "[5 rows x 9 columns]" ] }, "execution_count": 5, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "train_set = pd.read_csv('train_set.csv').sort_values(by=['utrip_id','checkin'])\n", "\n", "print(train_set.shape)\n", "train_set.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "id": "z1m-r3qLhH0x", "outputId": "91c73e93-2946-4448-c8f9-b7c7841be235" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(378667, 9)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idcheckincheckoutdevice_classaffiliate_idbooker_countryutrip_idcity_idhotel_country
010000662016-07-212016-07-23desktop9924Gondal1000066_256430Urkesh
110000662016-07-232016-07-25desktop9924Gondal1000066_241971Urkesh
210000662016-07-252016-07-28desktop9924Gondal1000066_25797Urkesh
310000662016-07-282016-07-31mobile2436Gondal1000066_20NaN
410002702016-02-082016-02-09mobile9452The Devilfire Empire1000270_150075The Devilfire Empire
\n", "
" ], "text/plain": [ " user_id checkin checkout ... utrip_id city_id hotel_country\n", "0 1000066 2016-07-21 2016-07-23 ... 1000066_2 56430 Urkesh\n", "1 1000066 2016-07-23 2016-07-25 ... 1000066_2 41971 Urkesh\n", "2 1000066 2016-07-25 2016-07-28 ... 1000066_2 5797 Urkesh\n", "3 1000066 2016-07-28 2016-07-31 ... 1000066_2 0 NaN\n", "4 1000270 2016-02-08 2016-02-09 ... 1000270_1 50075 The Devilfire Empire\n", "\n", "[5 rows x 9 columns]" ] }, "execution_count": 6, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "test_set = pd.read_csv('test_set.csv').sort_values(by=['utrip_id','checkin'])\n", "\n", "print(test_set.shape)\n", "test_set.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9QKI_nxuhtVP", "outputId": "7eaf4633-813b-41c7-fd36-46f31b344110" }, "outputs": [ { "data": { "text/plain": [ "Int64Index([47499, 23921, 36063, 17013], dtype='int64')" ] }, "execution_count": 7, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "# what are the top 4 most visited cities?\n", "topcities = train_set.city_id.value_counts().index[:4]\n", "topcities" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xz0---cgiG7X", "outputId": "6672ddf9-f1f6-42ad-fff1-a701056ecc9b" }, "outputs": [ { "data": { "text/plain": [ "70662" ] }, "execution_count": 9, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "# how many trips are there in the test set?\n", "test_trips = (test_set[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)\n", "len(test_trips)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "ptKQJIE-iMnL", "outputId": "3635433b-a717-453d-cbad-6eef6d8892ca" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
city_id_1city_id_2city_id_3city_id_4
047499239213606317013
147499239213606317013
247499239213606317013
347499239213606317013
447499239213606317013
\n", "
" ], "text/plain": [ " city_id_1 city_id_2 city_id_3 city_id_4\n", "0 47499 23921 36063 17013\n", "1 47499 23921 36063 17013\n", "2 47499 23921 36063 17013\n", "3 47499 23921 36063 17013\n", "4 47499 23921 36063 17013" ] }, "execution_count": 10, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "# baseline - a simple logical rule - recommend top 4 most visitied cities to everyone\n", "cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0],\n", " columns= ['city_id_1','city_id_2','city_id_3','city_id_4'])\n", "cities_prediction[:5]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "id": "I4kLpNQVirxr", "outputId": "35a862c1-8126-46d2-d043-ebf2ac43d84b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(70662, 5)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
utrip_idcity_id_1city_id_2city_id_3city_id_4
01000066_247499239213606317013
11000270_147499239213606317013
21000441_147499239213606317013
3100048_147499239213606317013
41000543_147499239213606317013
\n", "
" ], "text/plain": [ " utrip_id city_id_1 city_id_2 city_id_3 city_id_4\n", "0 1000066_2 47499 23921 36063 17013\n", "1 1000270_1 47499 23921 36063 17013\n", "2 1000441_1 47499 23921 36063 17013\n", "3 100048_1 47499 23921 36063 17013\n", "4 1000543_1 47499 23921 36063 17013" ] }, "execution_count": 11, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "predictions = pd.concat([test_trips, cities_prediction], axis=1)\n", "\n", "print(predictions.shape)\n", "predictions.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 252 }, "id": "OeOWrUdujVer", "outputId": "131db80f-2718-4c65-e01a-fdd7cb040001" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(70662, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
city_idhotel_country
utrip_id
1038944_154085Sokovia
1068715_129319Cobra Island
1075528_155763Bozatta
1110462_411930Alvonia
1132565_158659Axphain
\n", "
" ], "text/plain": [ " city_id hotel_country\n", "utrip_id \n", "1038944_1 54085 Sokovia\n", "1068715_1 29319 Cobra Island\n", "1075528_1 55763 Bozatta\n", "1110462_4 11930 Alvonia\n", "1132565_1 58659 Axphain" ] }, "execution_count": 12, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "ground_truth = pd.read_csv('ground_truth.csv', index_col=[0])\n", "\n", "print(ground_truth.shape)\n", "ground_truth.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "4YhzkrLnjgeo" }, "outputs": [], "source": [ "def evaluate_accuracy_at_4(predictions, ground_truth):\n", " '''checks if the true city is within the four recommended cities'''\n", " data = predictions.join(ground_truth, on='utrip_id')\n", "\n", " hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|\n", " (data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1\n", " return hits.mean()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8t0sGGLNjr6J", "outputId": "10bce096-8673-4343-adc0-05c4c27f39b1" }, "outputs": [ { "data": { "text/plain": [ "0.05271574537941185" ] }, "execution_count": 14, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "evaluate_accuracy_at_4(predictions, ground_truth)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8Oorct55jydh" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "authorship_tag": "ABX9TyM5QFU4gHbNtmYio7rU1FnM", "collapsed_sections": [], "name": "2021-06-12-booking-dot-com-trip-recommendations-01-baseline.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }