{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Booking.com trip recommendation part 1 - baseline model\n",
"> Booking.com challenge on trip recommendation part 1\n",
"\n",
"- toc: true\n",
"- badges: true\n",
"- comments: true\n",
"- categories: [travel]\n",
"- image: "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "YrHhkJNbghNP"
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LR03pKu4hTyH"
},
"outputs": [],
"source": [
"!wget https://github.com/sparsh-ai/reco-data/raw/master/BookingChallenge.zip\n",
"!unzip BookingChallenge.zip"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "L-wi74ZtgvVH",
"outputId": "f6fcd4c5-ada2-4b88-9dba-5f449d7c9226"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1166835, 9)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" checkin | \n",
" checkout | \n",
" city_id | \n",
" device_class | \n",
" affiliate_id | \n",
" booker_country | \n",
" hotel_country | \n",
" utrip_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1000027 | \n",
" 2016-08-13 | \n",
" 2016-08-14 | \n",
" 8183 | \n",
" desktop | \n",
" 7168 | \n",
" Elbonia | \n",
" Gondal | \n",
" 1000027_1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1000027 | \n",
" 2016-08-14 | \n",
" 2016-08-16 | \n",
" 15626 | \n",
" desktop | \n",
" 7168 | \n",
" Elbonia | \n",
" Gondal | \n",
" 1000027_1 | \n",
"
\n",
" \n",
" 2 | \n",
" 1000027 | \n",
" 2016-08-16 | \n",
" 2016-08-18 | \n",
" 60902 | \n",
" desktop | \n",
" 7168 | \n",
" Elbonia | \n",
" Gondal | \n",
" 1000027_1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1000027 | \n",
" 2016-08-18 | \n",
" 2016-08-21 | \n",
" 30628 | \n",
" desktop | \n",
" 253 | \n",
" Elbonia | \n",
" Gondal | \n",
" 1000027_1 | \n",
"
\n",
" \n",
" 4 | \n",
" 1000033 | \n",
" 2016-04-09 | \n",
" 2016-04-11 | \n",
" 38677 | \n",
" mobile | \n",
" 359 | \n",
" Gondal | \n",
" Cobra Island | \n",
" 1000033_1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id checkin checkout ... booker_country hotel_country utrip_id\n",
"0 1000027 2016-08-13 2016-08-14 ... Elbonia Gondal 1000027_1\n",
"1 1000027 2016-08-14 2016-08-16 ... Elbonia Gondal 1000027_1\n",
"2 1000027 2016-08-16 2016-08-18 ... Elbonia Gondal 1000027_1\n",
"3 1000027 2016-08-18 2016-08-21 ... Elbonia Gondal 1000027_1\n",
"4 1000033 2016-04-09 2016-04-11 ... Gondal Cobra Island 1000033_1\n",
"\n",
"[5 rows x 9 columns]"
]
},
"execution_count": 5,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"train_set = pd.read_csv('train_set.csv').sort_values(by=['utrip_id','checkin'])\n",
"\n",
"print(train_set.shape)\n",
"train_set.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "z1m-r3qLhH0x",
"outputId": "91c73e93-2946-4448-c8f9-b7c7841be235"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(378667, 9)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" checkin | \n",
" checkout | \n",
" device_class | \n",
" affiliate_id | \n",
" booker_country | \n",
" utrip_id | \n",
" city_id | \n",
" hotel_country | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1000066 | \n",
" 2016-07-21 | \n",
" 2016-07-23 | \n",
" desktop | \n",
" 9924 | \n",
" Gondal | \n",
" 1000066_2 | \n",
" 56430 | \n",
" Urkesh | \n",
"
\n",
" \n",
" 1 | \n",
" 1000066 | \n",
" 2016-07-23 | \n",
" 2016-07-25 | \n",
" desktop | \n",
" 9924 | \n",
" Gondal | \n",
" 1000066_2 | \n",
" 41971 | \n",
" Urkesh | \n",
"
\n",
" \n",
" 2 | \n",
" 1000066 | \n",
" 2016-07-25 | \n",
" 2016-07-28 | \n",
" desktop | \n",
" 9924 | \n",
" Gondal | \n",
" 1000066_2 | \n",
" 5797 | \n",
" Urkesh | \n",
"
\n",
" \n",
" 3 | \n",
" 1000066 | \n",
" 2016-07-28 | \n",
" 2016-07-31 | \n",
" mobile | \n",
" 2436 | \n",
" Gondal | \n",
" 1000066_2 | \n",
" 0 | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 1000270 | \n",
" 2016-02-08 | \n",
" 2016-02-09 | \n",
" mobile | \n",
" 9452 | \n",
" The Devilfire Empire | \n",
" 1000270_1 | \n",
" 50075 | \n",
" The Devilfire Empire | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id checkin checkout ... utrip_id city_id hotel_country\n",
"0 1000066 2016-07-21 2016-07-23 ... 1000066_2 56430 Urkesh\n",
"1 1000066 2016-07-23 2016-07-25 ... 1000066_2 41971 Urkesh\n",
"2 1000066 2016-07-25 2016-07-28 ... 1000066_2 5797 Urkesh\n",
"3 1000066 2016-07-28 2016-07-31 ... 1000066_2 0 NaN\n",
"4 1000270 2016-02-08 2016-02-09 ... 1000270_1 50075 The Devilfire Empire\n",
"\n",
"[5 rows x 9 columns]"
]
},
"execution_count": 6,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"test_set = pd.read_csv('test_set.csv').sort_values(by=['utrip_id','checkin'])\n",
"\n",
"print(test_set.shape)\n",
"test_set.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9QKI_nxuhtVP",
"outputId": "7eaf4633-813b-41c7-fd36-46f31b344110"
},
"outputs": [
{
"data": {
"text/plain": [
"Int64Index([47499, 23921, 36063, 17013], dtype='int64')"
]
},
"execution_count": 7,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"# what are the top 4 most visited cities?\n",
"topcities = train_set.city_id.value_counts().index[:4]\n",
"topcities"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xz0---cgiG7X",
"outputId": "6672ddf9-f1f6-42ad-fff1-a701056ecc9b"
},
"outputs": [
{
"data": {
"text/plain": [
"70662"
]
},
"execution_count": 9,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"# how many trips are there in the test set?\n",
"test_trips = (test_set[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)\n",
"len(test_trips)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "ptKQJIE-iMnL",
"outputId": "3635433b-a717-453d-cbad-6eef6d8892ca"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" city_id_1 | \n",
" city_id_2 | \n",
" city_id_3 | \n",
" city_id_4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 1 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 2 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 3 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 4 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" city_id_1 city_id_2 city_id_3 city_id_4\n",
"0 47499 23921 36063 17013\n",
"1 47499 23921 36063 17013\n",
"2 47499 23921 36063 17013\n",
"3 47499 23921 36063 17013\n",
"4 47499 23921 36063 17013"
]
},
"execution_count": 10,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"# baseline - a simple logical rule - recommend top 4 most visitied cities to everyone\n",
"cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0],\n",
" columns= ['city_id_1','city_id_2','city_id_3','city_id_4'])\n",
"cities_prediction[:5]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "I4kLpNQVirxr",
"outputId": "35a862c1-8126-46d2-d043-ebf2ac43d84b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(70662, 5)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" utrip_id | \n",
" city_id_1 | \n",
" city_id_2 | \n",
" city_id_3 | \n",
" city_id_4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1000066_2 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 1 | \n",
" 1000270_1 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 2 | \n",
" 1000441_1 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 3 | \n",
" 100048_1 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
" 4 | \n",
" 1000543_1 | \n",
" 47499 | \n",
" 23921 | \n",
" 36063 | \n",
" 17013 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" utrip_id city_id_1 city_id_2 city_id_3 city_id_4\n",
"0 1000066_2 47499 23921 36063 17013\n",
"1 1000270_1 47499 23921 36063 17013\n",
"2 1000441_1 47499 23921 36063 17013\n",
"3 100048_1 47499 23921 36063 17013\n",
"4 1000543_1 47499 23921 36063 17013"
]
},
"execution_count": 11,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"predictions = pd.concat([test_trips, cities_prediction], axis=1)\n",
"\n",
"print(predictions.shape)\n",
"predictions.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 252
},
"id": "OeOWrUdujVer",
"outputId": "131db80f-2718-4c65-e01a-fdd7cb040001"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(70662, 2)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" city_id | \n",
" hotel_country | \n",
"
\n",
" \n",
" utrip_id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1038944_1 | \n",
" 54085 | \n",
" Sokovia | \n",
"
\n",
" \n",
" 1068715_1 | \n",
" 29319 | \n",
" Cobra Island | \n",
"
\n",
" \n",
" 1075528_1 | \n",
" 55763 | \n",
" Bozatta | \n",
"
\n",
" \n",
" 1110462_4 | \n",
" 11930 | \n",
" Alvonia | \n",
"
\n",
" \n",
" 1132565_1 | \n",
" 58659 | \n",
" Axphain | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" city_id hotel_country\n",
"utrip_id \n",
"1038944_1 54085 Sokovia\n",
"1068715_1 29319 Cobra Island\n",
"1075528_1 55763 Bozatta\n",
"1110462_4 11930 Alvonia\n",
"1132565_1 58659 Axphain"
]
},
"execution_count": 12,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"ground_truth = pd.read_csv('ground_truth.csv', index_col=[0])\n",
"\n",
"print(ground_truth.shape)\n",
"ground_truth.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"id": "4YhzkrLnjgeo"
},
"outputs": [],
"source": [
"def evaluate_accuracy_at_4(predictions, ground_truth):\n",
" '''checks if the true city is within the four recommended cities'''\n",
" data = predictions.join(ground_truth, on='utrip_id')\n",
"\n",
" hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|\n",
" (data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1\n",
" return hits.mean()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8t0sGGLNjr6J",
"outputId": "10bce096-8673-4343-adc0-05c4c27f39b1"
},
"outputs": [
{
"data": {
"text/plain": [
"0.05271574537941185"
]
},
"execution_count": 14,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"evaluate_accuracy_at_4(predictions, ground_truth)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8Oorct55jydh"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyM5QFU4gHbNtmYio7rU1FnM",
"collapsed_sections": [],
"name": "2021-06-12-booking-dot-com-trip-recommendations-01-baseline.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}