# Booking.com trip recommendation part 1 - baseline model
> Booking.com challenge on trip recommendation part 1

- toc: true
- badges: true
- comments: true
- categories: [travel]
- image: 

In [1]:
import pandas as pd

In [None]:
!wget https://github.com/sparsh-ai/reco-data/raw/master/BookingChallenge.zip
!unzip BookingChallenge.zip

In [5]:
train_set = pd.read_csv('train_set.csv').sort_values(by=['utrip_id','checkin'])

print(train_set.shape)
train_set.head()

(1166835, 9)


Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,1000027,2016-08-13,2016-08-14,8183,desktop,7168,Elbonia,Gondal,1000027_1
1,1000027,2016-08-14,2016-08-16,15626,desktop,7168,Elbonia,Gondal,1000027_1
2,1000027,2016-08-16,2016-08-18,60902,desktop,7168,Elbonia,Gondal,1000027_1
3,1000027,2016-08-18,2016-08-21,30628,desktop,253,Elbonia,Gondal,1000027_1
4,1000033,2016-04-09,2016-04-11,38677,mobile,359,Gondal,Cobra Island,1000033_1


In [6]:
test_set = pd.read_csv('test_set.csv').sort_values(by=['utrip_id','checkin'])

print(test_set.shape)
test_set.head()

(378667, 9)


Unnamed: 0,user_id,checkin,checkout,device_class,affiliate_id,booker_country,utrip_id,city_id,hotel_country
0,1000066,2016-07-21,2016-07-23,desktop,9924,Gondal,1000066_2,56430,Urkesh
1,1000066,2016-07-23,2016-07-25,desktop,9924,Gondal,1000066_2,41971,Urkesh
2,1000066,2016-07-25,2016-07-28,desktop,9924,Gondal,1000066_2,5797,Urkesh
3,1000066,2016-07-28,2016-07-31,mobile,2436,Gondal,1000066_2,0,
4,1000270,2016-02-08,2016-02-09,mobile,9452,The Devilfire Empire,1000270_1,50075,The Devilfire Empire


In [7]:
# what are the top 4 most visited cities?
topcities = train_set.city_id.value_counts().index[:4]
topcities

Int64Index([47499, 23921, 36063, 17013], dtype='int64')

In [9]:
# how many trips are there in the test set?
test_trips = (test_set[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)
len(test_trips)

70662

In [10]:
# baseline - a simple logical rule - recommend top 4 most visitied cities to everyone
cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0],
                                 columns= ['city_id_1','city_id_2','city_id_3','city_id_4'])
cities_prediction[:5]

Unnamed: 0,city_id_1,city_id_2,city_id_3,city_id_4
0,47499,23921,36063,17013
1,47499,23921,36063,17013
2,47499,23921,36063,17013
3,47499,23921,36063,17013
4,47499,23921,36063,17013


In [11]:
predictions = pd.concat([test_trips, cities_prediction], axis=1)

print(predictions.shape)
predictions.head()

(70662, 5)


Unnamed: 0,utrip_id,city_id_1,city_id_2,city_id_3,city_id_4
0,1000066_2,47499,23921,36063,17013
1,1000270_1,47499,23921,36063,17013
2,1000441_1,47499,23921,36063,17013
3,100048_1,47499,23921,36063,17013
4,1000543_1,47499,23921,36063,17013


In [12]:
ground_truth = pd.read_csv('ground_truth.csv', index_col=[0])

print(ground_truth.shape)
ground_truth.head()

(70662, 2)


Unnamed: 0_level_0,city_id,hotel_country
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1038944_1,54085,Sokovia
1068715_1,29319,Cobra Island
1075528_1,55763,Bozatta
1110462_4,11930,Alvonia
1132565_1,58659,Axphain


In [13]:
def evaluate_accuracy_at_4(predictions, ground_truth):
    '''checks if the true city is within the four recommended cities'''
    data = predictions.join(ground_truth, on='utrip_id')

    hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|
        (data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1
    return hits.mean()

In [14]:
evaluate_accuracy_at_4(predictions, ground_truth)

0.05271574537941185