# Train a Model to Detect Sentiment from Trip Reports

In this example, we have trip reports from customer engagements stored in Ceph. In order to detect the sentiment of future trips, we use the historic data to train our models.  Over time, the accuracy of the models will improve as more data is stored in Ceph.

The models are also stored back in Ceph for use by other execution environments.

### Install Machine Learning libraries

In [None]:
!pip install sklearn
!pip install tensorflow
!pip install keras
!pip install pandas
!pip install boto3
!pip install matplotlib
!pip install seaborn

import pyspark

import re

import pandas as pd
import matplotlib.pyplot as plt

### Access the data using Spark

In [None]:
#Set the Spark configuration
#This will point to a local Spark instance running in stand-alone mode on the notebook
conf = pyspark.SparkConf().setAppName('Sentiment Analysis').setMaster('local[*]')
sc = pyspark.SparkContext.getOrCreate(conf)

accessKey= 'S3user1'
secretKey= 'S3user1key'
endpointUrl= 'http://'

#Set the S3 configurations to access Ceph Object Storage
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", 'S3user1') 
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", 'S3user1key') 
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", 'http://10.0.1.111')

#Get the SQL context
sqlContext = pyspark.SQLContext(sc)

feedbackFile = sqlContext.read.option("sep", "\t").csv("s3a://SENTIMENT/data/trip_report.tsv", header=True)

#### IMPORTANT: If you run the above step with incorrect Ceph parameters, you must reset the Kernel to see changes.
This can be done by going to Kernel in the menu and selecting 'Restart'

### Convert the data to a Pandas data frame

In [None]:
df = feedbackFile.toPandas()
sc.stop()

df.head()

### Visualize the data

#### Types of trip outcomes by field representative

In [None]:
import numpy as np
np.random.seed(sum(map(ord, "categorical")))

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

outcome_dict = {'Successful':0,'Partial Success':1,'Unsuccessful':2 }

df_vis = df[['Your Name', 'Outcome']]
df_vis['outcome_numeric'] = df_vis['Outcome'].apply(lambda a:outcome_dict[a])



outcome_cross_table = pd.crosstab(index=df_vis["Your Name"], 
                          columns=df_vis["Outcome"])


outcome_cross_table.plot(kind="bar", 
                 figsize=(16,12),
                 stacked=True,fontsize=12)
plt.show();

#### Types of outcomes by event type

In [None]:
event_type_cross_table = pd.crosstab(index=df["Primary Audience Engaged"], 
                          columns=df["Outcome"])

event_type_cross_table.plot(kind="bar", 
                 figsize=(16,12),
                 stacked=True,fontsize=12)
plt.show();

### Now convert "Highlights" data to prepare for training the model

In [None]:
df['Highlights'] = df['Highlights'].astype(str)

df[['Highlights','Outcome']].head(20)

In [None]:
df_outcome = df[['Highlights','Outcome']]

pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

grouped_highlights = pd.DataFrame(df_outcome.groupby('Outcome')['Highlights'].apply(lambda x: "%s" % ' '.join(x)))

grouped_highlights['Outcome'] = list(grouped_highlights.index.get_values())
grouped_highlights.reset_index(drop=True, inplace=True)

grouped_highlights['Highlights'] = grouped_highlights['Highlights'].astype(str)

df['Highlights'] = df['Highlights'].apply(lambda a: a.lower())

df_success = df[df['Outcome'] == 'Successful']
df_unsuccess = df[df['Outcome'] == 'Unsuccessful']
df_part_success = df[df['Outcome'] == 'Partial Success']

#### Import additional Machine Learning libraries

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

#### Separating train and test data. Taking successful and unsuccessful separately

In [None]:
df_failure = df_part_success.append(df_unsuccess, ignore_index= True)

df_failure['Outcome'] = 'Unsuccessful'

test_hold_out = 0.1

#### Success

train = df_success[ : -int(test_hold_out * len(df_success))]
test = df_success[-int(test_hold_out * len(df_success)) : ]

#### Failure

train = train.append(df_failure[ : -int(test_hold_out * len(df_failure))])
test = test.append(df_failure[-int(test_hold_out * len(df_failure)) : ])


train = train.sample(frac = 1)
train['type'] = "Train"
test['type'] = "Test"

train = train.append(test)

train.reset_index(drop=True,inplace=True)

Y = pd.get_dummies(train['Outcome']).values

test_index_list = list(train[train['type'] == 'Test'].index)

test_index_list

### Use the HIGHLIGHTS field for sentiment analysis

__max_features__ = Vocabulary size,its a hyper parameter <br>
*Tokenizer creates vectors from text, mainly works like a dictionary id in total vocabulary, returns list of integers, where every integer acts like an index <br>


In [None]:
max_fatures = 10000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(train['Highlights'].values)
X_highlights = tokenizer.texts_to_sequences(train['Highlights'].values)
X_highlights = pad_sequences(X_highlights)


#### Creating the network layer by layer
First layer is word embedding layer, second layer is LSTM based RNN, and third layer is Softmax activation layer, due to categorical outcome

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X_highlights.shape[1], dropout=0.05))
model.add(LSTM(lstm_out, dropout_U=0.1, dropout_W=0.1))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

#### Separating train and test data

In [None]:
X_highlights_train = X_highlights[0:test_index_list[0]]
Y_highlights_train = Y[0:test_index_list[0]]

X_highlights_test = X_highlights[test_index_list[0]:]
Y_highlights_test = Y[test_index_list[0]:]

#### Running the model
Batch size and number of epoch can be changed as optimisation

In [None]:
batch_size = 20
model.fit(X_highlights_train, Y_highlights_train, nb_epoch = 10, batch_size=batch_size, verbose = 2)

#### Printing test data accuracy

In [None]:
score,accuracy = model.evaluate(X_highlights_test, Y_highlights_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("accuracy: %.2f" % (accuracy))

### Save the model, tokenizer and feature dimension and store them in Ceph

In [None]:
model.save("./model")

import pickle

with open('./tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

feature_dimension = X_highlights_train.shape[1]
with open('./feature_dimension.pickle', 'wb') as handle:
    pickle.dump(feature_dimension, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Save models to S3

In [None]:
import boto3
s3 = boto3.resource('s3')

#Create S3 session for writing manifest file
session = boto3.Session(
    aws_access_key_id=accessKey,
    aws_secret_access_key=secretKey
)

s3 = session.resource('s3', endpoint_url=endpointUrl, verify=False)

# Upload the model to S3
s3.meta.client.upload_file('./model', 'SENTIMENT', 'models/trip_report_model')

# Upload the tokenizer to S3
s3.meta.client.upload_file('./tokenizer.pickle', 'SENTIMENT', 'models/trip_report_tokenizer.pickle')

# Upload the feature dimension to S3
s3.meta.client.upload_file('./feature_dimension.pickle', 'SENTIMENT', 'models/trip_report_feature_dimension.pickle')

### The model has been saved to s3 as binary files and can be viewed