#### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import nltk
import tqdm
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data] C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data] C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data] Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data] Package averaged_perceptron_tagger is already up-to-
[nltk_data] date!
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!


True

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [4]:
from collections import defaultdict

Set the random seed, in order to guarentee reproducability across runs (consistency of results)

In [5]:
#Set Random seed
np.random.seed(7)

#### Loading the corpus
10'000 reviews. \_\_label\_\_1 are negative reviews, \_\_label\_\_2 are positive reviews.

In [6]:
Corpus = pd.read_csv("corpus.csv",encoding='latin-1')

In [7]:
print(type(Corpus))
print(Corpus.shape)
print(Corpus[:10])

Corpus = Corpus[:1000]
print(Corpus.shape)


(10000, 2)
 text label
0 Stuning even for the non-gamer: This sound tr... __label__2 
1 The best soundtrack ever to anything.: I'm re... __label__2 
2 Amazing!: This soundtrack is my favorite musi... __label__2 
3 Excellent Soundtrack: I truly like this sound... __label__2 
4 Remember, Pull Your Jaw Off The Floor After H... __label__2 
5 an absolute masterpiece: I am quite sure any ... __label__2 
6 Buyer beware: This is a self-published book, ... __label__1 
7 Glorious story: I loved Whisper of the wicked... __label__2 
8 A FIVE STAR BOOK: I just finished reading Whi... __label__2 
9 Whispers of the Wicked Saints: This was a eas... __label__2 
(1000, 2)


#### Data Pre-processing
This will help in getting better results through the classification algorithms

In [8]:
# Step - 1a : Remove blank rows if any.
print(Corpus.shape)
Corpus['text'].dropna(inplace=True)
print(Corpus.shape)

(1000, 2)
(1000, 2)


In [9]:
# Step - 1b : Change all the text to lower case. This is a normal procedure as it helps "normalizing" the text.
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

In [10]:
# Step - 1c : Tokenization : Each sample (text chunk) from the corpus is broken into a set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

In [11]:
# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [12]:
first_sample = Corpus['text'][0]
print(first_sample)

['stuning', 'even', 'for', 'the', 'non-gamer', ':', 'this', 'sound', 'track', 'was', 'beautiful', '!', 'it', 'paints', 'the', 'senery', 'in', 'your', 'mind', 'so', 'well', 'i', 'would', 'recomend', 'it', 'even', 'to', 'people', 'who', 'hate', 'video', 'game', 'music', '!', 'i', 'have', 'played', 'the', 'game', 'chrono', 'cross', 'but', 'out', 'of', 'all', 'of', 'the', 'games', 'i', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music', '!', 'it', 'backs', 'away', 'from', 'crude', 'keyboarding', 'and', 'takes', 'a', 'fresher', 'step', 'with', 'grate', 'guitars', 'and', 'soulful', 'orchestras', '.', 'it', 'would', 'impress', 'anyone', 'who', 'cares', 'to', 'listen', '!', '^_^']


In [13]:
Final_words=[]
# Initializing WordNetLemmatizer()
word_Lemmatized = WordNetLemmatizer()
# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
pos_tag_result = pos_tag(first_sample)
print(pos_tag_result)
for word, tag in pos_tag_result:
 # Below condition is to check for Stop words and consider only alphabets
 if word not in stopwords.words('english') and word.isalpha():
 word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
 Final_words.append(word_Final)
# The final processed set of words for each iteration will be stored in 'text_final'
print('text_final',str(Final_words))

[('stuning', 'VBG'), ('even', 'RB'), ('for', 'IN'), ('the', 'DT'), ('non-gamer', 'JJ'), (':', ':'), ('this', 'DT'), ('sound', 'NN'), ('track', 'NN'), ('was', 'VBD'), ('beautiful', 'JJ'), ('!', '.'), ('it', 'PRP'), ('paints', 'VBZ'), ('the', 'DT'), ('senery', 'NN'), ('in', 'IN'), ('your', 'PRP$'), ('mind', 'NN'), ('so', 'RB'), ('well', 'RB'), ('i', 'VB'), ('would', 'MD'), ('recomend', 'VB'), ('it', 'PRP'), ('even', 'RB'), ('to', 'TO'), ('people', 'NNS'), ('who', 'WP'), ('hate', 'VBP'), ('video', 'NNS'), ('game', 'NN'), ('music', 'NN'), ('!', '.'), ('i', 'NN'), ('have', 'VBP'), ('played', 'VBN'), ('the', 'DT'), ('game', 'NN'), ('chrono', 'NN'), ('cross', 'NN'), ('but', 'CC'), ('out', 'IN'), ('of', 'IN'), ('all', 'DT'), ('of', 'IN'), ('the', 'DT'), ('games', 'NNS'), ('i', 'VBP'), ('have', 'VBP'), ('ever', 'RB'), ('played', 'VBN'), ('it', 'PRP'), ('has', 'VBZ'), ('the', 'DT'), ('best', 'JJS'), ('music', 'NN'), ('!', '.'), ('it', 'PRP'), ('backs', 'VBZ'), ('away', 'RB'), ('from', 'IN'), ('c

In [14]:
#Check if the dataset was already processed before, as it takes several minutes. This is a good practice.

In [15]:
import pickle
import os
import os.path
if os.path.isfile("processed_corpus.pickle"):
 with open('processed_corpus.pickle', 'rb') as f:
 Corpus = pickle.load(f)
else:
 for index,entry in enumerate(tqdm.tqdm(Corpus['text'])):
 # Declaring Empty List to store the words that follow the rules for this step
 Final_words = []
 # Initializing WordNetLemmatizer()
 word_Lemmatized = WordNetLemmatizer()
 # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
 for word, tag in pos_tag(entry):
 # Below condition is to check for Stop words and consider only alphabets
 if word not in stopwords.words('english') and word.isalpha():
 word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
 Final_words.append(word_Final)
 # The final processed set of words for each iteration will be stored in 'text_final'
 Corpus.loc[index,'text_final'] = str(Final_words)
 
 with open("processed_corpus.pickle", "wb") as f:
 pickle.dump(Corpus, f)

In [16]:
print(Corpus['text_final'].head())

0 ['stun', 'even', 'sound', 'track', 'beautiful'...
1 ['best', 'soundtrack', 'ever', 'anything', 're...
2 ['amaze', 'soundtrack', 'favorite', 'music', '...
3 ['excellent', 'soundtrack', 'truly', 'like', '...
4 ['remember', 'pull', 'jaw', 'floor', 'hear', '...
Name: text_final, dtype: object


In [17]:
print(Corpus[:5])

 text label \
0 [stuning, even, for, the, non-gamer, :, this, ... __label__2 
1 [the, best, soundtrack, ever, to, anything, .,... __label__2 
2 [amazing, !, :, this, soundtrack, is, my, favo... __label__2 
3 [excellent, soundtrack, :, i, truly, like, thi... __label__2 
4 [remember, ,, pull, your, jaw, off, the, floor... __label__2 

 text_final 
0 ['stun', 'even', 'sound', 'track', 'beautiful'... 
1 ['best', 'soundtrack', 'ever', 'anything', 're... 
2 ['amaze', 'soundtrack', 'favorite', 'music', '... 
3 ['excellent', 'soundtrack', 'truly', 'like', '... 
4 ['remember', 'pull', 'jaw', 'floor', 'hear', '... 


In [18]:
# Step - 2: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

In [19]:
# Step - 3: Label encode the target variable
#This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [20]:
# Step - 4: Vectorize the words by using TF-IDF Vectorizer
#This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [21]:
# Step - 5: Run different algorithms to classify our data and check their accuracy

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score -> 81.33333333333333


In [22]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.LinearSVC()
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
distances = SVM.decision_function(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(list(zip(predictions_SVM,distances))[:5])

SVM Accuracy Score -> 80.0
[(1, 0.13771564364451283), (1, 0.6901297920139712), (0, -0.1981896618752993), (0, -0.4480398990108602), (0, -0.778919669893617)]


Code largely based on https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [23]:
Tfidf_vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
 dtype=, encoding='utf-8', input='content',
 lowercase=True, max_df=1.0, max_features=5000, min_df=1,
 ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 stop_words=None, strip_accents=None, sublinear_tf=False,
 token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
 vocabulary=None)

In [24]:
print(len(Tfidf_vect.get_feature_names()))
print(Tfidf_vect.get_feature_names()[:10])

5000
['aa', 'aaa', 'ab', 'abandon', 'abbreviate', 'abc', 'abdomen', 'abdominal', 'ability', 'abit']


In [25]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(Corpus['text_final'])

In [26]:
X

<1000x5000 sparse matrix of type ''
	with 31808 stored elements in Compressed Sparse Row format>

In [27]:
print(Tfidf_vect.get_stop_words())

None


In [28]:
print(SVM.coef_.shape)
print(type(SVM.coef_))

(1, 5000)



In [29]:
print(SVM.coef_)

[[-0.09050077 0. 0.10122429 ... 0.0352994 -0.07236617
 0. ]]


In [30]:
svm_bias = SVM.intercept_[0]
svm_coef = SVM.coef_.reshape (SVM.coef_.shape[1])
print(svm_coef.shape)
print(type(svm_coef))

(5000,)



In [31]:
test_str = "This is awesome. Very good movie and amazing soundtrack. Not like those old average movies from Tarantino."
test_str_Tfidf = Tfidf_vect.transform([test_str])
print(test_str_Tfidf)

 (0, 4147)	0.5199645135146538
 (0, 3804)	0.3850804174786943
 (0, 2758)	0.2432894135238121
 (0, 2654)	0.238784128756363
 (0, 2466)	0.16675247844565785
 (0, 1770)	0.17679709214511746
 (0, 424)	0.35403773308506453
 (0, 410)	0.3908483818512366
 (0, 184)	0.3577339835390647


In [32]:
test_str_Tfidf

<1x5000 sparse matrix of type ''
	with 9 stored elements in Compressed Sparse Row format>

In [33]:
test_str_Tfidf.__dict__

{'_shape': (1, 5000),
 'data': array([0.51996451, 0.38508042, 0.24328941, 0.23878413, 0.16675248,
 0.17679709, 0.35403773, 0.39084838, 0.35773398]),
 'indices': array([4147, 3804, 2758, 2654, 2466, 1770, 424, 410, 184], dtype=int32),
 'indptr': array([0, 9], dtype=int32),
 'maxprint': 50}

#### test_str tokens weighted with tf-idf

In [34]:
print({k:v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)})

{4147: 0.5199645135146538, 3804: 0.3850804174786943, 2758: 0.2432894135238121, 2654: 0.238784128756363, 2466: 0.16675247844565785, 1770: 0.17679709214511746, 424: 0.35403773308506453, 410: 0.3908483818512366, 184: 0.3577339835390647}


In [35]:
print({Tfidf_vect.get_feature_names()[k]:v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)})

{'tarantino': 0.5199645135146538, 'soundtrack': 0.3850804174786943, 'old': 0.2432894135238121, 'movie': 0.238784128756363, 'like': 0.16675247844565785, 'good': 0.17679709214511746, 'awesome': 0.35403773308506453, 'average': 0.3908483818512366, 'amazing': 0.3577339835390647}


#### all svm weights

In [36]:
all_svm_weights={Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in range(0, svm_coef.shape[0])}
all_nonzero_svm_weights={Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in range(0, svm_coef.shape[0]) if svm_coef[k]!=0.0}

#### svm weights for the test_str tokens

In [37]:
print({Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in test_str_Tfidf.indices})

{'tarantino': -0.041343894497876434, 'soundtrack': 0.21409584517162913, 'old': -0.44373803191538447, 'movie': -0.41199304223305466, 'like': -0.20699017901400116, 'good': 1.1069508669130126, 'awesome': 0.8209169581723464, 'average': 0.15341076571124157, 'amazing': 1.006800041750699}


In [38]:
test_str_raw_score_val = svm_bias + sum([svm_coef[k]*v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)])
print(test_str_raw_score_val)

0.6308574307108017


In [39]:
test_str_prediction_SVM = SVM.predict(test_str_Tfidf)
test_str_distance = SVM.decision_function(test_str_Tfidf)
print(Encoder.classes_[test_str_prediction_SVM[0]])
print(test_str_distance[0])

__label__2 
0.6308574307108017


#### Normalizing SVM predications to [0,1]
https://www.csie.ntu.edu.tw/~cjlin/papers/plattprob.pdf

In [40]:
def sigmoid(x, k=1):
 return 1/(1+ math.exp(-k*x))

In [41]:
print(sigmoid(test_str_distance[0]))

0.6526838565743812
