{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Importing the libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import math" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from nltk import pos_tag\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from nltk.corpus import wordnet as wn\n", "import nltk\n", "import tqdm\n", "nltk.download('punkt')\n", "nltk.download('wordnet')\n", "nltk.download('averaged_perceptron_tagger')\n", "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn import model_selection, naive_bayes, svm\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from collections import defaultdict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set the random seed, in order to guarentee reproducability across runs (consistency of results)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Set Random seed\n", "np.random.seed(7)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Loading the corpus\n", "10'000 reviews. \\_\\_label\\_\\_1 are negative reviews, \\_\\_label\\_\\_2 are positive reviews." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "Corpus = pd.read_csv(\"corpus.csv\",encoding='latin-1')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "(10000, 2)\n", " text label\n", "0 Stuning even for the non-gamer: This sound tr... __label__2 \n", "1 The best soundtrack ever to anything.: I'm re... __label__2 \n", "2 Amazing!: This soundtrack is my favorite musi... __label__2 \n", "3 Excellent Soundtrack: I truly like this sound... __label__2 \n", "4 Remember, Pull Your Jaw Off The Floor After H... __label__2 \n", "5 an absolute masterpiece: I am quite sure any ... __label__2 \n", "6 Buyer beware: This is a self-published book, ... __label__1 \n", "7 Glorious story: I loved Whisper of the wicked... __label__2 \n", "8 A FIVE STAR BOOK: I just finished reading Whi... __label__2 \n", "9 Whispers of the Wicked Saints: This was a eas... __label__2 \n", "(1000, 2)\n" ] } ], "source": [ "print(type(Corpus))\n", "print(Corpus.shape)\n", "print(Corpus[:10])\n", "\n", "Corpus = Corpus[:1000]\n", "print(Corpus.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Data Pre-processing\n", "This will help in getting better results through the classification algorithms" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1000, 2)\n", "(1000, 2)\n" ] } ], "source": [ "# Step - 1a : Remove blank rows if any.\n", "print(Corpus.shape)\n", "Corpus['text'].dropna(inplace=True)\n", "print(Corpus.shape)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Step - 1b : Change all the text to lower case. This is a normal procedure as it helps \"normalizing\" the text.\n", "Corpus['text'] = [entry.lower() for entry in Corpus['text']]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Step - 1c : Tokenization : Each sample (text chunk) from the corpus is broken into a set of words\n", "Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.\n", "\n", "# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. \n", "#By default it is set to Noun\n", "tag_map = defaultdict(lambda : wn.NOUN)\n", "tag_map['J'] = wn.ADJ\n", "tag_map['V'] = wn.VERB\n", "tag_map['R'] = wn.ADV" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['stuning', 'even', 'for', 'the', 'non-gamer', ':', 'this', 'sound', 'track', 'was', 'beautiful', '!', 'it', 'paints', 'the', 'senery', 'in', 'your', 'mind', 'so', 'well', 'i', 'would', 'recomend', 'it', 'even', 'to', 'people', 'who', 'hate', 'video', 'game', 'music', '!', 'i', 'have', 'played', 'the', 'game', 'chrono', 'cross', 'but', 'out', 'of', 'all', 'of', 'the', 'games', 'i', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music', '!', 'it', 'backs', 'away', 'from', 'crude', 'keyboarding', 'and', 'takes', 'a', 'fresher', 'step', 'with', 'grate', 'guitars', 'and', 'soulful', 'orchestras', '.', 'it', 'would', 'impress', 'anyone', 'who', 'cares', 'to', 'listen', '!', '^_^']\n" ] } ], "source": [ "first_sample = Corpus['text'][0]\n", "print(first_sample)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('stuning', 'VBG'), ('even', 'RB'), ('for', 'IN'), ('the', 'DT'), ('non-gamer', 'JJ'), (':', ':'), ('this', 'DT'), ('sound', 'NN'), ('track', 'NN'), ('was', 'VBD'), ('beautiful', 'JJ'), ('!', '.'), ('it', 'PRP'), ('paints', 'VBZ'), ('the', 'DT'), ('senery', 'NN'), ('in', 'IN'), ('your', 'PRP$'), ('mind', 'NN'), ('so', 'RB'), ('well', 'RB'), ('i', 'VB'), ('would', 'MD'), ('recomend', 'VB'), ('it', 'PRP'), ('even', 'RB'), ('to', 'TO'), ('people', 'NNS'), ('who', 'WP'), ('hate', 'VBP'), ('video', 'NNS'), ('game', 'NN'), ('music', 'NN'), ('!', '.'), ('i', 'NN'), ('have', 'VBP'), ('played', 'VBN'), ('the', 'DT'), ('game', 'NN'), ('chrono', 'NN'), ('cross', 'NN'), ('but', 'CC'), ('out', 'IN'), ('of', 'IN'), ('all', 'DT'), ('of', 'IN'), ('the', 'DT'), ('games', 'NNS'), ('i', 'VBP'), ('have', 'VBP'), ('ever', 'RB'), ('played', 'VBN'), ('it', 'PRP'), ('has', 'VBZ'), ('the', 'DT'), ('best', 'JJS'), ('music', 'NN'), ('!', '.'), ('it', 'PRP'), ('backs', 'VBZ'), ('away', 'RB'), ('from', 'IN'), ('crude', 'NN'), ('keyboarding', 'NN'), ('and', 'CC'), ('takes', 'VBZ'), ('a', 'DT'), ('fresher', 'JJ'), ('step', 'NN'), ('with', 'IN'), ('grate', 'JJ'), ('guitars', 'NNS'), ('and', 'CC'), ('soulful', 'JJ'), ('orchestras', 'NNS'), ('.', '.'), ('it', 'PRP'), ('would', 'MD'), ('impress', 'VB'), ('anyone', 'NN'), ('who', 'WP'), ('cares', 'VBZ'), ('to', 'TO'), ('listen', 'VB'), ('!', '.'), ('^_^', 'NN')]\n", "text_final ['stun', 'even', 'sound', 'track', 'beautiful', 'paint', 'senery', 'mind', 'well', 'would', 'recomend', 'even', 'people', 'hate', 'video', 'game', 'music', 'play', 'game', 'chrono', 'cross', 'game', 'ever', 'play', 'best', 'music', 'back', 'away', 'crude', 'keyboarding', 'take', 'fresh', 'step', 'grate', 'guitar', 'soulful', 'orchestra', 'would', 'impress', 'anyone', 'care', 'listen']\n" ] } ], "source": [ "Final_words=[]\n", "# Initializing WordNetLemmatizer()\n", "word_Lemmatized = WordNetLemmatizer()\n", "# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.\n", "pos_tag_result = pos_tag(first_sample)\n", "print(pos_tag_result)\n", "for word, tag in pos_tag_result:\n", " # Below condition is to check for Stop words and consider only alphabets\n", " if word not in stopwords.words('english') and word.isalpha():\n", " word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n", " Final_words.append(word_Final)\n", "# The final processed set of words for each iteration will be stored in 'text_final'\n", "print('text_final',str(Final_words))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Check if the dataset was already processed before, as it takes several minutes. This is a good practice." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pickle\n", "import os\n", "import os.path\n", "if os.path.isfile(\"processed_corpus.pickle\"):\n", " with open('processed_corpus.pickle', 'rb') as f:\n", " Corpus = pickle.load(f)\n", "else:\n", " for index,entry in enumerate(tqdm.tqdm(Corpus['text'])):\n", " # Declaring Empty List to store the words that follow the rules for this step\n", " Final_words = []\n", " # Initializing WordNetLemmatizer()\n", " word_Lemmatized = WordNetLemmatizer()\n", " # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.\n", " for word, tag in pos_tag(entry):\n", " # Below condition is to check for Stop words and consider only alphabets\n", " if word not in stopwords.words('english') and word.isalpha():\n", " word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n", " Final_words.append(word_Final)\n", " # The final processed set of words for each iteration will be stored in 'text_final'\n", " Corpus.loc[index,'text_final'] = str(Final_words)\n", " \n", " with open(\"processed_corpus.pickle\", \"wb\") as f:\n", " pickle.dump(Corpus, f)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 ['stun', 'even', 'sound', 'track', 'beautiful'...\n", "1 ['best', 'soundtrack', 'ever', 'anything', 're...\n", "2 ['amaze', 'soundtrack', 'favorite', 'music', '...\n", "3 ['excellent', 'soundtrack', 'truly', 'like', '...\n", "4 ['remember', 'pull', 'jaw', 'floor', 'hear', '...\n", "Name: text_final, dtype: object\n" ] } ], "source": [ "print(Corpus['text_final'].head())" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " text label \\\n", "0 [stuning, even, for, the, non-gamer, :, this, ... __label__2 \n", "1 [the, best, soundtrack, ever, to, anything, .,... __label__2 \n", "2 [amazing, !, :, this, soundtrack, is, my, favo... __label__2 \n", "3 [excellent, soundtrack, :, i, truly, like, thi... __label__2 \n", "4 [remember, ,, pull, your, jaw, off, the, floor... __label__2 \n", "\n", " text_final \n", "0 ['stun', 'even', 'sound', 'track', 'beautiful'... \n", "1 ['best', 'soundtrack', 'ever', 'anything', 're... \n", "2 ['amaze', 'soundtrack', 'favorite', 'music', '... \n", "3 ['excellent', 'soundtrack', 'truly', 'like', '... \n", "4 ['remember', 'pull', 'jaw', 'floor', 'hear', '... \n" ] } ], "source": [ "print(Corpus[:5])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Step - 2: Split the model into Train and Test Data set\n", "Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Step - 3: Label encode the target variable\n", "#This is done to transform Categorical data of string type in the data set into numerical values\n", "Encoder = LabelEncoder()\n", "Train_Y = Encoder.fit_transform(Train_Y)\n", "Test_Y = Encoder.fit_transform(Test_Y)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Step - 4: Vectorize the words by using TF-IDF Vectorizer\n", "#This is done to find how important a word in document is in comaprison to the corpus\n", "Tfidf_vect = TfidfVectorizer(max_features=5000)\n", "Tfidf_vect.fit(Corpus['text_final'])\n", "\n", "Train_X_Tfidf = Tfidf_vect.transform(Train_X)\n", "Test_X_Tfidf = Tfidf_vect.transform(Test_X)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Naive Bayes Accuracy Score -> 81.33333333333333\n" ] } ], "source": [ "# Step - 5: Run different algorithms to classify our data and check their accuracy\n", "\n", "# Classifier - Algorithm - Naive Bayes\n", "# fit the training dataset on the classifier\n", "Naive = naive_bayes.MultinomialNB()\n", "Naive.fit(Train_X_Tfidf,Train_Y)\n", "\n", "# predict the labels on validation dataset\n", "predictions_NB = Naive.predict(Test_X_Tfidf)\n", "\n", "# Use accuracy_score function to get the accuracy\n", "print(\"Naive Bayes Accuracy Score -> \",accuracy_score(predictions_NB, Test_Y)*100)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SVM Accuracy Score -> 80.0\n", "[(1, 0.13771564364451283), (1, 0.6901297920139712), (0, -0.1981896618752993), (0, -0.4480398990108602), (0, -0.778919669893617)]\n" ] } ], "source": [ "# Classifier - Algorithm - SVM\n", "# fit the training dataset on the classifier\n", "SVM = svm.LinearSVC()\n", "SVM.fit(Train_X_Tfidf,Train_Y)\n", "\n", "# predict the labels on validation dataset\n", "predictions_SVM = SVM.predict(Test_X_Tfidf)\n", "distances = SVM.decision_function(Test_X_Tfidf)\n", "\n", "# Use accuracy_score function to get the accuracy\n", "print(\"SVM Accuracy Score -> \",accuracy_score(predictions_SVM, Test_Y)*100)\n", "print(list(zip(predictions_SVM,distances))[:5])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Code largely based on https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=1.0, max_features=5000, min_df=1,\n", " ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n", " stop_words=None, strip_accents=None, sublinear_tf=False,\n", " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", " vocabulary=None)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Tfidf_vect" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5000\n", "['aa', 'aaa', 'ab', 'abandon', 'abbreviate', 'abc', 'abdomen', 'abdominal', 'ability', 'abit']\n" ] } ], "source": [ "print(len(Tfidf_vect.get_feature_names()))\n", "print(Tfidf_vect.get_feature_names()[:10])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "vectorizer = TfidfVectorizer(max_features=5000)\n", "X = vectorizer.fit_transform(Corpus['text_final'])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<1000x5000 sparse matrix of type ''\n", "\twith 31808 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n" ] } ], "source": [ "print(Tfidf_vect.get_stop_words())" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 5000)\n", "\n" ] } ], "source": [ "print(SVM.coef_.shape)\n", "print(type(SVM.coef_))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[-0.09050077 0. 0.10122429 ... 0.0352994 -0.07236617\n", " 0. ]]\n" ] } ], "source": [ "print(SVM.coef_)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(5000,)\n", "\n" ] } ], "source": [ "svm_bias = SVM.intercept_[0]\n", "svm_coef = SVM.coef_.reshape (SVM.coef_.shape[1])\n", "print(svm_coef.shape)\n", "print(type(svm_coef))" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " (0, 4147)\t0.5199645135146538\n", " (0, 3804)\t0.3850804174786943\n", " (0, 2758)\t0.2432894135238121\n", " (0, 2654)\t0.238784128756363\n", " (0, 2466)\t0.16675247844565785\n", " (0, 1770)\t0.17679709214511746\n", " (0, 424)\t0.35403773308506453\n", " (0, 410)\t0.3908483818512366\n", " (0, 184)\t0.3577339835390647\n" ] } ], "source": [ "test_str = \"This is awesome. Very good movie and amazing soundtrack. Not like those old average movies from Tarantino.\"\n", "test_str_Tfidf = Tfidf_vect.transform([test_str])\n", "print(test_str_Tfidf)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<1x5000 sparse matrix of type ''\n", "\twith 9 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_str_Tfidf" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'_shape': (1, 5000),\n", " 'data': array([0.51996451, 0.38508042, 0.24328941, 0.23878413, 0.16675248,\n", " 0.17679709, 0.35403773, 0.39084838, 0.35773398]),\n", " 'indices': array([4147, 3804, 2758, 2654, 2466, 1770, 424, 410, 184], dtype=int32),\n", " 'indptr': array([0, 9], dtype=int32),\n", " 'maxprint': 50}" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_str_Tfidf.__dict__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### test_str tokens weighted with tf-idf" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{4147: 0.5199645135146538, 3804: 0.3850804174786943, 2758: 0.2432894135238121, 2654: 0.238784128756363, 2466: 0.16675247844565785, 1770: 0.17679709214511746, 424: 0.35403773308506453, 410: 0.3908483818512366, 184: 0.3577339835390647}\n" ] } ], "source": [ "print({k:v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)})" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'tarantino': 0.5199645135146538, 'soundtrack': 0.3850804174786943, 'old': 0.2432894135238121, 'movie': 0.238784128756363, 'like': 0.16675247844565785, 'good': 0.17679709214511746, 'awesome': 0.35403773308506453, 'average': 0.3908483818512366, 'amazing': 0.3577339835390647}\n" ] } ], "source": [ "print({Tfidf_vect.get_feature_names()[k]:v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### all svm weights" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_svm_weights={Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in range(0, svm_coef.shape[0])}\n", "all_nonzero_svm_weights={Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in range(0, svm_coef.shape[0]) if svm_coef[k]!=0.0}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### svm weights for the test_str tokens" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'tarantino': -0.041343894497876434, 'soundtrack': 0.21409584517162913, 'old': -0.44373803191538447, 'movie': -0.41199304223305466, 'like': -0.20699017901400116, 'good': 1.1069508669130126, 'awesome': 0.8209169581723464, 'average': 0.15341076571124157, 'amazing': 1.006800041750699}\n" ] } ], "source": [ "print({Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in test_str_Tfidf.indices})" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6308574307108017\n" ] } ], "source": [ "test_str_raw_score_val = svm_bias + sum([svm_coef[k]*v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)])\n", "print(test_str_raw_score_val)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "__label__2 \n", "0.6308574307108017\n" ] } ], "source": [ "test_str_prediction_SVM = SVM.predict(test_str_Tfidf)\n", "test_str_distance = SVM.decision_function(test_str_Tfidf)\n", "print(Encoder.classes_[test_str_prediction_SVM[0]])\n", "print(test_str_distance[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Normalizing SVM predications to [0,1]\n", "https://www.csie.ntu.edu.tw/~cjlin/papers/plattprob.pdf" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def sigmoid(x, k=1):\n", " return 1/(1+ math.exp(-k*x))" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6526838565743812\n" ] } ], "source": [ "print(sigmoid(test_str_distance[0]))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }