{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary depencencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import text_normalizer as tn\n", "import model_evaluation_utils as meu\n", "\n", "np.set_printoptions(precision=2, linewidth=80)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load and normalize data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dataset = pd.read_csv(r'movie_reviews.csv')\n", "\n", "reviews = np.array(dataset['review'])\n", "sentiments = np.array(dataset['sentiment'])\n", "\n", "# extract data for model evaluation\n", "test_reviews = reviews[35000:]\n", "test_sentiments = sentiments[35000:]\n", "sample_review_ids = [7626, 3533, 13010]\n", "\n", "# normalize dataset\n", "norm_test_reviews = tn.normalize_corpus(test_reviews)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Sentiment Analysis with AFINN" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from afinn import Afinn\n", "\n", "afn = Afinn(emoticons=True) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiment for sample reviews" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "REVIEW: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!\n", "Actual Sentiment: negative\n", "Predicted Sentiment polarity: -7.0\n", "------------------------------------------------------------\n", "REVIEW: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one.\n", "Actual Sentiment: positive\n", "Predicted Sentiment polarity: 3.0\n", "------------------------------------------------------------\n", "REVIEW: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot\n", "Actual Sentiment: positive\n", "Predicted Sentiment polarity: -3.0\n", "------------------------------------------------------------\n" ] } ], "source": [ "for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):\n", " print('REVIEW:', review)\n", " print('Actual Sentiment:', sentiment)\n", " print('Predicted Sentiment polarity:', afn.score(review))\n", " print('-'*60)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiment for test dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "sentiment_polarity = [afn.score(review) for review in test_reviews]\n", "predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate model performance" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.71\n", "Precision: 0.73\n", "Recall: 0.71\n", "F1 Score: 0.71\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.67 0.85 0.75 7510\n", " negative 0.79 0.57 0.67 7490\n", "\n", "avg / total 0.73 0.71 0.71 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6376 1134\n", " negative 3189 4301\n" ] } ], "source": [ "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, \n", " classes=['positive', 'negative'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sentiment Analysis with SentiWordNet" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Positive Polarity Score: 0.875\n", "Negative Polarity Score: 0.125\n", "Objective Score: 0.0\n" ] } ], "source": [ "from nltk.corpus import sentiwordnet as swn\n", "\n", "awesome = list(swn.senti_synsets('awesome', 'a'))[0]\n", "print('Positive Polarity Score:', awesome.pos_score())\n", "print('Negative Polarity Score:', awesome.neg_score())\n", "print('Objective Score:', awesome.obj_score())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def analyze_sentiment_sentiwordnet_lexicon(review,\n", " verbose=False):\n", "\n", " # tokenize and POS tag text tokens\n", " tagged_text = [(token.text, token.tag_) for token in tn.nlp(review)]\n", " pos_score = neg_score = token_count = obj_score = 0\n", " # get wordnet synsets based on POS tags\n", " # get sentiment scores if synsets are found\n", " for word, tag in tagged_text:\n", " ss_set = None\n", " if 'NN' in tag and list(swn.senti_synsets(word, 'n')):\n", " ss_set = list(swn.senti_synsets(word, 'n'))[0]\n", " elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):\n", " ss_set = list(swn.senti_synsets(word, 'v'))[0]\n", " elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):\n", " ss_set = list(swn.senti_synsets(word, 'a'))[0]\n", " elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):\n", " ss_set = list(swn.senti_synsets(word, 'r'))[0]\n", " # if senti-synset is found \n", " if ss_set:\n", " # add scores for all found synsets\n", " pos_score += ss_set.pos_score()\n", " neg_score += ss_set.neg_score()\n", " obj_score += ss_set.obj_score()\n", " token_count += 1\n", " \n", " # aggregate final scores\n", " final_score = pos_score - neg_score\n", " norm_final_score = round(float(final_score) / token_count, 2)\n", " final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'\n", " if verbose:\n", " norm_obj_score = round(float(obj_score) / token_count, 2)\n", " norm_pos_score = round(float(pos_score) / token_count, 2)\n", " norm_neg_score = round(float(neg_score) / token_count, 2)\n", " # to display results in a nice table\n", " sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score, norm_pos_score, \n", " norm_neg_score, norm_final_score]],\n", " columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], \n", " ['Predicted Sentiment', 'Objectivity',\n", " 'Positive', 'Negative', 'Overall']], \n", " labels=[[0,0,0,0,0],[0,1,2,3,4]]))\n", " print(sentiment_frame)\n", " \n", " return final_sentiment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiment for sample reviews" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "REVIEW: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!\n", "Actual Sentiment: negative\n", " SENTIMENT STATS: \n", " Predicted Sentiment Objectivity Positive Negative Overall\n", "0 negative 0.76 0.09 0.15 -0.06\n", "------------------------------------------------------------\n", "REVIEW: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one.\n", "Actual Sentiment: positive\n", " SENTIMENT STATS: \n", " Predicted Sentiment Objectivity Positive Negative Overall\n", "0 positive 0.74 0.2 0.06 0.14\n", "------------------------------------------------------------\n", "REVIEW: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot\n", "Actual Sentiment: positive\n", " SENTIMENT STATS: \n", " Predicted Sentiment Objectivity Positive Negative Overall\n", "0 positive 0.8 0.14 0.07 0.07\n", "------------------------------------------------------------\n" ] } ], "source": [ "for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):\n", " print('REVIEW:', review)\n", " print('Actual Sentiment:', sentiment)\n", " pred = analyze_sentiment_sentiwordnet_lexicon(review, verbose=True) \n", " print('-'*60)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiment for test dataset" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "predicted_sentiments = [analyze_sentiment_sentiwordnet_lexicon(review, verbose=False) for review in norm_test_reviews]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate model performance" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.69\n", "Precision: 0.69\n", "Recall: 0.69\n", "F1 Score: 0.68\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.66 0.76 0.71 7510\n", " negative 0.72 0.61 0.66 7490\n", "\n", "avg / total 0.69 0.69 0.68 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 5742 1768\n", " negative 2932 4558\n" ] } ], "source": [ "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, \n", " classes=['positive', 'negative'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sentiment Analysis with VADER" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Program Files\\Anaconda3\\lib\\site-packages\\nltk\\twitter\\__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n", " warnings.warn(\"The twython library has not been installed. \"\n" ] } ], "source": [ "from nltk.sentiment.vader import SentimentIntensityAnalyzer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build model" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def analyze_sentiment_vader_lexicon(review, \n", " threshold=0.1,\n", " verbose=False):\n", " # pre-process text\n", " review = tn.strip_html_tags(review)\n", " review = tn.remove_accented_chars(review)\n", " review = tn.expand_contractions(review)\n", " \n", " # analyze the sentiment for review\n", " analyzer = SentimentIntensityAnalyzer()\n", " scores = analyzer.polarity_scores(review)\n", " # get aggregate scores and final sentiment\n", " agg_score = scores['compound']\n", " final_sentiment = 'positive' if agg_score >= threshold\\\n", " else 'negative'\n", " if verbose:\n", " # display detailed sentiment statistics\n", " positive = str(round(scores['pos'], 2)*100)+'%'\n", " final = round(agg_score, 2)\n", " negative = str(round(scores['neg'], 2)*100)+'%'\n", " neutral = str(round(scores['neu'], 2)*100)+'%'\n", " sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,\n", " negative, neutral]],\n", " columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], \n", " ['Predicted Sentiment', 'Polarity Score',\n", " 'Positive', 'Negative', 'Neutral']], \n", " labels=[[0,0,0,0,0],[0,1,2,3,4]]))\n", " print(sentiment_frame)\n", " \n", " return final_sentiment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiment for sample reviews" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "REVIEW: no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!\n", "Actual Sentiment: negative\n", " SENTIMENT STATS: \n", " Predicted Sentiment Polarity Score Positive Negative Neutral\n", "0 negative -0.8 0.0% 40.0% 60.0%\n", "------------------------------------------------------------\n", "REVIEW: I don't care if some people voted this movie to be bad. If you want the Truth this is a Very Good Movie! It has every thing a movie should have. You really should Get this one.\n", "Actual Sentiment: positive\n", " SENTIMENT STATS: \n", " Predicted Sentiment Polarity Score Positive Negative Neutral\n", "0 negative -0.16 16.0% 14.000000000000002% 69.0%\n", "------------------------------------------------------------\n", "REVIEW: Worst horror film ever but funniest film ever rolled in one you have got to see this film it is so cheap it is unbeliaveble but you have to see it really!!!! P.s watch the carrot\n", "Actual Sentiment: positive\n", " SENTIMENT STATS: \n", " Predicted Sentiment Polarity Score Positive Negative Neutral\n", "0 positive 0.49 11.0% 11.0% 77.0%\n", "------------------------------------------------------------\n" ] } ], "source": [ "for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):\n", " print('REVIEW:', review)\n", " print('Actual Sentiment:', sentiment)\n", " pred = analyze_sentiment_vader_lexicon(review, threshold=0.4, verbose=True) \n", " print('-'*60)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiment for test dataset" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "predicted_sentiments = [analyze_sentiment_vader_lexicon(review, threshold=0.4, verbose=False) for review in test_reviews]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate model performance" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.71\n", "Precision: 0.72\n", "Recall: 0.71\n", "F1 Score: 0.71\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.67 0.83 0.74 7510\n", " negative 0.78 0.59 0.67 7490\n", "\n", "avg / total 0.72 0.71 0.71 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6235 1275\n", " negative 3068 4422\n" ] } ], "source": [ "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, \n", " classes=['positive', 'negative'])" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }