{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import necessary dependencies"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import text_normalizer as tn\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load and normalize data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production.
The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive\n"
]
}
],
"source": [
"dataset = pd.read_csv(r'movie_reviews.csv')\n",
"\n",
"# take a peek at the data\n",
"print(dataset.head())\n",
"reviews = np.array(dataset['review'])\n",
"sentiments = np.array(dataset['sentiment'])\n",
"\n",
"# build train and test datasets\n",
"train_reviews = reviews[:35000]\n",
"train_sentiments = sentiments[:35000]\n",
"test_reviews = reviews[35000:]\n",
"test_sentiments = sentiments[35000:]\n",
"\n",
"# normalize datasets\n",
"norm_train_reviews = tn.normalize_corpus(train_reviews)\n",
"norm_test_reviews = tn.normalize_corpus(test_reviews)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extract features from positive and negative reviews"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(25000, 331) (25000, 331)\n"
]
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"# consolidate all normalized reviews\n",
"norm_reviews = norm_train_reviews+norm_test_reviews\n",
"# get tf-idf features for only positive reviews\n",
"positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']\n",
"ptvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)\n",
"ptvf_features = ptvf.fit_transform(positive_reviews)\n",
"# get tf-idf features for only negative reviews\n",
"negative_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'negative']\n",
"ntvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)\n",
"ntvf_features = ntvf.fit_transform(negative_reviews)\n",
"# view feature set dimensions\n",
"print(ptvf_features.shape, ntvf_features.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Topic Modeling on Reviews"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pyLDAvis\n",
"import pyLDAvis.sklearn\n",
"from sklearn.decomposition import NMF\n",
"import topic_model_utils as tmu\n",
"\n",
"pyLDAvis.enable_notebook()\n",
"total_topics = 10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Display and visualize topics for positive reviews"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic #1 without weights\n",
"['like', 'not', 'think', 'really', 'say', 'would', 'get', 'know', 'thing', 'much', 'bad', 'go', 'lot', 'could', 'even']\n",
"\n",
"Topic #2 without weights\n",
"['movie', 'see', 'watch', 'great', 'good', 'one', 'not', 'time', 'ever', 'enjoy', 'recommend', 'make', 'acting', 'like', 'first']\n",
"\n",
"Topic #3 without weights\n",
"['show', 'episode', 'series', 'tv', 'watch', 'dvd', 'first', 'see', 'time', 'one', 'good', 'year', 'remember', 'ever', 'would']\n",
"\n",
"Topic #4 without weights\n",
"['performance', 'role', 'play', 'actor', 'cast', 'good', 'well', 'great', 'character', 'excellent', 'give', 'also', 'support', 'star', 'job']\n",
"\n",
"Topic #5 without weights\n",
"['man', 'young', 'old', 'two', 'get', 'year', 'woman', 'take', 'go', 'come', 'find', 'back', 'girl', 'father', 'friend']\n",
"\n",
"Topic #6 without weights\n",
"['film', 'see', 'one', 'scene', 'make', 'not', 'time', 'director', 'horror', 'music', 'many', 'cinema', 'release', 'work', 'use']\n",
"\n",
"Topic #7 without weights\n",
"['story', 'tell', 'character', 'true', 'book', 'well', 'line', 'base', 'interesting', 'end', 'simple', 'read', 'beautiful', 'main', 'different']\n",
"\n",
"Topic #8 without weights\n",
"['funny', 'comedy', 'laugh', 'humor', 'fun', 'moment', 'line', 'not', 'guy', 'get', 'make', 'lot', 'one', 'time', 'show']\n",
"\n",
"Topic #9 without weights\n",
"['life', 'world', 'people', 'us', 'real', 'live', 'human', 'war', 'many', 'show', 'not', 'way', 'no', 'make', 'feel']\n",
"\n",
"Topic #10 without weights\n",
"['love', 'fall', 'song', 'wonderful', 'beautiful', 'music', 'heart', 'girl', 'would', 'watch', 'great', 'favorite', 'always', 'family', 'woman']\n",
"\n"
]
}
],
"source": [
"# build topic model on positive sentiment review features\n",
"pos_nmf = NMF(n_components=total_topics, \n",
" random_state=42, alpha=0.1, l1_ratio=0.2)\n",
"pos_nmf.fit(ptvf_features) \n",
"# extract features and component weights\n",
"pos_feature_names = ptvf.get_feature_names()\n",
"pos_weights = pos_nmf.components_\n",
"# extract and display topics and their components\n",
"pos_topics = tmu.get_topics_terms_weights(pos_weights, pos_feature_names)\n",
"tmu.print_topics_udf(topics=pos_topics,\n",
" total_topics=total_topics,\n",
" num_terms=15,\n",
" display_weights=False)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"