{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import necessary dependencies"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import text_normalizer as tn"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load and normalize data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production.
The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive\n"
]
}
],
"source": [
"dataset = pd.read_csv(r'movie_reviews.csv')\n",
"\n",
"# take a peek at the data\n",
"print(dataset.head())\n",
"reviews = np.array(dataset['review'])\n",
"sentiments = np.array(dataset['sentiment'])\n",
"\n",
"# build train and test datasets\n",
"train_reviews = reviews[:35000]\n",
"train_sentiments = sentiments[:35000]\n",
"test_reviews = reviews[35000:]\n",
"test_sentiments = sentiments[35000:]\n",
"\n",
"# normalize datasets\n",
"norm_train_reviews = tn.normalize_corpus(train_reviews)\n",
"norm_test_reviews = tn.normalize_corpus(test_reviews)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Build Text Classification Pipeline with The Best Model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import make_pipeline\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"# build BOW features on train reviews\n",
"cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))\n",
"cv_train_features = cv.fit_transform(norm_train_reviews)\n",
"# build Logistic Regression model\n",
"lr = LogisticRegression()\n",
"lr.fit(cv_train_features, train_sentiments)\n",
"\n",
"# Build Text Classification Pipeline\n",
"lr_pipeline = make_pipeline(cv, lr)\n",
"\n",
"# save the list of prediction classes (positive, negative)\n",
"classes = list(lr_pipeline.classes_)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# Analyze Model Prediction Probabilities"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['positive', 'negative'], dtype=object)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr_pipeline.predict(['the lord of the rings is an excellent movie', \n",
" 'i hated the recent movie on tv, it was so bad'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | negative | \n", "positive | \n", "
---|---|---|
0 | \n", "0.169653 | \n", "0.830347 | \n", "
1 | \n", "0.730814 | \n", "0.269186 | \n", "