# *Data Visualization and Statistics*

Gallery of Matplotlib examples: [https://matplotlib.org/gallery.html](https://matplotlib.org/gallery.html)

In [None]:
## First, let's import some packages.

import os
from pprint import pprint
from textblob import TextBlob

import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
# The line above tells Jupyter to display Matplotlib graphics within the notebook.

In [None]:
## Download sample text corpora from GitHub, then unzip.

os.chdir('/sharedfolder/')

!wget -N https://github.com/pcda17/pcda17.github.io/blob/master/week/8/Sample_corpora.zip?raw=true -O Sample_corpora.zip
!unzip -o Sample_corpora.zip

In [None]:
os.chdir('/sharedfolder/Sample_corpora')

os.listdir('./')

In [None]:
!ls Jane_Austen

In [None]:
!ls Herman_Melville

In [None]:
## Loading a Melville novel as a TextBlob object

melville_path = 'Herman_Melville/Moby_Dick.txt'

melville_blob = TextBlob(open(melville_path).read().replace('\n', ' '))

In [None]:
## Loading an Austen novel as a TextBlob object

austen_path = 'Jane_Austen/Pride_and_Prejudice.txt'

austen_blob = TextBlob(open(austen_path).read().replace('\n', ' '))

In [None]:
## Recall that 'some_textblob_object.words' is a WordList object ...

melville_blob.words[5100:5140]

In [None]:
# ... which we can cast to an ordinary list.

list(melville_blob.words[5100:5140])

In [None]:
## And 'some_textblob_object.sentences' is a list of Sentence objects ...

austen_blob.sentences[100:105]

In [None]:
# ... which we can convert to a list of strings using a list comprehension.

[str(item) for item in austen_blob.sentences[100:105]]

In [None]:
## For reference, here's another example of a list comprehension:

word_list = ['Call', 'me', 'Ishmael.']

uppercase_list = [word.upper() for word in word_list]

uppercase_list

In [None]:
## And one more for good measure:

string_nums = [str(i) for i in range(12)]

string_nums

### ▷ Sentiment analysis with TextBlob

Details on the training data that NLTK (via TextBlob) uses to measure polarity:
[http://www.cs.cornell.edu/people/pabo/movie-review-data/](http://www.cs.cornell.edu/people/pabo/movie-review-data/)

In [None]:
## Negative sentiment polarity example
# (result between -1 and +1)

from textblob import TextBlob

text = "This is a very mean and nasty sentence."

blob = TextBlob(text)

sentiment_score = blob.sentiment.polarity

print(sentiment_score)

In [None]:
## Positive sentiment polarity example
# (result between -1 and +1)

text = "This is a very nice and positive sentence."

blob = TextBlob(text)

sentiment_score = blob.sentiment.polarity

print(sentiment_score)

In [None]:
## Neutral polarity / not enough information

text = "What is this?"

blob = TextBlob(text)

sentiment_score = blob.sentiment.polarity

print(sentiment_score)

In [None]:
## High subjectivity example
# result between 0 and 1

text="This is a very mean and nasty sentence."

blob = TextBlob(text)

sentiment_score = blob.sentiment.subjectivity

print(sentiment_score)

In [None]:
## Low subjectivity example
# result between 0 and 1

text="This sentence states a fact, with an apparently objective adjective."

blob = TextBlob(text)

sentiment_score=blob.sentiment.subjectivity

print(sentiment_score)

### ▷ Plotting Sentiment Values

Let's map sentiment polarity values across the course of a full novel.

In [None]:
## Viewing Pyplot style templates

pprint(plt.style.available)

In [None]:
## Selecting a Pyplot style

plt.style.use('ggplot')

# The 'ggplot' style imitates the R graphing package 'ggplot2.' (http://ggplot2.org)

In [None]:
austen_sentiments = [item.sentiment.polarity for item in austen_blob.sentences]

austen_sentiments[:15]

In [None]:
## Austen sentiment values for first 60 sentences

plt.figure(figsize=(18,8))
plt.plot(austen_sentiments[:60])

In [None]:
austen_blob.sentences[30]

In [None]:
austen_blob.sentences[37]

In [None]:
## Plotting 'Pride and Prejudice' sentence sentiment values over full novel

plt.figure(figsize=(18,8))

plt.plot(austen_sentiments)

plt.show()

In [None]:
## Finding the most 'positive' sentences in 'Pride and Prejudice' and printing them

max_sentiment = max(austen_sentiments)

print(max_sentiment) # max sentiment polarity value
print()

for sentence in austen_blob.sentences:
 if sentence.sentiment.polarity == max_sentiment:
 print(sentence)
 print()

In [None]:
## Finding the most 'negative' sentences in 'Pride and Prejudice' and printing them

min_sentiment = min(austen_sentiments)

print(min_sentiment) # max sentiment polarity value
print()

for sentence in austen_blob.sentences:
 if sentence.sentiment.polarity == min_sentiment:
 print(sentence)
 print()

In [None]:
## Example: smoothing a list of numbers using the 'pandas' package

some_values = [5, 4, 5, 6, 6, 7, 6, 19, 4, 4, 3, 3, 3, 1, 5, 5, 6, 7, 0]

pandas_series = pd.Series(some_values)

list(pandas_series.rolling(window=4).mean())

In [None]:
## Smoothing our data before plotting

austen_sentiments_pd = pd.Series(austen_sentiments)

austen_sentiments_smooth = austen_sentiments_pd.rolling(window=200).mean()

print(austen_sentiments_smooth[190:220])

In [None]:
## Plotting smoothed sentiment polarity values for each sentence in 'Pride and Prejudice'

plt.figure(figsize=(18,8))

plt.plot(austen_sentiments_smooth)

plt.show()

In [None]:
## Comparing 'Moby Dick' sentiment values

melville_sentiments = [item.sentiment.polarity for item in melville_blob.sentences]

melville_sentiments_pd = pd.Series(melville_sentiments)

melville_sentiments_smooth = melville_sentiments_pd.rolling(window=200).mean()

plt.figure(figsize=(18,8))

plt.plot(melville_sentiments_smooth)

plt.show()

In [None]:
## Finding and printing the most 'negative' sentence in a list of smoothed sentiment values

min_sentiment = min(melville_sentiments_smooth[199:])

print(min_sentiment) # min sentiment polarity value
print()

min_sentiment_index = list(melville_sentiments_smooth).index(min_sentiment) # index position of the 'min_sentiment' value

print(melville_blob.sentences[min_sentiment_index])

In [None]:
## Finding and printing the most 'positive' sentence in a list of smoothed sentiment values

max_sentiment = max(melville_sentiments_smooth[199:])

print(max_sentiment) # max sentiment polarity value
print()

max_sentiment_index = list(melville_sentiments_smooth).index(max_sentiment) # index position of the 'min_sentiment' value

print(melville_blob.sentences[max_sentiment_index])

In [None]:
## Finding and printing the most 'positive' sentence in a list of smoothed sentiment values

max_sentiment = max(austen_sentiments_smooth[199:])

print(max_sentiment) # max sentiment polarity value
print()

max_sentiment_index = list(austen_sentiments_smooth).index(max_sentiment) # index position of the 'max_sentiment' value

print(austen_blob.sentences[max_sentiment_index])

In [None]:
## Finding and printing the most 'negative' sentence in a list of smoothed sentiment values

min_sentiment = min(austen_sentiments_smooth[199:])

print(min_sentiment) # min sentiment polarity value
print()

min_sent_index=list(austen_sentiments_smooth).index(min_sentiment) # index position of the 'min_sentiment' value

print(austen_blob.sentences[min_sent_index])

In [None]:
## Creating functions to expedite the steps we put together above process
# This function accepts an optional second argument for smoothing window size. The default is 200 windows.

def plot_polarity(text_path, window=200):
 text_in = open(text_path).read().replace('\n', ' ')
 blob = TextBlob(text_in)
 sentiments = [sentence.sentiment.polarity for sentence in blob.sentences]
 sentiments_pd = pd.Series(sentiments)
 sentiments_smooth = sentiments_pd.rolling(window).mean()
 plt.figure(figsize = (18,8))
 plt.plot(sentiments_smooth)
 plt.show()

In [None]:
!find ./

In [None]:
plot_polarity('George_Eliot/Silas_Marner.txt')

In [None]:
plot_polarity('Joseph_Conrad/Heart_of_Darkness.txt')

### ▷ Plotting smoothed random data (for comparison)

In [None]:
## Plotting completely random data

random_vals = np.random.rand(4000)

vals_pd = pd.Series(random_vals)
vals_smooth = vals_pd.rolling(window=200).mean()

plt.figure(figsize=(18,8))
plt.plot(vals_smooth)

### ▷ Working with multiple files

In [None]:
!ls *

In [None]:
os.chdir('/sharedfolder/Sample_corpora/Inaugural_Speeches/')
sorted(os.listdir('./'))

In [None]:
inaugural_filenames = sorted(os.listdir('./'))

inaugural_sentiment_values = []

for filename in inaugural_filenames:
 inaugural_text = open(filename).read()
 sentiment_polarity_value = TextBlob(inaugural_text).sentiment.polarity
 inaugural_sentiment_values.append(sentiment_polarity_value)

print(inaugural_sentiment_values)

In [None]:
## Creating nicely formatted labels for the sentiment values above

inaugural_labels = [item.replace('.txt','').replace('_', ' ').title() for item in inaugural_filenames]

inaugural_labels

In [None]:
## Plotting presidential inaugural address sentiment values over time

plt.figure(figsize = (20,8))

plt.xticks(range(len(inaugural_sentiment_values)), inaugural_labels) # two arguments: tick positions, tick display list

plt.xticks(rotation=-85)

plt.ylabel('Sentiment Polarity Value')

plt.plot(inaugural_sentiment_values)

plt.show()

## ▷ Assignment

 For each author in our set of corpora, which is their most 'positive' novel? Their most 'negative'?

## ▷ Sentiment Histograms

In [None]:
os.chdir('/sharedfolder/Sample_corpora/')

In [None]:
text_in = open('Jane_Austen/Pride_and_Prejudice.txt').read().replace('\n', ' ')

blob = TextBlob(text_in)
sentiments = [sentence.sentiment.polarity for sentence in blob.sentences]
plt.figure(figsize=(20,10))
plt.hist(sentiments, bins=25)
plt.show()

In [None]:
text_in = open('Jane_Austen/Pride_and_Prejudice.txt').read().replace('\n', ' ')

blob = TextBlob(text_in)
sentiments = [sentence.sentiment.subjectivity for sentence in blob.sentences]
plt.figure(figsize=(20,10))
plt.hist(sentiments, bins=25)
plt.show()

## ▷ Cleaning sentiment values

In [None]:
text_in = open('Jane_Austen/Pride_and_Prejudice.txt').read().replace('\n', ' ')

blob = TextBlob(text_in)
sentiments = [sentence.sentiment.polarity for sentence in blob.sentences]
sentiments_cleaned = [value for value in sentiments if value!=0]
plt.figure(figsize=(20,10))
plt.hist(sentiments_cleaned, bins=25)
plt.show()

In [None]:
def polarity_histogram_cleaned(text_path):
 text_in = open(text_path).read().replace('\n', ' ')
 blob = TextBlob(text_in)
 sentiments = [sentence.sentiment.polarity for sentence in blob.sentences]
 sentiments_cleaned = [value for value in sentiments if value!=0]
 plt.figure(figsize=(20,10))
 plt.hist(sentiments_cleaned, bins=25)
 plt.show()

In [None]:
!find ./

In [None]:
polarity_histogram_cleaned('./Joseph_Conrad/The_Secret_Agent.txt')

## ▷ Comparing Sentiment Distributions

In [None]:
melville_blob = TextBlob(open('Herman_Melville/Moby_Dick.txt').read().replace('\n', ' '))
austen_blob = TextBlob(open('Jane_Austen/Pride_and_Prejudice.txt').read().replace('\n', ' '))

melville_sentiments = [sentence.sentiment.polarity for sentence in melville_blob.sentences]
melville_sentiments_cleaned = [value for value in melville_sentiments if value!=0.0]

austen_sentiments = [sentence.sentiment.polarity for sentence in austen_blob.sentences]
austen_sentiments_cleaned = [value for value in austen_sentiments if value!=0.0]

plt.figure(figsize=(15,8))

plt.hist(melville_sentiments_cleaned, bins=25, alpha=0.5, label='Moby Dick')
plt.hist(austen_sentiments_cleaned, bins=25, alpha=0.5, label='Pride and Prejudice')

plt.legend(loc='upper right')

plt.show()

In [None]:
print(np.mean(melville_sentiments_cleaned))
print(np.mean(austen_sentiments_cleaned))

## ▷ Statistical Tests

In [None]:
## t-test of independent values
# (used to determine whether two *normally distributed* sets of values are significantly different)

from scipy import stats

stats.ttest_ind(melville_sentiments_cleaned, austen_sentiments_cleaned)

In [None]:
## Mann-Whitney U test
# (used to test two sets of *non-normally distributed* values are significantly different)

stats.mannwhitneyu(melville_sentiments, austen_sentiments)

## ▷ Assignment

 Is George Eliot significantly more subjective than Jane Austen?
 Is Herman Melville significantly more 'positive' than Joseph Conrad?

## ▷ Assignment

 Write a function that takes two texts' paths as arguments and 
 (a) plots a histogram comparing their sentences' sentiment distributions
 (b) tests whether their sentiment values are significantly different