In [124]:
# Import the libraries that we're going to use
from nltk.corpus import twitter_samples
from nltk import casual_tokenize, word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tag import StanfordNERTagger

from collections import Counter
import string
import re

from gensim.models.ldamodel import LdaModel
from gensim import corpora

from pycorenlp import StanfordCoreNLP

In [2]:
# Lowercase all text
#
# @param strings
# An array of sentences (not word tokenized)
# @returns an array of lowercased sentences

def lowercase(strings):
 return [i.lower() for i in strings]

In [3]:
# Tokenize into words (using Tweet tokenizer, probably not suitable for non-Tweet text)
#
# @param strings
# An array of sentences
# @returns an array of tokenized sentences (each tokenized sentence is an array, so this returns an array of arrays)

def tokenize_tweets(strings):
 return [casual_tokenize(i) for i in strings]

In [4]:
# Tokenize into words (regular text, not Tweet text)
#
# @param strings
# An array of sentences
# @returns an array of tokenized sentences (each tokenized sentence is an array, so this returns an array of arrays)

def tokenize_regular(strings):
 return [word_tokenize(i) for i in strings]

In [5]:
# Tokenize into sentences (regular text, not Tweet text)
#
# @param text
# A string of text (not broken into sentences)
# @returns an array of sentences

def tokenize_sentences(text):
 return sent_tokenize(text)

In [38]:
# Remove all punctuation from sentences
#
# @param tokenized
# An array of tokenized sentences
# @returns an array of tokenized sentences with no punctuation

def remove_punctuation(tokenized):
 stripped = [[''.join([letter for letter in word if letter not in string.punctuation]) for word in sentence]\
 for sentence in tokenized]
 stripped = [[word for word in sentence if len(word) > 0] for sentence in stripped]
 return [i for i in stripped if len(i) > 0]

In [7]:
# Either remove all digits from sentences or replace them with pound sign
#
# @param tokenized
# An array of tokenized sentences
# @ param replace
# Whether to replace the digits with # or not (default = True)
# @returns an array of sentences with digits removed or replaced

def remove_digits(tokenized,replace=True):
 if replace:
 stripped = [[re.sub('[0123456789]','#',word) for word in sentence] for sentence in tokenized]
 else:
 stripped = [[re.sub('[0123456789]','',word) for word in sentence] for sentence in tokenized]
 stripped = [[word for word in sentence if len(word) > 0 ] for sentence in stripped]
 return [i for i in stripped if len(i) > 0] 

In [8]:
# Stem all words
#
# @param tokenized
# An array of tokenized sentences
# @returns an array of tokenized sentences with all of the words stemmed

def stem_words(tokenized):
 stemmer = PorterStemmer()
 return [[stemmer.stem(word) for word in sentence] for sentence in tokenized]

In [9]:
# Remove words that occur less than a certain number of times
#
# @param tokenized
# An array of tokenized sentences
# @param threshold
# The minimum number of times a word has to occur before it is removed (default = 5)
# @returns an array of tokenized sentences with rare words replaced with 'UNK'

def remove_rare_words(tokenized,threshold=5):
 #count the number of times each word appears in all the sentences
 counter = Counter([word for sentence in tokenized for word in sentence])
 
 #remove words that appear less than the threshold number of times (replace with 'UNK')
 return [[word if counter[word] >= threshold else 'UNK' for word in sentence] for sentence in tokenized]

In [10]:
# Remove links (text that begins with 'http://' or 'https://')
#
# @param tokenized
# An array of tokenized sentences
# @returns an array of tokenized sentences with links replaced with 'LINK'

def remove_links(tokenized):
 return [[word if word[:7] != 'http://' and word[:8] != 'https://' else 'LINK' for word in sentence] \
 for sentence in tokenized]

In [26]:
# Remove all (English) stopwords
#
# @param tokenized
# An array of tokenized sentences
# @returns an array of tokenized sentences with stopwords removed

def remove_stopwords(tokenized):
 stop = set(stopwords.words('english'))
 stripped = [[word for word in sentence if word not in stop] for sentence in tokenized]
 return [i for i in stripped if len(i) > 0] 

In [11]:
# Extract NER relationships from text
# Make sure that you've downloaded the Stanford NER files
# This only extracts persons, organizations, and locations
#
# @param tokenized
# An array of tokenized sentences
# @returns an array of sentences where each sentence is a list of tuples of (word, entity label)

def extract_ner(tokenized):
 # Download these files from https://nlp.stanford.edu/software/
 # Make sure the paths are set correctly
 st = StanfordNERTagger('/Users/laura/software/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',\
 '/Users/laura/software/stanford-ner/stanford-ner.jar') 
 return st.tag_sents(tokenized) #Batch processing is important - speeds it up tremendously!

In [113]:
# Calculate topics from text using LDA
#
# @param tokenized
# An array of tokenized sentences
# @param ignore
# Words to ignore when creating topics
# @param num_topics
# The number of topics to calculate
# @returns
# The dictionary of the corpus in the correct format for the topic model
# The corpus in the correct format for the topic model
# A trained topic model

def topic_modeling(tokenized,ignore=set(),num_topics = 10):
 dictionary = corpora.Dictionary([[word for word in sentence if word not in ignore] for sentence in tokenized])
 corpus = [dictionary.doc2bow(sentence) for sentence in tokenized]
 return (dictionary,corpus,LdaModel(corpus, num_topics=num_topics, id2word=dictionary))

In [146]:
# Calculate sentiment of tokenized sentences
# NOTE: before you do this, you need to make sure that the Stanford CoreNLP server is up and running! To do that:
# 1. Download it - wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-01-31.zip
# 2. Unzip it - unzip stanford-corenlp-full-2018-01-31.zip
# 3. Change to directory - cd stanford-corenlp-full-2018-01-31
# 4. Run it - java -mx5g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 10000000
#
# @param tokenized
# An array of tokenized sentences
# @returns the result for the Stanford CoreNPL Sentiment Analysis tool

def sentiment(tokenized):
 nlp = StanfordCoreNLP('http://localhost:9000')
 
 results = []
 for tokens in tokenized:
 results.append(nlp.annotate(' '.join(tokens),
 properties={
 'annotators': 'sentiment',
 'outputFormat': 'json',
 'timeout': 10000000,
 }))
 return results

In [91]:
# Ok, let's get some sample tweets! (you can replace this with your own data)
strings = twitter_samples.strings('positive_tweets.json') + \
 twitter_samples.strings('negative_tweets.json')

In [187]:
strings[:10]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!! my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing a

In [92]:
# Prepare data
lowercased = lowercase(strings)
tokenized = tokenize_tweets(lowercased)
tokenized = remove_links(tokenized)

In [64]:
# Named Entity Recognition
ner = extract_ner(tokenized)

In [65]:
# Let's look at some entities
for entity in ['PERSON','ORGANIZATION','PLACE']:
 print(entity)
 entities = set()
 counter = Counter()
 for sentence in ner:
 entityStarted = False
 fullEntity = ''
 for (word,entityLabel) in sentence:
 if entityLabel == entity:
 if entityStarted:
 fullEntity += ' ' + word
 else:
 fullEntity = word
 entityStarted = True
 elif entityStarted:
 entities.add(fullEntity)
 counter[fullEntity] += 1
 fullEntity = ''
 entityStarted = False
 if fullEntity != '':
 entities.add(fullEntity)
 counter[fullEntity] += 1
 
 for entity in entities: #Print out what we found
 print(entity,counter[entity])

PERSON
jane 1
miss kang 1
michael woodford 1
bush 1
emma roberts 1
chris gayle 2
kath 2
jonah 1
jumma mubarak 1
lewis 1
miss dubai 1
tom moore 1
joe 1
pete wentz 1
john prescott 1
tom felton 1
anna 1
don ’ 1
miss hannah montana 1
niamh fennell 1
irene 1
darcey connor 1
selena gomez louis tomlinson rita liam payne 1
miss pamela 1
kevin clifton 1
bea miller 1
jessica 2
donna 1
fabian delph 1
chris ellis 1
christophe gans 1
clarke 1
betty miller 1
danny 1
karlie kloss 1
sam smith 1
delph 1
phil 1
orhan pamuk 1
manny 1
obama 1
o sunnies 1
john sheen 1
erica 1
michael jackson 1
michael 2
alex smith 1
miss matt 1
thurston collins 1
tara barkin 1
anna akana 1
hillary clinton 1
mrs wong 1
steven william umboh 1
ORGANIZATION
expedia 1
PLACE


In [66]:
#tokenized = remove_rare_words(tokenized)

In [93]:
tokenized_noStopWords = remove_stopwords(tokenized)
tokenized_noStopWords = remove_punctuation(tokenized_noStopWords)

In [114]:
# Topic modeling
# NOTE: Highly dependent on the number of topics you use

(dictionary,corpus,topics) = topic_modeling(tokenized_noStopWords,ignore = set(['UNK','LINK']),num_topics=5)

In [115]:
topics.print_topics()

[(0,
 '0.009*"cant" + 0.008*"d" + 0.007*"like" + 0.006*"today" + 0.006*"feel" + 0.004*"want" + 0.004*"see" + 0.004*"okay" + 0.004*"im" + 0.004*"makes"'),
 (1,
 '0.023*"im" + 0.011*"want" + 0.008*"like" + 0.008*"kik" + 0.007*"know" + 0.007*"snapchat" + 0.006*"thanks" + 0.004*"tired" + 0.004*"pls" + 0.004*"sick"'),
 (2,
 '0.018*"miss" + 0.009*"sad" + 0.008*"much" + 0.007*"cant" + 0.006*"get" + 0.006*"thank" + 0.006*"good" + 0.006*"u" + 0.006*"one" + 0.006*"oh"'),
 (3,
 '0.017*"please" + 0.015*"follow" + 0.013*"u" + 0.012*"》" + 0.012*"♛" + 0.011*"back" + 0.010*"love" + 0.010*"justinbieber" + 0.007*"day" + 0.007*"sorry"'),
 (4,
 '0.015*"im" + 0.011*"followed" + 0.009*"get" + 0.008*"go" + 0.007*"thanks" + 0.007*"still" + 0.007*"3" + 0.007*"see" + 0.007*"wanna" + 0.006*"please"')]

In [122]:
tokenized[40]

['@joyster2012',
 '@cathstaincliffe',
 'good',
 'for',
 'you',
 ',',
 'girl',
 '!',
 '!',
 'best',
 'wishes',
 ':-)']

In [123]:
topics.get_document_topics(corpus[40])

[(0, 0.028808361042268887),
 (1, 0.028776925978230294),
 (2, 0.88457221079167814),
 (3, 0.028668412518234668),
 (4, 0.029174089669588135)]

In [157]:
# Sentiment analysis
sent = sentiment(tokenized[:10])

In [159]:
for res in sent:
 for s in res["sentences"]:
 print("'%s': %s %s" % (
 " ".join([t["word"] for t in s["tokens"]]),
 s["sentimentValue"], s["sentiment"]))

'#followfriday @france_inte @pkuchly57 @milipol_paris for being top engaged members in my community this week : -RRB-': 2 Neutral
'@lamb2ja hey james !': 2 Neutral
'how odd : / please call our contact centre on 02392441234 and we will be able to assist you :-RRB- many thanks !': 2 Neutral
'@despiteofficial we had a listen last night :-RRB- as you bleed is an amazing track .': 3 Positive
'when are you in scotland ?': 2 Neutral
'!': 2 Neutral
'@ 97sides congrats : -RRB-': 2 Neutral
'yeaaaah yippppy !': 3 Positive
'!': 2 Neutral
'!': 2 Neutral
'my accnt verified rqst has succeed got a blue tick mark on my fb profile :-RRB- in 15 days': 1 Negative
'@bhaktisbanter @pallaviruhail this one is irresistible :-RRB- #flipkartfashionfriday LINK': 3 Positive
'we do n't like to keep our lovely customers waiting for long !': 1 Negative
'we hope you enjoy !': 3 Positive
'happy friday !': 3 Positive
'- lwwf :-RRB- LINK': 2 Neutral
'@impatientraider on second thought , there ' s just not enough time for

In [160]:
sent = sentiment(tokenized[-10:])

In [161]:
for res in sent:
 for s in res["sentences"]:
 print("'%s': %s %s" % (
 " ".join([t["word"] for t in s["tokens"]]),
 s["sentimentValue"], s["sentiment"]))

'i want it to be my birthday already : -LRB-': 2 Neutral
'@louanndavies completely agree .': 2 Neutral
'the press wo n't : -LRB-': 2 Neutral
'im super duper tired : -LRB-': 1 Negative
'having boring time :-LRB- do n't know what to do ...': 1 Negative
'ill be on soon , i promise :-LRB- waaah': 1 Negative
'i wan na change my avi but usanele : -LRB-': 1 Negative
'my puppy broke her foot : -LRB-': 2 Neutral
'where 's all the jaebum baby pictures :-LRB- -LRB-': 2 Neutral
'but but mr ahmad maslan cooks too :-LRB- LINK': 1 Negative
'@eawoman as a hull supporter i am expecting a misserable few weeks : - -LRB-': 2 Neutral


In [177]:
justinbeiber = [sentence for sentence in tokenized if '@justinbieber' in ' '.join(sentence)]

In [180]:
sent = sentiment(justinbeiber)

In [182]:
all_sentiment = []
for res in sent:
 for s in res["sentences"]:
 print("'%s': %s %s" % (
 " ".join([t["word"] for t in s["tokens"]]),
 s["sentimentValue"], s["sentiment"]))
 all_sentiment.append(s['sentiment'])

'@justinbieber :-RRB- always smile': 3 Positive
'@justinbieber can you please follow me @caitecat1209 ♡ ♡ ♡ please jb follow me i love you always and forever .': 3 Positive
'i 'm a belieber ♡ ♡ ♡ :-RRB- i love u': 2 Neutral
'this is why im standing and always being a belieber :-RRB- its all bcause of him @justinbieber 2009 until die : -RRB-': 2 Neutral
''' @justinbieber : :-RRB- '' why baby ?': 1 Negative
'😘 😘 😘': 2 Neutral
'@justinbieber it makesme happy to see this :-RRB- keep smiling we love u': 3 Positive
'@justinbieber when you smile , i smile : -RRB-': 3 Positive
'@justinbieber :-RRB- back at ya': 2 Neutral
'@justinbieber i love u : -RRB-': 2 Neutral
'@justinbieber you are daddy af ... : - -RRB-': 2 Neutral
'justin where are you ?': 2 Neutral
':-LRB- @justinbieber': 2 Neutral
'♛ ♛ ♛ 》 》 》 i love you so much .': 3 Positive
'i beli ̇ eve that he wi ̇ ll follow .': 1 Negative
'please follow me please justi ̇ n @justinbieber :-LRB- x15 .': 1 Negative
'350 》 》 》 see me ♛ ♛ ♛': 1 Negat

In [184]:
all_sentiment.count('Positive')

40

In [185]:
all_sentiment.count('Negative')

162

In [186]:
all_sentiment.count('Neutral')

9