{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# *Unsupervised learning: Latent Dirichlet allocation (LDA) topic modeling*" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Install a Python package for LDA\n", "# http://pythonhosted.org/lda/getting_started.html\n", "\n", "!pip3 install lda" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Importing basic packages\n", "\n", "import os\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Downloading 'Essays' by Ralph Waldo Emerson\n", "\n", "os.chdir('/sharedfolder/')\n", "\n", "!wget http://www.gutenberg.org/cache/epub/16643/pg16643.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Loading the text\n", "\n", "text_path = 'pg16643.txt'\n", "\n", "text_data = open(text_path).read()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Dividing the document into segments, with the aim of extracting individual essays\n", "\n", "len(text_data.split('\\n\\n\\n\\n\\n'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Viewing the beginning of each segment to determine which ones to keep\n", "\n", "counter = 0\n", "\n", "for item in text_data.split('\\n\\n\\n\\n\\n'):\n", " print('-----')\n", " print(counter)\n", " print(item[:80])\n", " counter+=1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a list of essays\n", "\n", "document_list = text_data.split('\\n\\n\\n\\n\\n')[9:20]\n", "\n", "print(len(document_list))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a vectorized representation of each essay in the list\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vectorizer = CountVectorizer()\n", "\n", "X = vectorizer.fit_transform(document_list) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Viewing a single essay's vector\n", "\n", "sample_essay_vector = X.toarray()[3]\n", "\n", "print(len(sample_essay_vector))\n", "\n", "sample_essay_vector" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a vocabulary list corresponding to the vectors we created above\n", "\n", "vocabulary = vectorizer.get_feature_names()\n", "\n", "vocabulary[8950:8980]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Viewing the 10 most frequent words in a single essay\n", "\n", "print(np.array(vocabulary)[np.argsort(sample_essay_vector)[::-1]][:10])\n", "\n", "print(np.argsort(sample_essay_vector)[::-1][:10]) # corresponding frequency values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Initializing an LDA model: 10 topics and 1000 iterations\n", "\n", "import lda\n", "\n", "model = lda.LDA(n_topics=10, n_iter=1000, random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Fitting the model using our list of vectors\n", "\n", "model.fit(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Viewing the top 50 words in each 'topic'\n", "\n", "topic_word = model.topic_word_\n", "\n", "n_top_words = 50\n", "\n", "for i, topic_distribution in enumerate(topic_word):\n", " topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]\n", " print('Topic {}: {}'.format(i, ' '.join(topic_words)))\n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Repeating the process, removing stop words and punctuation first" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize\n", "\n", "word_tokenize('We are symbols, and inhabit symbols.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Importing NLTK stop words\n", "\n", "from nltk.corpus import stopwords\n", " \n", "stop_words = set(stopwords.words('english'))\n", "\n", "stop_words" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Importing Python punctuation set\n", "\n", "import string\n", "\n", "string.punctuation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Testing tokenization + stop word removal\n", "\n", "sentence = 'We are symbols, and inhabit symbols.'.lower()\n", "\n", "token_list = word_tokenize(sentence)\n", "\n", "sentence_filtered = [item for item in token_list if (item not in stop_words)&(item not in string.punctuation)]\n", "\n", "sentence_filtered" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Tokenizing and removing stop words from our list of essays\n", "\n", "documents_filtered = []\n", "\n", "for document in document_list:\n", " token_list = word_tokenize(document.lower())\n", " tokens_filtered = [item for item in token_list if (item not in stop_words)&(item not in string.punctuation)]\n", " documents_filtered.append(' '.join(tokens_filtered))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Viewing a segment of a preprocessed essay\n", "\n", "documents_filtered[3][2000:2100]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Vectorizing preprocessed essays\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vectorizer = CountVectorizer()\n", "\n", "X = vectorizer.fit_transform(documents_filtered) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a vocabulary list corresponding to the vectors we created above\n", "\n", "vocabulary = vectorizer.get_feature_names()\n", "\n", "vocabulary[1140:1160]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Initializing an LDA model: 10 topics and 1000 iterations\n", "\n", "model = lda.LDA(n_topics=10, n_iter=1000, random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Fitting the model using our list of vectors\n", "\n", "model.fit(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Viewing the top 50 words in each 'topic'\n", "\n", "topic_word = model.topic_word_\n", "\n", "n_top_words = 50\n", "\n", "for i, topic_distribution in enumerate(topic_word):\n", " topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]\n", " print('Topic ' + str(i) + ':')\n", " print(' '.join(topic_words))\n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### â–·Assignment\n", "\n", " Modify the code above: Apply a stemming step to each word before vectorizing the text.\n", " See example stemming code in the following cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Stemming example\n", "\n", "from nltk.stem.porter import PorterStemmer\n", "\n", "stemmer = PorterStemmer()\n", "\n", "print(stemmer.stem('nature'))\n", "\n", "print(stemmer.stem('natural'))\n", "\n", "print(stemmer.stem('naturalism'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# *Supervised learning: Naive Bayes classification*" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Download sample text corpora from GitHub, then unzip.\n", "\n", "os.chdir('/sharedfolder/')\n", "\n", "## Uncomment the lines below if you need to re-download test corpora we used last week.\n", "\n", "#!wget -N https://github.com/pcda17/pcda17.github.io/blob/master/week/8/Sample_corpora.zip?raw=true -O Sample_corpora.zip\n", "#!unzip -o Sample_corpora.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.chdir('/sharedfolder/Sample_corpora')\n", "\n", "os.listdir('./')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Loading Melville novels\n", "\n", "os.chdir('/sharedfolder/Sample_corpora/Herman_Melville/')\n", "\n", "melville_texts = []\n", "\n", "for filename in os.listdir('./'):\n", " text_data = open(filename).read().replace('\\n', ' ')\n", " melville_texts.append(text_data)\n", "\n", "print(len(melville_texts))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Loading Austen novels\n", "\n", "os.chdir('/sharedfolder/Sample_corpora/Jane_Austen/')\n", "\n", "austen_texts = []\n", "\n", "for filename in os.listdir('./'):\n", " text_data = open(filename).read().replace('\\n', ' ')\n", " austen_texts.append(text_data)\n", "\n", "print(len(austen_texts))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Removing the last novel from each list so we can use it to test our classifier\n", "\n", "melville_train_texts = melville_texts[:-1]\n", "austen_train_texts = austen_texts[:-1]\n", "\n", "melville_test_text = melville_texts[-1]\n", "austen_test_text = austen_texts[-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a master list of Melville sentences\n", "\n", "from nltk.tokenize import sent_tokenize\n", "\n", "melville_combined_texts = ' '.join(melville_train_texts)\n", "\n", "melville_sentences = sent_tokenize(melville_combined_texts)\n", "\n", "print(len(melville_sentences))\n", "\n", "melville_sentences[9999]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Extracting 2000 Melville sentences at random for use as a training set\n", "\n", "import random\n", "\n", "melville_train_sentences = random.sample(melville_sentences, 2000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a list of Melville sentences for our test set\n", "\n", "melville_test_sentences = sent_tokenize(melville_test_text)\n", "\n", "print(len(melville_test_sentences))\n", "\n", "melville_test_sentences[997]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a master list of Austen sentences\n", "\n", "austen_combined_texts = ' '.join(austen_train_texts)\n", "\n", "austen_sentences = sent_tokenize(austen_combined_texts)\n", "\n", "print(len(austen_sentences))\n", "\n", "austen_sentences[8979]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Extracting 2000 Austen sentences at random for use as a training set\n", "\n", "austen_train_sentences = random.sample(austen_sentences, 2000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating a list of Austen sentences for our test set\n", "\n", "austen_test_sentences = sent_tokenize(austen_test_text)\n", "\n", "print(len(austen_test_sentences))\n", "\n", "austen_test_sentences[1000]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Combing training data\n", "combined_texts = melville_train_sentences + austen_train_sentences\n", "\n", "## Creating list of associated class values: \n", "## 0 for Melville, 1 for Austen\n", "y = [0]*len(melville_train_sentences) + [1]*len(austen_train_sentences)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Creating vectorized training set using our combined sentence list\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vectorizer = CountVectorizer()\n", "\n", "X = vectorizer.fit_transform(combined_texts).toarray()\n", "\n", "X.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Training a multinomial naive Bayes classifier\n", "## X is a combined list of Melville and Austen sentences (2000 sentences from each)\n", "## y is a list of classes (0 or 1)\n", "\n", "from sklearn.naive_bayes import MultinomialNB\n", "\n", "classifier = MultinomialNB().fit(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Classifying 5 sentences in our Austen test set\n", "# Recall that 0 means Melville & 1 means Austen\n", "\n", "from pprint import pprint\n", "\n", "input_sentences = austen_test_sentences[3000:3005]\n", "\n", "input_vector = vectorizer.transform(input_sentences) ## Converting a list of string to the same\n", " ## vector format we used for our training set.\n", "pprint(austen_test_sentences[3000:3005])\n", "\n", "classifier.predict(input_vector)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Classifying 5 sentences in our Melville test set\n", "\n", "input_sentences = melville_test_sentences[3000:3005]\n", "\n", "input_vector = vectorizer.transform(input_sentences)\n", "\n", "pprint(melville_test_sentences[3000:3005])\n", "\n", "classifier.predict(input_vector)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### â–·Assignment\n", "\n", " Write a script that prints Austen-like sentences written \n", " by Melville, and Melville-like sentences written by Austen." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }