{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# *Unsupervised learning: Latent Dirichlet allocation (LDA) topic modeling*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Install a Python package for LDA\n",
    "# http://pythonhosted.org/lda/getting_started.html\n",
    "\n",
    "!pip3 install lda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Importing basic packages\n",
    "\n",
    "import os\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Downloading 'Essays' by Ralph Waldo Emerson\n",
    "\n",
    "os.chdir('/sharedfolder/')\n",
    "\n",
    "!wget http://www.gutenberg.org/cache/epub/16643/pg16643.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Loading the text\n",
    "\n",
    "text_path = 'pg16643.txt'\n",
    "\n",
    "text_data = open(text_path).read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Dividing the document into segments, with the aim of extracting individual essays\n",
    "\n",
    "len(text_data.split('\\n\\n\\n\\n\\n'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Viewing the beginning of each segment to determine which ones to keep\n",
    "\n",
    "counter = 0\n",
    "\n",
    "for item in text_data.split('\\n\\n\\n\\n\\n'):\n",
    "    print('-----')\n",
    "    print(counter)\n",
    "    print(item[:80])\n",
    "    counter+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a list of essays\n",
    "\n",
    "document_list = text_data.split('\\n\\n\\n\\n\\n')[9:20]\n",
    "\n",
    "print(len(document_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a vectorized representation of each essay in the list\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vectorizer = CountVectorizer()\n",
    "\n",
    "X = vectorizer.fit_transform(document_list) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Viewing a single essay's vector\n",
    "\n",
    "sample_essay_vector = X.toarray()[3]\n",
    "\n",
    "print(len(sample_essay_vector))\n",
    "\n",
    "sample_essay_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a vocabulary list corresponding to the vectors we created above\n",
    "\n",
    "vocabulary = vectorizer.get_feature_names()\n",
    "\n",
    "vocabulary[8950:8980]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Viewing the 10 most frequent words in a single essay\n",
    "\n",
    "print(np.array(vocabulary)[np.argsort(sample_essay_vector)[::-1]][:10])\n",
    "\n",
    "print(np.argsort(sample_essay_vector)[::-1][:10]) # corresponding frequency values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Initializing an LDA model: 10 topics and 1000 iterations\n",
    "\n",
    "import lda\n",
    "\n",
    "model = lda.LDA(n_topics=10, n_iter=1000, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Fitting the model using our list of vectors\n",
    "\n",
    "model.fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Viewing the top 50 words in each 'topic'\n",
    "\n",
    "topic_word = model.topic_word_\n",
    "\n",
    "n_top_words = 50\n",
    "\n",
    "for i, topic_distribution in enumerate(topic_word):\n",
    "    topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]\n",
    "    print('Topic {}: {}'.format(i, ' '.join(topic_words)))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Repeating the process, removing stop words and punctuation first"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "word_tokenize('We are symbols, and inhabit symbols.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Importing NLTK stop words\n",
    "\n",
    "from nltk.corpus import stopwords\n",
    " \n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "stop_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Importing Python punctuation set\n",
    "\n",
    "import string\n",
    "\n",
    "string.punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Testing tokenization + stop word removal\n",
    "\n",
    "sentence = 'We are symbols, and inhabit symbols.'.lower()\n",
    "\n",
    "token_list = word_tokenize(sentence)\n",
    "\n",
    "sentence_filtered = [item for item in token_list if (item not in stop_words)&(item not in string.punctuation)]\n",
    "\n",
    "sentence_filtered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Tokenizing and removing stop words from our list of essays\n",
    "\n",
    "documents_filtered = []\n",
    "\n",
    "for document in document_list:\n",
    "    token_list = word_tokenize(document.lower())\n",
    "    tokens_filtered = [item for item in token_list if (item not in stop_words)&(item not in string.punctuation)]\n",
    "    documents_filtered.append(' '.join(tokens_filtered))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Viewing a segment of a preprocessed essay\n",
    "\n",
    "documents_filtered[3][2000:2100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Vectorizing preprocessed essays\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vectorizer = CountVectorizer()\n",
    "\n",
    "X = vectorizer.fit_transform(documents_filtered) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a vocabulary list corresponding to the vectors we created above\n",
    "\n",
    "vocabulary = vectorizer.get_feature_names()\n",
    "\n",
    "vocabulary[1140:1160]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Initializing an LDA model: 10 topics and 1000 iterations\n",
    "\n",
    "model = lda.LDA(n_topics=10, n_iter=1000, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Fitting the model using our list of vectors\n",
    "\n",
    "model.fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Viewing the top 50 words in each 'topic'\n",
    "\n",
    "topic_word = model.topic_word_\n",
    "\n",
    "n_top_words = 50\n",
    "\n",
    "for i, topic_distribution in enumerate(topic_word):\n",
    "    topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]\n",
    "    print('Topic ' + str(i) + ':')\n",
    "    print(' '.join(topic_words))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ▷Assignment\n",
    "\n",
    "    Modify the code above: Apply a stemming step to each word before vectorizing the text.\n",
    "    See example stemming code in the following cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Stemming example\n",
    "\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "\n",
    "stemmer = PorterStemmer()\n",
    "\n",
    "print(stemmer.stem('nature'))\n",
    "\n",
    "print(stemmer.stem('natural'))\n",
    "\n",
    "print(stemmer.stem('naturalism'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# *Supervised learning: Naive Bayes classification*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Download sample text corpora from GitHub, then unzip.\n",
    "\n",
    "os.chdir('/sharedfolder/')\n",
    "\n",
    "## Uncomment the lines below if you need to re-download test corpora we used last week.\n",
    "\n",
    "#!wget -N https://github.com/pcda17/pcda17.github.io/blob/master/week/8/Sample_corpora.zip?raw=true -O Sample_corpora.zip\n",
    "#!unzip -o Sample_corpora.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('/sharedfolder/Sample_corpora')\n",
    "\n",
    "os.listdir('./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Loading Melville novels\n",
    "\n",
    "os.chdir('/sharedfolder/Sample_corpora/Herman_Melville/')\n",
    "\n",
    "melville_texts = []\n",
    "\n",
    "for filename in os.listdir('./'):\n",
    "    text_data = open(filename).read().replace('\\n', ' ')\n",
    "    melville_texts.append(text_data)\n",
    "\n",
    "print(len(melville_texts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Loading Austen novels\n",
    "\n",
    "os.chdir('/sharedfolder/Sample_corpora/Jane_Austen/')\n",
    "\n",
    "austen_texts = []\n",
    "\n",
    "for filename in os.listdir('./'):\n",
    "    text_data = open(filename).read().replace('\\n', ' ')\n",
    "    austen_texts.append(text_data)\n",
    "\n",
    "print(len(austen_texts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Removing the last novel from each list so we can use it to test our classifier\n",
    "\n",
    "melville_train_texts = melville_texts[:-1]\n",
    "austen_train_texts = austen_texts[:-1]\n",
    "\n",
    "melville_test_text = melville_texts[-1]\n",
    "austen_test_text = austen_texts[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a master list of Melville sentences\n",
    "\n",
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "melville_combined_texts = ' '.join(melville_train_texts)\n",
    "\n",
    "melville_sentences = sent_tokenize(melville_combined_texts)\n",
    "\n",
    "print(len(melville_sentences))\n",
    "\n",
    "melville_sentences[9999]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Extracting 2000 Melville sentences at random for use as a training set\n",
    "\n",
    "import random\n",
    "\n",
    "melville_train_sentences = random.sample(melville_sentences, 2000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a list of Melville sentences for our test set\n",
    "\n",
    "melville_test_sentences = sent_tokenize(melville_test_text)\n",
    "\n",
    "print(len(melville_test_sentences))\n",
    "\n",
    "melville_test_sentences[997]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a master list of Austen sentences\n",
    "\n",
    "austen_combined_texts = ' '.join(austen_train_texts)\n",
    "\n",
    "austen_sentences = sent_tokenize(austen_combined_texts)\n",
    "\n",
    "print(len(austen_sentences))\n",
    "\n",
    "austen_sentences[8979]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Extracting 2000 Austen sentences at random for use as a training set\n",
    "\n",
    "austen_train_sentences = random.sample(austen_sentences, 2000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating a list of Austen sentences for our test set\n",
    "\n",
    "austen_test_sentences = sent_tokenize(austen_test_text)\n",
    "\n",
    "print(len(austen_test_sentences))\n",
    "\n",
    "austen_test_sentences[1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Combing training data\n",
    "combined_texts = melville_train_sentences + austen_train_sentences\n",
    "\n",
    "## Creating list of associated class values: \n",
    "## 0 for Melville, 1 for Austen\n",
    "y = [0]*len(melville_train_sentences) + [1]*len(austen_train_sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Creating vectorized training set using our combined sentence list\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vectorizer = CountVectorizer()\n",
    "\n",
    "X = vectorizer.fit_transform(combined_texts).toarray()\n",
    "\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Training a multinomial naive Bayes classifier\n",
    "## X is a combined list of Melville and Austen sentences (2000 sentences from each)\n",
    "## y is a list of classes (0 or 1)\n",
    "\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "classifier = MultinomialNB().fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Classifying 5 sentences in our Austen test set\n",
    "# Recall that 0 means Melville & 1 means Austen\n",
    "\n",
    "from pprint import pprint\n",
    "\n",
    "input_sentences = austen_test_sentences[3000:3005]\n",
    "\n",
    "input_vector = vectorizer.transform(input_sentences)   ## Converting a list of string to the same\n",
    "                                                       ## vector format we used for our training set.\n",
    "pprint(austen_test_sentences[3000:3005])\n",
    "\n",
    "classifier.predict(input_vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Classifying 5 sentences in our Melville test set\n",
    "\n",
    "input_sentences = melville_test_sentences[3000:3005]\n",
    "\n",
    "input_vector = vectorizer.transform(input_sentences)\n",
    "\n",
    "pprint(melville_test_sentences[3000:3005])\n",
    "\n",
    "classifier.predict(input_vector)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ▷Assignment\n",
    "\n",
    "    Write a script that prints Austen-like sentences written \n",
    "    by Melville, and Melville-like sentences written by Austen."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}