{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Mining the Social Web\n", "\n", "## Mining Text Files\n", "\n", "This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the book or video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning HTML in Google+ content by stripping out HTML tags and converting HTML entities back to plain-text representations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup # pip install beautifulsoup4\n", "\n", "def cleanHtml(html):\n", " if html == \"\": return \"\"\n", "\n", " return BeautifulSoup(html, 'html5lib').get_text()\n", "\n", "txt = \"Don't forget about HTML entities and markup when \"+\\\n", " \"mining text!
\"\n", "\n", "print(cleanHtml(txt))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sample data structures used in illustrations for the rest of this chapter" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus = {\n", " 'a' : \"Mr. Green killed Colonel Mustard in the study with the candlestick. \\\n", "Mr. Green is not a very nice fellow.\",\n", " 'b' : \"Professor Plum has a green plant in his study.\",\n", " 'c' : \"Miss Scarlett watered Professor Plum's green plant while he was away \\\n", "from his office last week.\"\n", "}\n", "terms = {\n", " 'a' : [ i.lower() for i in corpus['a'].split() ], \n", " 'b' : [ i.lower() for i in corpus['b'].split() ],\n", " 'c' : [ i.lower() for i in corpus['c'].split() ]\n", " }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Running TF-IDF on sample data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from math import log\n", "\n", "# Enter in a query term from the corpus variable\n", "QUERY_TERMS = ['mr.', 'green']\n", "\n", "def tf(term, doc, normalize=True):\n", " doc = doc.lower().split()\n", " if normalize:\n", " return doc.count(term.lower()) / float(len(doc))\n", " else:\n", " return doc.count(term.lower()) / 1.0\n", "\n", "def idf(term, corpus):\n", " num_texts_with_term = len([True for text in corpus if term.lower()\n", " in text.lower().split()])\n", "\n", " # tf-idf calc involves multiplying against a tf value less than 0, so it's\n", " # necessary to return a value greater than 1 for consistent scoring.\n", " # (Multiplying two values less than 1 returns a value less than each of\n", " # them.)\n", "\n", " try:\n", " return 1.0 + log(float(len(corpus)) / num_texts_with_term)\n", " except ZeroDivisionError:\n", " return 1.0\n", "\n", "def tf_idf(term, doc, corpus):\n", " return tf(term, doc) * idf(term, corpus)\n", "\n", "corpus = \\\n", " {'a': 'Mr. Green killed Colonel Mustard in the study with the candlestick. \\\n", "Mr. Green is not a very nice fellow.',\n", " 'b': 'Professor Plum has a green plant in his study.',\n", " 'c': \"Miss Scarlett watered Professor Plum's green plant while he was away \\\n", "from his office last week.\"}\n", "\n", "for (k, v) in sorted(corpus.items()):\n", " print(k, ':', v)\n", "print()\n", " \n", "# Score queries by calculating cumulative tf_idf score for each term in query\n", "\n", "query_scores = {'a': 0, 'b': 0, 'c': 0}\n", "for term in [t.lower() for t in QUERY_TERMS]:\n", " for doc in sorted(corpus):\n", " print('TF({0}): {1}'.format(doc, term), tf(term, corpus[doc]))\n", " print('IDF: {0}'.format(term), idf(term, corpus.values()))\n", " print()\n", "\n", " for doc in sorted(corpus):\n", " score = tf_idf(term, corpus[doc], corpus.values())\n", " print('TF-IDF({0}): {1}'.format(doc, term), score)\n", " query_scores[doc] += score\n", " print()\n", "\n", "print(\"Overall TF-IDF scores for query '{0}'\".format(' '.join(QUERY_TERMS)))\n", "for (doc, score) in sorted(query_scores.items()):\n", " print(doc, score)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exploring text data with NLTK" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Explore some of NLTK's functionality by exploring the data.\n", "# Here are some suggestions for an interactive interpreter session.\n", "\n", "import json\n", "import nltk\n", "\n", "# Download ancillary nltk packages if not already installed\n", "nltk.download('stopwords')\n", "\n", "# Load in human language data from wherever you've saved it\n", "DATA = 'resources/ch05-textfiles/ch05-timoreilly.json'\n", "data = json.loads(open(DATA).read())\n", "\n", "# Combine titles and post content\n", "all_content = \" \".join([ i['title'] + \" \" + i['content'] for i in data ])\n", "\n", "# Approximate bytes of text\n", "print(len(all_content))\n", "\n", "tokens = all_content.split()\n", "text = nltk.Text(tokens)\n", "\n", "# Examples of the appearance of the word \"open\"\n", "text.concordance(\"open\")\n", "\n", "# Frequent collocations in the text (usually meaningful phrases)\n", "text.collocations()\n", "\n", "# Frequency analysis for words of interest\n", "fdist = text.vocab()\n", "print(fdist[\"open\"])\n", "print(fdist[\"source\"])\n", "print(fdist[\"web\"])\n", "print(fdist[\"2.0\"])\n", "\n", "# Number of words in the text\n", "print('Number of tokens:', len(tokens))\n", "\n", "# Number of unique words in the text\n", "print('Number of unique words:', len(fdist.keys()))\n", "\n", "# Common words that aren't stopwords\n", "print('Common words that aren\\'t stopwords')\n", "print([w for w in list(fdist.keys())[:100]\n", " if w.lower() not in nltk.corpus.stopwords.words('english')])\n", "\n", "# Long words that aren't URLs\n", "print('Long words that aren\\'t URLs')\n", "print([w for w in fdist.keys() if len(w) > 15 and 'http' not in w])\n", "\n", "# Number of URLs\n", "print('Number of URLs: ',len([w for w in fdist.keys() if 'http' in w]))\n", "\n", "# Top 10 Most Common Words\n", "print('Top 10 Most Common Words')\n", "print(fdist.most_common(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Querying text data with TF-IDF" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import nltk\n", "\n", "# Provide your own query terms here\n", "\n", "QUERY_TERMS = ['Government']\n", "\n", "# Load in human language data from wherever you've saved it\n", "DATA = 'resources/ch05-textfiles/ch05-timoreilly.json'\n", "data = json.loads(open(DATA).read())\n", "\n", "activities = [post['content'].lower().split() \n", " for post in data \n", " if post['content'] != \"\"]\n", "\n", "# TextCollection provides tf, idf, and tf_idf abstractions so\n", "# that we don't have to maintain/compute them ourselves\n", "\n", "tc = nltk.TextCollection(activities)\n", "\n", "relevant_activities = []\n", "\n", "for idx in range(len(activities)):\n", " score = 0\n", " for term in [t.lower() for t in QUERY_TERMS]:\n", " score += tc.tf_idf(term, activities[idx])\n", " if score > 0:\n", " relevant_activities.append({'score': score, 'title': data[idx]['title']})\n", "\n", "# Sort by score and display results\n", "\n", "relevant_activities = sorted(relevant_activities,\n", " key=lambda p: p['score'], reverse=True)\n", "for activity in relevant_activities:\n", " print('Title: {0}'.format(activity['title']))\n", " print('Score: {0}'.format(activity['score']))\n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Finding similar documents using cosine similarity\n", "\n", "The dot product of two vectors __A__ and **B** can be thought of as a projection of one vector into the other.\n", "\n", "\n", "By measuring how much of **A** is in the same direction as **B**, we get a measure of how similar **A** is to **B**. The idea behind the following exercise is to create vectors for each document in our corpus consisting of the TF-IDF scores of the terms in those documents:\n", "\n", "```\n", "v_1 = [ tf_idf(term_1, doc_1), tf_idf(term_2, doc_1), ..., tf_idf(term_n, doc_1) ]\n", "v_2 = [ tf_idf(term_1, doc_2), tf_idf(term_2, doc_2), ..., tf_idf(term_n, doc_2) ]\n", "```\n", "\n", "The dot product of these vectors:\n", "\n", "$\\mathbf{v_1} \\cdot \\mathbf{v_2} = |\\mathbf{v_1}||\\mathbf{v_2}|\\cos(\\theta)$.\n", "\n", "Now you see where the cosine comes in. The \"cosine distance\" between $\\mathbf{v1}$ and $\\mathbf{v2}$ is then given by\n", "\n", "$$\n", "d = 1 - \\frac{\\mathbf{v_1} \\cdot \\mathbf{v_2}}{|\\mathbf{v_1}||\\mathbf{v_2}|}\n", "$$" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import nltk\n", "import nltk.cluster\n", "\n", "# Load in human language data from wherever you've saved it\n", "DATA = 'resources/ch05-textfiles/ch05-timoreilly.json'\n", "data = json.loads(open(DATA).read())\n", "\n", "all_posts = [ (i['title'] + \" \" + i['content']).lower().split() for i in data ]\n", "\n", "# Provides tf, idf, and tf_idf abstractions for scoring\n", "\n", "tc = nltk.TextCollection(all_posts)\n", "\n", "# Compute a term-document matrix such that td_matrix[doc_title][term]\n", "# returns a tf-idf score for the term in the document\n", "\n", "td_matrix = {}\n", "for idx in range(len(all_posts)):\n", " post = all_posts[idx]\n", " fdist = nltk.FreqDist(post)\n", "\n", " doc_title = data[idx]['title'].replace('\\n', '')\n", " td_matrix[doc_title] = {}\n", "\n", " for term in fdist.keys():\n", " td_matrix[doc_title][term] = tc.tf_idf(term, post)\n", "\n", "# Build vectors such that term scores are in the same positions...\n", "distances = {}\n", "for title1 in td_matrix.keys():\n", "\n", " distances[title1] = {}\n", " (min_dist, most_similar) = (1.0, ('', ''))\n", "\n", " for title2 in td_matrix.keys():\n", "\n", " # Take care not to mutate the original data structures\n", " # since we're in a loop and need the originals multiple times\n", "\n", " terms1 = td_matrix[title1].copy()\n", " terms2 = td_matrix[title2].copy()\n", "\n", " # Fill in \"gaps\" in each map so vectors of the same length can be computed\n", " for term1 in terms1:\n", " if term1 not in terms2:\n", " terms2[term1] = 0\n", "\n", " for term2 in terms2:\n", " if term2 not in terms1:\n", " terms1[term2] = 0\n", "\n", " # Create vectors from term maps\n", " v1 = [score for (term, score) in sorted(terms1.items())]\n", " v2 = [score for (term, score) in sorted(terms2.items())]\n", "\n", " # Compute similarity amongst documents\n", " distances[title1][title2] = nltk.cluster.util.cosine_distance(v1, v2)\n", "\n", " if title1 == title2:\n", " #print distances[title1][title2]\n", " continue\n", "\n", " if distances[title1][title2] < min_dist:\n", " (min_dist, most_similar) = (distances[title1][title2], title2)\n", "\n", " print(u'Most similar (score: {})\\n{}\\n{}\\n'.format(1-min_dist, title1, \n", " most_similar))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Generating a figure to visually display the cosine similarity between documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt # pip install matplotlib\n", "%matplotlib inline\n", "\n", "max_articles = 15\n", "\n", "# Get the titles - the keys to the 'distances' dict\n", "keys = list(distances.keys())\n", "\n", "# Extract the article titles\n", "titles = [l[:40].replace('\\n',' ')+'...' for l in list(distances.keys())]\n", "\n", "n_articles = len(titles) if len(titles) < max_articles else max_articles\n", "\n", "# Initialize the matrix of appropriate size to store similarity scores\n", "similarity_matrix = np.zeros((n_articles, n_articles))\n", "\n", "# Loop over the cells in the matrix\n", "for i in range(n_articles):\n", " for j in range(n_articles):\n", " # Retrieve the cosine distance between articles i and j\n", " d = distances[keys[i]][keys[j]]\n", " \n", " # Store the 'similarity' between articles i and j, defined as 1.0 - distance\n", " similarity_matrix[i, j] = 1.0 - d\n", "\n", "\n", "# Create a figure and axes\n", "fig = plt.figure(figsize=(8,8), dpi=300)\n", "ax = fig.add_subplot(111)\n", "\n", "# Visualize the matrix with colored squares indicating similarity\n", "ax.matshow(similarity_matrix, cmap='Greys', vmin = 0.0, vmax = 0.2)\n", "\n", "# Set regular ticks, one for each article in the collection\n", "ax.set_xticks(range(n_articles))\n", "ax.set_yticks(range(n_articles))\n", "\n", "# Set the tick labels as the article titles\n", "ax.set_xticklabels(titles)\n", "ax.set_yticklabels(titles)\n", "\n", "# Rotate the labels on the x-axis by 90 degrees\n", "plt.xticks(rotation=90);" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Using NLTK to compute bigrams and collocations for a sentence" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "\n", "sentence = \"Mr. Green killed Colonel Mustard in the study with the \" + \\\n", " \"candlestick. Mr. Green is not a very nice fellow.\"\n", "\n", "print([bg for bg in nltk.ngrams(sentence.split(), 2)])\n", "txt = nltk.Text(sentence.split())\n", "\n", "txt.collocations()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using NLTK to compute collocations in a similar manner to the nltk.Text.collocations demo functionality" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import nltk\n", "from nltk.metrics import association\n", "\n", "# Load in human language data from wherever you've saved it\n", "DATA = 'resources/ch05-textfiles/ch05-timoreilly.json'\n", "data = json.loads(open(DATA).read())\n", "\n", "# Number of collocations to find\n", "\n", "N = 25\n", "\n", "all_tokens = [token for post in data for token in post['content'].lower().split()]\n", "\n", "finder = nltk.BigramCollocationFinder.from_words(all_tokens)\n", "finder.apply_freq_filter(2)\n", "finder.apply_word_filter(lambda w: w in nltk.corpus.stopwords.words('english'))\n", "scorer = association.BigramAssocMeasures.jaccard\n", "collocations = finder.nbest(scorer, N)\n", "\n", "for collocation in collocations:\n", " c = ' '.join(collocation)\n", " print(c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }