{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup\n", "\n", "Run all imports (lots and a bit ugly, i know) and define some helper functions." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T20:01:16.002489Z", "start_time": "2017-08-22T20:01:15.801349Z" }, "code_folding": [] }, "outputs": [], "source": [ "import sys\n", "import pandas as pd\n", "from gensim.models import Phrases, Word2Vec\n", "from gensim.models.phrases import Phraser\n", "from gensim.models.keyedvectors import KeyedVectors\n", "import nltk \n", "from nltk.corpus import stopwords\n", "import string\n", "import multiprocessing\n", "import itertools\n", "from collections import defaultdict, Mapping, Container\n", "import random\n", "from tqdm import tqdm\n", "from sys import getsizeof\n", "import pickle\n", "import re\n", "import networkx as nx\n", "from itertools import chain\n", "import matplotlib.pyplot as plt\n", "from scipy.spatial.distance import pdist, squareform\n", "from scipy.cluster.hierarchy import linkage, dendrogram\n", "import plotly.plotly as py\n", "from plotly.graph_objs import *\n", "import plotly.figure_factory as FF\n", "import plotly.graph_objs as go\n", "import plotly.tools\n", "import numpy as np\n", "from jinja2 import Template\n", "from sklearn.manifold import TSNE\n", "from sklearn.decomposition import PCA\n", "\n", "# set plotly creds\n", "plotly.tools.set_credentials_file(username='andrewm4894', api_key='YOUR_KEY_HERE')\n", "\n", "# config vars for bq\n", "project_id = \"MY_BQ_PROJECT\"\n", "private_key = \"C:/Users/Andrew/Documents/PATH_TO_YOUR_KEY/MY_KEY.json\"\n", "\n", "# set wider prints for pd\n", "pd.options.display.max_colwidth = 500\n", "\n", "# function to strip html\n", "TAG_RE = re.compile(r'<[^>]+>')\n", "def remove_tags(text):\n", " return TAG_RE.sub('', text)\n", "\n", "# function to print shape of df\n", "def print_dim(df):\n", " print(\"### df SHAPE = \"+str(df.shape)+\" ###\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get Data\n", "\n", "Our data happens to be stored in [Google Bigquery](https://cloud.google.com/bigquery/) - its awesome and i'd reccomend it to everyone!\n", "\n", "So this part might be a bit specific to Bigquery. We also use [jinja templating](http://jinja.pocoo.org/) here to pull from Bigquery one year at a time. This is because the pandas [read_gbq](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_gbq.html) module can only comfortably pull a certain amount of data in one go. So templating by year is essentially a natural way to shard our data pulls. " ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T13:46:54.814900Z", "start_time": "2017-08-21T13:44:42.586380Z" }, "code_folding": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2012\n", "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 134.1 MB\n", "\n", "Retrieving results...\n", " Got page: 1; 100% done. Elapsed 9.84 s.\n", "Got 17214 rows.\n", "\n", "Total time taken 9.96 s.\n", "Finished at 2017-08-21 14:44:53.\n", "2013\n", "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 134.1 MB\n", "\n", "Retrieving results...\n", " Got page: 1; 100% done. Elapsed 10.27 s.\n", "Got 17881 rows.\n", "\n", "Total time taken 10.37 s.\n", "Finished at 2017-08-21 14:45:04.\n", "2014\n", "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 134.1 MB\n", "\n", "Retrieving results...\n", " Got page: 1; 100% done. Elapsed 13.45 s.\n", "Got 18760 rows.\n", "\n", "Total time taken 13.55 s.\n", "Finished at 2017-08-21 14:45:18.\n", "2015\n", "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 134.1 MB\n", "\n", "Retrieving results...\n", " Got page: 1; 90% done. Elapsed 16.21 s.\n", " Got page: 2; 100% done. Elapsed 17.94 s.\n", "Got 23819 rows.\n", "\n", "Total time taken 18.07 s.\n", "Finished at 2017-08-21 14:45:37.\n", "2016\n", "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 134.1 MB\n", "\n", "Retrieving results...\n", " Got page: 1; 21% done. Elapsed 12.72 s.\n", " Got page: 2; 43% done. Elapsed 17.02 s.\n", " Got page: 3; 64% done. Elapsed 21.03 s.\n", " Got page: 4; 85% done. Elapsed 25.68 s.\n", " Got page: 5; 100% done. Elapsed 30.06 s.\n", "Got 19279 rows.\n", "\n", "Total time taken 30.21 s.\n", "Finished at 2017-08-21 14:46:08.\n", "2017\n", "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 134.1 MB\n", "\n", "Retrieving results...\n", " Got page: 1; 20% done. Elapsed 13.76 s.\n", " Got page: 2; 40% done. Elapsed 20.51 s.\n", " Got page: 3; 60% done. Elapsed 28.0 s.\n", " Got page: 4; 80% done. Elapsed 35.85 s.\n", " Got page: 5; 100% done. Elapsed 43.53 s.\n", " Got page: 6; 100% done. Elapsed 45.17 s.\n", "Got 21071 rows.\n", "\n", "Total time taken 45.32 s.\n", "Finished at 2017-08-21 14:46:54.\n", "### df SHAPE = (118024, 2) ###\n", " pid \\\n", "0 I8bC0jze55Ow0LZoJSyRYGr1K0M= \n", "1 Oj/mjv0XIpoJPaLX+XCQep65ToU= \n", "2 dOlSCJ9+2t1xLe/zB2Re+H+KEg4= \n", "3 JSUPPhMmW+EG7bk3SwjuGsRIOQs= \n", "4 VUu+U6+AoDr3dM7MjPldJzrom5g= \n", "5 Pf0w8g3yHVzCPGANWb1yFBih/g8= \n", "6 h/CArboWvz7qHyz9tAoqqvKVtGY= \n", "7 2TLuF1x7XFeXoMbgjPYwElsA6tc= \n", "8 gATed2Df6aFdRtMNLbaYC9ZEcnU= \n", "9 srPxWwm5B1Gw2mqvGTYZfzLfLNc= \n", "\n", " text \n", "0 'Bachelorette' Ashley Hebert: How I'm Getting In Shape For My Wedding .

Find out how you can get as fit as Ashley with all of her secret diet and workout tips.

\\n

Bachelorette Ashley Hebert is trying hard to stay fit for her wedding to J.P. Rose... \n", "1 'Teen Mom 2' Star Leah Messer's Dream Wedding — The Details Revealed .

Leah’s fiance Jeremy Calvert gushes about the type of ceremony they can’t wait to have, the guest list and even the honeymoon destination.

\\n

Teen Mom 2 star Leah Messer is happily engaged to her new fiance Leah nags her fiance Jeremy on Twitter about being late — and he responds angrily. Is there trouble in paradise?\\n

Teen Mom 2 star Leah Messer, 19, has been engaged to her fiance Jere... \n", "3 Christian Siriano Designs Wedding Dresses For Nordstrom .

The designer is branching out, into the wedding business! Now you can say ‘I Do’ in one of his coveted designs — get all the details on his latest fashion venture here.

\\n

Former Project Runway winner Christian Siriano, 26, has designed countless dr... \n", "4 Tips & Tricks For A Flawless Valentine’s Day Look Just Like Reese Witherspoon .

Do you want to create a sexy makeup look for a Valentine’s Day date, but you’re sick of the classic smokey eye? We have the perfect look for you! Celebrity makeup artist Jillian Dempsey gives you a how-to for a twist on the over-played smokey eye. Keep reading to see her beauty tricks and shop […]

\\n \n", "5 Courtney Robertson Tries On Wedding Dresses — Engaged To 'Bachelor' Ben? .

This is more proof that Courtney is engaged to ‘Bachelor’ Ben Flajnik. Check out 24 pics of Courtney trying on wedding gowns. Courtney Robertson seems like she’s getting ready to walk down the aisle with Bachelor Ben Flajnik. The 28-year-old model, who is rumored to be Ben’s fiancee, was spotted trying on wedding gowns at […]

\\n \n", "6 Courtney Robertson Shocks Nicki Sterling By Shopping For A Wedding Dress .

The thought that Ben Flajnik may actually be engaged to Courtney is quite unsettling for Nicki.

\\n

Nicki Sterling, who was the third eliminated contestant on The Bachelor, can’t believe that Bio:\\nWill Smith is an American actor, producer, and rapper. In the late 1980s, he achieved some fame as a rapper under the name The Fresh Prince. In 1990, his popularity increased when he starred in the TV sitcom The Fresh Prince of Bel-Air. The show ran for six years and ended in 1996 on NBC with syndications on various networks. During his TV career, he also went into films, and then blockbuster movies. His most successful films have b... \n" ] } ], "source": [ "# use jinja2 template to run query for each year to avoid pd gbq crapping out.\n", "\n", "# query to pull from data stored in Google Bigquery\n", "qry_template = '''\n", "select pid, text from \n", "(\n", "select\n", " -- hash the id so is still an id but more anonamous\n", " sha1(post_id_domain) as pid,\n", " post_content as text \n", "from \n", " hollywoodlife.post_content\n", "where \n", " post_content is not null\n", " and\n", " post_content<>''\n", " and\n", " post_date like '{{ post_year }}%'\n", "group by 1,2\n", "--limit 250 #uncomment when pulling in smaller sample\n", ")\n", "'''\n", "\n", "template_qry = Template( qry_template )\n", "\n", "loop_num = 0\n", "\n", "# loop through each year\n", "for year in range(2012 , 2018):\n", " \n", " print(year)\n", " \n", " # just track if first loop or not to handle the append\n", " loop_num += 1\n", " \n", " # render the template query for the year of the loop\n", " qry_rendered = template_qry.render( post_year = year )\n", " \n", " # pull data from google bigquery\n", " df_tmp = pd.read_gbq( qry_rendered, project_id, private_key = private_key )\n", "\n", " # if first loop then obviously nothing to append as only have results for the first year\n", " if loop_num == 1:\n", " df = df_tmp\n", " # if not the first year then append this year to all others\n", " else:\n", " # union df's\n", " frames = [df, df_tmp]\n", " df = pd.concat(frames)\n", " \n", " # reset index\n", " df.reset_index(drop=True)\n", "\n", "print_dim(df)\n", "print(df.head(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now save the data to csv so we don't need to rerun the pull from BigQuery each time." ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T13:52:46.840610Z", "start_time": "2017-08-21T13:52:40.875830Z" }, "collapsed": true }, "outputs": [], "source": [ "# save data to csv to be read in easy later.\n", "df.to_csv(\"input_data.csv\", encoding = \"utf-8\", index = False)\n", "\n", "# copy df to another dataframe\n", "df_orig = df\n", "del df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load in saved data from file so no need to pull from BQ each time. " ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T20:01:39.249427Z", "start_time": "2017-08-22T20:01:34.350944Z" } }, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidtext
0I8bC0jze55Ow0LZoJSyRYGr1K0M=&#039;Bachelorette&#039; Ashley Hebert: How I&#039;m Getting In Shape For My Wedding . <h3>Find out how you can get as fit as Ashley with all of her secret diet and workout tips.</h3>\\n<p><em><a href=\\\"http://www.hollywoodlife.com/tag/The-Bachelorette/\\\">Bachelorette</a></em> <strong><a href=\\\"http://www.hollywoodlife.com/tag/Ashley-Hebert/\\\">Ashley Hebert</a></strong> is trying hard to stay fit for her wedding to <strong><a href=\\\"http://www.hollywoodlife.com/tag/J.P.-Rosenbaum/\\\">J.P. Rose...
1Oj/mjv0XIpoJPaLX+XCQep65ToU=&#039;Teen Mom 2&#039; Star Leah Messer&#039;s Dream Wedding &#8212; The Details Revealed . <h3>Leah&#8217;s fiance Jeremy Calvert gushes about the type of ceremony they can&#8217;t wait to have, the guest list and even the honeymoon destination.</h3>\\n<p><em><a href=\\\"http://www.hollywoodlife.com/tag/Teen-Mom-2/\\\">Teen Mom 2</a></em> star <strong><a href=\\\"http://www.hollywoodlife.com/tag/Leah-Messer/\\\">Leah Messer</a></strong> is happily engaged to her new fiance <strong><a href=\\\"http://w...
2dOlSCJ9+2t1xLe/zB2Re+H+KEg4=Are &#039;Teen Mom&#039; Leah Messer &amp; Jeremy Calvert Already Fighting? . <h3>Leah nags her fiance Jeremy on Twitter about being late &#8212; and he responds angrily. Is there trouble in paradise?</h3>\\n<p><em><a href=\\\"http://www.hollywoodlife.com/tag/Teen-Mom-2/\\\">Teen Mom 2</a></em> star <strong><a href=\\\"http://www.hollywoodlife.com/tag/Leah-Messer/\\\">Leah Messer</a></strong>, 19, has been engaged to her fiance <strong><a href=\\\"http://www.hollywoodlife.com/tag/jeremy-calvert/\\\">Jere...
3JSUPPhMmW+EG7bk3SwjuGsRIOQs=Christian Siriano Designs Wedding Dresses For Nordstrom . <h3>The designer is branching out, into the wedding business! Now you can say &#8216;I Do&#8217; in one of his coveted designs &#8212; get all the details on his latest fashion venture here.</h3>\\n<p>Former <a href=\\\"http://www.hollywoodlife.com/2012/01/13/project-runway-all-stars-episode-2-recap-kara-janx-couture-challenge/\\\" target=\\\"_blank\\\">Project Runway</a> winner <strong>Christian Siriano</strong>, 26, has designed countless dr...
4VUu+U6+AoDr3dM7MjPldJzrom5g=Tips &amp; Tricks For A Flawless Valentine&#8217;s Day Look Just Like Reese Witherspoon . <p>Do you want to create a sexy makeup look for a Valentine&#8217;s Day date, but you&#8217;re sick of the classic smokey eye? We have the perfect look for you! Celebrity makeup artist Jillian Dempsey gives you a how-to for a twist on the over-played smokey eye. Keep reading to see her beauty tricks and shop [&hellip;]</p>\\n
\n", "
" ], "text/plain": [ " pid \\\n", "0 I8bC0jze55Ow0LZoJSyRYGr1K0M= \n", "1 Oj/mjv0XIpoJPaLX+XCQep65ToU= \n", "2 dOlSCJ9+2t1xLe/zB2Re+H+KEg4= \n", "3 JSUPPhMmW+EG7bk3SwjuGsRIOQs= \n", "4 VUu+U6+AoDr3dM7MjPldJzrom5g= \n", "\n", " text \n", "0 'Bachelorette' Ashley Hebert: How I'm Getting In Shape For My Wedding .

Find out how you can get as fit as Ashley with all of her secret diet and workout tips.

\\n

Bachelorette Ashley Hebert is trying hard to stay fit for her wedding to J.P. Rose... \n", "1 'Teen Mom 2' Star Leah Messer's Dream Wedding — The Details Revealed .

Leah’s fiance Jeremy Calvert gushes about the type of ceremony they can’t wait to have, the guest list and even the honeymoon destination.

\\n

Teen Mom 2 star Leah Messer is happily engaged to her new fiance Leah nags her fiance Jeremy on Twitter about being late — and he responds angrily. Is there trouble in paradise?\\n

Teen Mom 2 star Leah Messer, 19, has been engaged to her fiance Jere... \n", "3 Christian Siriano Designs Wedding Dresses For Nordstrom .

The designer is branching out, into the wedding business! Now you can say ‘I Do’ in one of his coveted designs — get all the details on his latest fashion venture here.

\\n

Former Project Runway winner Christian Siriano, 26, has designed countless dr... \n", "4 Tips & Tricks For A Flawless Valentine’s Day Look Just Like Reese Witherspoon .

Do you want to create a sexy makeup look for a Valentine’s Day date, but you’re sick of the classic smokey eye? We have the perfect look for you! Celebrity makeup artist Jillian Dempsey gives you a how-to for a twist on the over-played smokey eye. Keep reading to see her beauty tricks and shop […]

\\n " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load in saved data\n", "df_orig = pd.read_csv(\"input_data.csv\", encoding = \"utf-8\")\n", "df_orig.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Text Preprocessing: Simple Cleaning\n", "\n", "We now have a data frame where our blob of text is often full of embedded html and has not been cleansed in any particular way. \n", "\n", "Here we will:\n", "\n", "* Strip out all html and essentially render the text as it would read. \n", "* Lowercase everything. \n", "* Replace or remove various specific characters that might make things harder on the model. \n", "\n", "p.s. Using [tqdm](https://github.com/tqdm/tqdm) everywhere in this notebook to get progress bar's on loops. Find it really satisfying for some reason :)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T13:52:59.582356Z", "start_time": "2017-08-21T13:52:58.502108Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████| 118024/118024 [00:01<00:00, 117183.60it/s]\n" ] } ], "source": [ "# set up list for collecting cleaned content into\n", "results_all = []\n", "\n", "# pull the df into a list to loop through (seems quicker then looping through the pd df itself)\n", "raw_data = list(zip(df_orig.pid,df_orig.text))\n", "\n", "# loop through each row of df and clean strip out the html tags\n", "for pid, text in tqdm(raw_data):\n", " # use try block to ignore errors in cleaning, should be fine as have lots of data\n", " try:\n", " result = [pid, remove_tags(text)]\n", " results_all.append(result)\n", " except Exception as e: \n", " # do nothing on error, we are ok to ignore any posts we can't clean up for whatever reason\n", " #print(e)\n", " pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now do some kinda ugly and brute force cleaning...\n", "\n", "(I'm sure there is much more elegant and generalized ways to do this but meh) " ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T13:58:28.736741Z", "start_time": "2017-08-21T13:58:07.747394Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "### df SHAPE = (118024, 2) ###\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidtext
0I8bC0jze55Ow0LZoJSyRYGr1K0M=bachelorette ashley hebert how i m getting in shape for my wedding . find out how you can get as fit as ashley with all of her secret diet and workout tips. bachelorette ashley hebert is trying hard to stay fit for her wedding to j.p . rosenbaum that will most likely take place later this year . ashley shares her wedding workout and diet tips with in touch magazine.
1Oj/mjv0XIpoJPaLX+XCQep65ToU=teen mom star leah messer s dream wedding the details revealed . leah fiance jeremy calvert gushes about the type of ceremony they can t wait to have the guest list and even the honeymoon destination. teen mom star leah messer is happily engaged to her new fiance jeremy calvert . and while she is pregnant reportedly with twins leah 19 and jeremy 22 are already planning their dream wedding.
2dOlSCJ9+2t1xLe/zB2Re+H+KEg4=are teen mom leah messer and jeremy calvert already fighting ? . leah nags her fiance jeremy on twitter about being late and he responds angrily . is there trouble in paradise ? teen mom star leah messer 19 has been engaged to her fiance jeremy calvert 22 for less than a month now and it seems that they are already starting to fight.
3JSUPPhMmW+EG7bk3SwjuGsRIOQs=christian siriano designs wedding dresses for nordstrom . the designer is branching out into the wedding business . now you can say i do in one of his coveted designs get all the details on his latest fashion venture here. former project runway winner christian siriano 26 has designed countless dresses for a-listers like heidi klum fergie taylor swift and sarah hyland to name a few . aside from his namesake collection the designer plans to branch into bridal for nordstrom where we re sure to...
4VUu+U6+AoDr3dM7MjPldJzrom5g=tips and tricks for a flawless valentine day look just like reese witherspoon . do you want to create a sexy makeup look for a valentine day date but you re sick of the classic smokey eye ? we have the perfect look for you . celebrity makeup artist jillian dempsey gives you a how-to for a twist on the over-played smokey eye . keep reading to see her beauty tricks and shop
\n", "
" ], "text/plain": [ " pid \\\n", "0 I8bC0jze55Ow0LZoJSyRYGr1K0M= \n", "1 Oj/mjv0XIpoJPaLX+XCQep65ToU= \n", "2 dOlSCJ9+2t1xLe/zB2Re+H+KEg4= \n", "3 JSUPPhMmW+EG7bk3SwjuGsRIOQs= \n", "4 VUu+U6+AoDr3dM7MjPldJzrom5g= \n", "\n", " text \n", "0 bachelorette ashley hebert how i m getting in shape for my wedding . find out how you can get as fit as ashley with all of her secret diet and workout tips. bachelorette ashley hebert is trying hard to stay fit for her wedding to j.p . rosenbaum that will most likely take place later this year . ashley shares her wedding workout and diet tips with in touch magazine. \n", "1 teen mom star leah messer s dream wedding the details revealed . leah fiance jeremy calvert gushes about the type of ceremony they can t wait to have the guest list and even the honeymoon destination. teen mom star leah messer is happily engaged to her new fiance jeremy calvert . and while she is pregnant reportedly with twins leah 19 and jeremy 22 are already planning their dream wedding. \n", "2 are teen mom leah messer and jeremy calvert already fighting ? . leah nags her fiance jeremy on twitter about being late and he responds angrily . is there trouble in paradise ? teen mom star leah messer 19 has been engaged to her fiance jeremy calvert 22 for less than a month now and it seems that they are already starting to fight. \n", "3 christian siriano designs wedding dresses for nordstrom . the designer is branching out into the wedding business . now you can say i do in one of his coveted designs get all the details on his latest fashion venture here. former project runway winner christian siriano 26 has designed countless dresses for a-listers like heidi klum fergie taylor swift and sarah hyland to name a few . aside from his namesake collection the designer plans to branch into bridal for nordstrom where we re sure to... \n", "4 tips and tricks for a flawless valentine day look just like reese witherspoon . do you want to create a sexy makeup look for a valentine day date but you re sick of the classic smokey eye ? we have the perfect look for you . celebrity makeup artist jillian dempsey gives you a how-to for a twist on the over-played smokey eye . keep reading to see her beauty tricks and shop " ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a df with outputs\n", "df = pd.DataFrame(results_all,columns=['pid','text'])\n", "\n", "# remove some messy artifacts\n", "df.text = df.text.str.replace(\"Bio:\\n\",\" \")\n", "df.text = df.text.str.replace('\\]\\[\\\"',\"\")\n", "df.text = df.text.str.replace(\"’s\",\" \")\n", "df.text = df.text.str.replace(\"‘s\",\" \")\n", "df.text = df.text.str.replace(\"’\",\" \")\n", "df.text = df.text.str.replace(\"‘\",\" \")\n", "df.text = df.text.str.replace(\"“\",\" \")\n", "df.text = df.text.str.replace(\"”\",\" \")\n", "df.text = df.text.str.replace(\"—\",\" \")\n", "df.text = df.text.str.replace(\"'\",\" \")\n", "df.text = df.text.str.replace(\"–\",\" \")\n", "df.text = df.text.str.replace(\"…\",\" \")\n", "\n", "# do some string cleaning directly on the df\n", "df.text = df.text.str.lower() # set all to lower \n", "df.text = df.text.str.replace(\"'s \",\" \") # remove 's from end of words\n", "df.text = df.text.str.replace(\"'s,\",\",\") # remove 's from end of words\n", "df.text = df.text.str.replace(' \\d+ ', ' ') # replace all words that are numbers - they wont be useful for us\n", "df.text = df.text.str.replace(\"' \",\" \") # remove ' \n", "df.text = df.text.str.replace(\" '\",\" \")\n", "df.text = df.text.str.replace(\", \",\" , \")\n", "df.text = df.text.str.replace(\",\",\"\")\n", "df.text = df.text.str.replace(\"!\",\".\")\n", "df.text = df.text.str.replace(\"’s \",\" \")\n", "df.text = df.text.str.replace(\"’s,\",\",\")\n", "df.text = df.text.str.replace(\"’s\",\"\")\n", "df.text = df.text.str.replace('\"','')\n", "df.text = df.text.str.replace(' -- ',' ')\n", "df.text = df.text.str.replace(' ‘',' ')\n", "df.text = df.text.str.replace('’ ',' ')\n", "df.text = df.text.str.replace(': ',' ')\n", "df.text = df.text.str.replace('—',' ')\n", "df.text = df.text.str.replace(r'(\\[.*\\])', ' ') # remove anything inside [] as is usually leftover html junk\n", "df.text = df.text.str.replace('\\n\\n','\\n')\n", "df.text = df.text.str.replace('\\n',' ') # remove newlines within each article as will introduce dirty data later if left in\n", "df.text = df.text.str.replace('❤','love')\n", "df.text = df.text.str.replace('(','( ')\n", "df.text = df.text.str.replace(')',' )')\n", "df.text = df.text.str.replace('\\. ',' . ')\n", "df.text = df.text.str.replace('“',' ')\n", "df.text = df.text.str.replace('”',' ')\n", "df.text = df.text.str.replace('\\xa0',' ')\n", "df.text = df.text.str.replace(' ',' ')\n", "df.text = df.text.str.replace(r'(https://www.instagram.com.*? )', 'instagram ')\n", "df.text = df.text.str.replace(r'(https://instagram.com.*/? )', 'instagram ')\n", "df.text = df.text.str.replace(r'(https://www.twitter.com.*? )', 'twitter ')\n", "df.text = df.text.str.replace(r'(https://twitter.com.*? )', 'twitter ')\n", "df.text = df.text.str.replace(r'(https://www.youtube.com.*? )', 'youtube ')\n", "df.text = df.text.str.replace('?',' ?')\n", "df.text = df.text.str.replace('\\\\\\\\n',' ')\n", "df.text = df.text.str.replace('&','and')\n", "df.text = df.text.str.replace('\\\\\\ ',' ')\n", "df.text = df.text.str.replace('’ ',' ’ ')\n", "df.text = df.text.str.replace(' ‘',' ‘ ')\n", "df.text = df.text.str.replace(' pic ',' ')\n", "df.text = df.text.str.replace(' pics ',' ')\n", "\n", "# replace any double white spaces we might be left with\n", "df.text = df.text.str.replace(' ',' ')\n", "\n", "\n", "print_dim(df)\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pull a random sample of clean and original text to just eyeball if the cleaning is generally working as expected and not leaving anything else obvious worth dealing with. " ] }, { "cell_type": "code", "execution_count": 158, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T12:03:19.360101Z", "start_time": "2017-08-22T12:03:19.306063Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['CM/I7grvh6tnicUrOoy4J7zvZ8w=']\n", "\n", "## CLEANED ##############################\n", "\n", "93192 perrie edwards has a really hot dad & the internet is freaking out about it . so sexy . now we see where perrie edwards got her good looks from . the little mix singer shared a of her dad and let just say the internet isn't freaking out over nothing . he a stud muffin . omg. dads may not be everyone thing but we have to say perrie edwards' dad is incredibly hot . the singer shared a of her dreamy papa on instagram during her 23rd birthday weekend on july 10 and her fans went insane . see the...\n", "Name: text, dtype: object\n", "\n", "## ORIGINAL ##############################\n", "\n", "93192 Perrie Edwards Has A Really Hot Dad & The Internet Is Freaking Out About It .

So sexy! Now we see where Perrie Edwards got her good looks from. The Little Mix singer shared a pic of her dad, and let's just say, the Internet isn't freaking out over nothing! He's a stud muffin.

\\nOMG! Dads may not be everyone's thing, but we have to say Perrie Edwards' dad is incredibly hot! The singer shared a pic of her dreamy papa on Instagra...\n", "Name: text, dtype: object\n" ] } ], "source": [ "# pull a random sample article to look at cleaning results\n", "samp_ind = list(df.pid.sample(1))\n", "#samp_ind = list(['I8bC0jze55Ow0LZoJSyRYGr1K0M='])\n", "print(samp_ind)\n", "print('\\n## CLEANED ##############################\\n')\n", "print(str(df.loc[df['pid'].isin(samp_ind)]['text']))\n", "print('\\n## ORIGINAL ##############################\\n')\n", "print(str(df_orig.loc[df_orig['pid'].isin(samp_ind)]['text']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Text Preprocessing: Phrase Creation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create sentences with bigram phrases flagged." ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:03:59.455280Z", "start_time": "2017-08-21T14:17:50.652653Z" }, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████| 118024/118024 [2:44:23<00:00, 11.97it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "### The number of sentences is 118024\n", "### The number of words is 11568437\n" ] } ], "source": [ "# get list of documents\n", "documents = list(df['text'])\n", "# get a list of single sentences\n", "#sentences = [s.split('.',1)[0].strip() for s in documents]\n", "sentences = documents # uncomment this to just use the whole dosument as one big sentence\n", "\n", "# create sentence stream iterator, removing stopwords and punctuation, also remove small sentences\n", "sentence_stream = [[str(i).translate(str.maketrans('','',string.punctuation)) for i in sentence.split(\" \") if i not in stopwords.words('english')] for sentence in tqdm(sentences)]\n", "\n", "# remove small sentences as not much to be learned from them \n", "#sentence_stream = [sentence for sentence in sentence_stream if len(sentence) > 3 ] # only need this if passing sentences as opposed to the full doc\n", "\n", "# create bigram phrases\n", "phrases = Phrases(sentence_stream, min_count=250)\n", "bigram = Phraser(phrases)\n", "\n", "# create list of sentences to feed into the word2vec model\n", "sentences = list(bigram[sentence_stream])\n", "words = [i for j in sentences for i in j]\n", "\n", "# save sentences object\n", "with open('sentences.pickle', 'wb') as handle:\n", " pickle.dump(sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", "\n", "print(\"### The number of sentences is \" + str(len(sentences)))\n", "print(\"### The number of words is \" + str(len(words)))" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "ExecuteTime": { "end_time": "2017-08-19T21:04:33.932531Z", "start_time": "2017-08-19T21:04:30.024749Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "### The number of sentences is 41448\n", "### The number of words is 5863883\n" ] } ], "source": [ "# if loading in saved sentences object\n", "sentences = pickle.load( open( \"sentences.pickle\", \"rb\" ) )\n", "words = [i for j in sentences for i in j]\n", "\n", "print(\"### The number of sentences is \" + str(len(sentences)))\n", "print(\"### The number of words is \" + str(len(words)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets pull a sample of some sentences to see what we now have and if it is looking ok for sending into the actual model building stage." ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:16:51.519842Z", "start_time": "2017-08-21T17:16:51.504844Z" } }, "outputs": [ { "data": { "text/plain": [ "['kourtney_kardashian forces khloe move divorce kuwtk kuwtk returns khloe_kardashian dealing heavy material literally kourtney forces khloe pack house shared lamar_odom divorce time move keeping_kardashians returns e sunday june 98c brand_new episodes cameras follow ',\n", " 'jared fogle exsubway spokesman plead guilty child porn charges report whoa jared fogle man known face subway expected plead guilty possession child pornography charges according multiple outlets ongoing federal investigation jared fogle 37 reportedly accept deal aug plead guilty charges possession child pornography comes couple weeks alleged ',\n", " 'miley_cyrus refuses release album year patrick way sorry miley fans bangerz hitmaker refusing release album sad pdafilled relationship patrick getting way music hollywoodlifecom_learned exclusive_details miley dropping new_album year bring tissues miley_cyrus 22 ',\n", " 'kim_kardashian robbery planned someone inside entourage shocking report interesting details come light parisian officials get closer finding behind kim_kardashian october robbery wont believe update individual limo company kardashian family used suspected kim_kardashian robbery may inside job according jan report french newspaper le monde knew people arrested important detail emerged one suspects works car company kim used last person driven day attack whoa kim_kardashian robbery suspects french police say individual could given robbers inside information kim staying according newspaper m6 tv also reports suspects may touch kardashian family scary multiple french outlets reporting extent driver involvement attack unclear french police seem think long story short gave robbers least inside info keeping_kardashians star whereabouts terrifying police get closer jean veil kim french attorney said happy reassured developments case were_glad hear source previously reiterated hollywoodlifecom_exclusively kim relieved paris police making headway arrests nice surprise veil also reportedly told france lexpress magazine one hand perhaps mean jewels recovered hand puts end disgraceful speculation people thought clever pretend robbery setup publicity stunt madame kardashian course criminals nabbed kim might face court means nightmare far kim absolutely terrified possibility going court starting entire process source reveals feel hollywoodlifers_think kim case closed soon hope so',\n", " 'kylie_jenner flaunts boobs curvaceous figure waist trainer selfie – got flaunt kylie_jenner put body display another sizzling selfie youngest keeping_kardashians star posed revealing white shirt waist trainer showing family trademark curves kylie_jenner 18 gunning crown kardashian waist trainer ']" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pull some random sentences to see if looking ok\n", "n_samples = 5\n", "sample = random.sample(range(0, len(sentences)), n_samples)\n", "[' '.join(sentences[i]) for i in sample]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get some counts etc that might be useful." ] }, { "cell_type": "code", "execution_count": 117, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:17:02.617923Z", "start_time": "2017-08-21T17:16:51.522845Z" }, "code_folding": [], "collapsed": true }, "outputs": [], "source": [ "# get a big list of all words\n", "words_long = list(itertools.chain(*sentences))\n", "# create a big string\n", "words_string = ' '.join(words_long)\n", "# clean up\n", "del words_long\n", "# get word counts into a dict\n", "word_counts = defaultdict(int)\n", "for word in words_string.split():\n", " word_counts[word] += 1" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:17:02.879253Z", "start_time": "2017-08-21T17:17:02.620907Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "new 49995\n", "one 47279\n", "see 46184\n", "get 36628\n", "like 33597\n", "love 33249\n", "look 31692\n", "time 31661\n", "show 30224\n", "back 28398\n", "also 25550\n", "two 25377\n", "first 25118\n", "know 25030\n", "going 24923\n", "even 24220\n", "think 23700\n", "may 23439\n", "said 23426\n", "watch 22827\n" ] } ], "source": [ "# print top 20 words\n", "for w in sorted(word_counts, key=word_counts.get, reverse=True)[:20]:\n", " print(w, word_counts[w])" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:17:03.177643Z", "start_time": "2017-08-21T17:17:02.884758Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "syndications 1\n", "zampino 1\n", "follieri 1\n", "tuohy 1\n", "bardo 1\n", "jordangreen 1\n", "popsynthpop 1\n", "brotherlyact 1\n", "javadd 1\n", "mullingar 1\n", "constantinova 1\n", "dobreva 1\n", "bulgariancanadian 1\n", "apr6 1\n", "planetsomerhalder 1\n", "penns 1\n", "tappahannock 1\n", "safarti 1\n", "b’day 1\n", "am…sasha 1\n" ] } ], "source": [ "# print bottom 20 words\n", "for w in sorted(word_counts, key=word_counts.get, reverse=False)[:20]:\n", " print(w, word_counts[w])" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Build Model\n", "\n", "Now we are ready to pass our sentences to [gensim.Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html) to build our own model. \n", "\n", "There are a few key hyper parameters we need to build the model. We are not doing anything fancy like cross validation here. Instead i did a few manual trial and error builds on a smaller sample of sentences until i found a paramter set that generlally made sense to me given the size of the datasrt and our focus. " ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:19:02.571044Z", "start_time": "2017-08-21T17:17:03.182645Z" }, "collapsed": true }, "outputs": [], "source": [ "# train model\n", "model = Word2Vec( \n", " sentences = sentences, \n", " size = 100, \n", " min_count = 250, \n", " window = 10, # use a largish window since passing full document as sentence \n", " iter = 10, \n", " workers = multiprocessing.cpu_count()\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save Model\n", "\n", "Once the model is built we save it to disk so can be loaded back in later for exploration without needing to rebuild each time." ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:19:02.754922Z", "start_time": "2017-08-21T17:19:02.573045Z" }, "collapsed": true }, "outputs": [], "source": [ "# save model to disk\n", "model.save(\"celeb_word2vec_model\")\n", "\n", "#If you’re finished training a model (=no more updates, only querying), then switch to the gensim.models.KeyedVectors instance in wv\n", "word_vectors = model.wv\n", "#del model\n", "\n", "# save word vectors to disk\n", "word_vectors.save(\"word_vectors\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Model\n", "\n", "Load in the saved model. We can run from here if exploring an already trained and saved model." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T20:02:41.147758Z", "start_time": "2017-08-22T20:02:40.821546Z" }, "collapsed": true }, "outputs": [], "source": [ "# load saved model\n", "model = Word2Vec.load('celeb_word2vec_model')\n", "\n", "# load saved word vectors\n", "word_vectors = KeyedVectors.load('word_vectors')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save Vectors\n", "\n", "We will save the raw vectors out to a nice and easy, human readable, text file. Then read back in the wide matrix of vectors into a pandas dataframe in order to transform it into a long format later for exploration in a [Tableau Public workbook](https://public.tableau.com/profile/andrew5416#!/vizhome/word_vector_explorer/WordVectorExplorer)." ] }, { "cell_type": "code", "execution_count": 123, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:19:03.877993Z", "start_time": "2017-08-21T17:19:02.943695Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(6040, 101)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
word123456789...919293949596979899100
0NaN0.423812-0.873024-1.076316-0.523255-0.069986-0.8127390.6328790.1615280.282792...-0.371600-0.553812-0.671738-0.336277-0.3645110.176831-0.2949650.629585-0.3764670.779368
1new2.319383-1.351134-0.5837130.4065624.404606-3.1651662.232082-0.4463042.595640...2.7605892.519782-0.0462843.112629-0.7852551.850350-0.5433550.321961-2.8999332.350732
2one-0.025912-1.830144-0.779826-1.594398-0.478641-1.684386-0.148587-1.6444061.716436...-0.068255-0.3131970.801451-1.983233-0.122030-0.0160980.392162-0.247242-1.243877-1.139465
3see-0.932137-1.764545-5.519857-0.801152-0.065875-0.366409-0.401208-1.6668640.933654...-0.520679-0.7605190.1476500.9230390.284766-1.2035661.997292-0.284277-4.810210-1.662077
4get-1.0380890.575331-2.3495741.1343121.008488-1.1613990.614903-4.204766-0.234601...-0.178508-1.3933330.1068760.3547011.1452871.4718321.583619-0.773887-2.289845-0.899614
\n", "

5 rows × 101 columns

\n", "
" ], "text/plain": [ " word 1 2 3 4 5 6 7 \\\n", "0 NaN 0.423812 -0.873024 -1.076316 -0.523255 -0.069986 -0.812739 0.632879 \n", "1 new 2.319383 -1.351134 -0.583713 0.406562 4.404606 -3.165166 2.232082 \n", "2 one -0.025912 -1.830144 -0.779826 -1.594398 -0.478641 -1.684386 -0.148587 \n", "3 see -0.932137 -1.764545 -5.519857 -0.801152 -0.065875 -0.366409 -0.401208 \n", "4 get -1.038089 0.575331 -2.349574 1.134312 1.008488 -1.161399 0.614903 \n", "\n", " 8 9 ... 91 92 93 94 \\\n", "0 0.161528 0.282792 ... -0.371600 -0.553812 -0.671738 -0.336277 \n", "1 -0.446304 2.595640 ... 2.760589 2.519782 -0.046284 3.112629 \n", "2 -1.644406 1.716436 ... -0.068255 -0.313197 0.801451 -1.983233 \n", "3 -1.666864 0.933654 ... -0.520679 -0.760519 0.147650 0.923039 \n", "4 -4.204766 -0.234601 ... -0.178508 -1.393333 0.106876 0.354701 \n", "\n", " 95 96 97 98 99 100 \n", "0 -0.364511 0.176831 -0.294965 0.629585 -0.376467 0.779368 \n", "1 -0.785255 1.850350 -0.543355 0.321961 -2.899933 2.350732 \n", "2 -0.122030 -0.016098 0.392162 -0.247242 -1.243877 -1.139465 \n", "3 0.284766 -1.203566 1.997292 -0.284277 -4.810210 -1.662077 \n", "4 1.145287 1.471832 1.583619 -0.773887 -2.289845 -0.899614 \n", "\n", "[5 rows x 101 columns]" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save raw vectors to a text file for exploration later in Tableau\n", "model.wv.save_word2vec_format('celeb_word2vec_wide.txt', binary=False)\n", "\n", "# read back in the wide file into a pandas df\n", "celeb_word2vec_wide = pd.read_csv(\"celeb_word2vec_wide.txt\",sep=' ', skiprows=1, header=None)\n", "# rename cols\n", "celeb_word2vec_wide.rename(columns = {0:'word'}, inplace = True)\n", "\n", "# print dims of the wide df\n", "print(celeb_word2vec_wide.shape)\n", "# looks at the df\n", "celeb_word2vec_wide.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we go from wide format to long format as tools like Tableau prefer this." ] }, { "cell_type": "code", "execution_count": 124, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:19:09.301177Z", "start_time": "2017-08-21T17:19:03.881494Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(604000, 3)\n", " word vector_element vector_element_magnitude\n", "0 NaN 1 0.423812\n", "1 new 1 2.319383\n", "2 one 1 -0.025912\n", "3 see 1 -0.932137\n", "4 get 1 -1.038089\n" ] } ], "source": [ "# go from wide to long format using the melt() function\n", "celeb_word2vec_long = celeb_word2vec_wide.melt(id_vars=['word'])\n", "# rename cols\n", "celeb_word2vec_long.rename(columns = {'variable':'vector_element', 'value':'vector_element_magnitude'}, inplace = True)\n", "\n", "# look at what we have\n", "print(celeb_word2vec_long.shape)\n", "print(celeb_word2vec_long.head())\n", "\n", "# save the long format back out to a text file\n", "celeb_word2vec_long.to_csv(\"celeb_word2vec_long.txt\",sep=' ',index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explore Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets just look at a specific vector to see what we actually have - basically an array of positive and negative numbers, all on a similar scale." ] }, { "cell_type": "code", "execution_count": 159, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T12:05:07.422119Z", "start_time": "2017-08-22T12:05:07.403099Z" } }, "outputs": [ { "data": { "text/plain": [ "array([-0.19621158, -1.70824993, 4.07912397, -4.13144636, 0.70407969,\n", " -0.06511977, -1.10553288, -3.4984827 , -2.40557313, -0.71576977,\n", " 3.52038622, 0.21764068, -0.54872227, -0.64682132, 0.73169208,\n", " -4.50421715, -1.66858566, -0.32605866, -7.28290606, 0.88032979,\n", " 4.97944689, -2.33671474, -2.03136921, 2.16170073, -0.51272494,\n", " -0.39437258, -1.62846637, -1.97175133, 3.41039515, 1.69589567,\n", " -1.25033355, 1.37241948, 0.20805676, 1.65922272, 2.03981185,\n", " 2.12722635, 1.26723588, 0.37178808, 1.61489332, -1.76117992,\n", " 0.75137532, 1.37545943, -0.70764965, 1.47992682, 1.53792179,\n", " 1.55739236, -1.84939837, 0.14028606, 3.16268826, -0.42398441,\n", " -4.79281712, 1.7875241 , 1.3778615 , 1.32047188, -3.06647325,\n", " -0.60717714, -1.01766086, -1.81914115, -1.82842767, 3.33063555,\n", " 0.33426681, -5.13528776, 1.3301748 , 2.58258796, 0.77017248,\n", " -0.89552003, -0.81426936, 4.43586302, -3.69974875, -3.14295745,\n", " 2.82621956, -0.70075619, 0.80255145, -2.20000005, 1.47386432,\n", " 3.01426864, 4.57765579, 2.4331708 , 0.68835354, -2.53468132,\n", " -2.43935299, -0.49032855, -3.15479589, 1.21918011, 0.33573633,\n", " -0.26256818, -2.80820608, 0.97537279, -1.99493766, -2.91135526,\n", " 1.83211803, -4.98132086, 0.98330897, 1.1170578 , -0.02211688,\n", " 1.62615228, -2.11096215, -0.23426078, -1.66096401, -0.82593608], dtype=float32)" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get a vector\n", "model.wv['justin_bieber']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now lets do the obligatory bit of vector arithmetic to help sense check some of our results. \n", "\n", "Great blog post [here](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/) that goes into this in more detail.\n", "\n", "What we ask the vectors below is basically:\n", "\n", "\"kim_kardashian\" - \"kanye_west\" + \"brad_pitt\" = ?\n", "\n", "Another way to think of this is \"kanye_west\" is to \"kim_kardashian\" as \"brad_pitt\" is to ?\n", "\n", "(Spoiler alert - ideally we'd like ? to be \"angelina_jolie\" to show the model has in some way understood the similar marriage relationship between the two pairs.)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T19:45:21.549480Z", "start_time": "2017-08-22T19:45:21.477429Z" } }, "outputs": [ { "data": { "text/plain": [ "[('angelina_jolie', 0.7405589818954468)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# some arithmetic e.g. model.wv.most_similar(positive=['woman', 'king'], negative=['man'])\n", "model.wv.most_similar(positive=['kim_kardashian', 'brad_pitt'], negative=['kanye_west'])[0:1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can also use the vectors to pick odd one out." ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:19:09.424589Z", "start_time": "2017-08-21T17:19:09.415583Z" } }, "outputs": [ { "data": { "text/plain": [ "'drake'" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.wv.doesnt_match(\"kim chloe kylie drake\".split())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can also compute the similarity between two sets of words." ] }, { "cell_type": "code", "execution_count": 154, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:56:00.233966Z", "start_time": "2017-08-22T09:56:00.198920Z" } }, "outputs": [ { "data": { "text/plain": [ "0.77569593611111409" ] }, "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.n_similarity(['kim_kardashian', 'khloe_kardashian'], ['kourtney_kardashian', 'kylie_jenner'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And finally we can easily look at the similarity between two vectors." ] }, { "cell_type": "code", "execution_count": 155, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:56:27.323522Z", "start_time": "2017-08-22T09:56:27.309513Z" } }, "outputs": [ { "data": { "text/plain": [ "0.6208901907836144" ] }, "execution_count": 155, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.wv.similarity('khloe_kardashian', 'kourtney_kardashian')" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2017-06-12T22:44:08.184697Z", "start_time": "2017-06-12T22:44:08.171689Z" }, "collapsed": true }, "source": [ "## Get Graph\n", "\n", "Next we will build a graph of relationships between words based on a seed word. \n", "\n", "So the idea here is to take a person, find their N nearest neighbours, and for each of them in turn find thier own neighbours and on for S steps. \n", "\n", "At the end of this the idea is that we will have something representing some notion of a network graph with the original seed word at the center." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T20:03:39.966221Z", "start_time": "2017-08-22T20:03:39.790095Z" } }, "outputs": [ { "data": { "text/plain": [ "[['justin_bieber', 'justin', 1],\n", " ['justin', 'biebs', 2],\n", " ['biebs', 'justin', 3],\n", " ['biebs', 'jb', 3],\n", " ['biebs', 'justin_bieber', 3],\n", " ['biebs', 'sel', 3],\n", " ['biebs', 'beliebers', 3],\n", " ['biebs', 'selena', 3],\n", " ['biebs', 'bieber', 3],\n", " ['biebs', 'sofia', 3],\n", " ['biebs', 'pop_star', 3],\n", " ['biebs', 'selena_gomez', 3],\n", " ['justin', 'jb', 2],\n", " ['jb', 'justin', 3],\n", " ['jb', 'biebs', 3],\n", " ['jb', 'justin_bieber', 3],\n", " ['jb', 'sofia', 3],\n", " ['jb', 'sofia_richie', 3],\n", " ['jb', 'beliebers', 3],\n", " ['jb', 'bieber', 3]]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "relationships = []\n", "\n", "seed_word = 'justin_bieber'\n", "topn = 10\n", "\n", "for a in model.most_similar(seed_word , topn = topn):\n", " to_node1 = a[0]\n", " relationships.append([seed_word , to_node1 , 1])\n", " for b in model.most_similar(to_node1 , topn = topn):\n", " to_node2 = b[0]\n", " relationships.append([to_node1 , to_node2, 2])\n", " for c in model.most_similar(to_node2 , topn = topn):\n", " to_node3 = c[0]\n", " relationships.append([to_node2 , to_node3 , 3])\n", "# for d in model.most_similar(to_node3 , topn = topn):\n", "# to_node4 = d[0]\n", "# relationships.append([to_node3 , to_node4 , 4])\n", "# for e in model.most_similar(to_node4 , topn = topn):\n", "# to_node5 = e[0]\n", "# relationships.append([to_node4 , to_node5 , 5])\n", "\n", "relationships[0:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As we are mostly interested in people type relationships, we will do some additional post processing to try clean things up a bit.\n", "\n", "Mainly we will restrict ourselves to words that have a \"_\" in them and thus a going to be the phrases we identifed earlier that are more likley to be people. " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T20:03:52.857971Z", "start_time": "2017-08-22T20:03:52.684847Z" } }, "outputs": [ { "data": { "text/plain": [ "[('justin_bieber', 'selena_gomez'),\n", " ('justin_bieber', 'sofia_richie'),\n", " ('justin_bieber', 'austin_mahone'),\n", " ('selena_gomez', 'justin_bieber'),\n", " ('selena_gomez', 'zayn_malik'),\n", " ('justin_bieber', 'selena_gomez'),\n", " ('justin_bieber', 'sofia_richie'),\n", " ('justin_bieber', 'austin_mahone'),\n", " ('pop_star', 'justin_bieber'),\n", " ('selena_gomez', 'justin_bieber'),\n", " ('selena_gomez', 'zayn_malik'),\n", " ('justin_bieber', 'selena_gomez'),\n", " ('selena_gomez', 'justin_bieber'),\n", " ('justin_bieber', 'selena_gomez'),\n", " ('justin_bieber', 'sofia_richie'),\n", " ('justin_bieber', 'austin_mahone'),\n", " ('selena_gomez', 'zayn_malik'),\n", " ('zayn_malik', 'perrie_edwards'),\n", " ('zayn_malik', 'harry_styles'),\n", " ('zayn_malik', 'one_direction')]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# put relationships into a df\n", "df_edges = pd.DataFrame(relationships,columns=[\"src\",\"dst\",\"step\"])\n", "# do some cleaning of things that are probably junk\n", "df_edges = df_edges[df_edges[\"dst\"].str.contains(\"_\") == True]\n", "df_edges = df_edges[df_edges[\"src\"].str.contains(\"_\") == True]\n", "\n", "# add a weight to each edge if we so wished we could calculate something more fancy to put here\n", "df_edges['weight'] = 1\n", "\n", "# make a final list from the clean df\n", "relationships_final = list(zip(df_edges['src'].tolist(),df_edges['dst'].tolist()))\n", "relationships_final[0:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now make a [.gml](https://gephi.org/users/supported-graph-formats/gml-format/) network file for R to read and do some network graphs on." ] }, { "cell_type": "code", "execution_count": 146, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:20:55.215024Z", "start_time": "2017-08-22T09:20:55.183001Z" }, "collapsed": true }, "outputs": [], "source": [ "# make a networkx graph and save edges file\n", "G = nx.from_pandas_dataframe(df_edges, 'src', 'dst', ['step','weight'])\n", "\n", "# save the graph as a gml file\n", "nx.write_gml(G, \"edges.gml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot Graph Using R\n", "\n", "I've used R to do most of the network plots. I tend to find R easier for plots, might be just because i'm generlaly more familiar with it." ] }, { "cell_type": "code", "execution_count": 133, "metadata": { "ExecuteTime": { "end_time": "2017-08-21T17:19:11.747563Z", "start_time": "2017-08-21T17:19:11.640444Z" } }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use R to run the make_network_graph.R script.\n", "# i dont think this works fully - tend to just run the rscript in rstudio manually\n", "import subprocess\n", "subprocess.call(\"cmd /C \"\"C:\\Program Files\\R\\R-3.3.1\\bin\\Rscript.exe\" \"C:\\\\Users\\\\Andrew\\\\Documents\\\\pmc-analytical-data-mart\\\\celeb_vectors\\\\make_network_graph.R\"\"\")" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Cluster Word Vectors\n", "\n", "Next we will do some clustering of the vectors in our Justin Bieber network. " ] }, { "cell_type": "code", "execution_count": 147, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:39:40.760427Z", "start_time": "2017-08-22T09:39:40.678346Z" }, "collapsed": true }, "outputs": [], "source": [ "# get a set of all the unique words in the network\n", "labels = list(set(list(chain.from_iterable(relationships_final))))\n", "\n", "# get the vectors relating to the words\n", "data_array = model.wv[labels]" ] }, { "cell_type": "code", "execution_count": 148, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:41:15.644201Z", "start_time": "2017-08-22T09:41:15.619178Z" }, "collapsed": true }, "outputs": [], "source": [ "# use the data to get distance matrix\n", "data_dist = pdist(data_array) # computing the distance\n", "data_link = linkage(data_dist) # computing the linkage" ] }, { "cell_type": "code", "execution_count": 149, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:41:55.328210Z", "start_time": "2017-08-22T09:41:51.746660Z" } }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a dendrogram for input into a heatmap\n", "dendro = FF.create_dendrogram(data_array, orientation='right',\n", " linkagefun=lambda x: linkage(data_array, method = 'ward', metric = 'euclidean')\n", " )\n", "\n", "# create heatmap\n", "dendro_leaves = dendro['layout']['yaxis']['ticktext']\n", "dendro_leaves = list(map(int, dendro_leaves))\n", "heat_data = squareform(data_dist)\n", "heat_data = heat_data[dendro_leaves,:]\n", "heat_data = heat_data[:,dendro_leaves]\n", "\n", "heatmap = Data([\n", " go.Heatmap(\n", " x = labels,\n", " y = labels,\n", " z = heat_data,\n", " colorscale='Pairs',\n", " showscale = False\n", " )\n", "])\n", "\n", "layout = go.Layout(\n", " title = 'Heatmap of ' + seed_word + ' neighbour vectors',\n", " margin=go.Margin(\n", " l=120,\n", " r=120)\n", ")\n", "\n", "fig = go.Figure(data=heatmap, layout=layout)\n", "\n", "py.iplot(fig, filename = 'celeb-vecs-heatmap')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now lets create a dendrogram." ] }, { "cell_type": "code", "execution_count": 150, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:42:51.198738Z", "start_time": "2017-08-22T09:42:49.429088Z" }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 150, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# build a dendrogram\n", "dendro = FF.create_dendrogram(data_array, \n", " orientation='left', \n", " labels=labels,\n", " linkagefun=lambda x: linkage(data_array, method = 'complete', metric = 'euclidean'))\n", "\n", "dendro['layout'].update({'width':800, \n", " 'height':800, \n", " 'title':'Dendrogram of ' + seed_word + ' neighbour vectors',\n", " 'margin':go.Margin(l=130)})\n", "\n", "py.iplot(dendro, filename='celeb-vecs-dendrogram')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets do another dendrogram but this time just pull a random sample of words." ] }, { "cell_type": "code", "execution_count": 198, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T14:27:18.126769Z", "start_time": "2017-08-22T14:27:16.224264Z" } }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 198, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 1000 ) if \"_\" in word]\n", "#sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 100 ) ]\n", "\n", "# get the vectors relating\n", "sample_data_array = model.wv[sample_n_words]\n", "\n", "dendro = FF.create_dendrogram(sample_data_array, \n", " orientation='left', \n", " labels=sample_n_words,\n", " linkagefun=lambda x: linkage(sample_data_array, method = 'complete', metric = 'euclidean'))\n", "\n", "dendro['layout'].update({'width':800, \n", " 'height':1800, \n", " 'title':'Dendrogram of a random sample of word vectors',\n", " 'margin':go.Margin(l=150)})\n", "\n", "py.iplot(dendro, filename='sample-vecs-dendrogram')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## t-SNE of Vectors\n", "\n", "Here we will take a sample of words from our trained model and create a [t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) model to project the vectors into a 2-d space where we can look at them on a scatter plot whereby the distance between the points is repesentiative of their distance in the higher dimensional space of our Word2Vec model.\n", "\n", "Note: We first do a [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis) on the vectors as t-SNE works best with dozens of features as opposed to hundreds. So the PCA gets us from 100 long word vectors to vectors of the top 20 principle components. \n", "\n", "Also - i've plotted the text labels which makes the graph look very messy. Best way to use it then is to zoom in and out on different sections." ] }, { "cell_type": "code", "execution_count": 151, "metadata": { "ExecuteTime": { "end_time": "2017-08-22T09:47:51.685622Z", "start_time": "2017-08-22T09:47:13.475641Z" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Explained variation (PCA): 0.7729213864172766\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sample some words from the trained model\n", "sample_pct = 0.2\n", "sample_words = random.sample( set( word_vectors.vocab ) , round( len( word_vectors.vocab ) * sample_pct ) )\n", "#sample_words = list(set([i[0] for i in relationships_final])) # uncomment this if you want to use our network for the t-SNE\n", "sample_vectors = model.wv[ sample_words ]\n", "\n", "# do PCA\n", "pca_n = PCA( n_components = 50 )\n", "pca_result = pca_n.fit_transform( sample_vectors )\n", "# print how much of the variation the top components explain\n", "print( 'Explained variation (PCA): {}'.format( np.sum( pca_n.explained_variance_ratio_ ) ) )\n", "\n", "# do t-SNE\n", "X = pca_result\n", "tsne = TSNE( n_components = 2 )\n", "X_tsne = tsne.fit_transform( X )\n", "\n", "# plot the t-SNE\n", "trace = go.Scatter(\n", " x = X_tsne[:, 0],\n", " y = X_tsne[:, 1],\n", " mode = 'text',\n", " text = sample_words\n", ")\n", "data = [ trace ]\n", "py.iplot( data, filename='celeb-vecs-tsne' )" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" }, "toc": { "colors": { "hover_highlight": "#DAA520", "navigate_num": "#000000", "navigate_text": "#333333", "running_highlight": "#FF0000", "selected_highlight": "#FFD700", "sidebar_border": "#EEEEEE", "wrapper_background": "#FFFFFF" }, "moveMenuLeft": true, "nav_menu": { "height": "102px", "width": "252px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_position": { "height": "762px", "left": "0px", "right": "1382px", "top": "32px", "width": "212px" }, "toc_section_display": "block", "toc_window_display": true, "widenNotebook": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "position": { "height": "445px", "left": "910px", "right": "20px", "top": "120px", "width": "307px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }