{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CS579: Lecture 13 \n", "\n", "**Demographic Inference II**\n", "\n", "*[Dr. Aron Culotta](http://cs.iit.edu/~culotta)* \n", "*[Illinois Institute of Technology](http://iit.edu)*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Gender Classification\n", "\n", "Let's build a classifier to predict whether a Twitter user is male/female.\n", "\n", "We'll collect \"labeled\" training data using Census name list." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**1.) Collect Census names. **" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "found 4014 female and 1146 male names\n", "male name sample: ['stephen', 'mark', 'sammy', 'thanh', 'wallace']\n", "female name sample: ['marion', 'regena', 'kathryne', 'ashely', 'rosanna']\n" ] } ], "source": [ "# Fetch male/female names from Census.\n", "\n", "import requests\n", "\n", "def get_census_names():\n", " \"\"\" Fetch a list of common male/female names from the census.\n", " For ambiguous names, we select the more frequent gender.\"\"\"\n", " males = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.male.first').text.split('\\n')\n", " females = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.female.first').text.split('\\n')\n", " males_pct = dict([(m.split()[0].lower(), float(m.split()[1]))\n", " for m in males if m])\n", " females_pct = dict([(f.split()[0].lower(), float(f.split()[1]))\n", " for f in females if f])\n", " male_names = set([m for m in males_pct if m not in females_pct or\n", " males_pct[m] > females_pct[m]])\n", " female_names = set([f for f in females_pct if f not in males_pct or\n", " females_pct[f] > males_pct[f]]) \n", " return male_names, female_names\n", "\n", "male_names, female_names = get_census_names()\n", "print('found %d female and %d male names' % (len(female_names), len(male_names)))\n", "print('male name sample:', list(male_names)[:5])\n", "print('female name sample:', list(female_names)[:5])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**2.) Sample 5K tweets with names on the Census list. **" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Construct TwitterAPI object.\n", "\n", "import configparser\n", "from TwitterAPI import TwitterAPI\n", "\n", "def get_twitter(config_file):\n", " config = configparser.ConfigParser()\n", " config.read(config_file)\n", " twitter = TwitterAPI(\n", " config.get('twitter', 'consumer_key'),\n", " config.get('twitter', 'consumer_secret'),\n", " config.get('twitter', 'access_token'),\n", " config.get('twitter', 'access_token_secret'))\n", " return twitter\n", "\n", "twitter = get_twitter('twitter.cfg')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "found 100 tweets\n", "found 200 tweets\n", "found 300 tweets\n", "found 400 tweets\n", "found 500 tweets\n", "found 600 tweets\n", "found 700 tweets\n", "found 800 tweets\n", "found 900 tweets\n", "found 1000 tweets\n", "found 1100 tweets\n", "found 1200 tweets\n", "found 1300 tweets\n", "found 1400 tweets\n", "found 1500 tweets\n", "found 1600 tweets\n", "found 1700 tweets\n", "found 1800 tweets\n", "found 1900 tweets\n", "found 2000 tweets\n", "found 2100 tweets\n", "found 2200 tweets\n", "found 2300 tweets\n", "found 2400 tweets\n", "found 2500 tweets\n", "found 2600 tweets\n", "found 2700 tweets\n", "found 2800 tweets\n", "found 2900 tweets\n", "found 3000 tweets\n", "found 3100 tweets\n", "found 3200 tweets\n", "found 3300 tweets\n", "found 3400 tweets\n", "found 3500 tweets\n", "found 3600 tweets\n", "found 3700 tweets\n", "found 3800 tweets\n", "found 3900 tweets\n", "found 4000 tweets\n", "found 4100 tweets\n", "found 4200 tweets\n", "found 4300 tweets\n", "found 4400 tweets\n", "found 4500 tweets\n", "found 4600 tweets\n", "found 4700 tweets\n", "found 4800 tweets\n", "found 4900 tweets\n", "found 5000 tweets\n" ] } ], "source": [ "# Sample U.S. tweets with names from Census. \n", "import sys\n", "\n", "def get_first_name(tweet):\n", " if 'user' in tweet and 'name' in tweet['user']:\n", " parts = tweet['user']['name'].split()\n", " if len(parts) > 0:\n", " return parts[0].lower()\n", "\n", "def sample_tweets(twitter, limit, male_names, female_names):\n", " tweets = []\n", " while True:\n", " try:\n", " # Restrict to U.S.\n", " for response in twitter.request('statuses/filter',\n", " {'locations':'-124.637,24.548,-66.993,48.9974'}):\n", " if 'user' in response:\n", " name = get_first_name(response)\n", " if name in male_names or name in female_names:\n", " tweets.append(response)\n", " if len(tweets) % 100 == 0:\n", " print('found %d tweets' % len(tweets))\n", " if len(tweets) >= limit:\n", " return tweets\n", " except:\n", " print(\"Unexpected error:\", sys.exc_info()[0])\n", " return tweets\n", " \n", "tweets = sample_tweets(twitter, 5000, male_names, female_names)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# optionally read from disk\n", "# import pickle\n", "# tweets = pickle.load(open('tweets.pkl', 'rb'))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sampled 5000 tweets\n", "top names: [('michael', 63), ('mike', 60), ('david', 57), ('matt', 48), ('chris', 48), ('john', 46), ('joe', 40), ('ryan', 39), ('mark', 38), ('brian', 35)]\n" ] } ], "source": [ "from collections import Counter\n", "\n", "print('sampled %d tweets' % len(tweets))\n", "print('top names:', Counter(get_first_name(t) for t in tweets).most_common(10))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Save these tweets.\n", "import pickle\n", "pickle.dump(tweets, open('tweets.pkl', 'wb'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**3.) Tokenize tweets. **" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test tweet:\n", "\tscreen_name=mickeystrand\n", "\tname=Mickey Strand\n", "\tdescr=Portrait Photographer, Beyond the Cut portrait project, WW2 Portrait project, Instructor & Mentor. Retired Navy Combat Camera Chief.\n", "\ttext=Working on upcoming course description for a Moab fine art shooting intense workshop.\n" ] } ], "source": [ "test_tweet = tweets[1]\n", "print('test tweet:\\n\\tscreen_name=%s\\n\\tname=%s\\n\\tdescr=%s\\n\\ttext=%s' %\n", " (test_tweet['user']['screen_name'],\n", " test_tweet['user']['name'],\n", " test_tweet['user']['description'],\n", " test_tweet['text']))" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import re\n", "\n", "def tokenize(string, lowercase, keep_punctuation, prefix,\n", " collapse_urls, collapse_mentions):\n", " \"\"\" Split a tweet into tokens.\"\"\"\n", " if not string:\n", " return []\n", " if lowercase:\n", " string = string.lower()\n", " tokens = []\n", " if collapse_urls:\n", " string = re.sub('http\\S+', 'THIS_IS_A_URL', string)\n", " if collapse_mentions:\n", " string = re.sub('@\\S+', 'THIS_IS_A_MENTION', string)\n", " if keep_punctuation:\n", " tokens = string.split()\n", " else:\n", " tokens = re.sub('\\W+', ' ', string).split()\n", " if prefix:\n", " tokens = ['%s%s' % (prefix, t) for t in tokens]\n", " return tokens" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['d=portrait',\n", " 'd=photographer',\n", " 'd=beyond',\n", " 'd=the',\n", " 'd=cut',\n", " 'd=portrait',\n", " 'd=project',\n", " 'd=ww2',\n", " 'd=portrait',\n", " 'd=project',\n", " 'd=instructor',\n", " 'd=mentor',\n", " 'd=retired',\n", " 'd=navy',\n", " 'd=combat',\n", " 'd=camera',\n", " 'd=chief']" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenize(test_tweet['user']['description'], lowercase=True,\n", " keep_punctuation=False, prefix='d=',\n", " collapse_urls=True, collapse_mentions=True)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['d=apple', 'd=banana', 'd=went', 'd=to', 'd=the', 'd=store']" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenize('apple-banana went to the store!', lowercase=True,\n", " keep_punctuation=False, prefix='d=',\n", " collapse_urls=True, collapse_mentions=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['t=working',\n", " 't=on',\n", " 't=upcoming',\n", " 't=course',\n", " 't=description',\n", " 't=for',\n", " 't=a',\n", " 't=moab',\n", " 't=fine',\n", " 't=art',\n", " 't=shooting',\n", " 't=intense',\n", " 't=workshop.']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenize(test_tweet['text'], lowercase=True, keep_punctuation=True,\n", " prefix='t=',\n", " collapse_urls=True, collapse_mentions=False)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def tweet2tokens(tweet, use_descr=True, lowercase=True,\n", " keep_punctuation=True, descr_prefix='d=',\n", " collapse_urls=True, collapse_mentions=True):\n", " \"\"\" Convert a tweet into a list of tokens, from the tweet text and optionally the\n", " user description. \"\"\"\n", " tokens = tokenize(tweet['text'], lowercase, keep_punctuation, None,\n", " collapse_urls, collapse_mentions)\n", " if use_descr:\n", " tokens.extend(tokenize(tweet['user']['description'], lowercase,\n", " keep_punctuation, descr_prefix,\n", " collapse_urls, collapse_mentions))\n", " return tokens" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['working',\n", " 'on',\n", " 'upcoming',\n", " 'course',\n", " 'description',\n", " 'for',\n", " 'a',\n", " 'moab',\n", " 'fine',\n", " 'art',\n", " 'shooting',\n", " 'intense',\n", " 'workshop.',\n", " 'd=portrait',\n", " 'd=photographer,',\n", " 'd=beyond',\n", " 'd=the',\n", " 'd=cut',\n", " 'd=portrait',\n", " 'd=project,',\n", " 'd=ww2',\n", " 'd=portrait',\n", " 'd=project,',\n", " 'd=instructor',\n", " 'd=&',\n", " 'd=mentor.',\n", " 'd=retired',\n", " 'd=navy',\n", " 'd=combat',\n", " 'd=camera',\n", " 'd=chief.']" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweet2tokens(test_tweet)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "use_descr=True lower=True punct=True prefix=d= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. d=portrait d=photographer, d=beyond d=the d=cut d=portrait d=project, d=ww2 d=portrait d=project, d=instructor d=& d=mentor. d=retired d=navy d=combat d=camera d=chief. \n", "----\n", "\n", "use_descr=True lower=True punct=True prefix=d= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. d=portrait d=photographer, d=beyond d=the d=cut d=portrait d=project, d=ww2 d=portrait d=project, d=instructor d=& d=mentor. d=retired d=navy d=combat d=camera d=chief. \n", "----\n", "\n", "use_descr=True lower=True punct=True prefix=d= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. d=portrait d=photographer, d=beyond d=the d=cut d=portrait d=project, d=ww2 d=portrait d=project, d=instructor d=& d=mentor. d=retired d=navy d=combat d=camera d=chief. \n", "----\n", "\n", "use_descr=True lower=True punct=True prefix=d= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. d=portrait d=photographer, d=beyond d=the d=cut d=portrait d=project, d=ww2 d=portrait d=project, d=instructor d=& d=mentor. d=retired d=navy d=combat d=camera d=chief. \n", "----\n", "\n", "use_descr=True lower=True punct=True prefix= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. portrait photographer, beyond the cut portrait project, ww2 portrait project, instructor & mentor. retired navy combat camera chief. \n", "----\n", "\n", "use_descr=True lower=True punct=True prefix= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. portrait photographer, beyond the cut portrait project, ww2 portrait project, instructor & mentor. retired navy combat camera chief. \n", "----\n", "\n", "use_descr=True lower=True punct=True prefix= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. portrait photographer, beyond the cut portrait project, ww2 portrait project, instructor & mentor. retired navy combat camera chief. \n", "----\n", "\n", "use_descr=True lower=True punct=True prefix= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. portrait photographer, beyond the cut portrait project, ww2 portrait project, instructor & mentor. retired navy combat camera chief. \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix=d= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop d=portrait d=photographer d=beyond d=the d=cut d=portrait d=project d=ww2 d=portrait d=project d=instructor d=mentor d=retired d=navy d=combat d=camera d=chief \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix=d= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop d=portrait d=photographer d=beyond d=the d=cut d=portrait d=project d=ww2 d=portrait d=project d=instructor d=mentor d=retired d=navy d=combat d=camera d=chief \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix=d= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop d=portrait d=photographer d=beyond d=the d=cut d=portrait d=project d=ww2 d=portrait d=project d=instructor d=mentor d=retired d=navy d=combat d=camera d=chief \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix=d= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop d=portrait d=photographer d=beyond d=the d=cut d=portrait d=project d=ww2 d=portrait d=project d=instructor d=mentor d=retired d=navy d=combat d=camera d=chief \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop portrait photographer beyond the cut portrait project ww2 portrait project instructor mentor retired navy combat camera chief \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop portrait photographer beyond the cut portrait project ww2 portrait project instructor mentor retired navy combat camera chief \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop portrait photographer beyond the cut portrait project ww2 portrait project instructor mentor retired navy combat camera chief \n", "----\n", "\n", "use_descr=True lower=True punct=False prefix= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop portrait photographer beyond the cut portrait project ww2 portrait project instructor mentor retired navy combat camera chief \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix=d= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. d=Portrait d=Photographer, d=Beyond d=the d=Cut d=portrait d=project, d=WW2 d=Portrait d=project, d=Instructor d=& d=Mentor. d=Retired d=Navy d=Combat d=Camera d=Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix=d= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. d=Portrait d=Photographer, d=Beyond d=the d=Cut d=portrait d=project, d=WW2 d=Portrait d=project, d=Instructor d=& d=Mentor. d=Retired d=Navy d=Combat d=Camera d=Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix=d= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. d=Portrait d=Photographer, d=Beyond d=the d=Cut d=portrait d=project, d=WW2 d=Portrait d=project, d=Instructor d=& d=Mentor. d=Retired d=Navy d=Combat d=Camera d=Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix=d= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. d=Portrait d=Photographer, d=Beyond d=the d=Cut d=portrait d=project, d=WW2 d=Portrait d=project, d=Instructor d=& d=Mentor. d=Retired d=Navy d=Combat d=Camera d=Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. Portrait Photographer, Beyond the Cut portrait project, WW2 Portrait project, Instructor & Mentor. Retired Navy Combat Camera Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. Portrait Photographer, Beyond the Cut portrait project, WW2 Portrait project, Instructor & Mentor. Retired Navy Combat Camera Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. Portrait Photographer, Beyond the Cut portrait project, WW2 Portrait project, Instructor & Mentor. Retired Navy Combat Camera Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=True prefix= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. Portrait Photographer, Beyond the Cut portrait project, WW2 Portrait project, Instructor & Mentor. Retired Navy Combat Camera Chief. \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix=d= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop d=Portrait d=Photographer d=Beyond d=the d=Cut d=portrait d=project d=WW2 d=Portrait d=project d=Instructor d=Mentor d=Retired d=Navy d=Combat d=Camera d=Chief \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix=d= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop d=Portrait d=Photographer d=Beyond d=the d=Cut d=portrait d=project d=WW2 d=Portrait d=project d=Instructor d=Mentor d=Retired d=Navy d=Combat d=Camera d=Chief \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix=d= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop d=Portrait d=Photographer d=Beyond d=the d=Cut d=portrait d=project d=WW2 d=Portrait d=project d=Instructor d=Mentor d=Retired d=Navy d=Combat d=Camera d=Chief \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix=d= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop d=Portrait d=Photographer d=Beyond d=the d=Cut d=portrait d=project d=WW2 d=Portrait d=project d=Instructor d=Mentor d=Retired d=Navy d=Combat d=Camera d=Chief \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop Portrait Photographer Beyond the Cut portrait project WW2 Portrait project Instructor Mentor Retired Navy Combat Camera Chief \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop Portrait Photographer Beyond the Cut portrait project WW2 Portrait project Instructor Mentor Retired Navy Combat Camera Chief \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop Portrait Photographer Beyond the Cut portrait project WW2 Portrait project Instructor Mentor Retired Navy Combat Camera Chief \n", "----\n", "\n", "use_descr=True lower=False punct=False prefix= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop Portrait Photographer Beyond the Cut portrait project WW2 Portrait project Instructor Mentor Retired Navy Combat Camera Chief \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix=d= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix=d= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix=d= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix=d= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=True prefix= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix=d= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix=d= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix=d= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix=d= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix= url=True mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix= url=True mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix= url=False mention=True\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=True punct=False prefix= url=False mention=False\n", "working on upcoming course description for a moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix=d= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix=d= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix=d= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix=d= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=True prefix= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop. \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix=d= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix=d= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix=d= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix=d= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix= url=True mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix= url=True mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix= url=False mention=True\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n", "use_descr=False lower=False punct=False prefix= url=False mention=False\n", "Working on upcoming course description for a Moab fine art shooting intense workshop \n", "----\n", "\n" ] } ], "source": [ "# for enumerating all possible arguments of tweet2tokens\n", "# https://docs.python.org/2/library/itertools.html#itertools.product\n", "from itertools import product\n", "\n", "use_descr_opts = [True, False]\n", "lowercase_opts = [True, False]\n", "keep_punctuation_opts = [True, False]\n", "descr_prefix_opts = ['d=', '']\n", "url_opts = [True, False]\n", "mention_opts = [True, False]\n", "\n", "argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']\n", "option_iter = product(use_descr_opts, lowercase_opts,\n", " keep_punctuation_opts,\n", " descr_prefix_opts, url_opts,\n", " mention_opts)\n", "for options in option_iter:\n", " print(' '.join('%s=%s' % (name, opt) \n", " for name, opt in zip(argnames, options)))\n", " print\n", " print(' '.join(tweet2tokens(test_tweet, *options)), '\\n----\\n')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Let's tokenize all tweets.\n", "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n", " keep_punctuation=False, descr_prefix='d=',\n", " collapse_urls=True, collapse_mentions=True)\n", " for t in tweets]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['working',\n", " 'on',\n", " 'upcoming',\n", " 'course',\n", " 'description',\n", " 'for',\n", " 'a',\n", " 'moab',\n", " 'fine',\n", " 'art',\n", " 'shooting',\n", " 'intense',\n", " 'workshop',\n", " 'd=portrait',\n", " 'd=photographer',\n", " 'd=beyond',\n", " 'd=the',\n", " 'd=cut',\n", " 'd=portrait',\n", " 'd=project',\n", " 'd=ww2',\n", " 'd=portrait',\n", " 'd=project',\n", " 'd=instructor',\n", " 'd=mentor',\n", " 'd=retired',\n", " 'd=navy',\n", " 'd=combat',\n", " 'd=camera',\n", " 'd=chief']" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens_list[1]" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[10]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "d = defaultdict(lambda: [])\n", "d['cat'].append(10)\n", "d['cat']\n", "#v = {}\n", "#v['cat'].append(10)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Store these in a sparse matrix.\n", "\n", "#1) Create a vocabulary (dict from term->index)\n", "\n", "# https://docs.python.org/2/library/collections.html#collections.defaultdict\n", "from collections import defaultdict\n", "\n", "def make_vocabulary(tokens_list):\n", " vocabulary = defaultdict(lambda: len(vocabulary)) # If term not present, assign next int.\n", " for tokens in tokens_list:\n", " for token in tokens:\n", " vocabulary[token] # looking up a key; defaultdict takes care of assigning it a value.\n", " print('%d unique terms in vocabulary' % len(vocabulary))\n", " return vocabulary" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20234 unique terms in vocabulary\n" ] } ], "source": [ "vocabulary = make_vocabulary(tokens_list)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('what', 0),\n", " ('THIS_IS_A_URL', 1),\n", " ('d=god', 2),\n", " ('d=the', 3),\n", " ('d=jonas', 4),\n", " ('d=brothers', 5),\n", " ('d=hold', 6),\n", " ('d=a', 7),\n", " ('d=special', 8),\n", " ('d=place', 9)]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# term->index\n", "list(vocabulary.items())[:10]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29731 unique terms in vocabulary\n" ] } ], "source": [ "# How big is vocabulary if we keep punctuation?\n", "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n", " keep_punctuation=True, descr_prefix='d=',\n", " collapse_urls=True, collapse_mentions=True)\n", " for t in tweets]\n", "\n", "vocabulary = make_vocabulary(tokens_list)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "32591 unique terms in vocabulary\n" ] } ], "source": [ "# How big is vocabulary if we keep punctuation and urls?\n", "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n", " keep_punctuation=True, descr_prefix='d=',\n", " collapse_urls=False, collapse_mentions=True)\n", " for t in tweets]\n", "\n", "vocabulary = make_vocabulary(tokens_list)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "36909 unique terms in vocabulary\n" ] } ], "source": [ "# How big is vocabulary if we keep punctuation and urls and mentions?\n", "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n", " keep_punctuation=True, descr_prefix='d=',\n", " collapse_urls=False, collapse_mentions=False)\n", " for t in tweets]\n", "\n", "vocabulary = make_vocabulary(tokens_list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Vector Matrix\n", "\n", "Create a matrix $X$ where $X[i,j]$ is the frequency of term $j$ in tweet $i$.\n", "\n", "$$\n", "X = \\begin{pmatrix}\n", "~ & \\hbox{term}_1 & \\hbox{term}_2 & \\hbox{term}_3 & \\hbox{term}_4 \\\\\n", "\\hbox{tweet}_1 & 1 & 0 & 0 & 0 \\\\\n", "\\hbox{tweet}_2 & 0 & 0 & 0 & 2 \\\\\n", "\\hbox{tweet}_3 & 1 & 1 & 0 & 0 \\\\\n", "\\end{pmatrix}\n", "$$\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sparse Matrices\n", "\n", "$$\n", "X = \\begin{pmatrix}\n", "~ & \\hbox{term}_1 & \\hbox{term}_2 & \\hbox{term}_3 & \\hbox{term}_4 \\\\\n", "\\hbox{tweet}_1 & 1 & 0 & 0 & 0 \\\\\n", "\\hbox{tweet}_2 & 0 & 0 & 0 & 2 \\\\\n", "\\hbox{tweet}_3 & 1 & 1 & 0 & 0 \\\\\n", "\\end{pmatrix}\n", "$$\n", "\n", "$X$ is mostly $0$ for text problems." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## List of List (LIL) Matrix\n", "\n", "Store a linked list of (index, value) pairs for each row.\n", "\n", "$$\n", "X = \\begin{pmatrix}\n", "\\hbox{tweet}_1 & (0, 1)\\\\\n", "\\hbox{tweet}_2 & (3,2)\\\\\n", "\\hbox{tweet}_3 & (0,1), (1,1)\\\\\n", "\\end{pmatrix}\n", "$$\n", "\n", "**Advantage:** Fast to construct: append to list in constant time.\n", "\n", "**Disadvantage:** Slow random access for matrix-vector product.\n", "\n", "E.g., $\\hat{z} = X\\cdot \\hat{\\beta}$ to classify tweets using a learned weight vector $\\beta$\n", "\n", "$\\hat{z}[i] = \\sum_j X[i,j] * \\beta[j]$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compressed Sparse Row (CSR) Matrix\n", "\n", "\n", "$$\n", "X = \\begin{pmatrix}\n", "~ & \\hbox{term}_1 & \\hbox{term}_2 & \\hbox{term}_3 & \\hbox{term}_4 \\\\\n", "\\hbox{tweet}_1 & 1 & 0 & 0 & 0 \\\\\n", "\\hbox{tweet}_2 & 0 & 0 & 0 & 2 \\\\\n", "\\hbox{tweet}_3 & 1 & 1 & 0 & 0 \\\\\n", "\\hbox{tweet}_4 & 1 & 0 & 0 & 0 \\\\\n", "\\end{pmatrix}\n", "$$\n", "\n", "CSR Matrix is an object with three attributes: \n", "- **val:** $\\{1,2,1,1,1\\}$           *list of all non-zero values* \n", "- **col_ind:** $\\{0,3,0,1,0\\}$   *column index for each non-zero value* (e.g., first non-zero value (1) is in column 0) \n", "- **row_ptr:** $\\{0,1,2,4\\}$     *index into **col_ind** where each row starts* (e.g., tweet3, term1 corresponds to col_ind[2])\n", "\n", "Allows efficient row access (good for us, since each row is a tweet)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Convert features to a sparse matrix X.\n", "# X[i,j] is the frequency of term j in tweet i\n", "# \n", "from scipy.sparse import lil_matrix\n", "\n", "def make_feature_matrix(tokens_list, vocabulary):\n", " X = lil_matrix((len(tweets), len(vocabulary)))\n", " for i, tokens in enumerate(tokens_list):\n", " for token in tokens:\n", " j = vocabulary[token]\n", " X[i,j] += 1\n", " return X.tocsr() # convert to CSR for more efficient random access." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape of X: (5000, 36909)\n" ] } ], "source": [ "X = make_feature_matrix(tokens_list, vocabulary)\n", "print('shape of X:', X.shape)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on csr_matrix in module scipy.sparse.csr object:\n", "\n", "class csr_matrix(scipy.sparse.compressed._cs_matrix, scipy.sparse.sputils.IndexMixin)\n", " | Compressed Sparse Row matrix\n", " | \n", " | This can be instantiated in several ways:\n", " | csr_matrix(D)\n", " | with a dense matrix or rank-2 ndarray D\n", " | \n", " | csr_matrix(S)\n", " | with another sparse matrix S (equivalent to S.tocsr())\n", " | \n", " | csr_matrix((M, N), [dtype])\n", " | to construct an empty matrix with shape (M, N)\n", " | dtype is optional, defaulting to dtype='d'.\n", " | \n", " | csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])\n", " | where ``data``, ``row_ind`` and ``col_ind`` satisfy the\n", " | relationship ``a[row_ind[k], col_ind[k]] = data[k]``.\n", " | \n", " | csr_matrix((data, indices, indptr), [shape=(M, N)])\n", " | is the standard CSR representation where the column indices for\n", " | row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their\n", " | corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.\n", " | If the shape parameter is not supplied, the matrix dimensions\n", " | are inferred from the index arrays.\n", " | \n", " | Attributes\n", " | ----------\n", " | dtype : dtype\n", " | Data type of the matrix\n", " | shape : 2-tuple\n", " | Shape of the matrix\n", " | ndim : int\n", " | Number of dimensions (this is always 2)\n", " | nnz\n", " | Number of nonzero elements\n", " | data\n", " | CSR format data array of the matrix\n", " | indices\n", " | CSR format index array of the matrix\n", " | indptr\n", " | CSR format index pointer array of the matrix\n", " | has_sorted_indices\n", " | Whether indices are sorted\n", " | \n", " | Notes\n", " | -----\n", " | \n", " | Sparse matrices can be used in arithmetic operations: they support\n", " | addition, subtraction, multiplication, division, and matrix power.\n", " | \n", " | Advantages of the CSR format\n", " | - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.\n", " | - efficient row slicing\n", " | - fast matrix vector products\n", " | \n", " | Disadvantages of the CSR format\n", " | - slow column slicing operations (consider CSC)\n", " | - changes to the sparsity structure are expensive (consider LIL or DOK)\n", " | \n", " | Examples\n", " | --------\n", " | \n", " | >>> import numpy as np\n", " | >>> from scipy.sparse import csr_matrix\n", " | >>> csr_matrix((3, 4), dtype=np.int8).toarray()\n", " | array([[0, 0, 0, 0],\n", " | [0, 0, 0, 0],\n", " | [0, 0, 0, 0]], dtype=int8)\n", " | \n", " | >>> row = np.array([0, 0, 1, 2, 2, 2])\n", " | >>> col = np.array([0, 2, 2, 0, 1, 2])\n", " | >>> data = np.array([1, 2, 3, 4, 5, 6])\n", " | >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()\n", " | array([[1, 0, 2],\n", " | [0, 0, 3],\n", " | [4, 5, 6]])\n", " | \n", " | >>> indptr = np.array([0, 2, 3, 6])\n", " | >>> indices = np.array([0, 2, 2, 0, 1, 2])\n", " | >>> data = np.array([1, 2, 3, 4, 5, 6])\n", " | >>> csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()\n", " | array([[1, 0, 2],\n", " | [0, 0, 3],\n", " | [4, 5, 6]])\n", " | \n", " | As an example of how to construct a CSR matrix incrementally,\n", " | the following snippet builds a term-document matrix from texts:\n", " | \n", " | >>> docs = [[\"hello\", \"world\", \"hello\"], [\"goodbye\", \"cruel\", \"world\"]]\n", " | >>> indptr = [0]\n", " | >>> indices = []\n", " | >>> data = []\n", " | >>> vocabulary = {}\n", " | >>> for d in docs:\n", " | ... for term in d:\n", " | ... index = vocabulary.setdefault(term, len(vocabulary))\n", " | ... indices.append(index)\n", " | ... data.append(1)\n", " | ... indptr.append(len(indices))\n", " | ...\n", " | >>> csr_matrix((data, indices, indptr), dtype=int).toarray()\n", " | array([[2, 1, 0, 0],\n", " | [0, 1, 1, 1]])\n", " | \n", " | Method resolution order:\n", " | csr_matrix\n", " | scipy.sparse.compressed._cs_matrix\n", " | scipy.sparse.data._data_matrix\n", " | scipy.sparse.base.spmatrix\n", " | scipy.sparse.data._minmax_mixin\n", " | scipy.sparse.sputils.IndexMixin\n", " | builtins.object\n", " | \n", " | Methods defined here:\n", " | \n", " | __getitem__(self, key)\n", " | \n", " | getcol(self, i)\n", " | Returns a copy of column i of the matrix, as a (m x 1)\n", " | CSR matrix (column vector).\n", " | \n", " | getrow(self, i)\n", " | Returns a copy of row i of the matrix, as a (1 x n)\n", " | CSR matrix (row vector).\n", " | \n", " | tobsr(self, blocksize=None, copy=True)\n", " | Convert this matrix to Block Sparse Row format.\n", " | \n", " | With copy=False, the data/indices may be shared between this matrix and\n", " | the resultant bsr_matrix.\n", " | \n", " | When blocksize=(R, C) is provided, it will be used for construction of\n", " | the bsr_matrix.\n", " | \n", " | tocsc(self, copy=False)\n", " | Convert this matrix to Compressed Sparse Column format.\n", " | \n", " | With copy=False, the data/indices may be shared between this matrix and\n", " | the resultant csc_matrix.\n", " | \n", " | tocsr(self, copy=False)\n", " | Convert this matrix to Compressed Sparse Row format.\n", " | \n", " | With copy=False, the data/indices may be shared between this matrix and\n", " | the resultant csr_matrix.\n", " | \n", " | tolil(self, copy=False)\n", " | Convert this matrix to LInked List format.\n", " | \n", " | With copy=False, the data/indices may be shared between this matrix and\n", " | the resultant lil_matrix.\n", " | \n", " | transpose(self, axes=None, copy=False)\n", " | Reverses the dimensions of the sparse matrix.\n", " | \n", " | Parameters\n", " | ----------\n", " | axes : None, optional\n", " | This argument is in the signature *solely* for NumPy\n", " | compatibility reasons. Do not pass in anything except\n", " | for the default value.\n", " | copy : bool, optional\n", " | Indicates whether or not attributes of `self` should be\n", " | copied whenever possible. The degree to which attributes\n", " | are copied varies depending on the type of sparse matrix\n", " | being used.\n", " | \n", " | Returns\n", " | -------\n", " | p : `self` with the dimensions reversed.\n", " | \n", " | See Also\n", " | --------\n", " | np.matrix.transpose : NumPy's implementation of 'transpose'\n", " | for matrices\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data and other attributes defined here:\n", " | \n", " | format = 'csr'\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from scipy.sparse.compressed._cs_matrix:\n", " | \n", " | __add__(self, other)\n", " | \n", " | __eq__(self, other)\n", " | Return self==value.\n", " | \n", " | __ge__(self, other)\n", " | Return self>=value.\n", " | \n", " | __gt__(self, other)\n", " | Return self>value.\n", " | \n", " | __init__(self, arg1, shape=None, dtype=None, copy=False)\n", " | Initialize self. See help(type(self)) for accurate signature.\n", " | \n", " | __le__(self, other)\n", " | Return self<=value.\n", " | \n", " | __lt__(self, other)\n", " | Return self>> import numpy as np\n", " | >>> from scipy.sparse import csr_matrix\n", " | >>> A = csr_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5]])\n", " | >>> v = np.array([1, 0, -1])\n", " | >>> A.dot(v)\n", " | array([ 1, -3, -1], dtype=int64)\n", " | \n", " | getH(self)\n", " | # Renamed conjtranspose() -> getH() for compatibility with dense matrices\n", " | \n", " | get_shape(self)\n", " | \n", " | getformat(self)\n", " | \n", " | getmaxprint(self)\n", " | \n", " | mean(self, axis=None, dtype=None, out=None)\n", " | Compute the arithmetic mean along the specified axis.\n", " | \n", " | Returns the average of the matrix elements. The average is taken\n", " | over all elements in the matrix by default, otherwise over the\n", " | specified axis. `float64` intermediate and return values are used\n", " | for integer inputs.\n", " | \n", " | Parameters\n", " | ----------\n", " | axis : {-2, -1, 0, 1, None} optional\n", " | Axis along which the mean is computed. The default is to compute\n", " | the mean of all elements in the matrix (i.e. `axis` = `None`).\n", " | dtype : data-type, optional\n", " | Type to use in computing the mean. For integer inputs, the default\n", " | is `float64`; for floating point inputs, it is the same as the\n", " | input dtype.\n", " | \n", " | .. versionadded: 0.18.0\n", " | \n", " | out : np.matrix, optional\n", " | Alternative output matrix in which to place the result. It must\n", " | have the same shape as the expected output, but the type of the\n", " | output values will be cast if necessary.\n", " | \n", " | .. versionadded: 0.18.0\n", " | \n", " | Returns\n", " | -------\n", " | m : np.matrix\n", " | \n", " | See Also\n", " | --------\n", " | np.matrix.mean : NumPy's implementation of 'mean' for matrices\n", " | \n", " | nonzero(self)\n", " | nonzero indices\n", " | \n", " | Returns a tuple of arrays (row,col) containing the indices\n", " | of the non-zero elements of the matrix.\n", " | \n", " | Examples\n", " | --------\n", " | >>> from scipy.sparse import csr_matrix\n", " | >>> A = csr_matrix([[1,2,0],[0,0,3],[4,0,5]])\n", " | >>> A.nonzero()\n", " | (array([0, 0, 1, 2, 2]), array([0, 1, 2, 0, 2]))\n", " | \n", " | reshape(self, shape, order='C')\n", " | Gives a new shape to a sparse matrix without changing its data.\n", " | \n", " | Parameters\n", " | ----------\n", " | shape : length-2 tuple of ints\n", " | The new shape should be compatible with the original shape.\n", " | order : 'C', optional\n", " | This argument is in the signature *solely* for NumPy\n", " | compatibility reasons. Do not pass in anything except\n", " | for the default value, as this argument is not used.\n", " | \n", " | Returns\n", " | -------\n", " | reshaped_matrix : `self` with the new dimensions of `shape`\n", " | \n", " | See Also\n", " | --------\n", " | np.matrix.reshape : NumPy's implementation of 'reshape' for matrices\n", " | \n", " | set_shape(self, shape)\n", " | \n", " | setdiag(self, values, k=0)\n", " | Set diagonal or off-diagonal elements of the array.\n", " | \n", " | Parameters\n", " | ----------\n", " | values : array_like\n", " | New values of the diagonal elements.\n", " | \n", " | Values may have any length. If the diagonal is longer than values,\n", " | then the remaining diagonal entries will not be set. If values if\n", " | longer than the diagonal, then the remaining values are ignored.\n", " | \n", " | If a scalar value is given, all of the diagonal is set to it.\n", " | \n", " | k : int, optional\n", " | Which off-diagonal to set, corresponding to elements a[i,i+k].\n", " | Default: 0 (the main diagonal).\n", " | \n", " | todense(self, order=None, out=None)\n", " | Return a dense matrix representation of this matrix.\n", " | \n", " | Parameters\n", " | ----------\n", " | order : {'C', 'F'}, optional\n", " | Whether to store multi-dimensional data in C (row-major)\n", " | or Fortran (column-major) order in memory. The default\n", " | is 'None', indicating the NumPy default of C-ordered.\n", " | Cannot be specified in conjunction with the `out`\n", " | argument.\n", " | \n", " | out : ndarray, 2-dimensional, optional\n", " | If specified, uses this array (or `numpy.matrix`) as the\n", " | output buffer instead of allocating a new array to\n", " | return. The provided array must have the same shape and\n", " | dtype as the sparse matrix on which you are calling the\n", " | method.\n", " | \n", " | Returns\n", " | -------\n", " | arr : numpy.matrix, 2-dimensional\n", " | A NumPy matrix object with the same shape and containing\n", " | the same data represented by the sparse matrix, with the\n", " | requested memory order. If `out` was passed and was an\n", " | array (rather than a `numpy.matrix`), it will be filled\n", " | with the appropriate values and returned wrapped in a\n", " | `numpy.matrix` object that shares the same memory.\n", " | \n", " | todia(self, copy=False)\n", " | Convert this matrix to sparse DIAgonal format.\n", " | \n", " | With copy=False, the data/indices may be shared between this matrix and\n", " | the resultant dia_matrix.\n", " | \n", " | todok(self, copy=False)\n", " | Convert this matrix to Dictionary Of Keys format.\n", " | \n", " | With copy=False, the data/indices may be shared between this matrix and\n", " | the resultant dok_matrix.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data descriptors inherited from scipy.sparse.base.spmatrix:\n", " | \n", " | __dict__\n", " | dictionary for instance variables (if defined)\n", " | \n", " | __weakref__\n", " | list of weak references to the object (if defined)\n", " | \n", " | nnz\n", " | Number of stored values, including explicit zeros.\n", " | \n", " | See also\n", " | --------\n", " | count_nonzero : Number of non-zero entries\n", " | \n", " | shape\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data and other attributes inherited from scipy.sparse.base.spmatrix:\n", " | \n", " | __array_priority__ = 10.1\n", " | \n", " | ndim = 2\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from scipy.sparse.data._minmax_mixin:\n", " | \n", " | max(self, axis=None, out=None)\n", " | Return the maximum of the matrix or maximum along an axis.\n", " | This takes all elements into account, not just the non-zero ones.\n", " | \n", " | Parameters\n", " | ----------\n", " | axis : {-2, -1, 0, 1, None} optional\n", " | Axis along which the sum is computed. The default is to\n", " | compute the maximum over all the matrix elements, returning\n", " | a scalar (i.e. `axis` = `None`).\n", " | \n", " | out : None, optional\n", " | This argument is in the signature *solely* for NumPy\n", " | compatibility reasons. Do not pass in anything except\n", " | for the default value, as this argument is not used.\n", " | \n", " | Returns\n", " | -------\n", " | amax : coo_matrix or scalar\n", " | Maximum of `a`. If `axis` is None, the result is a scalar value.\n", " | If `axis` is given, the result is a sparse.coo_matrix of dimension\n", " | ``a.ndim - 1``.\n", " | \n", " | See Also\n", " | --------\n", " | min : The minimum value of a sparse matrix along a given axis.\n", " | np.matrix.max : NumPy's implementation of 'max' for matrices\n", " | \n", " | min(self, axis=None, out=None)\n", " | Return the minimum of the matrix or maximum along an axis.\n", " | This takes all elements into account, not just the non-zero ones.\n", " | \n", " | Parameters\n", " | ----------\n", " | axis : {-2, -1, 0, 1, None} optional\n", " | Axis along which the sum is computed. The default is to\n", " | compute the minimum over all the matrix elements, returning\n", " | a scalar (i.e. `axis` = `None`).\n", " | \n", " | out : None, optional\n", " | This argument is in the signature *solely* for NumPy\n", " | compatibility reasons. Do not pass in anything except for\n", " | the default value, as this argument is not used.\n", " | \n", " | Returns\n", " | -------\n", " | amin : coo_matrix or scalar\n", " | Minimum of `a`. If `axis` is None, the result is a scalar value.\n", " | If `axis` is given, the result is a sparse.coo_matrix of dimension\n", " | ``a.ndim - 1``.\n", " | \n", " | See Also\n", " | --------\n", " | max : The maximum value of a sparse matrix along a given axis.\n", " | np.matrix.min : NumPy's implementation of 'min' for matrices\n", "\n" ] } ], "source": [ "help(X)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<1x36909 sparse matrix of type ''\n", "\twith 28 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# How is tweet stored?\n", "X[1]" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on method nonzero in module scipy.sparse.base:\n", "\n", "nonzero() method of scipy.sparse.csr.csr_matrix instance\n", " nonzero indices\n", " \n", " Returns a tuple of arrays (row,col) containing the indices\n", " of the non-zero elements of the matrix.\n", " \n", " Examples\n", " --------\n", " >>> from scipy.sparse import csr_matrix\n", " >>> A = csr_matrix([[1,2,0],[0,0,3],[4,0,5]])\n", " >>> A.nonzero()\n", " (array([0, 0, 1, 2, 2]), array([0, 1, 2, 0, 2]))\n", "\n" ] } ], "source": [ "help(X[1].nonzero)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0], dtype=int32),\n", " array([ 4, 15, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,\n", " 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55], dtype=int32))" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X[1].nonzero()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 4, 15, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,\n", " 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55], dtype=int32)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# non-zero indices of terms used in tweet 1.\n", "X[1].nonzero()[1] # col_ind" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", " 1., 1., 3., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1.,\n", " 1., 1.])" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# term counts for tweet 1.\n", "X[1].data # \"val\"" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d=&\n", "1.0\n" ] } ], "source": [ "# What word does each term index correspond to?\n", "# Convert term->index dict into index->term dict\n", "index2term = {i: t for t, i in vocabulary.items()}\n", "print(index2term[15])\n", "print(X[1, 15])\n", "# So, the term \"for\" (index 29) appears in user 200's tweet two times" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d=cut\n", "1.0\n" ] } ], "source": [ "# d=and appears one time.\n", "print(index2term[46])\n", "print(X[1, 46])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How do CSR matrices access row values?\n", "\n", "Recall:\n", "\n", "CSR Matrix is an object with three attributes: \n", "- **val:** $\\{1,2,1,1\\}$           *list of all non-zero values* \n", "- **col_ind:** $\\{0,3,0,1\\}$   *column index for each non-zero value* (e.g., first non-zero value (1) is in column 0) \n", "- **row_ptr (ind_ptr):** $\\{0,1,2\\}$     *index into **col_ind** where each row starts* (e.g., tweet3, term1 corresponds to col_ind[2])" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([200, 300, 400])" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Recall: numpy array slices.\n", "import numpy as np\n", "a = np.array([0, 100, 200, 300, 400, 500])\n", "a[2:5] # get elements at positions 2,3,4" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tweet 1 starts at col_ind= 30\n", "tweet 2 starts at col_ind= 58\n", "so, the columns that are non-zero for tweet 1 are:\n", "[ 4 15 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52\n", " 53 54 55]\n", "and the data associated with those cells are:\n", "[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 1. 1.\n", " 1. 2. 1. 1. 1. 1. 1. 1. 1. 1.]\n" ] } ], "source": [ "print('tweet 1 starts at col_ind=', X.indptr[1])\n", "print('tweet 2 starts at col_ind=', X.indptr[2])\n", "print('so, the columns that are non-zero for tweet 1 are:')\n", "print(X.indices[X.indptr[1]:X.indptr[2]])\n", "print('and the data associated with those cells are:')\n", "print(X.data[X.indptr[1]:X.indptr[2]])" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tweet 0:\n", " (0, 0)\t1.0\n", " (0, 1)\t1.0\n", " (0, 2)\t1.0\n", " (0, 3)\t1.0\n", " (0, 4)\t1.0\n", " (0, 5)\t2.0\n", " (0, 6)\t1.0\n", " (0, 7)\t1.0\n", " (0, 8)\t1.0\n", " (0, 9)\t1.0\n", " (0, 10)\t1.0\n", " (0, 11)\t1.0\n", " (0, 12)\t1.0\n", " (0, 13)\t1.0\n", " (0, 14)\t1.0\n", " (0, 15)\t1.0\n", " (0, 16)\t1.0\n", " (0, 17)\t1.0\n", " (0, 18)\t1.0\n", " (0, 19)\t1.0\n", " (0, 20)\t1.0\n", " (0, 21)\t1.0\n", " (0, 22)\t1.0\n", " (0, 23)\t1.0\n", " (0, 24)\t1.0\n", " (0, 25)\t1.0\n", " (0, 26)\t1.0\n", " (0, 27)\t1.0\n", " (0, 28)\t1.0\n", " (0, 29)\t1.0 \n", "\n", "tweet 1:\n", " (0, 4)\t1.0\n", " (0, 15)\t1.0\n", " (0, 30)\t1.0\n", " (0, 31)\t1.0\n", " (0, 32)\t1.0\n", " (0, 33)\t1.0\n", " (0, 34)\t1.0\n", " (0, 35)\t1.0\n", " (0, 36)\t1.0\n", " (0, 37)\t1.0\n", " (0, 38)\t1.0\n", " (0, 39)\t1.0\n", " (0, 40)\t1.0\n", " (0, 41)\t1.0\n", " (0, 42)\t1.0\n", " (0, 43)\t3.0\n", " (0, 44)\t1.0\n", " (0, 45)\t1.0\n", " (0, 46)\t1.0\n", " (0, 47)\t2.0\n", " (0, 48)\t1.0\n", " (0, 49)\t1.0\n", " (0, 50)\t1.0\n", " (0, 51)\t1.0\n", " (0, 52)\t1.0\n", " (0, 53)\t1.0\n", " (0, 54)\t1.0\n", " (0, 55)\t1.0 \n", "\n", "tweet 2:\n", " (0, 56)\t1.0\n", " (0, 57)\t1.0\n", " (0, 58)\t1.0\n", " (0, 59)\t1.0\n", " (0, 60)\t1.0\n", " (0, 61)\t2.0\n", " (0, 62)\t2.0\n", " (0, 63)\t1.0\n", " (0, 64)\t1.0\n", " (0, 65)\t1.0\n", " (0, 66)\t1.0\n", " (0, 67)\t1.0\n", " (0, 68)\t1.0\n", " (0, 69)\t1.0\n", " (0, 70)\t1.0\n", " (0, 71)\t1.0\n" ] } ], "source": [ "print('tweet 0:\\n', X[0], '\\n')\n", "print('tweet 1:\\n', X[1], '\\n')\n", "print('tweet 2:\\n', X[2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Efficient matrix vector product:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$$ z = X * \\beta $$" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X * beta for tweet 1= 31.0\n", "which is the same as the sum 31.0, since beta=[1...1]\n" ] } ], "source": [ "# Compute z = X * \\beta, where X is a CSR matrix.\n", "import numpy as np\n", "beta = np.ones(len(vocabulary)) # assume Beta = vector of 1s\n", "z = np.zeros(len(tweets))\n", "for i in range(len(tweets)): # for each row.\n", " for j in range(X.indptr[i], X.indptr[i+1]): # for each col.\n", " colidx = X.indices[j]\n", " z[i] += beta[colidx] * X.data[j]\n", "print('X * beta for tweet 1=', z[1])\n", "print('which is the same as the sum %.1f, since beta=[1...1]' %\n", " X[1].sum())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**4.) Create a list of gender labels.**" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gender labels: Counter({0: 2861, 1: 2139})\n" ] } ], "source": [ "# y is a 1d numpy array of gender labels.\n", "# Let 1=Female, 0=Male.\n", "import numpy as np\n", "\n", "def get_gender(tweet, male_names, female_names):\n", " name = get_first_name(tweet)\n", " if name in female_names:\n", " return 1\n", " elif name in male_names:\n", " return 0\n", " else:\n", " return -1\n", " \n", "y = np.array([get_gender(t, male_names, female_names) for t in tweets])\n", "print('gender labels:', Counter(y))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**5.) Fit a Logistic Regression classifier to predict gender from profile/tweet.**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Sports Guy" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Do 5-fold cross-validation\n", "# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html\n", "from sklearn.model_selection import KFold\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, confusion_matrix\n", "\n", "def do_cross_val(X, y, nfolds):\n", " \"\"\" Compute average cross-validation acccuracy.\"\"\"\n", " cv = KFold(n_splits=nfolds, random_state=42, shuffle=True)\n", " accuracies = []\n", " for train_idx, test_idx in cv.split(X):\n", " clf = LogisticRegression()\n", " clf.fit(X[train_idx], y[train_idx])\n", " predicted = clf.predict(X[test_idx])\n", " acc = accuracy_score(y[test_idx], predicted)\n", " accuracies.append(acc)\n", " avg = np.mean(accuracies)\n", " print(np.std(accuracies))\n", " print(accuracies)\n", " return avg" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0155897402159\n", "[0.69299999999999995, 0.69099999999999995, 0.70199999999999996, 0.72299999999999998, 0.72899999999999998]\n", "avg accuracy 0.7076\n" ] } ], "source": [ "print('avg accuracy', do_cross_val(X, y, 5))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CSR TIME\n" ] }, { "data": { "text/plain": [ "0.5853505079867318" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fitting model with CSR much, much faster than with LIL.\n", "from timeit import timeit\n", "print('CSR TIME')\n", "timeit(\"do_cross_val(X.tocsr(), y, 2)\", number=5,\n", " setup=\"from __main__ import do_cross_val, X, y\")" ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LIL TIME\n" ] }, { "data": { "text/plain": [ "214.12728118896484" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('LIL TIME')\n", "timeit(\"do_cross_val(X.tolil(), y, 2)\", number=5,\n", " setup=\"from __main__ import do_cross_val, X, y\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(See more about vectorization of arithmetic operations: \n", "https://en.wikipedia.org/wiki/Automatic_vectorization )" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# How does tokenization affect accuracy?\n", "# Collapse urls and mentions; ignore description prefix.\n", "def run_all(tweets, use_descr=True, lowercase=True,\n", " keep_punctuation=True, descr_prefix=None,\n", " collapse_urls=True, collapse_mentions=True):\n", " \n", " tokens_list = [tweet2tokens(t, use_descr, lowercase,\n", " keep_punctuation, descr_prefix,\n", " collapse_urls, collapse_mentions)\n", " for t in tweets]\n", " vocabulary = make_vocabulary(tokens_list)\n", " X = make_feature_matrix(tokens_list, vocabulary)\n", " acc = do_cross_val(X, y, 5)\n", " print('acc=', acc)\n", " return acc" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=True\n", "29796 unique terms in vocabulary\n", "acc= 0.7114\n", "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=False\n", "33113 unique terms in vocabulary\n", "acc= 0.7148\n", "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=True\n", "32465 unique terms in vocabulary\n", "acc= 0.7114\n", "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=False\n", "35782 unique terms in vocabulary\n", "acc= 0.7156\n", "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=True\tmention=True\n", "26212 unique terms in vocabulary\n", "acc= 0.7072\n", "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=True\tmention=False\n", "29479 unique terms in vocabulary\n", "acc= 0.7116\n", "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=False\tmention=True\n", "28882 unique terms in vocabulary\n", "acc= 0.7076\n", "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=False\tmention=False\n", "32149 unique terms in vocabulary\n", "acc= 0.711\n", "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=True\n", "20542 unique terms in vocabulary\n", "acc= 0.711\n", "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=False\n", "23768 unique terms in vocabulary\n", "acc= 0.7142\n", "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=True\n", "23264 unique terms in vocabulary\n", "acc= 0.709\n", "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=False\n", "26488 unique terms in vocabulary\n", "acc= 0.7094\n", "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=True\tmention=True\n", "16861 unique terms in vocabulary\n", "acc= 0.705\n", "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=True\tmention=False\n", "20006 unique terms in vocabulary\n", "acc= 0.7074\n", "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=False\tmention=True\n", "19575 unique terms in vocabulary\n", "acc= 0.7022\n", "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=False\tmention=False\n", "22716 unique terms in vocabulary\n", "acc= 0.7076\n", "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=True\n", "33755 unique terms in vocabulary\n", "acc= 0.706\n", "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=False\n", "37078 unique terms in vocabulary\n", "acc= 0.7086\n", "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=True\n", "36424 unique terms in vocabulary\n", "acc= 0.7042\n", "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=False\n", "39747 unique terms in vocabulary\n", "acc= 0.7074\n", "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=True\tmention=True\n", "30045 unique terms in vocabulary\n", "acc= 0.7042\n", "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=True\tmention=False\n", "33335 unique terms in vocabulary\n", "acc= 0.7064\n", "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=False\tmention=True\n", "32715 unique terms in vocabulary\n", "acc= 0.7056\n", "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=False\tmention=False\n", "36005 unique terms in vocabulary\n", "acc= 0.7094\n", "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=True\n", "24880 unique terms in vocabulary\n", "acc= 0.7178\n", "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=False\n", "28163 unique terms in vocabulary\n", "acc= 0.7198\n", "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=True\n", "27638 unique terms in vocabulary\n", "acc= 0.7166\n", "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=False\n", "30918 unique terms in vocabulary\n", "acc= 0.72\n", "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=True\tmention=True\n", "20916 unique terms in vocabulary\n", "acc= 0.711\n", "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=True\tmention=False\n", "24134 unique terms in vocabulary\n", "acc= 0.7144\n", "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=False\tmention=True\n", "23663 unique terms in vocabulary\n", "acc= 0.7108\n", "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=False\tmention=False\n", "26877 unique terms in vocabulary\n", "acc= 0.7166\n", "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=True\n", "14313 unique terms in vocabulary\n", "acc= 0.5978\n", "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=False\n", "16688 unique terms in vocabulary\n", "acc= 0.6106\n", "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=True\n", "16765 unique terms in vocabulary\n", "acc= 0.6044\n", "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=False\n", "19140 unique terms in vocabulary\n", "acc= 0.6132\n", "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=True\tmention=True\n", "14313 unique terms in vocabulary\n", "acc= 0.5978\n", "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=True\tmention=False\n", "16688 unique terms in vocabulary\n", "acc= 0.6106\n", "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=False\tmention=True\n", "16765 unique terms in vocabulary\n", "acc= 0.6044\n", "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=False\tmention=False\n", "19140 unique terms in vocabulary\n", "acc= 0.6132\n", "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=True\n", "10007 unique terms in vocabulary\n", "acc= 0.5962\n", "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=False\n", "12322 unique terms in vocabulary\n", "acc= 0.61\n", "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=True\n", "12464 unique terms in vocabulary\n", "acc= 0.6076\n", "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=False\n", "14779 unique terms in vocabulary\n", "acc= 0.6116\n", "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=True\tmention=True\n", "10007 unique terms in vocabulary\n", "acc= 0.5962\n", "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=True\tmention=False\n", "12322 unique terms in vocabulary\n", "acc= 0.61\n", "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=False\tmention=True\n", "12464 unique terms in vocabulary\n", "acc= 0.6076\n", "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=False\tmention=False\n", "14779 unique terms in vocabulary\n", "acc= 0.6116\n", "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=True\n", "16153 unique terms in vocabulary\n", "acc= 0.5906\n", "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=False\n", "18532 unique terms in vocabulary\n", "acc= 0.6102\n", "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=True\n", "18605 unique terms in vocabulary\n", "acc= 0.6028\n", "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=False\n", "20984 unique terms in vocabulary\n", "acc= 0.6086\n", "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=True\tmention=True\n", "16153 unique terms in vocabulary\n", "acc= 0.5906\n", "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=True\tmention=False\n", "18532 unique terms in vocabulary\n", "acc= 0.6102\n", "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=False\tmention=True\n", "18605 unique terms in vocabulary\n", "acc= 0.6028\n", "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=False\tmention=False\n", "20984 unique terms in vocabulary\n", "acc= 0.6086\n", "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=True\n", "12036 unique terms in vocabulary\n", "acc= 0.5844\n", "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=False\n", "14368 unique terms in vocabulary\n", "acc= 0.5956\n", "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=True\n", "14493 unique terms in vocabulary\n", "acc= 0.5926\n", "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=False\n", "16825 unique terms in vocabulary\n", "acc= 0.5962\n", "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=True\tmention=True\n", "12036 unique terms in vocabulary\n", "acc= 0.5844\n", "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=True\tmention=False\n", "14368 unique terms in vocabulary\n", "acc= 0.5956\n", "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=False\tmention=True\n", "14493 unique terms in vocabulary\n", "acc= 0.5926\n", "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=False\tmention=False\n", "16825 unique terms in vocabulary\n", "acc= 0.5962\n" ] } ], "source": [ "argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']\n", "option_iter = product(use_descr_opts, lowercase_opts,\n", " keep_punctuation_opts,\n", " descr_prefix_opts, url_opts,\n", " mention_opts)\n", "results = []\n", "for options in option_iter:\n", " option_str = '\\t'.join('%s=%s' % (name, opt) for name, opt\n", " in zip(argnames, options))\n", " print(option_str)\n", " acc = run_all(tweets, *options)\n", " results.append((acc, options))\n", " print" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7200 use_descr=True lower=False punct=False prefix=d= url=False mention=False\n", "0.7198 use_descr=True lower=False punct=False prefix=d= url=True mention=False\n", "0.7178 use_descr=True lower=False punct=False prefix=d= url=True mention=True\n", "0.7166 use_descr=True lower=False punct=False prefix=d= url=False mention=True\n", "0.7166 use_descr=True lower=False punct=False prefix= url=False mention=False\n", "0.7156 use_descr=True lower=True punct=True prefix=d= url=False mention=False\n", "0.7148 use_descr=True lower=True punct=True prefix=d= url=True mention=False\n", "0.7144 use_descr=True lower=False punct=False prefix= url=True mention=False\n", "0.7142 use_descr=True lower=True punct=False prefix=d= url=True mention=False\n", "0.7116 use_descr=True lower=True punct=True prefix= url=True mention=False\n", "0.7114 use_descr=True lower=True punct=True prefix=d= url=True mention=True\n", "0.7114 use_descr=True lower=True punct=True prefix=d= url=False mention=True\n", "0.7110 use_descr=True lower=True punct=False prefix=d= url=True mention=True\n", "0.7110 use_descr=True lower=True punct=True prefix= url=False mention=False\n", "0.7110 use_descr=True lower=False punct=False prefix= url=True mention=True\n", "0.7108 use_descr=True lower=False punct=False prefix= url=False mention=True\n", "0.7094 use_descr=True lower=True punct=False prefix=d= url=False mention=False\n", "0.7094 use_descr=True lower=False punct=True prefix= url=False mention=False\n", "0.7090 use_descr=True lower=True punct=False prefix=d= url=False mention=True\n", "0.7086 use_descr=True lower=False punct=True prefix=d= url=True mention=False\n", "0.7076 use_descr=True lower=True punct=True prefix= url=False mention=True\n", "0.7076 use_descr=True lower=True punct=False prefix= url=False mention=False\n", "0.7074 use_descr=True lower=True punct=False prefix= url=True mention=False\n", "0.7074 use_descr=True lower=False punct=True prefix=d= url=False mention=False\n", "0.7072 use_descr=True lower=True punct=True prefix= url=True mention=True\n", "0.7064 use_descr=True lower=False punct=True prefix= url=True mention=False\n", "0.7060 use_descr=True lower=False punct=True prefix=d= url=True mention=True\n", "0.7056 use_descr=True lower=False punct=True prefix= url=False mention=True\n", "0.7050 use_descr=True lower=True punct=False prefix= url=True mention=True\n", "0.7042 use_descr=True lower=False punct=True prefix=d= url=False mention=True\n", "0.7042 use_descr=True lower=False punct=True prefix= url=True mention=True\n", "0.7022 use_descr=True lower=True punct=False prefix= url=False mention=True\n", "0.6132 use_descr=False lower=True punct=True prefix=d= url=False mention=False\n", "0.6132 use_descr=False lower=True punct=True prefix= url=False mention=False\n", "0.6116 use_descr=False lower=True punct=False prefix=d= url=False mention=False\n", "0.6116 use_descr=False lower=True punct=False prefix= url=False mention=False\n", "0.6106 use_descr=False lower=True punct=True prefix=d= url=True mention=False\n", "0.6106 use_descr=False lower=True punct=True prefix= url=True mention=False\n", "0.6102 use_descr=False lower=False punct=True prefix=d= url=True mention=False\n", "0.6102 use_descr=False lower=False punct=True prefix= url=True mention=False\n", "0.6100 use_descr=False lower=True punct=False prefix=d= url=True mention=False\n", "0.6100 use_descr=False lower=True punct=False prefix= url=True mention=False\n", "0.6086 use_descr=False lower=False punct=True prefix=d= url=False mention=False\n", "0.6086 use_descr=False lower=False punct=True prefix= url=False mention=False\n", "0.6076 use_descr=False lower=True punct=False prefix=d= url=False mention=True\n", "0.6076 use_descr=False lower=True punct=False prefix= url=False mention=True\n", "0.6044 use_descr=False lower=True punct=True prefix=d= url=False mention=True\n", "0.6044 use_descr=False lower=True punct=True prefix= url=False mention=True\n", "0.6028 use_descr=False lower=False punct=True prefix=d= url=False mention=True\n", "0.6028 use_descr=False lower=False punct=True prefix= url=False mention=True\n", "0.5978 use_descr=False lower=True punct=True prefix=d= url=True mention=True\n", "0.5978 use_descr=False lower=True punct=True prefix= url=True mention=True\n", "0.5962 use_descr=False lower=True punct=False prefix=d= url=True mention=True\n", "0.5962 use_descr=False lower=True punct=False prefix= url=True mention=True\n", "0.5962 use_descr=False lower=False punct=False prefix=d= url=False mention=False\n", "0.5962 use_descr=False lower=False punct=False prefix= url=False mention=False\n", "0.5956 use_descr=False lower=False punct=False prefix=d= url=True mention=False\n", "0.5956 use_descr=False lower=False punct=False prefix= url=True mention=False\n", "0.5926 use_descr=False lower=False punct=False prefix=d= url=False mention=True\n", "0.5926 use_descr=False lower=False punct=False prefix= url=False mention=True\n", "0.5906 use_descr=False lower=False punct=True prefix=d= url=True mention=True\n", "0.5906 use_descr=False lower=False punct=True prefix= url=True mention=True\n", "0.5844 use_descr=False lower=False punct=False prefix=d= url=True mention=True\n", "0.5844 use_descr=False lower=False punct=False prefix= url=True mention=True\n" ] } ], "source": [ "for r in sorted(results, reverse=True):\n", " print('%.4f' % r[0], ' '.join('%s=%s' % (name, opt) for name, opt in zip(argnames, r[1])))" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [], "source": [ "idx2word = dict((v,k) for k,v in vocabulary.items())" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "top weighted terms for female class:\n", "('d=mom', 1.8794005883225817)\n", "('d=mother', 1.7879217848719102)\n", "('d=mom,', 1.6898641235543843)\n", "('d=✨', 1.4921097973960653)\n", "('d=wife,', 1.3136066681226575)\n", "('d=girl', 1.2785485084279851)\n", "('makes', 1.1343331260117135)\n", "('🙄', 1.127231631083933)\n", "('d=she', 1.1163831736733825)\n", "('d=httr.', 1.1047908737851388)\n", "('💕', 1.0950857840221102)\n", "('d=has', 1.0660393539063231)\n", "('loving', 1.0659908199830699)\n", "('d=alumna.', 1.0263939807173448)\n", "('d=mother,', 1.0222786681547364)\n", "('d=mama', 0.99262669785202895)\n", "('d=mom.', 0.95003253953833566)\n", "('d=❤️', 0.94194513761458232)\n", "('d=woman', 0.89938204611590611)\n", "('d=cat', 0.89301696818502463)\n", "\n", "top weighted terms for male class:\n", "('d=father', -1.5209778787652677)\n", "('d=husband,', -1.2633912999304031)\n", "('d=father,', -1.224224446924508)\n", "('d=dad', -1.1694784834867893)\n", "('d=fan.', -1.1438101776635889)\n", "('d=former', -1.1106439116218867)\n", "('d=when', -1.088660992801703)\n", "('d=musician', -0.97744036684698099)\n", "('d=twitter', -0.97378594433553467)\n", "('god', -0.97276940734276773)\n", "('d=dad,', -0.93343031634893492)\n", "('coming', -0.89861822992156226)\n", "('it!', -0.89310050869682611)\n", "('d=tech', -0.88925768127771254)\n", "('d=contributor', -0.87922009935841272)\n", "('d=#trurebels', -0.8779892573977276)\n", "('dude', -0.87183702647870243)\n", "('days', -0.86162692417795617)\n", "('d=southern', -0.86154110918134996)\n", "('d=guy', -0.85732007984228786)\n" ] } ], "source": [ "# Fit model on all data and print top coef.\n", "model = LogisticRegression()\n", "model.fit(X,y)\n", "# Get the learned coefficients for the Positive class.\n", "coef = model.coef_[0]\n", "# Sort them in descending order.\n", "top_coef_ind = np.argsort(coef)[::-1][:20]\n", "# Get the names of those features.\n", "top_coef_terms = [idx2word[i] for i in top_coef_ind]\n", "# Get the weights of those features\n", "top_coef = coef[top_coef_ind]\n", "# Print the top 10.\n", "print('top weighted terms for female class:')\n", "print('\\n'.join(str(x) for x in zip(top_coef_terms, top_coef)))\n", "\n", "# repeat for males\n", "top_coef_ind = np.argsort(coef)[:20]\n", "top_coef_terms = [idx2word[i] for i in top_coef_ind]\n", "top_coef = coef[top_coef_ind]\n", "print('\\ntop weighted terms for male class:')\n", "print('\\n'.join(str(x) for x in zip(top_coef_terms, top_coef)))\n" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHYNJREFUeJzt3XtwXGeZ5/HvY7W6dZctS7YVX2LnQm6bGxEmAZYKMzC5\nLEtgJuyEqV0CQ9ZbDNTu7C7UJktVhqVqt2B2ZxZmoMh6IJOEpUggkI1nMZMJlyVkIBclOI5jk1hx\nnESyZUmWdVe3Wt3P/nGOTFuR1Iq61Wqd/n2quvrcdN5HR+r36fO+7znH3B0REak8a1Y6ABERWRlK\nACIiFUoJQESkQikBiIhUKCUAEZEKpQQgIlKhlABERCqUEoCISIVSAhARqVCxQndgZluB+4CNgAO7\n3f0rs7Yx4CvAjcAE8DF3fzbfvltbW3379u2FhigiUjGeeeaZAXdvW8y2BScAYBr4j+7+rJk1As+Y\n2aPufjBnmxuA88PX24Gvh+8L2r59O52dnUUIUUSkMpjZq4vdtuAmIHc/PvNt3t1HgUPA5lmb3QTc\n54EngLVm1l5o2SIisnRF7QMws+3AlcCTs1ZtBl7Pme/mjUliZh+7zKzTzDr7+/uLGZ6IiOQoWgIw\nswbg+8CfuvvIUvfj7rvdvcPdO9raFtWMJSIiS1CUBGBm1QSV/7fd/QdzbNIDbM2Z3xIuExGRFVJw\nAghH+HwTOOTufznPZnuAj1rgamDY3Y8XWraIiCxdMUYBvRP4V8DzZrYvXPafgW0A7n4XsJdgCGgX\nwTDQjxehXBERKUDBCcDdHwcszzYOfKrQskREpHh0JbCISBl59OAJ7vr5yyUpSwlARKSM/PQ3fXzz\n8VdKUpYSgIhIhVICEBEpK16ykpQARETKzIKjaopICUBEpEIpAYiIVCglABGRMuKl6wJQAhARKTdW\nok4AJQARkQqlBCAiUkbUBCQiUsGsRANBlQBERCqUEoCISIVSAhARKSOuW0GIiFQuDQMVEZFlpQQg\nIlJGNAxURKSC6W6gIiKyrIqSAMzsbjPrM7MD86y/1syGzWxf+LqzGOWKiERNCVuAiBVpP/cAXwXu\nW2CbX7j7+4tUnohIZFmJhgEV5QzA3R8DBouxLxERKY1S9gFcY2bPmdmPzOyS+TYys11m1mlmnf39\n/SUMT0SkspQqATwLnO3ulwN/Dfyf+TZ0993u3uHuHW1tbSUKT0SkPERuGKi7j7j7WDi9F6g2s9ZS\nlC0iInMrSQIws00W9mqY2c6w3JOlKFtEROZWlFFAZvYd4Fqg1cy6gT8DqgHc/S7gZuCTZjYNTAK3\nuJfyREdEZHUo5c3gipIA3P0jedZ/lWCYqIiI5KGbwYmIyLJSAhARKSdRGwUkIiLlRwlARKTMqA9A\nRESWlRKAiEgZKeX4eCUAEZEyYyV6JIwSgIhIhVICEBEpI6W8SYISgIhImdEoIBERWVZKACIiFUoJ\nQESkjGgYqIhIBStRF4ASgIhIpVICEBEpI5F7JrCIiCxO1p01a3QlsIhIxcm6s6ZEFwIoAYiIlJFM\n1qlSAhARqTyZLKurCcjM7jazPjM7MM96M7O/MrMuM9tvZm8tRrkiIlHj7pSo/i/aGcA9wPULrL8B\nOD987QK+XqRyRUQiJeNO1Wo6A3D3x4DBBTa5CbjPA08Aa82svRhli4hESSYbvU7gzcDrOfPd4bI3\nMLNdZtZpZp39/f0lCU5EpFy4s7rOAIrJ3Xe7e4e7d7S1ta10OCIiJRWcAZSmrFIlgB5ga878lnCZ\niIjkyETwOoA9wEfD0UBXA8PufrxEZYuIrBpewk7gWDF2YmbfAa4FWs2sG/gzoBrA3e8C9gI3Al3A\nBPDxYpQrIhI1mayTiK2iBODuH8mz3oFPFaMsEZEoy/gquxBMRESKI5t1qiLWCSwiIouQXW0XgomI\nSHFkso5FbBSQiIgsQtZ1N1ARkYqUreQrgUVEKlk265ToBEAJQESknKSmsyRiVSUpSwlARKSMTKYz\n1MWVAEREKs7kVIZaJQARkcri7kymM9RUKwGIiFSUZDoLQK0SgIhIZZlMZwCorS5N1awEICJSJoYn\n0wA011WXpDwlABGRMjE4PgXAurp4ScpTAhARKRNDE0oAIiIVaeYMoKVeCUBEpKKcmjkDUAIQEaks\ng+NpqquMel0IJiJSWU6OpVhXF9fzAEREKs2RgXG2t9aXrLyiJAAzu97MXjSzLjO7fY71HzOzfjPb\nF75uK0a5IiJR4e4cPjHK+RsaSlZmrNAdmFkV8DXgfUA38LSZ7XH3g7M2fcDdP11oeSIiUdQ/mmIk\nOV3SBFCMM4CdQJe7H3H3KeB+4KYi7FdEpGJ09Y0BcN6GxpKVWYwEsBl4PWe+O1w22x+Y2X4ze9DM\nts63MzPbZWadZtbZ399fhPBERMrfywPjAJy3ys4AFuPvgO3ufhnwKHDvfBu6+25373D3jra2thKF\nJyKysl7qHaUhEWNjU6JkZRYjAfQAud/ot4TLTnP3k+6eCme/AVxVhHJFRCIhm3Ue7xrgsi3NJRsC\nCsVJAE8D55vZDjOLA7cAe3I3MLP2nNkPAIeKUK6ISCT8/HA/rwyMc/NVW0pabsGjgNx92sw+DTwC\nVAF3u/sLZvYFoNPd9wD/1sw+AEwDg8DHCi1XRCQq7n78FdoaE7z/srNKWm7BCQDA3fcCe2ctuzNn\n+g7gjmKUJSISJb96+SS/ODzAHTdcSDxW2mtzdSWwiMgKGZ5Mc+fDB2hvruHWd2wveflFOQMQEZE3\nZ2JqmtvufZrDfWP87cfeVrIHwefSGYCISIkl0xk+cU8nz7x6iq/+0ZW858INKxKHEoCISAmNJNP8\n6/s6eeKVk/zlv7ii5B2/udQEJCJSIv/YNcBnvvccfaMpvvQHl/HBK+e6aULpKAGIiCwjd2d/9zDf\nfvJVvvdMN9vX1/PArqvp2N6y0qEpAYiILJdfv3aKL/zfg/z6tSFqqtfwL99+NnfceCF18fKoessj\nChGRiMhmnaeODvK1n3Xxi8MDbGhM8Pl/fjEfvHIza+tK86zfxVICEBEpUGo6w69fG+KXL5/k+890\n0zM0SXNtNZ+97gJufcd2GhLlWdWWZ1QiImXM3Tl6coJHD/by5JFBnnxlkLHUNGsMdu5o4bPXXcDv\nXLSBpprqlQ51QUoAIiILmM5k+U3vKAePjXDwePD6zfERRpLTAOxorecDV5zFtW9p4+071tNcV96V\nfi4lABGR0ORUhpf7xzh0fIQXjo1w6PgIz/cMMzGVAaC2uooL2xt5/+VncVF7E++5oI0t6+pWOOql\nUwIQkYqRyTr9oymOD09yYiTJ0ZMTHOkf4+jABK8NTnBiNIl7sO1MZX/zVVu46ux1XLq5mbPX11O1\npnT3619uSgAiEgmp6Qx9Iyl6R5IcH07SOzwZvifpHQne+0ZTZLJ+xs+1NiTY0VrHO85bz9kt9Zy3\noYELNjWyozValf1clABEpKy5OxNTGXpHkpwYDiv3kSTHhyfpHU7ROzJJ73CSgbGpN/xsXbyK9uYa\n2ptreed5rWxqqmFTcw3tzcH75rW1ZTc0s5SUAESk5JLpDOOpaSamMgxPphmeTHNyfIqB0RR9oyn6\nRpL0j6V49eQEJ0aSpKazb9jH2rpqNjUFlfmlm5vZ1FR7umJvb65hY3MNjYlYSR+xuNooAYhIwaYz\nWYYm0/QOBxX3ybEpBsZSHB+a5MRIioGxFEOTaUYm04wk0yTTb6zQZ1RXGRsaa2htiHPZlmbamzey\nviHBxqYEm5pq2dRcw6amGmrjpb99ctQoAYjIG+Q2u7w2OMHxoSSnJqY4OTbFqYkpBseD19DkFEMT\naUbDIZGzNdbE2NhUQ1tDgvM3NNBcW01jTYy1dXEaEjFq41U011aztraadfVx2hoSNNdWsybibe/l\nQglApMJMZ7L0j6XoHU5yYuS3beonwve+kaAZZiz1xkq9Pl7Fuvo46+rirG+Ic25bPWvrgvm1ddVs\nbErQFn57b6mP01jmF0JVOiUAkQgZT02fHvEyM/rlxMiZI2EGxlLMGghDvGoNG5oSbGqq4aKzmnh3\nQ4JNzTVsaEywtaWOzWtraamPr8hTq2T5FCUBmNn1wFeAKuAb7v7FWesTwH3AVcBJ4A/d/Wgxyhap\nRP2jKV4bnOCVgXGe7x5if88wR/rHGZ5Mv2HbppoY7c21bGyu4cJNjWxqCjpIZ0bEbGqqoaU+rs7S\nClRwAjCzKuBrwPuAbuBpM9vj7gdzNvsEcMrdzzOzW4AvAX9YaNkiUTc1nT19ZWrwGuU3vSNnDHms\nj1dxyeZm3n9ZO1vW1QUjYHIqd3WWynyKcQawE+hy9yMAZnY/cBOQmwBuAj4fTj8IfNXMzN1nnYiK\nVLbpTJZXByf4x64BfvR8L8+8doqpcAhkPLaGCzY2cu0FG7hwUyPntjWwtaWWHa0Nkb9gSZZHMRLA\nZuD1nPlu4O3zbePu02Y2DKwHBmbvzMx2AbsAtm3bVoTwRFZeJuucHE/RPxoMkewbTXFsaJLB8Sn6\nRpMMjE7RMzRJ70jy9JWq57TV89Grz+bSLc1c3N7EjtZ6YlV6jLcUT9l1Arv7bmA3QEdHh84QpGxk\nss7IZJrBiSn6R1OMJacZS00zkkwzPJE+fUFT7mssNc1oMthmrvPdxpoYbY0JWusT7NzRwua1tWxb\nX0fH2es4p62h9L+kVJRiJIAeYGvO/JZw2VzbdJtZDGgm6AwWWTbuTjKdZSw1zcTU9OnKeGIquAJ1\n5krUialMUJFPppmYyjA5lWEinWEsGYxvD15pxsM7Qs6ntjoc015XTVNtNVtb6mhMxGisidFcW01r\nY4LWhuDV1pjgrLU1JGJqn5eVU4wE8DRwvpntIKjobwH+aNY2e4BbgV8BNwM/Vfu/zDadyTI+lWFi\naprx1DTjqaCSHp+aeZ97+UzlPp7KnN5mIpyePdxxPtVVRmNNNXXxKuriVdTGYzQmgouYGmtiNNYE\nFzA11VSzrr6atoYammpj1CeCZc211cRjap6R1aXgBBC26X8aeIRgGOjd7v6CmX0B6HT3PcA3gW+Z\nWRcwSJAkJCJmKu6RyTSnJqYYT2WYTIffrlMZRlPBN+iRyZlv0tOMpYLtZ75557s9wGy11VXUJ2LU\nJ6qojwfv6xvibIvXUZ+ooi4eoyERoy5RFbzHY9THq2iqnankY9TFg33UxatIxNZoGKRUnKL0Abj7\nXmDvrGV35kwngQ8XoywpnmzWGQ0r4Jk269FkmpHkTKUcvI8mpxlLpRlPBd/OJ9NZxlJpxpLBt/Cp\nOW7UNZeGsDlkptJtrq2mvbnm9O0BGhLVQYUerp+puM+syIPKW6NeRApXdp3A8ua4B5X4TCdkbmU+\nNKtDcmSO+XxNJI2JGE21QcU8UyG31FfRkGg43WQyU2EHzSPx09/Aa6uD5pSgco9pBItImVECKDOZ\nrDM0McXx4SR9o0n6R1MMjE0xEnZIDk8EzSz9oykGx4Mbcy1UicfWGM21QRt1U201LfVxdrTWn142\nszx3viGs9BsS+qYtEmVKACWUzTr9Y8H472NDSY4NTdJ9aoITIyleHQzue35qYmrO4YKJ2BoaEjGa\n66pZVxfnnLZ63rajhfX18dOdkE3hCJTcyrwuXqW2bRGZkxLAMkimM7zYO8rRk+N09Y3R1TfG0ZMT\nvHpy/PTDpWc0JmLBk4nW1XLltrWsrw/uojhzOX9bOHRQN+ESkWJTAihAOpPl8IkxXjoxyosnRnmp\nN3jvGZo8/S1+jcH21nq2tdRx9TktnNNaz1lra4NXcy3NdbpdroisDCWARZqcyrC/e4gDx4Kbcu17\nfYijA+NMhw3wsTXGuW0NXLF1LTdftSXnXi11+vYuImVJCWAe2ayzv2eYx17q57GX+tn3+tDpyr61\nIc7lW9byvos3cuGmRi7cFNynRRcCichqogSQI5t1nuse4u9f6OXv9h3j2HASM7h0czO3/dNzeNv2\ndVy6pZkNjTUrHaqISMGUAAg6bX/wbA/fePwIR/rHia0x3nV+K5+57gLe/ZY2WhsSKx2iiEjRVXQC\nSGey3PerV/nGL45wfDjJRe1N/I8PX877LtqozlkRibyKTQCPHx7gzocPcGRgnJ3bW/iLD1/ONeeu\n15h5EakYFZcARpNp/tveQ3znqdfZ0VrP33y0g/detEEVv4hUnIpKAF19Y9x279O8OjjBbe/awX/4\nvbdQF6+oQyAiclrF1H5PHx3kj+95mkRsDQ/suoadO1pWOiQRkRVVEQmgbyTJrvs6aW1I8K1P7GTL\nurqVDklEZMVFPgG4O599cD8TUxke+DdXqfIXEQlF/tLVh/cd4+cv9XPHDRfylo2NKx2OiEjZiHQC\nyGSdv/7pYS5qb+Kj12xf6XBERMpKpBPAY4f7ebl/nE9eey5r9GATEZEzRDoBPPRsD401Ma67ZONK\nhyIiUnYKSgBm1mJmj5rZ4fB93TzbZcxsX/jaU0iZi5VMZ3jkhV4+dOVmEjHdjllEZLZCzwBuB37i\n7ucDPwnn5zLp7leErw8UWOaiHDo+Qmo6yzvObS1FcSIiq06hCeAm4N5w+l7ggwXur2j2dw8DcNmW\n5hWORESkPBWaADa6+/FwuheYr7G9xsw6zewJMytJkniue4i2xgRnra0tRXEiIqtO3gvBzOzHwKY5\nVn0ud8bd3cx8nt2c7e49ZnYO8FMze97dX56nvF3ALoBt27blC29ePacm2bG+fsk/LyISdXkTgLu/\nd751ZnbCzNrd/biZtQN98+yjJ3w/Ymb/D7gSmDMBuPtuYDdAR0fHfAklr96RJJduVvOPiMh8Cm0C\n2gPcGk7fCjw8ewMzW2dmiXC6FXgncLDAchfk7vQOJ9X8IyKygEITwBeB95nZYeC94Txm1mFm3wi3\nuQjoNLPngJ8BX3T3ZU0Aw5NpUtNZNjbp2b0iIvMp6GZw7n4S+N05lncCt4XTvwQuLaScN6t/NAXA\nhkY9y1dEZD6RvBJ4YGwKgPX18RWORESkfEUyAYylpgForNGD3UVE5hPJBDAymQagoSbyjzsQEVmy\naCaAZJAA1tbqDEBEZD6RTACp6SwANdW6CZyIyHyimQDSQQKIxyL564mIFEUka8jUdIbYGqNKD4ER\nEZlXRBNAloS+/YuILCiStWRqOkNC7f8iIguKZgJI6wxARCSfSNaSagISEckvkrVkajqj5wCLiOQR\n0QSQJVEdyV9NRKRoIllLqg9ARCS/SNaSagISEckvkglgKqMzABGRfCJZS6bSWd0GQkQkj0jWkll3\n1phuAyEispBIJgAHUP0vIrKgSCYAXPW/iEg+BSUAM/uwmb1gZlkz61hgu+vN7EUz6zKz2wspU0RE\niqPQM4ADwO8Dj823gZlVAV8DbgAuBj5iZhcXWO6CPCh3OYsQEVn1CnporrsfgryV7U6gy92PhNve\nD9wEHCyk7DxxqQlIRCSPUvQBbAZez5nvDpctm+AMYDlLEBFZ/fKeAZjZj4FNc6z6nLs/XOyAzGwX\nsAtg27ZtS9qHqxNYRCSvvAnA3d9bYBk9wNac+S3hsvnK2w3sBujo6PClFOi4+gBERPIoRRPQ08D5\nZrbDzOLALcCe5SxQZwAiIvkVOgz0Q2bWDVwD/NDMHgmXn2VmewHcfRr4NPAIcAj4rru/UFjYiwlu\n2UsQEVnVCh0F9BDw0BzLjwE35szvBfYWUtabi6tUJYmIrF7RvBIYMJ0CiIgsKJIJwN01DFREJI9o\nJgDUBSAikk80E4DrQjARkXyimQBw9QGIiOQRyQQAOgMQEcknkglAw0BFRPKLZgJAZwAiIvlEMwHo\nmZAiInlFMgGArgMQEcknkglAN4MTEckvmgkA9QGIiOQTzQTgug5ARCSfSCYA0BmAiEg+kUwAugxA\nRCS/aCYAdQKLiOQV0QSgZwKLiOQTzQSw0gGIiKwCkUwA6HbQIiJ5RTIBBA+EUQYQEVlIQQnAzD5s\nZi+YWdbMOhbY7qiZPW9m+8yss5AyFx9bKUoREVm9YgX+/AHg94H/tYht3+PuAwWWtyiu+0GLiORV\nUAJw90NA2Y240TOBRUTyK1UfgAP/YGbPmNmuZS9MncAiInnlPQMwsx8Dm+ZY9Tl3f3iR5bzL3XvM\nbAPwqJn9xt0fm6e8XcAugG3bti1y92dydB2AiEg+eROAu7+30ELcvSd87zOzh4CdwJwJwN13A7sB\nOjo6ltSYf/0lm7iovXGJ0YqIVIZCO4HzMrN6YI27j4bTvwd8YTnL/PItVy7n7kVEIqHQYaAfMrNu\n4Brgh2b2SLj8LDPbG262EXjczJ4DngJ+6O5/X0i5IiJSuEJHAT0EPDTH8mPAjeH0EeDyQsoREZHi\ni+SVwCIikp8SgIhIhVICEBGpUEoAIiIVSglARKRCKQGIiFQoK+c7Z5pZP/DqEn+8FSjJ3UcLoBiL\nYzXECKsjTsVYHCsZ49nu3raYDcs6ARTCzDrdfd5nFJQDxVgcqyFGWB1xKsbiWA0xgpqAREQqlhKA\niEiFinIC2L3SASyCYiyO1RAjrI44FWNxrIYYo9sHICIiC4vyGYCIiCwgcgnAzK43sxfNrMvMbl+B\n8o+a2fNmts/MOsNlLWb2qJkdDt/XhcvNzP4qjHW/mb01Zz+3htsfNrNbixDX3WbWZ2YHcpYVLS4z\nuyr8vbvCn33Tj2SbJ8bPm1lPeDz3mdmNOevuCMt70cyuy1k+5/+Ame0wsyfD5Q+YWXwJMW41s5+Z\n2UEze8HM/l24vGyO5QIxls2xNLMaM3vKzJ4LY/wvC+3XzBLhfFe4fvtSYy9CjPeY2Ss5x/GKcPmK\nfG4K4u6ReQFVwMvAOUAceA64uMQxHAVaZy37c+D2cPp24Evh9I3AjwieYX818GS4vAU4Er6vC6fX\nFRjXu4G3AgeWIy6CZz1cHf7Mj4AbihTj54HPzLHtxeHfNwHsCP/uVQv9DwDfBW4Jp+8CPrmEGNuB\nt4bTjcBLYSxlcywXiLFsjmX4uzWE09XAk+HvPOd+gT8B7gqnbwEeWGrsRYjxHuDmObZfkc9NIa+o\nnQHsBLrc/Yi7TwH3AzetcEwQxHBvOH0v8MGc5fd54AlgrZm1A9cBj7r7oLufAh4Fri8kAA+ewTy4\nHHGF65rc/QkP/qvvy9lXoTHO5ybgfndPufsrQBfB33/O/4Hwm9XvAA/O8fu+mRiPu/uz4fQocAjY\nTBkdywVinE/Jj2V4PMbC2erw5QvsN/f4Pgj8bhjHm4q9SDHOZ0U+N4WIWgLYDLyeM9/Nwv/4y8GB\nfzCzZyx4wD3ARnc/Hk73EjwlDeaPt1S/R7Hi2hxOL1e8nw5Pqe+eaVpZQozrgSF3ny5WjGEzxJUE\n3wzL8ljOihHK6FiaWZWZ7QP6CCrFlxfY7+lYwvXDYRzL+hmaHaO7zxzH/xoex/9pZonZMS4yluX+\n3OQVtQRQDt7l7m8FbgA+ZWbvzl0ZZvqyG3pVrnEBXwfOBa4AjgN/sbLhBMysAfg+8KfuPpK7rlyO\n5RwxltWxdPeMu18BbCH4xn7hSsYzl9kxmtk/Ae4giPVtBM06/2kFQyxI1BJAD7A1Z35LuKxk3L0n\nfO8jeFzmTuBEeLpH+N4Xbj5fvKX6PYoVV084XfR43f1E+CHMAn9DcDyXEuNJglPy2Kzlb5qZVRNU\nrN929x+Ei8vqWM4VYzkeyzCuIeBnBM8Wn2+/p2MJ1zeHcZTkM5QT4/VhE5u7ewr4W5Z+HJftc7No\nxe5UWMkXwTOOjxB0Bs10/FxSwvLrgcac6V8StN3/d87sIPzzcPqfcWan0VP+206jVwg6jNaF0y1F\niG87Z3awFi0u3tiZdWORYmzPmf73BO29AJdwZuffEYKOv3n/B4DvcWYH458sIT4jaKv98qzlZXMs\nF4ixbI4l0AasDadrgV8A759vv8CnOLMT+LtLjb0IMbbnHOcvA19c6c/NkuuEUhZWkl8o6Il/iaA9\n8XMlLvuc8B/tOeCFmfIJ2ip/AhwGfpzzxzfga2GszwMdOfv6Y4IOrS7g40WI7TsEp/1pgrbGTxQz\nLqADOBD+zFcJLzIsQozfCmPYD+zhzErsc2F5L5IzemK+/4Hw7/NUGPv3gMQSYnwXQfPOfmBf+Lqx\nnI7lAjGWzbEELgN+HcZyALhzof0CNeF8V7j+nKXGXoQYfxoexwPA/+a3I4VW5HNTyEtXAouIVKio\n9QGIiMgiKQGIiFQoJQARkQqlBCAiUqGUAEREKpQSgIhIhVICEBGpUEoAIiIV6v8DkX2FEpFoGGYA\nAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "plt.figure()\n", "plt.plot(sorted(coef))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.27289331178589837" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coef[vocabulary['dress']]" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "-0.35541530381124387" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coef[vocabulary['she']]" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.46293652100681332" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coef[vocabulary['he']]" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "-0.2128136378053142" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coef[vocabulary['the']] # ?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Error Analysis\n", "\n", "- Which ones do we get wrong?\n", "- Are there obvious reasons?" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 0 }