{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CS579: Lecture 13  \n",
    "\n",
    "**Demographic Inference II**\n",
    "\n",
    "*[Dr. Aron Culotta](http://cs.iit.edu/~culotta)*  \n",
    "*[Illinois Institute of Technology](http://iit.edu)*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gender Classification\n",
    "\n",
    "Let's build a classifier to predict whether a Twitter user is male/female.\n",
    "\n",
    "We'll collect \"labeled\" training data using Census name list."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**1.) Collect Census names. **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found 4014 female and 1146 male names\n",
      "male name sample: ['stephen', 'mark', 'sammy', 'thanh', 'wallace']\n",
      "female name sample: ['marion', 'regena', 'kathryne', 'ashely', 'rosanna']\n"
     ]
    }
   ],
   "source": [
    "# Fetch male/female names from Census.\n",
    "\n",
    "import requests\n",
    "\n",
    "def get_census_names():\n",
    "    \"\"\" Fetch a list of common male/female names from the census.\n",
    "    For ambiguous names, we select the more frequent gender.\"\"\"\n",
    "    males = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.male.first').text.split('\\n')\n",
    "    females = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.female.first').text.split('\\n')\n",
    "    males_pct = dict([(m.split()[0].lower(), float(m.split()[1]))\n",
    "                  for m in males if m])\n",
    "    females_pct = dict([(f.split()[0].lower(), float(f.split()[1]))\n",
    "                    for f in females if f])\n",
    "    male_names = set([m for m in males_pct if m not in females_pct or\n",
    "                  males_pct[m] > females_pct[m]])\n",
    "    female_names = set([f for f in females_pct if f not in males_pct or\n",
    "                  females_pct[f] > males_pct[f]])    \n",
    "    return male_names, female_names\n",
    "\n",
    "male_names, female_names = get_census_names()\n",
    "print('found %d female and %d male names' % (len(female_names), len(male_names)))\n",
    "print('male name sample:', list(male_names)[:5])\n",
    "print('female name sample:', list(female_names)[:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**2.) Sample 5K tweets with names on the Census list. **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Construct TwitterAPI object.\n",
    "\n",
    "import configparser\n",
    "from TwitterAPI import TwitterAPI\n",
    "\n",
    "def get_twitter(config_file):\n",
    "    config = configparser.ConfigParser()\n",
    "    config.read(config_file)\n",
    "    twitter = TwitterAPI(\n",
    "                   config.get('twitter', 'consumer_key'),\n",
    "                   config.get('twitter', 'consumer_secret'),\n",
    "                   config.get('twitter', 'access_token'),\n",
    "                   config.get('twitter', 'access_token_secret'))\n",
    "    return twitter\n",
    "\n",
    "twitter = get_twitter('twitter.cfg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found 100 tweets\n",
      "found 200 tweets\n",
      "found 300 tweets\n",
      "found 400 tweets\n",
      "found 500 tweets\n",
      "found 600 tweets\n",
      "found 700 tweets\n",
      "found 800 tweets\n",
      "found 900 tweets\n",
      "found 1000 tweets\n",
      "found 1100 tweets\n",
      "found 1200 tweets\n",
      "found 1300 tweets\n",
      "found 1400 tweets\n",
      "found 1500 tweets\n",
      "found 1600 tweets\n",
      "found 1700 tweets\n",
      "found 1800 tweets\n",
      "found 1900 tweets\n",
      "found 2000 tweets\n",
      "found 2100 tweets\n",
      "found 2200 tweets\n",
      "found 2300 tweets\n",
      "found 2400 tweets\n",
      "found 2500 tweets\n",
      "found 2600 tweets\n",
      "found 2700 tweets\n",
      "found 2800 tweets\n",
      "found 2900 tweets\n",
      "found 3000 tweets\n",
      "found 3100 tweets\n",
      "found 3200 tweets\n",
      "found 3300 tweets\n",
      "found 3400 tweets\n",
      "found 3500 tweets\n",
      "found 3600 tweets\n",
      "found 3700 tweets\n",
      "found 3800 tweets\n",
      "found 3900 tweets\n",
      "found 4000 tweets\n",
      "found 4100 tweets\n",
      "found 4200 tweets\n",
      "found 4300 tweets\n",
      "found 4400 tweets\n",
      "found 4500 tweets\n",
      "found 4600 tweets\n",
      "found 4700 tweets\n",
      "found 4800 tweets\n",
      "found 4900 tweets\n",
      "found 5000 tweets\n"
     ]
    }
   ],
   "source": [
    "# Sample U.S. tweets with names from Census. \n",
    "import sys\n",
    "\n",
    "def get_first_name(tweet):\n",
    "    if 'user' in tweet and 'name' in tweet['user']:\n",
    "        parts = tweet['user']['name'].split()\n",
    "        if len(parts) > 0:\n",
    "            return parts[0].lower()\n",
    "\n",
    "def sample_tweets(twitter, limit, male_names, female_names):\n",
    "    tweets = []\n",
    "    while True:\n",
    "        try:\n",
    "            # Restrict to U.S.\n",
    "            for response in twitter.request('statuses/filter',\n",
    "                        {'locations':'-124.637,24.548,-66.993,48.9974'}):\n",
    "                if 'user' in response:\n",
    "                    name = get_first_name(response)\n",
    "                    if name in male_names or name in female_names:\n",
    "                        tweets.append(response)\n",
    "                        if len(tweets) % 100 == 0:\n",
    "                            print('found %d tweets' % len(tweets))\n",
    "                        if len(tweets) >= limit:\n",
    "                            return tweets\n",
    "        except:\n",
    "            print(\"Unexpected error:\", sys.exc_info()[0])\n",
    "    return tweets\n",
    "        \n",
    "tweets = sample_tweets(twitter, 5000, male_names, female_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# optionally read from disk\n",
    "# import pickle\n",
    "# tweets = pickle.load(open('tweets.pkl', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sampled 5000 tweets\n",
      "top names: [('michael', 63), ('mike', 60), ('david', 57), ('matt', 48), ('chris', 48), ('john', 46), ('joe', 40), ('ryan', 39), ('mark', 38), ('brian', 35)]\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "print('sampled %d tweets' % len(tweets))\n",
    "print('top names:', Counter(get_first_name(t) for t in tweets).most_common(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Save these tweets.\n",
    "import pickle\n",
    "pickle.dump(tweets, open('tweets.pkl', 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**3.) Tokenize tweets. **"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test tweet:\n",
      "\tscreen_name=mickeystrand\n",
      "\tname=Mickey Strand\n",
      "\tdescr=Portrait Photographer, Beyond the Cut portrait project, WW2 Portrait project, Instructor & Mentor. Retired Navy Combat Camera Chief.\n",
      "\ttext=Working on upcoming course description for a Moab fine art shooting intense workshop.\n"
     ]
    }
   ],
   "source": [
    "test_tweet = tweets[1]\n",
    "print('test tweet:\\n\\tscreen_name=%s\\n\\tname=%s\\n\\tdescr=%s\\n\\ttext=%s' %\n",
    "      (test_tweet['user']['screen_name'],\n",
    "       test_tweet['user']['name'],\n",
    "       test_tweet['user']['description'],\n",
    "       test_tweet['text']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def tokenize(string, lowercase, keep_punctuation, prefix,\n",
    "             collapse_urls, collapse_mentions):\n",
    "    \"\"\" Split a tweet into tokens.\"\"\"\n",
    "    if not string:\n",
    "        return []\n",
    "    if lowercase:\n",
    "        string = string.lower()\n",
    "    tokens = []\n",
    "    if collapse_urls:\n",
    "        string = re.sub('http\\S+', 'THIS_IS_A_URL', string)\n",
    "    if collapse_mentions:\n",
    "        string = re.sub('@\\S+', 'THIS_IS_A_MENTION', string)\n",
    "    if keep_punctuation:\n",
    "        tokens = string.split()\n",
    "    else:\n",
    "        tokens = re.sub('\\W+', ' ', string).split()\n",
    "    if prefix:\n",
    "        tokens = ['%s%s' % (prefix, t) for t in tokens]\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['d=portrait',\n",
       " 'd=photographer',\n",
       " 'd=beyond',\n",
       " 'd=the',\n",
       " 'd=cut',\n",
       " 'd=portrait',\n",
       " 'd=project',\n",
       " 'd=ww2',\n",
       " 'd=portrait',\n",
       " 'd=project',\n",
       " 'd=instructor',\n",
       " 'd=mentor',\n",
       " 'd=retired',\n",
       " 'd=navy',\n",
       " 'd=combat',\n",
       " 'd=camera',\n",
       " 'd=chief']"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenize(test_tweet['user']['description'], lowercase=True,\n",
    "         keep_punctuation=False, prefix='d=',\n",
    "         collapse_urls=True, collapse_mentions=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['d=apple', 'd=banana', 'd=went', 'd=to', 'd=the', 'd=store']"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenize('apple-banana went to the store!', lowercase=True,\n",
    "         keep_punctuation=False, prefix='d=',\n",
    "         collapse_urls=True, collapse_mentions=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['t=working',\n",
       " 't=on',\n",
       " 't=upcoming',\n",
       " 't=course',\n",
       " 't=description',\n",
       " 't=for',\n",
       " 't=a',\n",
       " 't=moab',\n",
       " 't=fine',\n",
       " 't=art',\n",
       " 't=shooting',\n",
       " 't=intense',\n",
       " 't=workshop.']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenize(test_tweet['text'], lowercase=True, keep_punctuation=True,\n",
    "         prefix='t=',\n",
    "         collapse_urls=True, collapse_mentions=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def tweet2tokens(tweet, use_descr=True, lowercase=True,\n",
    "                 keep_punctuation=True, descr_prefix='d=',\n",
    "                 collapse_urls=True, collapse_mentions=True):\n",
    "    \"\"\" Convert a tweet into a list of tokens, from the tweet text and optionally the\n",
    "    user description. \"\"\"\n",
    "    tokens = tokenize(tweet['text'], lowercase, keep_punctuation, None,\n",
    "                       collapse_urls, collapse_mentions)\n",
    "    if use_descr:\n",
    "        tokens.extend(tokenize(tweet['user']['description'], lowercase,\n",
    "                               keep_punctuation, descr_prefix,\n",
    "                               collapse_urls, collapse_mentions))\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['working',\n",
       " 'on',\n",
       " 'upcoming',\n",
       " 'course',\n",
       " 'description',\n",
       " 'for',\n",
       " 'a',\n",
       " 'moab',\n",
       " 'fine',\n",
       " 'art',\n",
       " 'shooting',\n",
       " 'intense',\n",
       " 'workshop.',\n",
       " 'd=portrait',\n",
       " 'd=photographer,',\n",
       " 'd=beyond',\n",
       " 'd=the',\n",
       " 'd=cut',\n",
       " 'd=portrait',\n",
       " 'd=project,',\n",
       " 'd=ww2',\n",
       " 'd=portrait',\n",
       " 'd=project,',\n",
       " 'd=instructor',\n",
       " 'd=&',\n",
       " 'd=mentor.',\n",
       " 'd=retired',\n",
       " 'd=navy',\n",
       " 'd=combat',\n",
       " 'd=camera',\n",
       " 'd=chief.']"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweet2tokens(test_tweet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  d=portrait  d=photographer,  d=beyond  d=the  d=cut  d=portrait  d=project,  d=ww2  d=portrait  d=project,  d=instructor  d=&  d=mentor.  d=retired  d=navy  d=combat  d=camera  d=chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  d=portrait  d=photographer,  d=beyond  d=the  d=cut  d=portrait  d=project,  d=ww2  d=portrait  d=project,  d=instructor  d=&  d=mentor.  d=retired  d=navy  d=combat  d=camera  d=chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  d=portrait  d=photographer,  d=beyond  d=the  d=cut  d=portrait  d=project,  d=ww2  d=portrait  d=project,  d=instructor  d=&  d=mentor.  d=retired  d=navy  d=combat  d=camera  d=chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  d=portrait  d=photographer,  d=beyond  d=the  d=cut  d=portrait  d=project,  d=ww2  d=portrait  d=project,  d=instructor  d=&  d=mentor.  d=retired  d=navy  d=combat  d=camera  d=chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=True  prefix=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  portrait  photographer,  beyond  the  cut  portrait  project,  ww2  portrait  project,  instructor  &  mentor.  retired  navy  combat  camera  chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=True  prefix=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  portrait  photographer,  beyond  the  cut  portrait  project,  ww2  portrait  project,  instructor  &  mentor.  retired  navy  combat  camera  chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=True  prefix=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  portrait  photographer,  beyond  the  cut  portrait  project,  ww2  portrait  project,  instructor  &  mentor.  retired  navy  combat  camera  chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=True  prefix=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop.  portrait  photographer,  beyond  the  cut  portrait  project,  ww2  portrait  project,  instructor  &  mentor.  retired  navy  combat  camera  chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  d=portrait  d=photographer  d=beyond  d=the  d=cut  d=portrait  d=project  d=ww2  d=portrait  d=project  d=instructor  d=mentor  d=retired  d=navy  d=combat  d=camera  d=chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  d=portrait  d=photographer  d=beyond  d=the  d=cut  d=portrait  d=project  d=ww2  d=portrait  d=project  d=instructor  d=mentor  d=retired  d=navy  d=combat  d=camera  d=chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  d=portrait  d=photographer  d=beyond  d=the  d=cut  d=portrait  d=project  d=ww2  d=portrait  d=project  d=instructor  d=mentor  d=retired  d=navy  d=combat  d=camera  d=chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  d=portrait  d=photographer  d=beyond  d=the  d=cut  d=portrait  d=project  d=ww2  d=portrait  d=project  d=instructor  d=mentor  d=retired  d=navy  d=combat  d=camera  d=chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  portrait  photographer  beyond  the  cut  portrait  project  ww2  portrait  project  instructor  mentor  retired  navy  combat  camera  chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  portrait  photographer  beyond  the  cut  portrait  project  ww2  portrait  project  instructor  mentor  retired  navy  combat  camera  chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  portrait  photographer  beyond  the  cut  portrait  project  ww2  portrait  project  instructor  mentor  retired  navy  combat  camera  chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=True  punct=False  prefix=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop  portrait  photographer  beyond  the  cut  portrait  project  ww2  portrait  project  instructor  mentor  retired  navy  combat  camera  chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=d=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  d=Portrait  d=Photographer,  d=Beyond  d=the  d=Cut  d=portrait  d=project,  d=WW2  d=Portrait  d=project,  d=Instructor  d=&  d=Mentor.  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=d=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  d=Portrait  d=Photographer,  d=Beyond  d=the  d=Cut  d=portrait  d=project,  d=WW2  d=Portrait  d=project,  d=Instructor  d=&  d=Mentor.  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=d=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  d=Portrait  d=Photographer,  d=Beyond  d=the  d=Cut  d=portrait  d=project,  d=WW2  d=Portrait  d=project,  d=Instructor  d=&  d=Mentor.  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=d=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  d=Portrait  d=Photographer,  d=Beyond  d=the  d=Cut  d=portrait  d=project,  d=WW2  d=Portrait  d=project,  d=Instructor  d=&  d=Mentor.  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  Portrait  Photographer,  Beyond  the  Cut  portrait  project,  WW2  Portrait  project,  Instructor  &  Mentor.  Retired  Navy  Combat  Camera  Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  Portrait  Photographer,  Beyond  the  Cut  portrait  project,  WW2  Portrait  project,  Instructor  &  Mentor.  Retired  Navy  Combat  Camera  Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  Portrait  Photographer,  Beyond  the  Cut  portrait  project,  WW2  Portrait  project,  Instructor  &  Mentor.  Retired  Navy  Combat  Camera  Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=True  prefix=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop.  Portrait  Photographer,  Beyond  the  Cut  portrait  project,  WW2  Portrait  project,  Instructor  &  Mentor.  Retired  Navy  Combat  Camera  Chief. \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  d=Portrait  d=Photographer  d=Beyond  d=the  d=Cut  d=portrait  d=project  d=WW2  d=Portrait  d=project  d=Instructor  d=Mentor  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  d=Portrait  d=Photographer  d=Beyond  d=the  d=Cut  d=portrait  d=project  d=WW2  d=Portrait  d=project  d=Instructor  d=Mentor  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  d=Portrait  d=Photographer  d=Beyond  d=the  d=Cut  d=portrait  d=project  d=WW2  d=Portrait  d=project  d=Instructor  d=Mentor  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  d=Portrait  d=Photographer  d=Beyond  d=the  d=Cut  d=portrait  d=project  d=WW2  d=Portrait  d=project  d=Instructor  d=Mentor  d=Retired  d=Navy  d=Combat  d=Camera  d=Chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  Portrait  Photographer  Beyond  the  Cut  portrait  project  WW2  Portrait  project  Instructor  Mentor  Retired  Navy  Combat  Camera  Chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  Portrait  Photographer  Beyond  the  Cut  portrait  project  WW2  Portrait  project  Instructor  Mentor  Retired  Navy  Combat  Camera  Chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  Portrait  Photographer  Beyond  the  Cut  portrait  project  WW2  Portrait  project  Instructor  Mentor  Retired  Navy  Combat  Camera  Chief \n",
      "----\n",
      "\n",
      "use_descr=True  lower=False  punct=False  prefix=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop  Portrait  Photographer  Beyond  the  Cut  portrait  project  WW2  Portrait  project  Instructor  Mentor  Retired  Navy  Combat  Camera  Chief \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=d=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=d=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=d=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=d=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=True  prefix=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=d=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=d=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=d=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=d=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=  url=True  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=  url=True  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=  url=False  mention=True\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=True  punct=False  prefix=  url=False  mention=False\n",
      "working  on  upcoming  course  description  for  a  moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=d=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=d=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=d=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=d=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=True  prefix=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop. \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=d=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=d=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=d=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=d=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=  url=True  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=  url=True  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=  url=False  mention=True\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n",
      "use_descr=False  lower=False  punct=False  prefix=  url=False  mention=False\n",
      "Working  on  upcoming  course  description  for  a  Moab  fine  art  shooting  intense  workshop \n",
      "----\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# for enumerating all possible arguments of tweet2tokens\n",
    "# https://docs.python.org/2/library/itertools.html#itertools.product\n",
    "from itertools import product\n",
    "\n",
    "use_descr_opts = [True, False]\n",
    "lowercase_opts = [True, False]\n",
    "keep_punctuation_opts = [True, False]\n",
    "descr_prefix_opts = ['d=', '']\n",
    "url_opts = [True, False]\n",
    "mention_opts = [True, False]\n",
    "\n",
    "argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']\n",
    "option_iter = product(use_descr_opts, lowercase_opts,\n",
    "                       keep_punctuation_opts,\n",
    "                       descr_prefix_opts, url_opts,\n",
    "                       mention_opts)\n",
    "for options in option_iter:\n",
    "    print('  '.join('%s=%s' % (name, opt) \n",
    "                    for name, opt in zip(argnames, options)))\n",
    "    print\n",
    "    print('  '.join(tweet2tokens(test_tweet, *options)), '\\n----\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Let's tokenize all tweets.\n",
    "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n",
    "                            keep_punctuation=False, descr_prefix='d=',\n",
    "                            collapse_urls=True, collapse_mentions=True)\n",
    "              for t in tweets]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['working',\n",
       " 'on',\n",
       " 'upcoming',\n",
       " 'course',\n",
       " 'description',\n",
       " 'for',\n",
       " 'a',\n",
       " 'moab',\n",
       " 'fine',\n",
       " 'art',\n",
       " 'shooting',\n",
       " 'intense',\n",
       " 'workshop',\n",
       " 'd=portrait',\n",
       " 'd=photographer',\n",
       " 'd=beyond',\n",
       " 'd=the',\n",
       " 'd=cut',\n",
       " 'd=portrait',\n",
       " 'd=project',\n",
       " 'd=ww2',\n",
       " 'd=portrait',\n",
       " 'd=project',\n",
       " 'd=instructor',\n",
       " 'd=mentor',\n",
       " 'd=retired',\n",
       " 'd=navy',\n",
       " 'd=combat',\n",
       " 'd=camera',\n",
       " 'd=chief']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens_list[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[10]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "d = defaultdict(lambda: [])\n",
    "d['cat'].append(10)\n",
    "d['cat']\n",
    "#v = {}\n",
    "#v['cat'].append(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Store these in a sparse matrix.\n",
    "\n",
    "#1) Create a vocabulary (dict from term->index)\n",
    "\n",
    "# https://docs.python.org/2/library/collections.html#collections.defaultdict\n",
    "from collections import defaultdict\n",
    "\n",
    "def make_vocabulary(tokens_list):\n",
    "    vocabulary = defaultdict(lambda: len(vocabulary))  # If term not present, assign next int.\n",
    "    for tokens in tokens_list:\n",
    "        for token in tokens:\n",
    "            vocabulary[token]  # looking up a key; defaultdict takes care of assigning it a value.\n",
    "    print('%d unique terms in vocabulary' % len(vocabulary))\n",
    "    return vocabulary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20234 unique terms in vocabulary\n"
     ]
    }
   ],
   "source": [
    "vocabulary = make_vocabulary(tokens_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('what', 0),\n",
       " ('THIS_IS_A_URL', 1),\n",
       " ('d=god', 2),\n",
       " ('d=the', 3),\n",
       " ('d=jonas', 4),\n",
       " ('d=brothers', 5),\n",
       " ('d=hold', 6),\n",
       " ('d=a', 7),\n",
       " ('d=special', 8),\n",
       " ('d=place', 9)]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# term->index\n",
    "list(vocabulary.items())[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "29731 unique terms in vocabulary\n"
     ]
    }
   ],
   "source": [
    "# How big is vocabulary if we keep punctuation?\n",
    "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n",
    "                            keep_punctuation=True, descr_prefix='d=',\n",
    "                            collapse_urls=True, collapse_mentions=True)\n",
    "              for t in tweets]\n",
    "\n",
    "vocabulary = make_vocabulary(tokens_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32591 unique terms in vocabulary\n"
     ]
    }
   ],
   "source": [
    "# How big is vocabulary if we keep punctuation and urls?\n",
    "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n",
    "                            keep_punctuation=True, descr_prefix='d=',\n",
    "                            collapse_urls=False, collapse_mentions=True)\n",
    "              for t in tweets]\n",
    "\n",
    "vocabulary = make_vocabulary(tokens_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "36909 unique terms in vocabulary\n"
     ]
    }
   ],
   "source": [
    "# How big is vocabulary if we keep punctuation and urls and mentions?\n",
    "tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,\n",
    "                            keep_punctuation=True, descr_prefix='d=',\n",
    "                            collapse_urls=False, collapse_mentions=False)\n",
    "              for t in tweets]\n",
    "\n",
    "vocabulary = make_vocabulary(tokens_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Vector Matrix\n",
    "\n",
    "Create a matrix $X$ where $X[i,j]$ is the frequency of term $j$ in tweet $i$.\n",
    "\n",
    "$$\n",
    "X = \\begin{pmatrix}\n",
    "~ & \\hbox{term}_1 & \\hbox{term}_2 & \\hbox{term}_3 & \\hbox{term}_4 \\\\\n",
    "\\hbox{tweet}_1 & 1  &  0  &  0 & 0 \\\\\n",
    "\\hbox{tweet}_2 & 0  &  0  &  0 & 2 \\\\\n",
    "\\hbox{tweet}_3 & 1  &  1  &  0 & 0 \\\\\n",
    "\\end{pmatrix}\n",
    "$$\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sparse Matrices\n",
    "\n",
    "$$\n",
    "X = \\begin{pmatrix}\n",
    "~ & \\hbox{term}_1 & \\hbox{term}_2 & \\hbox{term}_3 & \\hbox{term}_4 \\\\\n",
    "\\hbox{tweet}_1 & 1  &  0  &  0 & 0 \\\\\n",
    "\\hbox{tweet}_2 & 0  &  0  &  0 & 2 \\\\\n",
    "\\hbox{tweet}_3 & 1  &  1  &  0 & 0 \\\\\n",
    "\\end{pmatrix}\n",
    "$$\n",
    "\n",
    "$X$ is mostly $0$ for text problems."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List of List (LIL) Matrix\n",
    "\n",
    "Store a linked list of (index, value) pairs for each row.\n",
    "\n",
    "$$\n",
    "X = \\begin{pmatrix}\n",
    "\\hbox{tweet}_1 & (0, 1)\\\\\n",
    "\\hbox{tweet}_2 & (3,2)\\\\\n",
    "\\hbox{tweet}_3 & (0,1), (1,1)\\\\\n",
    "\\end{pmatrix}\n",
    "$$\n",
    "\n",
    "**Advantage:** Fast to construct: append to list in constant time.\n",
    "\n",
    "**Disadvantage:** Slow random access for matrix-vector product.\n",
    "\n",
    "E.g., $\\hat{z} = X\\cdot \\hat{\\beta}$ to classify tweets using a learned weight vector $\\beta$\n",
    "\n",
    "$\\hat{z}[i] = \\sum_j X[i,j] * \\beta[j]$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compressed Sparse Row (CSR) Matrix\n",
    "\n",
    "\n",
    "$$\n",
    "X = \\begin{pmatrix}\n",
    "~ & \\hbox{term}_1 & \\hbox{term}_2 & \\hbox{term}_3 & \\hbox{term}_4 \\\\\n",
    "\\hbox{tweet}_1 & 1  &  0  &  0 & 0 \\\\\n",
    "\\hbox{tweet}_2 & 0  &  0  &  0 & 2 \\\\\n",
    "\\hbox{tweet}_3 & 1  &  1  &  0 & 0 \\\\\n",
    "\\hbox{tweet}_4 & 1  &  0  &  0 & 0 \\\\\n",
    "\\end{pmatrix}\n",
    "$$\n",
    "\n",
    "CSR Matrix is an object with three attributes: \n",
    "- **val:** $\\{1,2,1,1,1\\}$  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *list of all non-zero values*  \n",
    "- **col_ind:** $\\{0,3,0,1,0\\}$ &nbsp; *column index for each non-zero value* (e.g., first non-zero value (1) is in column 0) \n",
    "- **row_ptr:** $\\{0,1,2,4\\}$ &nbsp;&nbsp;&nbsp; *index into **col_ind** where each row starts* (e.g., tweet3, term1 corresponds to col_ind[2])\n",
    "\n",
    "Allows efficient row access (good for us, since each row is a tweet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Convert features to a sparse matrix X.\n",
    "# X[i,j] is the frequency of term j in tweet i\n",
    "# \n",
    "from scipy.sparse import lil_matrix\n",
    "\n",
    "def make_feature_matrix(tokens_list, vocabulary):\n",
    "    X = lil_matrix((len(tweets), len(vocabulary)))\n",
    "    for i, tokens in enumerate(tokens_list):\n",
    "        for token in tokens:\n",
    "            j = vocabulary[token]\n",
    "            X[i,j] += 1\n",
    "    return X.tocsr()  # convert to CSR for more efficient random access."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shape of X: (5000, 36909)\n"
     ]
    }
   ],
   "source": [
    "X = make_feature_matrix(tokens_list, vocabulary)\n",
    "print('shape of X:', X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on csr_matrix in module scipy.sparse.csr object:\n",
      "\n",
      "class csr_matrix(scipy.sparse.compressed._cs_matrix, scipy.sparse.sputils.IndexMixin)\n",
      " |  Compressed Sparse Row matrix\n",
      " |  \n",
      " |  This can be instantiated in several ways:\n",
      " |      csr_matrix(D)\n",
      " |          with a dense matrix or rank-2 ndarray D\n",
      " |  \n",
      " |      csr_matrix(S)\n",
      " |          with another sparse matrix S (equivalent to S.tocsr())\n",
      " |  \n",
      " |      csr_matrix((M, N), [dtype])\n",
      " |          to construct an empty matrix with shape (M, N)\n",
      " |          dtype is optional, defaulting to dtype='d'.\n",
      " |  \n",
      " |      csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])\n",
      " |          where ``data``, ``row_ind`` and ``col_ind`` satisfy the\n",
      " |          relationship ``a[row_ind[k], col_ind[k]] = data[k]``.\n",
      " |  \n",
      " |      csr_matrix((data, indices, indptr), [shape=(M, N)])\n",
      " |          is the standard CSR representation where the column indices for\n",
      " |          row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their\n",
      " |          corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.\n",
      " |          If the shape parameter is not supplied, the matrix dimensions\n",
      " |          are inferred from the index arrays.\n",
      " |  \n",
      " |  Attributes\n",
      " |  ----------\n",
      " |  dtype : dtype\n",
      " |      Data type of the matrix\n",
      " |  shape : 2-tuple\n",
      " |      Shape of the matrix\n",
      " |  ndim : int\n",
      " |      Number of dimensions (this is always 2)\n",
      " |  nnz\n",
      " |      Number of nonzero elements\n",
      " |  data\n",
      " |      CSR format data array of the matrix\n",
      " |  indices\n",
      " |      CSR format index array of the matrix\n",
      " |  indptr\n",
      " |      CSR format index pointer array of the matrix\n",
      " |  has_sorted_indices\n",
      " |      Whether indices are sorted\n",
      " |  \n",
      " |  Notes\n",
      " |  -----\n",
      " |  \n",
      " |  Sparse matrices can be used in arithmetic operations: they support\n",
      " |  addition, subtraction, multiplication, division, and matrix power.\n",
      " |  \n",
      " |  Advantages of the CSR format\n",
      " |    - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.\n",
      " |    - efficient row slicing\n",
      " |    - fast matrix vector products\n",
      " |  \n",
      " |  Disadvantages of the CSR format\n",
      " |    - slow column slicing operations (consider CSC)\n",
      " |    - changes to the sparsity structure are expensive (consider LIL or DOK)\n",
      " |  \n",
      " |  Examples\n",
      " |  --------\n",
      " |  \n",
      " |  >>> import numpy as np\n",
      " |  >>> from scipy.sparse import csr_matrix\n",
      " |  >>> csr_matrix((3, 4), dtype=np.int8).toarray()\n",
      " |  array([[0, 0, 0, 0],\n",
      " |         [0, 0, 0, 0],\n",
      " |         [0, 0, 0, 0]], dtype=int8)\n",
      " |  \n",
      " |  >>> row = np.array([0, 0, 1, 2, 2, 2])\n",
      " |  >>> col = np.array([0, 2, 2, 0, 1, 2])\n",
      " |  >>> data = np.array([1, 2, 3, 4, 5, 6])\n",
      " |  >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()\n",
      " |  array([[1, 0, 2],\n",
      " |         [0, 0, 3],\n",
      " |         [4, 5, 6]])\n",
      " |  \n",
      " |  >>> indptr = np.array([0, 2, 3, 6])\n",
      " |  >>> indices = np.array([0, 2, 2, 0, 1, 2])\n",
      " |  >>> data = np.array([1, 2, 3, 4, 5, 6])\n",
      " |  >>> csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()\n",
      " |  array([[1, 0, 2],\n",
      " |         [0, 0, 3],\n",
      " |         [4, 5, 6]])\n",
      " |  \n",
      " |  As an example of how to construct a CSR matrix incrementally,\n",
      " |  the following snippet builds a term-document matrix from texts:\n",
      " |  \n",
      " |  >>> docs = [[\"hello\", \"world\", \"hello\"], [\"goodbye\", \"cruel\", \"world\"]]\n",
      " |  >>> indptr = [0]\n",
      " |  >>> indices = []\n",
      " |  >>> data = []\n",
      " |  >>> vocabulary = {}\n",
      " |  >>> for d in docs:\n",
      " |  ...     for term in d:\n",
      " |  ...         index = vocabulary.setdefault(term, len(vocabulary))\n",
      " |  ...         indices.append(index)\n",
      " |  ...         data.append(1)\n",
      " |  ...     indptr.append(len(indices))\n",
      " |  ...\n",
      " |  >>> csr_matrix((data, indices, indptr), dtype=int).toarray()\n",
      " |  array([[2, 1, 0, 0],\n",
      " |         [0, 1, 1, 1]])\n",
      " |  \n",
      " |  Method resolution order:\n",
      " |      csr_matrix\n",
      " |      scipy.sparse.compressed._cs_matrix\n",
      " |      scipy.sparse.data._data_matrix\n",
      " |      scipy.sparse.base.spmatrix\n",
      " |      scipy.sparse.data._minmax_mixin\n",
      " |      scipy.sparse.sputils.IndexMixin\n",
      " |      builtins.object\n",
      " |  \n",
      " |  Methods defined here:\n",
      " |  \n",
      " |  __getitem__(self, key)\n",
      " |  \n",
      " |  getcol(self, i)\n",
      " |      Returns a copy of column i of the matrix, as a (m x 1)\n",
      " |      CSR matrix (column vector).\n",
      " |  \n",
      " |  getrow(self, i)\n",
      " |      Returns a copy of row i of the matrix, as a (1 x n)\n",
      " |      CSR matrix (row vector).\n",
      " |  \n",
      " |  tobsr(self, blocksize=None, copy=True)\n",
      " |      Convert this matrix to Block Sparse Row format.\n",
      " |      \n",
      " |      With copy=False, the data/indices may be shared between this matrix and\n",
      " |      the resultant bsr_matrix.\n",
      " |      \n",
      " |      When blocksize=(R, C) is provided, it will be used for construction of\n",
      " |      the bsr_matrix.\n",
      " |  \n",
      " |  tocsc(self, copy=False)\n",
      " |      Convert this matrix to Compressed Sparse Column format.\n",
      " |      \n",
      " |      With copy=False, the data/indices may be shared between this matrix and\n",
      " |      the resultant csc_matrix.\n",
      " |  \n",
      " |  tocsr(self, copy=False)\n",
      " |      Convert this matrix to Compressed Sparse Row format.\n",
      " |      \n",
      " |      With copy=False, the data/indices may be shared between this matrix and\n",
      " |      the resultant csr_matrix.\n",
      " |  \n",
      " |  tolil(self, copy=False)\n",
      " |      Convert this matrix to LInked List format.\n",
      " |      \n",
      " |      With copy=False, the data/indices may be shared between this matrix and\n",
      " |      the resultant lil_matrix.\n",
      " |  \n",
      " |  transpose(self, axes=None, copy=False)\n",
      " |      Reverses the dimensions of the sparse matrix.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      axes : None, optional\n",
      " |          This argument is in the signature *solely* for NumPy\n",
      " |          compatibility reasons. Do not pass in anything except\n",
      " |          for the default value.\n",
      " |      copy : bool, optional\n",
      " |          Indicates whether or not attributes of `self` should be\n",
      " |          copied whenever possible. The degree to which attributes\n",
      " |          are copied varies depending on the type of sparse matrix\n",
      " |          being used.\n",
      " |      \n",
      " |      Returns\n",
      " |      -------\n",
      " |      p : `self` with the dimensions reversed.\n",
      " |      \n",
      " |      See Also\n",
      " |      --------\n",
      " |      np.matrix.transpose : NumPy's implementation of 'transpose'\n",
      " |                            for matrices\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Data and other attributes defined here:\n",
      " |  \n",
      " |  format = 'csr'\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Methods inherited from scipy.sparse.compressed._cs_matrix:\n",
      " |  \n",
      " |  __add__(self, other)\n",
      " |  \n",
      " |  __eq__(self, other)\n",
      " |      Return self==value.\n",
      " |  \n",
      " |  __ge__(self, other)\n",
      " |      Return self>=value.\n",
      " |  \n",
      " |  __gt__(self, other)\n",
      " |      Return self>value.\n",
      " |  \n",
      " |  __init__(self, arg1, shape=None, dtype=None, copy=False)\n",
      " |      Initialize self.  See help(type(self)) for accurate signature.\n",
      " |  \n",
      " |  __le__(self, other)\n",
      " |      Return self<=value.\n",
      " |  \n",
      " |  __lt__(self, other)\n",
      " |      Return self<value.\n",
      " |  \n",
      " |  __ne__(self, other)\n",
      " |      Return self!=value.\n",
      " |  \n",
      " |  __radd__(self, other)\n",
      " |  \n",
      " |  __rsub__(self, other)\n",
      " |  \n",
      " |  __setitem__(self, index, x)\n",
      " |  \n",
      " |  __sub__(self, other)\n",
      " |  \n",
      " |  check_format(self, full_check=True)\n",
      " |      check whether the matrix format is valid\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      full_check : bool, optional\n",
      " |          If `True`, rigorous check, O(N) operations. Otherwise\n",
      " |          basic check, O(1) operations (default True).\n",
      " |  \n",
      " |  diagonal(self)\n",
      " |      Returns the main diagonal of the matrix\n",
      " |  \n",
      " |  eliminate_zeros(self)\n",
      " |      Remove zero entries from the matrix\n",
      " |      \n",
      " |      This is an *in place* operation\n",
      " |  \n",
      " |  getnnz(self, axis=None)\n",
      " |      Number of stored values, including explicit zeros.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      axis : None, 0, or 1\n",
      " |          Select between the number of values across the whole matrix, in\n",
      " |          each column, or in each row.\n",
      " |      \n",
      " |      See also\n",
      " |      --------\n",
      " |      count_nonzero : Number of non-zero entries\n",
      " |  \n",
      " |  maximum(self, other)\n",
      " |  \n",
      " |  minimum(self, other)\n",
      " |  \n",
      " |  multiply(self, other)\n",
      " |      Point-wise multiplication by another matrix, vector, or\n",
      " |      scalar.\n",
      " |  \n",
      " |  prune(self)\n",
      " |      Remove empty space after all non-zero elements.\n",
      " |  \n",
      " |  sort_indices(self)\n",
      " |      Sort the indices of this matrix *in place*\n",
      " |  \n",
      " |  sorted_indices(self)\n",
      " |      Return a copy of this matrix with sorted indices\n",
      " |  \n",
      " |  sum(self, axis=None, dtype=None, out=None)\n",
      " |      Sum the matrix elements over a given axis.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      axis : {-2, -1, 0, 1, None} optional\n",
      " |          Axis along which the sum is computed. The default is to\n",
      " |          compute the sum of all the matrix elements, returning a scalar\n",
      " |          (i.e. `axis` = `None`).\n",
      " |      dtype : dtype, optional\n",
      " |          The type of the returned matrix and of the accumulator in which\n",
      " |          the elements are summed.  The dtype of `a` is used by default\n",
      " |          unless `a` has an integer dtype of less precision than the default\n",
      " |          platform integer.  In that case, if `a` is signed then the platform\n",
      " |          integer is used while if `a` is unsigned then an unsigned integer\n",
      " |          of the same precision as the platform integer is used.\n",
      " |      \n",
      " |          .. versionadded: 0.18.0\n",
      " |      \n",
      " |      out : np.matrix, optional\n",
      " |          Alternative output matrix in which to place the result. It must\n",
      " |          have the same shape as the expected output, but the type of the\n",
      " |          output values will be cast if necessary.\n",
      " |      \n",
      " |          .. versionadded: 0.18.0\n",
      " |      \n",
      " |      Returns\n",
      " |      -------\n",
      " |      sum_along_axis : np.matrix\n",
      " |          A matrix with the same shape as `self`, with the specified\n",
      " |          axis removed.\n",
      " |      \n",
      " |      See Also\n",
      " |      --------\n",
      " |      np.matrix.sum : NumPy's implementation of 'sum' for matrices\n",
      " |  \n",
      " |  sum_duplicates(self)\n",
      " |      Eliminate duplicate matrix entries by adding them together\n",
      " |      \n",
      " |      The is an *in place* operation\n",
      " |  \n",
      " |  toarray(self, order=None, out=None)\n",
      " |      See the docstring for `spmatrix.toarray`.\n",
      " |  \n",
      " |  tocoo(self, copy=True)\n",
      " |      Convert this matrix to COOrdinate format.\n",
      " |      \n",
      " |      With copy=False, the data/indices may be shared between this matrix and\n",
      " |      the resultant coo_matrix.\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Data descriptors inherited from scipy.sparse.compressed._cs_matrix:\n",
      " |  \n",
      " |  has_canonical_format\n",
      " |      Determine whether the matrix has sorted indices and no duplicates\n",
      " |      \n",
      " |      Returns\n",
      " |          - True: if the above applies\n",
      " |          - False: otherwise\n",
      " |      \n",
      " |      has_canonical_format implies has_sorted_indices, so if the latter flag\n",
      " |      is False, so will the former be; if the former is found True, the\n",
      " |      latter flag is also set.\n",
      " |  \n",
      " |  has_sorted_indices\n",
      " |      Determine whether the matrix has sorted indices\n",
      " |      \n",
      " |      Returns\n",
      " |          - True: if the indices of the matrix are in sorted order\n",
      " |          - False: otherwise\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Data and other attributes inherited from scipy.sparse.compressed._cs_matrix:\n",
      " |  \n",
      " |  __hash__ = None\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Methods inherited from scipy.sparse.data._data_matrix:\n",
      " |  \n",
      " |  __abs__(self)\n",
      " |  \n",
      " |  __imul__(self, other)\n",
      " |  \n",
      " |  __itruediv__(self, other)\n",
      " |  \n",
      " |  __neg__(self)\n",
      " |  \n",
      " |  arcsin(self)\n",
      " |      Element-wise arcsin.\n",
      " |      \n",
      " |      See numpy.arcsin for more information.\n",
      " |  \n",
      " |  arcsinh(self)\n",
      " |      Element-wise arcsinh.\n",
      " |      \n",
      " |      See numpy.arcsinh for more information.\n",
      " |  \n",
      " |  arctan(self)\n",
      " |      Element-wise arctan.\n",
      " |      \n",
      " |      See numpy.arctan for more information.\n",
      " |  \n",
      " |  arctanh(self)\n",
      " |      Element-wise arctanh.\n",
      " |      \n",
      " |      See numpy.arctanh for more information.\n",
      " |  \n",
      " |  astype(self, t)\n",
      " |  \n",
      " |  ceil(self)\n",
      " |      Element-wise ceil.\n",
      " |      \n",
      " |      See numpy.ceil for more information.\n",
      " |  \n",
      " |  conj(self)\n",
      " |  \n",
      " |  copy(self)\n",
      " |      Returns a copy of this matrix.\n",
      " |      \n",
      " |      No data/indices will be shared between the returned value and current\n",
      " |      matrix.\n",
      " |  \n",
      " |  count_nonzero(self)\n",
      " |      Number of non-zero entries, equivalent to\n",
      " |      \n",
      " |      np.count_nonzero(a.toarray())\n",
      " |      \n",
      " |      Unlike getnnz() and the nnz property, which return the number of stored\n",
      " |      entries (the length of the data attribute), this method counts the\n",
      " |      actual number of non-zero entries in data.\n",
      " |  \n",
      " |  deg2rad(self)\n",
      " |      Element-wise deg2rad.\n",
      " |      \n",
      " |      See numpy.deg2rad for more information.\n",
      " |  \n",
      " |  expm1(self)\n",
      " |      Element-wise expm1.\n",
      " |      \n",
      " |      See numpy.expm1 for more information.\n",
      " |  \n",
      " |  floor(self)\n",
      " |      Element-wise floor.\n",
      " |      \n",
      " |      See numpy.floor for more information.\n",
      " |  \n",
      " |  log1p(self)\n",
      " |      Element-wise log1p.\n",
      " |      \n",
      " |      See numpy.log1p for more information.\n",
      " |  \n",
      " |  power(self, n, dtype=None)\n",
      " |      This function performs element-wise power.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      n : n is a scalar\n",
      " |      \n",
      " |      dtype : If dtype is not specified, the current dtype will be preserved.\n",
      " |  \n",
      " |  rad2deg(self)\n",
      " |      Element-wise rad2deg.\n",
      " |      \n",
      " |      See numpy.rad2deg for more information.\n",
      " |  \n",
      " |  rint(self)\n",
      " |      Element-wise rint.\n",
      " |      \n",
      " |      See numpy.rint for more information.\n",
      " |  \n",
      " |  sign(self)\n",
      " |      Element-wise sign.\n",
      " |      \n",
      " |      See numpy.sign for more information.\n",
      " |  \n",
      " |  sin(self)\n",
      " |      Element-wise sin.\n",
      " |      \n",
      " |      See numpy.sin for more information.\n",
      " |  \n",
      " |  sinh(self)\n",
      " |      Element-wise sinh.\n",
      " |      \n",
      " |      See numpy.sinh for more information.\n",
      " |  \n",
      " |  sqrt(self)\n",
      " |      Element-wise sqrt.\n",
      " |      \n",
      " |      See numpy.sqrt for more information.\n",
      " |  \n",
      " |  tan(self)\n",
      " |      Element-wise tan.\n",
      " |      \n",
      " |      See numpy.tan for more information.\n",
      " |  \n",
      " |  tanh(self)\n",
      " |      Element-wise tanh.\n",
      " |      \n",
      " |      See numpy.tanh for more information.\n",
      " |  \n",
      " |  trunc(self)\n",
      " |      Element-wise trunc.\n",
      " |      \n",
      " |      See numpy.trunc for more information.\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Data descriptors inherited from scipy.sparse.data._data_matrix:\n",
      " |  \n",
      " |  dtype\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Methods inherited from scipy.sparse.base.spmatrix:\n",
      " |  \n",
      " |  __bool__(self)\n",
      " |  \n",
      " |  __div__(self, other)\n",
      " |  \n",
      " |  __getattr__(self, attr)\n",
      " |  \n",
      " |  __iadd__(self, other)\n",
      " |  \n",
      " |  __idiv__(self, other)\n",
      " |  \n",
      " |  __isub__(self, other)\n",
      " |  \n",
      " |  __iter__(self)\n",
      " |  \n",
      " |  __len__(self)\n",
      " |      # What should len(sparse) return? For consistency with dense matrices,\n",
      " |      # perhaps it should be the number of rows?  But for some uses the number of\n",
      " |      # non-zeros is more important.  For now, raise an exception!\n",
      " |  \n",
      " |  __matmul__(self, other)\n",
      " |  \n",
      " |  __mul__(self, other)\n",
      " |      interpret other and call one of the following\n",
      " |      \n",
      " |      self._mul_scalar()\n",
      " |      self._mul_vector()\n",
      " |      self._mul_multivector()\n",
      " |      self._mul_sparse_matrix()\n",
      " |  \n",
      " |  __nonzero__ = __bool__(self)\n",
      " |  \n",
      " |  __numpy_ufunc__(self, func, method, pos, inputs, **kwargs)\n",
      " |      Method for compatibility with NumPy's ufuncs and dot\n",
      " |      functions.\n",
      " |  \n",
      " |  __pow__(self, other)\n",
      " |  \n",
      " |  __rdiv__(self, other)\n",
      " |  \n",
      " |  __repr__(self)\n",
      " |      Return repr(self).\n",
      " |  \n",
      " |  __rmatmul__(self, other)\n",
      " |  \n",
      " |  __rmul__(self, other)\n",
      " |  \n",
      " |  __rtruediv__(self, other)\n",
      " |  \n",
      " |  __str__(self)\n",
      " |      Return str(self).\n",
      " |  \n",
      " |  __truediv__(self, other)\n",
      " |  \n",
      " |  asformat(self, format)\n",
      " |      Return this matrix in a given sparse format\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      format : {string, None}\n",
      " |          desired sparse matrix format\n",
      " |              - None for no format conversion\n",
      " |              - \"csr\" for csr_matrix format\n",
      " |              - \"csc\" for csc_matrix format\n",
      " |              - \"lil\" for lil_matrix format\n",
      " |              - \"dok\" for dok_matrix format and so on\n",
      " |  \n",
      " |  asfptype(self)\n",
      " |      Upcast matrix to a floating point format (if necessary)\n",
      " |  \n",
      " |  conjugate(self)\n",
      " |  \n",
      " |  dot(self, other)\n",
      " |      Ordinary dot product\n",
      " |      \n",
      " |      Examples\n",
      " |      --------\n",
      " |      >>> import numpy as np\n",
      " |      >>> from scipy.sparse import csr_matrix\n",
      " |      >>> A = csr_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5]])\n",
      " |      >>> v = np.array([1, 0, -1])\n",
      " |      >>> A.dot(v)\n",
      " |      array([ 1, -3, -1], dtype=int64)\n",
      " |  \n",
      " |  getH(self)\n",
      " |      # Renamed conjtranspose() -> getH() for compatibility with dense matrices\n",
      " |  \n",
      " |  get_shape(self)\n",
      " |  \n",
      " |  getformat(self)\n",
      " |  \n",
      " |  getmaxprint(self)\n",
      " |  \n",
      " |  mean(self, axis=None, dtype=None, out=None)\n",
      " |      Compute the arithmetic mean along the specified axis.\n",
      " |      \n",
      " |      Returns the average of the matrix elements. The average is taken\n",
      " |      over all elements in the matrix by default, otherwise over the\n",
      " |      specified axis. `float64` intermediate and return values are used\n",
      " |      for integer inputs.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      axis : {-2, -1, 0, 1, None} optional\n",
      " |          Axis along which the mean is computed. The default is to compute\n",
      " |          the mean of all elements in the matrix (i.e. `axis` = `None`).\n",
      " |      dtype : data-type, optional\n",
      " |          Type to use in computing the mean. For integer inputs, the default\n",
      " |          is `float64`; for floating point inputs, it is the same as the\n",
      " |          input dtype.\n",
      " |      \n",
      " |          .. versionadded: 0.18.0\n",
      " |      \n",
      " |      out : np.matrix, optional\n",
      " |          Alternative output matrix in which to place the result. It must\n",
      " |          have the same shape as the expected output, but the type of the\n",
      " |          output values will be cast if necessary.\n",
      " |      \n",
      " |          .. versionadded: 0.18.0\n",
      " |      \n",
      " |      Returns\n",
      " |      -------\n",
      " |      m : np.matrix\n",
      " |      \n",
      " |      See Also\n",
      " |      --------\n",
      " |      np.matrix.mean : NumPy's implementation of 'mean' for matrices\n",
      " |  \n",
      " |  nonzero(self)\n",
      " |      nonzero indices\n",
      " |      \n",
      " |      Returns a tuple of arrays (row,col) containing the indices\n",
      " |      of the non-zero elements of the matrix.\n",
      " |      \n",
      " |      Examples\n",
      " |      --------\n",
      " |      >>> from scipy.sparse import csr_matrix\n",
      " |      >>> A = csr_matrix([[1,2,0],[0,0,3],[4,0,5]])\n",
      " |      >>> A.nonzero()\n",
      " |      (array([0, 0, 1, 2, 2]), array([0, 1, 2, 0, 2]))\n",
      " |  \n",
      " |  reshape(self, shape, order='C')\n",
      " |      Gives a new shape to a sparse matrix without changing its data.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      shape : length-2 tuple of ints\n",
      " |          The new shape should be compatible with the original shape.\n",
      " |      order : 'C', optional\n",
      " |          This argument is in the signature *solely* for NumPy\n",
      " |          compatibility reasons. Do not pass in anything except\n",
      " |          for the default value, as this argument is not used.\n",
      " |      \n",
      " |      Returns\n",
      " |      -------\n",
      " |      reshaped_matrix : `self` with the new dimensions of `shape`\n",
      " |      \n",
      " |      See Also\n",
      " |      --------\n",
      " |      np.matrix.reshape : NumPy's implementation of 'reshape' for matrices\n",
      " |  \n",
      " |  set_shape(self, shape)\n",
      " |  \n",
      " |  setdiag(self, values, k=0)\n",
      " |      Set diagonal or off-diagonal elements of the array.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      values : array_like\n",
      " |          New values of the diagonal elements.\n",
      " |      \n",
      " |          Values may have any length.  If the diagonal is longer than values,\n",
      " |          then the remaining diagonal entries will not be set.  If values if\n",
      " |          longer than the diagonal, then the remaining values are ignored.\n",
      " |      \n",
      " |          If a scalar value is given, all of the diagonal is set to it.\n",
      " |      \n",
      " |      k : int, optional\n",
      " |          Which off-diagonal to set, corresponding to elements a[i,i+k].\n",
      " |          Default: 0 (the main diagonal).\n",
      " |  \n",
      " |  todense(self, order=None, out=None)\n",
      " |      Return a dense matrix representation of this matrix.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      order : {'C', 'F'}, optional\n",
      " |          Whether to store multi-dimensional data in C (row-major)\n",
      " |          or Fortran (column-major) order in memory. The default\n",
      " |          is 'None', indicating the NumPy default of C-ordered.\n",
      " |          Cannot be specified in conjunction with the `out`\n",
      " |          argument.\n",
      " |      \n",
      " |      out : ndarray, 2-dimensional, optional\n",
      " |          If specified, uses this array (or `numpy.matrix`) as the\n",
      " |          output buffer instead of allocating a new array to\n",
      " |          return. The provided array must have the same shape and\n",
      " |          dtype as the sparse matrix on which you are calling the\n",
      " |          method.\n",
      " |      \n",
      " |      Returns\n",
      " |      -------\n",
      " |      arr : numpy.matrix, 2-dimensional\n",
      " |          A NumPy matrix object with the same shape and containing\n",
      " |          the same data represented by the sparse matrix, with the\n",
      " |          requested memory order. If `out` was passed and was an\n",
      " |          array (rather than a `numpy.matrix`), it will be filled\n",
      " |          with the appropriate values and returned wrapped in a\n",
      " |          `numpy.matrix` object that shares the same memory.\n",
      " |  \n",
      " |  todia(self, copy=False)\n",
      " |      Convert this matrix to sparse DIAgonal format.\n",
      " |      \n",
      " |      With copy=False, the data/indices may be shared between this matrix and\n",
      " |      the resultant dia_matrix.\n",
      " |  \n",
      " |  todok(self, copy=False)\n",
      " |      Convert this matrix to Dictionary Of Keys format.\n",
      " |      \n",
      " |      With copy=False, the data/indices may be shared between this matrix and\n",
      " |      the resultant dok_matrix.\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Data descriptors inherited from scipy.sparse.base.spmatrix:\n",
      " |  \n",
      " |  __dict__\n",
      " |      dictionary for instance variables (if defined)\n",
      " |  \n",
      " |  __weakref__\n",
      " |      list of weak references to the object (if defined)\n",
      " |  \n",
      " |  nnz\n",
      " |      Number of stored values, including explicit zeros.\n",
      " |      \n",
      " |      See also\n",
      " |      --------\n",
      " |      count_nonzero : Number of non-zero entries\n",
      " |  \n",
      " |  shape\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Data and other attributes inherited from scipy.sparse.base.spmatrix:\n",
      " |  \n",
      " |  __array_priority__ = 10.1\n",
      " |  \n",
      " |  ndim = 2\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Methods inherited from scipy.sparse.data._minmax_mixin:\n",
      " |  \n",
      " |  max(self, axis=None, out=None)\n",
      " |      Return the maximum of the matrix or maximum along an axis.\n",
      " |      This takes all elements into account, not just the non-zero ones.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      axis : {-2, -1, 0, 1, None} optional\n",
      " |          Axis along which the sum is computed. The default is to\n",
      " |          compute the maximum over all the matrix elements, returning\n",
      " |          a scalar (i.e. `axis` = `None`).\n",
      " |      \n",
      " |      out : None, optional\n",
      " |          This argument is in the signature *solely* for NumPy\n",
      " |          compatibility reasons. Do not pass in anything except\n",
      " |          for the default value, as this argument is not used.\n",
      " |      \n",
      " |      Returns\n",
      " |      -------\n",
      " |      amax : coo_matrix or scalar\n",
      " |          Maximum of `a`. If `axis` is None, the result is a scalar value.\n",
      " |          If `axis` is given, the result is a sparse.coo_matrix of dimension\n",
      " |          ``a.ndim - 1``.\n",
      " |      \n",
      " |      See Also\n",
      " |      --------\n",
      " |      min : The minimum value of a sparse matrix along a given axis.\n",
      " |      np.matrix.max : NumPy's implementation of 'max' for matrices\n",
      " |  \n",
      " |  min(self, axis=None, out=None)\n",
      " |      Return the minimum of the matrix or maximum along an axis.\n",
      " |      This takes all elements into account, not just the non-zero ones.\n",
      " |      \n",
      " |      Parameters\n",
      " |      ----------\n",
      " |      axis : {-2, -1, 0, 1, None} optional\n",
      " |          Axis along which the sum is computed. The default is to\n",
      " |          compute the minimum over all the matrix elements, returning\n",
      " |          a scalar (i.e. `axis` = `None`).\n",
      " |      \n",
      " |      out : None, optional\n",
      " |          This argument is in the signature *solely* for NumPy\n",
      " |          compatibility reasons. Do not pass in anything except for\n",
      " |          the default value, as this argument is not used.\n",
      " |      \n",
      " |      Returns\n",
      " |      -------\n",
      " |      amin : coo_matrix or scalar\n",
      " |          Minimum of `a`. If `axis` is None, the result is a scalar value.\n",
      " |          If `axis` is given, the result is a sparse.coo_matrix of dimension\n",
      " |          ``a.ndim - 1``.\n",
      " |      \n",
      " |      See Also\n",
      " |      --------\n",
      " |      max : The maximum value of a sparse matrix along a given axis.\n",
      " |      np.matrix.min : NumPy's implementation of 'min' for matrices\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1x36909 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 28 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How is tweet stored?\n",
    "X[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on method nonzero in module scipy.sparse.base:\n",
      "\n",
      "nonzero() method of scipy.sparse.csr.csr_matrix instance\n",
      "    nonzero indices\n",
      "    \n",
      "    Returns a tuple of arrays (row,col) containing the indices\n",
      "    of the non-zero elements of the matrix.\n",
      "    \n",
      "    Examples\n",
      "    --------\n",
      "    >>> from scipy.sparse import csr_matrix\n",
      "    >>> A = csr_matrix([[1,2,0],[0,0,3],[4,0,5]])\n",
      "    >>> A.nonzero()\n",
      "    (array([0, 0, 1, 2, 2]), array([0, 1, 2, 0, 2]))\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(X[1].nonzero)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0], dtype=int32),\n",
       " array([ 4, 15, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,\n",
       "        45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55], dtype=int32))"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[1].nonzero()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 4, 15, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,\n",
       "       45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55], dtype=int32)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# non-zero indices of terms used in tweet 1.\n",
    "X[1].nonzero()[1]  # col_ind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,\n",
       "        1.,  1.,  3.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,\n",
       "        1.,  1.])"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# term counts for tweet 1.\n",
    "X[1].data  # \"val\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "d=&\n",
      "1.0\n"
     ]
    }
   ],
   "source": [
    "# What word does each term index correspond to?\n",
    "# Convert term->index dict into index->term dict\n",
    "index2term = {i: t for t, i in vocabulary.items()}\n",
    "print(index2term[15])\n",
    "print(X[1, 15])\n",
    "# So, the term \"for\" (index 29) appears in user 200's tweet two times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "d=cut\n",
      "1.0\n"
     ]
    }
   ],
   "source": [
    "# d=and appears one time.\n",
    "print(index2term[46])\n",
    "print(X[1, 46])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How do CSR matrices access row values?\n",
    "\n",
    "Recall:\n",
    "\n",
    "CSR Matrix is an object with three attributes: \n",
    "- **val:** $\\{1,2,1,1\\}$  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *list of all non-zero values*  \n",
    "- **col_ind:** $\\{0,3,0,1\\}$ &nbsp; *column index for each non-zero value* (e.g., first non-zero value (1) is in column 0) \n",
    "- **row_ptr (ind_ptr):** $\\{0,1,2\\}$ &nbsp;&nbsp;&nbsp; *index into **col_ind** where each row starts* (e.g., tweet3, term1 corresponds to col_ind[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([200, 300, 400])"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Recall: numpy array slices.\n",
    "import numpy as np\n",
    "a = np.array([0, 100, 200, 300, 400, 500])\n",
    "a[2:5]  # get elements at positions 2,3,4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tweet 1 starts at col_ind= 30\n",
      "tweet 2 starts at col_ind= 58\n",
      "so, the columns that are non-zero for tweet 1 are:\n",
      "[ 4 15 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52\n",
      " 53 54 55]\n",
      "and the data associated with those cells are:\n",
      "[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  3.  1.  1.\n",
      "  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.]\n"
     ]
    }
   ],
   "source": [
    "print('tweet 1 starts at col_ind=', X.indptr[1])\n",
    "print('tweet 2 starts at col_ind=', X.indptr[2])\n",
    "print('so, the columns that are non-zero for tweet 1 are:')\n",
    "print(X.indices[X.indptr[1]:X.indptr[2]])\n",
    "print('and the data associated with those cells are:')\n",
    "print(X.data[X.indptr[1]:X.indptr[2]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tweet 0:\n",
      "   (0, 0)\t1.0\n",
      "  (0, 1)\t1.0\n",
      "  (0, 2)\t1.0\n",
      "  (0, 3)\t1.0\n",
      "  (0, 4)\t1.0\n",
      "  (0, 5)\t2.0\n",
      "  (0, 6)\t1.0\n",
      "  (0, 7)\t1.0\n",
      "  (0, 8)\t1.0\n",
      "  (0, 9)\t1.0\n",
      "  (0, 10)\t1.0\n",
      "  (0, 11)\t1.0\n",
      "  (0, 12)\t1.0\n",
      "  (0, 13)\t1.0\n",
      "  (0, 14)\t1.0\n",
      "  (0, 15)\t1.0\n",
      "  (0, 16)\t1.0\n",
      "  (0, 17)\t1.0\n",
      "  (0, 18)\t1.0\n",
      "  (0, 19)\t1.0\n",
      "  (0, 20)\t1.0\n",
      "  (0, 21)\t1.0\n",
      "  (0, 22)\t1.0\n",
      "  (0, 23)\t1.0\n",
      "  (0, 24)\t1.0\n",
      "  (0, 25)\t1.0\n",
      "  (0, 26)\t1.0\n",
      "  (0, 27)\t1.0\n",
      "  (0, 28)\t1.0\n",
      "  (0, 29)\t1.0 \n",
      "\n",
      "tweet 1:\n",
      "   (0, 4)\t1.0\n",
      "  (0, 15)\t1.0\n",
      "  (0, 30)\t1.0\n",
      "  (0, 31)\t1.0\n",
      "  (0, 32)\t1.0\n",
      "  (0, 33)\t1.0\n",
      "  (0, 34)\t1.0\n",
      "  (0, 35)\t1.0\n",
      "  (0, 36)\t1.0\n",
      "  (0, 37)\t1.0\n",
      "  (0, 38)\t1.0\n",
      "  (0, 39)\t1.0\n",
      "  (0, 40)\t1.0\n",
      "  (0, 41)\t1.0\n",
      "  (0, 42)\t1.0\n",
      "  (0, 43)\t3.0\n",
      "  (0, 44)\t1.0\n",
      "  (0, 45)\t1.0\n",
      "  (0, 46)\t1.0\n",
      "  (0, 47)\t2.0\n",
      "  (0, 48)\t1.0\n",
      "  (0, 49)\t1.0\n",
      "  (0, 50)\t1.0\n",
      "  (0, 51)\t1.0\n",
      "  (0, 52)\t1.0\n",
      "  (0, 53)\t1.0\n",
      "  (0, 54)\t1.0\n",
      "  (0, 55)\t1.0 \n",
      "\n",
      "tweet 2:\n",
      "   (0, 56)\t1.0\n",
      "  (0, 57)\t1.0\n",
      "  (0, 58)\t1.0\n",
      "  (0, 59)\t1.0\n",
      "  (0, 60)\t1.0\n",
      "  (0, 61)\t2.0\n",
      "  (0, 62)\t2.0\n",
      "  (0, 63)\t1.0\n",
      "  (0, 64)\t1.0\n",
      "  (0, 65)\t1.0\n",
      "  (0, 66)\t1.0\n",
      "  (0, 67)\t1.0\n",
      "  (0, 68)\t1.0\n",
      "  (0, 69)\t1.0\n",
      "  (0, 70)\t1.0\n",
      "  (0, 71)\t1.0\n"
     ]
    }
   ],
   "source": [
    "print('tweet 0:\\n', X[0], '\\n')\n",
    "print('tweet 1:\\n', X[1], '\\n')\n",
    "print('tweet 2:\\n', X[2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Efficient matrix vector product:**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$$ z = X * \\beta $$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X * beta for tweet 1= 31.0\n",
      "which is the same as the sum 31.0, since beta=[1...1]\n"
     ]
    }
   ],
   "source": [
    "# Compute z = X * \\beta, where X is a CSR matrix.\n",
    "import numpy as np\n",
    "beta = np.ones(len(vocabulary))  # assume Beta = vector of 1s\n",
    "z = np.zeros(len(tweets))\n",
    "for i in range(len(tweets)):  # for each row.\n",
    "    for j in range(X.indptr[i], X.indptr[i+1]): # for each col.\n",
    "        colidx = X.indices[j]\n",
    "        z[i] += beta[colidx] * X.data[j]\n",
    "print('X * beta for tweet 1=', z[1])\n",
    "print('which is the same as the sum %.1f, since beta=[1...1]' %\n",
    "      X[1].sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**4.) Create a list of gender labels.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gender labels: Counter({0: 2861, 1: 2139})\n"
     ]
    }
   ],
   "source": [
    "# y is a 1d numpy array of gender labels.\n",
    "# Let 1=Female, 0=Male.\n",
    "import numpy as np\n",
    "\n",
    "def get_gender(tweet, male_names, female_names):\n",
    "    name = get_first_name(tweet)\n",
    "    if name in female_names:\n",
    "        return 1\n",
    "    elif name in male_names:\n",
    "        return 0\n",
    "    else:\n",
    "        return -1\n",
    "    \n",
    "y = np.array([get_gender(t, male_names, female_names) for t in tweets])\n",
    "print('gender labels:', Counter(y))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**5.) Fit a Logistic Regression classifier to predict gender from profile/tweet.**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#Sports Guy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Do 5-fold cross-validation\n",
    "# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix\n",
    "\n",
    "def do_cross_val(X, y, nfolds):\n",
    "    \"\"\" Compute average cross-validation acccuracy.\"\"\"\n",
    "    cv = KFold(n_splits=nfolds, random_state=42, shuffle=True)\n",
    "    accuracies = []\n",
    "    for train_idx, test_idx in cv.split(X):\n",
    "        clf = LogisticRegression()\n",
    "        clf.fit(X[train_idx], y[train_idx])\n",
    "        predicted = clf.predict(X[test_idx])\n",
    "        acc = accuracy_score(y[test_idx], predicted)\n",
    "        accuracies.append(acc)\n",
    "    avg = np.mean(accuracies)\n",
    "    print(np.std(accuracies))\n",
    "    print(accuracies)\n",
    "    return avg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0155897402159\n",
      "[0.69299999999999995, 0.69099999999999995, 0.70199999999999996, 0.72299999999999998, 0.72899999999999998]\n",
      "avg accuracy 0.7076\n"
     ]
    }
   ],
   "source": [
    "print('avg accuracy', do_cross_val(X, y, 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CSR TIME\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5853505079867318"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fitting model with CSR much, much faster than with LIL.\n",
    "from timeit import timeit\n",
    "print('CSR TIME')\n",
    "timeit(\"do_cross_val(X.tocsr(), y, 2)\", number=5,\n",
    "       setup=\"from __main__ import do_cross_val, X, y\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LIL TIME\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "214.12728118896484"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('LIL TIME')\n",
    "timeit(\"do_cross_val(X.tolil(), y, 2)\", number=5,\n",
    "       setup=\"from __main__ import do_cross_val, X, y\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(See more about vectorization of arithmetic operations: \n",
    "https://en.wikipedia.org/wiki/Automatic_vectorization )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# How does tokenization affect accuracy?\n",
    "# Collapse urls and mentions; ignore description prefix.\n",
    "def run_all(tweets, use_descr=True, lowercase=True,\n",
    "            keep_punctuation=True, descr_prefix=None,\n",
    "            collapse_urls=True, collapse_mentions=True):\n",
    "    \n",
    "    tokens_list = [tweet2tokens(t, use_descr, lowercase,\n",
    "                            keep_punctuation, descr_prefix,\n",
    "                            collapse_urls, collapse_mentions)\n",
    "                  for t in tweets]\n",
    "    vocabulary = make_vocabulary(tokens_list)\n",
    "    X = make_feature_matrix(tokens_list, vocabulary)\n",
    "    acc = do_cross_val(X, y, 5)\n",
    "    print('acc=', acc)\n",
    "    return acc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=True\n",
      "29796 unique terms in vocabulary\n",
      "acc= 0.7114\n",
      "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=False\n",
      "33113 unique terms in vocabulary\n",
      "acc= 0.7148\n",
      "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=True\n",
      "32465 unique terms in vocabulary\n",
      "acc= 0.7114\n",
      "use_descr=True\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=False\n",
      "35782 unique terms in vocabulary\n",
      "acc= 0.7156\n",
      "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=True\tmention=True\n",
      "26212 unique terms in vocabulary\n",
      "acc= 0.7072\n",
      "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=True\tmention=False\n",
      "29479 unique terms in vocabulary\n",
      "acc= 0.7116\n",
      "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=False\tmention=True\n",
      "28882 unique terms in vocabulary\n",
      "acc= 0.7076\n",
      "use_descr=True\tlower=True\tpunct=True\tprefix=\turl=False\tmention=False\n",
      "32149 unique terms in vocabulary\n",
      "acc= 0.711\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=True\n",
      "20542 unique terms in vocabulary\n",
      "acc= 0.711\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=False\n",
      "23768 unique terms in vocabulary\n",
      "acc= 0.7142\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=True\n",
      "23264 unique terms in vocabulary\n",
      "acc= 0.709\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=False\n",
      "26488 unique terms in vocabulary\n",
      "acc= 0.7094\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=True\tmention=True\n",
      "16861 unique terms in vocabulary\n",
      "acc= 0.705\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=True\tmention=False\n",
      "20006 unique terms in vocabulary\n",
      "acc= 0.7074\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=False\tmention=True\n",
      "19575 unique terms in vocabulary\n",
      "acc= 0.7022\n",
      "use_descr=True\tlower=True\tpunct=False\tprefix=\turl=False\tmention=False\n",
      "22716 unique terms in vocabulary\n",
      "acc= 0.7076\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=True\n",
      "33755 unique terms in vocabulary\n",
      "acc= 0.706\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=False\n",
      "37078 unique terms in vocabulary\n",
      "acc= 0.7086\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=True\n",
      "36424 unique terms in vocabulary\n",
      "acc= 0.7042\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=False\n",
      "39747 unique terms in vocabulary\n",
      "acc= 0.7074\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=True\tmention=True\n",
      "30045 unique terms in vocabulary\n",
      "acc= 0.7042\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=True\tmention=False\n",
      "33335 unique terms in vocabulary\n",
      "acc= 0.7064\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=False\tmention=True\n",
      "32715 unique terms in vocabulary\n",
      "acc= 0.7056\n",
      "use_descr=True\tlower=False\tpunct=True\tprefix=\turl=False\tmention=False\n",
      "36005 unique terms in vocabulary\n",
      "acc= 0.7094\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=True\n",
      "24880 unique terms in vocabulary\n",
      "acc= 0.7178\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=False\n",
      "28163 unique terms in vocabulary\n",
      "acc= 0.7198\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=True\n",
      "27638 unique terms in vocabulary\n",
      "acc= 0.7166\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=False\n",
      "30918 unique terms in vocabulary\n",
      "acc= 0.72\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=True\tmention=True\n",
      "20916 unique terms in vocabulary\n",
      "acc= 0.711\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=True\tmention=False\n",
      "24134 unique terms in vocabulary\n",
      "acc= 0.7144\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=False\tmention=True\n",
      "23663 unique terms in vocabulary\n",
      "acc= 0.7108\n",
      "use_descr=True\tlower=False\tpunct=False\tprefix=\turl=False\tmention=False\n",
      "26877 unique terms in vocabulary\n",
      "acc= 0.7166\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=True\n",
      "14313 unique terms in vocabulary\n",
      "acc= 0.5978\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=True\tmention=False\n",
      "16688 unique terms in vocabulary\n",
      "acc= 0.6106\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=True\n",
      "16765 unique terms in vocabulary\n",
      "acc= 0.6044\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=d=\turl=False\tmention=False\n",
      "19140 unique terms in vocabulary\n",
      "acc= 0.6132\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=True\tmention=True\n",
      "14313 unique terms in vocabulary\n",
      "acc= 0.5978\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=True\tmention=False\n",
      "16688 unique terms in vocabulary\n",
      "acc= 0.6106\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=False\tmention=True\n",
      "16765 unique terms in vocabulary\n",
      "acc= 0.6044\n",
      "use_descr=False\tlower=True\tpunct=True\tprefix=\turl=False\tmention=False\n",
      "19140 unique terms in vocabulary\n",
      "acc= 0.6132\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=True\n",
      "10007 unique terms in vocabulary\n",
      "acc= 0.5962\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=True\tmention=False\n",
      "12322 unique terms in vocabulary\n",
      "acc= 0.61\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=True\n",
      "12464 unique terms in vocabulary\n",
      "acc= 0.6076\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=d=\turl=False\tmention=False\n",
      "14779 unique terms in vocabulary\n",
      "acc= 0.6116\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=True\tmention=True\n",
      "10007 unique terms in vocabulary\n",
      "acc= 0.5962\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=True\tmention=False\n",
      "12322 unique terms in vocabulary\n",
      "acc= 0.61\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=False\tmention=True\n",
      "12464 unique terms in vocabulary\n",
      "acc= 0.6076\n",
      "use_descr=False\tlower=True\tpunct=False\tprefix=\turl=False\tmention=False\n",
      "14779 unique terms in vocabulary\n",
      "acc= 0.6116\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=True\n",
      "16153 unique terms in vocabulary\n",
      "acc= 0.5906\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=True\tmention=False\n",
      "18532 unique terms in vocabulary\n",
      "acc= 0.6102\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=True\n",
      "18605 unique terms in vocabulary\n",
      "acc= 0.6028\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=d=\turl=False\tmention=False\n",
      "20984 unique terms in vocabulary\n",
      "acc= 0.6086\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=True\tmention=True\n",
      "16153 unique terms in vocabulary\n",
      "acc= 0.5906\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=True\tmention=False\n",
      "18532 unique terms in vocabulary\n",
      "acc= 0.6102\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=False\tmention=True\n",
      "18605 unique terms in vocabulary\n",
      "acc= 0.6028\n",
      "use_descr=False\tlower=False\tpunct=True\tprefix=\turl=False\tmention=False\n",
      "20984 unique terms in vocabulary\n",
      "acc= 0.6086\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=True\n",
      "12036 unique terms in vocabulary\n",
      "acc= 0.5844\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=True\tmention=False\n",
      "14368 unique terms in vocabulary\n",
      "acc= 0.5956\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=True\n",
      "14493 unique terms in vocabulary\n",
      "acc= 0.5926\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=d=\turl=False\tmention=False\n",
      "16825 unique terms in vocabulary\n",
      "acc= 0.5962\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=True\tmention=True\n",
      "12036 unique terms in vocabulary\n",
      "acc= 0.5844\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=True\tmention=False\n",
      "14368 unique terms in vocabulary\n",
      "acc= 0.5956\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=False\tmention=True\n",
      "14493 unique terms in vocabulary\n",
      "acc= 0.5926\n",
      "use_descr=False\tlower=False\tpunct=False\tprefix=\turl=False\tmention=False\n",
      "16825 unique terms in vocabulary\n",
      "acc= 0.5962\n"
     ]
    }
   ],
   "source": [
    "argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']\n",
    "option_iter = product(use_descr_opts, lowercase_opts,\n",
    "                       keep_punctuation_opts,\n",
    "                       descr_prefix_opts, url_opts,\n",
    "                       mention_opts)\n",
    "results = []\n",
    "for options in option_iter:\n",
    "    option_str = '\\t'.join('%s=%s' % (name, opt) for name, opt\n",
    "                           in zip(argnames, options))\n",
    "    print(option_str)\n",
    "    acc = run_all(tweets, *options)\n",
    "    results.append((acc, options))\n",
    "    print"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7200 use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=False\n",
      "0.7198 use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=False\n",
      "0.7178 use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=True\n",
      "0.7166 use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=True\n",
      "0.7166 use_descr=True  lower=False  punct=False  prefix=  url=False  mention=False\n",
      "0.7156 use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=False\n",
      "0.7148 use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=False\n",
      "0.7144 use_descr=True  lower=False  punct=False  prefix=  url=True  mention=False\n",
      "0.7142 use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=False\n",
      "0.7116 use_descr=True  lower=True  punct=True  prefix=  url=True  mention=False\n",
      "0.7114 use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=True\n",
      "0.7114 use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=True\n",
      "0.7110 use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=True\n",
      "0.7110 use_descr=True  lower=True  punct=True  prefix=  url=False  mention=False\n",
      "0.7110 use_descr=True  lower=False  punct=False  prefix=  url=True  mention=True\n",
      "0.7108 use_descr=True  lower=False  punct=False  prefix=  url=False  mention=True\n",
      "0.7094 use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=False\n",
      "0.7094 use_descr=True  lower=False  punct=True  prefix=  url=False  mention=False\n",
      "0.7090 use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=True\n",
      "0.7086 use_descr=True  lower=False  punct=True  prefix=d=  url=True  mention=False\n",
      "0.7076 use_descr=True  lower=True  punct=True  prefix=  url=False  mention=True\n",
      "0.7076 use_descr=True  lower=True  punct=False  prefix=  url=False  mention=False\n",
      "0.7074 use_descr=True  lower=True  punct=False  prefix=  url=True  mention=False\n",
      "0.7074 use_descr=True  lower=False  punct=True  prefix=d=  url=False  mention=False\n",
      "0.7072 use_descr=True  lower=True  punct=True  prefix=  url=True  mention=True\n",
      "0.7064 use_descr=True  lower=False  punct=True  prefix=  url=True  mention=False\n",
      "0.7060 use_descr=True  lower=False  punct=True  prefix=d=  url=True  mention=True\n",
      "0.7056 use_descr=True  lower=False  punct=True  prefix=  url=False  mention=True\n",
      "0.7050 use_descr=True  lower=True  punct=False  prefix=  url=True  mention=True\n",
      "0.7042 use_descr=True  lower=False  punct=True  prefix=d=  url=False  mention=True\n",
      "0.7042 use_descr=True  lower=False  punct=True  prefix=  url=True  mention=True\n",
      "0.7022 use_descr=True  lower=True  punct=False  prefix=  url=False  mention=True\n",
      "0.6132 use_descr=False  lower=True  punct=True  prefix=d=  url=False  mention=False\n",
      "0.6132 use_descr=False  lower=True  punct=True  prefix=  url=False  mention=False\n",
      "0.6116 use_descr=False  lower=True  punct=False  prefix=d=  url=False  mention=False\n",
      "0.6116 use_descr=False  lower=True  punct=False  prefix=  url=False  mention=False\n",
      "0.6106 use_descr=False  lower=True  punct=True  prefix=d=  url=True  mention=False\n",
      "0.6106 use_descr=False  lower=True  punct=True  prefix=  url=True  mention=False\n",
      "0.6102 use_descr=False  lower=False  punct=True  prefix=d=  url=True  mention=False\n",
      "0.6102 use_descr=False  lower=False  punct=True  prefix=  url=True  mention=False\n",
      "0.6100 use_descr=False  lower=True  punct=False  prefix=d=  url=True  mention=False\n",
      "0.6100 use_descr=False  lower=True  punct=False  prefix=  url=True  mention=False\n",
      "0.6086 use_descr=False  lower=False  punct=True  prefix=d=  url=False  mention=False\n",
      "0.6086 use_descr=False  lower=False  punct=True  prefix=  url=False  mention=False\n",
      "0.6076 use_descr=False  lower=True  punct=False  prefix=d=  url=False  mention=True\n",
      "0.6076 use_descr=False  lower=True  punct=False  prefix=  url=False  mention=True\n",
      "0.6044 use_descr=False  lower=True  punct=True  prefix=d=  url=False  mention=True\n",
      "0.6044 use_descr=False  lower=True  punct=True  prefix=  url=False  mention=True\n",
      "0.6028 use_descr=False  lower=False  punct=True  prefix=d=  url=False  mention=True\n",
      "0.6028 use_descr=False  lower=False  punct=True  prefix=  url=False  mention=True\n",
      "0.5978 use_descr=False  lower=True  punct=True  prefix=d=  url=True  mention=True\n",
      "0.5978 use_descr=False  lower=True  punct=True  prefix=  url=True  mention=True\n",
      "0.5962 use_descr=False  lower=True  punct=False  prefix=d=  url=True  mention=True\n",
      "0.5962 use_descr=False  lower=True  punct=False  prefix=  url=True  mention=True\n",
      "0.5962 use_descr=False  lower=False  punct=False  prefix=d=  url=False  mention=False\n",
      "0.5962 use_descr=False  lower=False  punct=False  prefix=  url=False  mention=False\n",
      "0.5956 use_descr=False  lower=False  punct=False  prefix=d=  url=True  mention=False\n",
      "0.5956 use_descr=False  lower=False  punct=False  prefix=  url=True  mention=False\n",
      "0.5926 use_descr=False  lower=False  punct=False  prefix=d=  url=False  mention=True\n",
      "0.5926 use_descr=False  lower=False  punct=False  prefix=  url=False  mention=True\n",
      "0.5906 use_descr=False  lower=False  punct=True  prefix=d=  url=True  mention=True\n",
      "0.5906 use_descr=False  lower=False  punct=True  prefix=  url=True  mention=True\n",
      "0.5844 use_descr=False  lower=False  punct=False  prefix=d=  url=True  mention=True\n",
      "0.5844 use_descr=False  lower=False  punct=False  prefix=  url=True  mention=True\n"
     ]
    }
   ],
   "source": [
    "for r in sorted(results, reverse=True):\n",
    "    print('%.4f' % r[0], '  '.join('%s=%s' % (name, opt) for name, opt in zip(argnames, r[1])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "idx2word = dict((v,k) for k,v in vocabulary.items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "top weighted terms for female class:\n",
      "('d=mom', 1.8794005883225817)\n",
      "('d=mother', 1.7879217848719102)\n",
      "('d=mom,', 1.6898641235543843)\n",
      "('d=✨', 1.4921097973960653)\n",
      "('d=wife,', 1.3136066681226575)\n",
      "('d=girl', 1.2785485084279851)\n",
      "('makes', 1.1343331260117135)\n",
      "('🙄', 1.127231631083933)\n",
      "('d=she', 1.1163831736733825)\n",
      "('d=httr.', 1.1047908737851388)\n",
      "('💕', 1.0950857840221102)\n",
      "('d=has', 1.0660393539063231)\n",
      "('loving', 1.0659908199830699)\n",
      "('d=alumna.', 1.0263939807173448)\n",
      "('d=mother,', 1.0222786681547364)\n",
      "('d=mama', 0.99262669785202895)\n",
      "('d=mom.', 0.95003253953833566)\n",
      "('d=❤️', 0.94194513761458232)\n",
      "('d=woman', 0.89938204611590611)\n",
      "('d=cat', 0.89301696818502463)\n",
      "\n",
      "top weighted terms for male class:\n",
      "('d=father', -1.5209778787652677)\n",
      "('d=husband,', -1.2633912999304031)\n",
      "('d=father,', -1.224224446924508)\n",
      "('d=dad', -1.1694784834867893)\n",
      "('d=fan.', -1.1438101776635889)\n",
      "('d=former', -1.1106439116218867)\n",
      "('d=when', -1.088660992801703)\n",
      "('d=musician', -0.97744036684698099)\n",
      "('d=twitter', -0.97378594433553467)\n",
      "('god', -0.97276940734276773)\n",
      "('d=dad,', -0.93343031634893492)\n",
      "('coming', -0.89861822992156226)\n",
      "('it!', -0.89310050869682611)\n",
      "('d=tech', -0.88925768127771254)\n",
      "('d=contributor', -0.87922009935841272)\n",
      "('d=#trurebels', -0.8779892573977276)\n",
      "('dude', -0.87183702647870243)\n",
      "('days', -0.86162692417795617)\n",
      "('d=southern', -0.86154110918134996)\n",
      "('d=guy', -0.85732007984228786)\n"
     ]
    }
   ],
   "source": [
    "# Fit model on all data and print top coef.\n",
    "model = LogisticRegression()\n",
    "model.fit(X,y)\n",
    "# Get the learned coefficients for the Positive class.\n",
    "coef = model.coef_[0]\n",
    "# Sort them in descending order.\n",
    "top_coef_ind = np.argsort(coef)[::-1][:20]\n",
    "# Get the names of those features.\n",
    "top_coef_terms = [idx2word[i] for i in top_coef_ind]\n",
    "# Get the weights of those features\n",
    "top_coef = coef[top_coef_ind]\n",
    "# Print the top 10.\n",
    "print('top weighted terms for female class:')\n",
    "print('\\n'.join(str(x) for x in zip(top_coef_terms, top_coef)))\n",
    "\n",
    "# repeat for males\n",
    "top_coef_ind = np.argsort(coef)[:20]\n",
    "top_coef_terms = [idx2word[i] for i in top_coef_ind]\n",
    "top_coef = coef[top_coef_ind]\n",
    "print('\\ntop weighted terms for male class:')\n",
    "print('\\n'.join(str(x) for x in zip(top_coef_terms, top_coef)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHYNJREFUeJzt3XtwXGeZ5/HvY7W6dZctS7YVX2LnQm6bGxEmAZYKMzC5\nLEtgJuyEqV0CQ9ZbDNTu7C7UJktVhqVqt2B2ZxZmoMh6IJOEpUggkI1nMZMJlyVkIBclOI5jk1hx\nnESyZUmWdVe3Wt3P/nGOTFuR1Iq61Wqd/n2quvrcdN5HR+r36fO+7znH3B0REak8a1Y6ABERWRlK\nACIiFUoJQESkQikBiIhUKCUAEZEKpQQgIlKhlABERCqUEoCISIVSAhARqVCxQndgZluB+4CNgAO7\n3f0rs7Yx4CvAjcAE8DF3fzbfvltbW3379u2FhigiUjGeeeaZAXdvW8y2BScAYBr4j+7+rJk1As+Y\n2aPufjBnmxuA88PX24Gvh+8L2r59O52dnUUIUUSkMpjZq4vdtuAmIHc/PvNt3t1HgUPA5lmb3QTc\n54EngLVm1l5o2SIisnRF7QMws+3AlcCTs1ZtBl7Pme/mjUliZh+7zKzTzDr7+/uLGZ6IiOQoWgIw\nswbg+8CfuvvIUvfj7rvdvcPdO9raFtWMJSIiS1CUBGBm1QSV/7fd/QdzbNIDbM2Z3xIuExGRFVJw\nAghH+HwTOOTufznPZnuAj1rgamDY3Y8XWraIiCxdMUYBvRP4V8DzZrYvXPafgW0A7n4XsJdgCGgX\nwTDQjxehXBERKUDBCcDdHwcszzYOfKrQskREpHh0JbCISBl59OAJ7vr5yyUpSwlARKSM/PQ3fXzz\n8VdKUpYSgIhIhVICEBEpK16ykpQARETKzIKjaopICUBEpEIpAYiIVCglABGRMuKl6wJQAhARKTdW\nok4AJQARkQqlBCAiUkbUBCQiUsGsRANBlQBERCqUEoCISIVSAhARKSOuW0GIiFQuDQMVEZFlpQQg\nIlJGNAxURKSC6W6gIiKyrIqSAMzsbjPrM7MD86y/1syGzWxf+LqzGOWKiERNCVuAiBVpP/cAXwXu\nW2CbX7j7+4tUnohIZFmJhgEV5QzA3R8DBouxLxERKY1S9gFcY2bPmdmPzOyS+TYys11m1mlmnf39\n/SUMT0SkspQqATwLnO3ulwN/Dfyf+TZ0993u3uHuHW1tbSUKT0SkPERuGKi7j7j7WDi9F6g2s9ZS\nlC0iInMrSQIws00W9mqY2c6w3JOlKFtEROZWlFFAZvYd4Fqg1cy6gT8DqgHc/S7gZuCTZjYNTAK3\nuJfyREdEZHUo5c3gipIA3P0jedZ/lWCYqIiI5KGbwYmIyLJSAhARKSdRGwUkIiLlRwlARKTMqA9A\nRESWlRKAiEgZKeX4eCUAEZEyYyV6JIwSgIhIhVICEBEpI6W8SYISgIhImdEoIBERWVZKACIiFUoJ\nQESkjGgYqIhIBStRF4ASgIhIpVICEBEpI5F7JrCIiCxO1p01a3QlsIhIxcm6s6ZEFwIoAYiIlJFM\n1qlSAhARqTyZLKurCcjM7jazPjM7MM96M7O/MrMuM9tvZm8tRrkiIlHj7pSo/i/aGcA9wPULrL8B\nOD987QK+XqRyRUQiJeNO1Wo6A3D3x4DBBTa5CbjPA08Aa82svRhli4hESSYbvU7gzcDrOfPd4bI3\nMLNdZtZpZp39/f0lCU5EpFy4s7rOAIrJ3Xe7e4e7d7S1ta10OCIiJRWcAZSmrFIlgB5ga878lnCZ\niIjkyETwOoA9wEfD0UBXA8PufrxEZYuIrBpewk7gWDF2YmbfAa4FWs2sG/gzoBrA3e8C9gI3Al3A\nBPDxYpQrIhI1mayTiK2iBODuH8mz3oFPFaMsEZEoy/gquxBMRESKI5t1qiLWCSwiIouQXW0XgomI\nSHFkso5FbBSQiIgsQtZ1N1ARkYqUreQrgUVEKlk265ToBEAJQESknKSmsyRiVSUpSwlARKSMTKYz\n1MWVAEREKs7kVIZaJQARkcri7kymM9RUKwGIiFSUZDoLQK0SgIhIZZlMZwCorS5N1awEICJSJoYn\n0wA011WXpDwlABGRMjE4PgXAurp4ScpTAhARKRNDE0oAIiIVaeYMoKVeCUBEpKKcmjkDUAIQEaks\ng+NpqquMel0IJiJSWU6OpVhXF9fzAEREKs2RgXG2t9aXrLyiJAAzu97MXjSzLjO7fY71HzOzfjPb\nF75uK0a5IiJR4e4cPjHK+RsaSlZmrNAdmFkV8DXgfUA38LSZ7XH3g7M2fcDdP11oeSIiUdQ/mmIk\nOV3SBFCMM4CdQJe7H3H3KeB+4KYi7FdEpGJ09Y0BcN6GxpKVWYwEsBl4PWe+O1w22x+Y2X4ze9DM\nts63MzPbZWadZtbZ399fhPBERMrfywPjAJy3ys4AFuPvgO3ufhnwKHDvfBu6+25373D3jra2thKF\nJyKysl7qHaUhEWNjU6JkZRYjAfQAud/ot4TLTnP3k+6eCme/AVxVhHJFRCIhm3Ue7xrgsi3NJRsC\nCsVJAE8D55vZDjOLA7cAe3I3MLP2nNkPAIeKUK6ISCT8/HA/rwyMc/NVW0pabsGjgNx92sw+DTwC\nVAF3u/sLZvYFoNPd9wD/1sw+AEwDg8DHCi1XRCQq7n78FdoaE7z/srNKWm7BCQDA3fcCe2ctuzNn\n+g7gjmKUJSISJb96+SS/ODzAHTdcSDxW2mtzdSWwiMgKGZ5Mc+fDB2hvruHWd2wveflFOQMQEZE3\nZ2JqmtvufZrDfWP87cfeVrIHwefSGYCISIkl0xk+cU8nz7x6iq/+0ZW858INKxKHEoCISAmNJNP8\n6/s6eeKVk/zlv7ii5B2/udQEJCJSIv/YNcBnvvccfaMpvvQHl/HBK+e6aULpKAGIiCwjd2d/9zDf\nfvJVvvdMN9vX1/PArqvp2N6y0qEpAYiILJdfv3aKL/zfg/z6tSFqqtfwL99+NnfceCF18fKoessj\nChGRiMhmnaeODvK1n3Xxi8MDbGhM8Pl/fjEfvHIza+tK86zfxVICEBEpUGo6w69fG+KXL5/k+890\n0zM0SXNtNZ+97gJufcd2GhLlWdWWZ1QiImXM3Tl6coJHD/by5JFBnnxlkLHUNGsMdu5o4bPXXcDv\nXLSBpprqlQ51QUoAIiILmM5k+U3vKAePjXDwePD6zfERRpLTAOxorecDV5zFtW9p4+071tNcV96V\nfi4lABGR0ORUhpf7xzh0fIQXjo1w6PgIz/cMMzGVAaC2uooL2xt5/+VncVF7E++5oI0t6+pWOOql\nUwIQkYqRyTr9oymOD09yYiTJ0ZMTHOkf4+jABK8NTnBiNIl7sO1MZX/zVVu46ux1XLq5mbPX11O1\npnT3619uSgAiEgmp6Qx9Iyl6R5IcH07SOzwZvifpHQne+0ZTZLJ+xs+1NiTY0VrHO85bz9kt9Zy3\noYELNjWyozValf1clABEpKy5OxNTGXpHkpwYDiv3kSTHhyfpHU7ROzJJ73CSgbGpN/xsXbyK9uYa\n2ptreed5rWxqqmFTcw3tzcH75rW1ZTc0s5SUAESk5JLpDOOpaSamMgxPphmeTHNyfIqB0RR9oyn6\nRpL0j6V49eQEJ0aSpKazb9jH2rpqNjUFlfmlm5vZ1FR7umJvb65hY3MNjYlYSR+xuNooAYhIwaYz\nWYYm0/QOBxX3ybEpBsZSHB+a5MRIioGxFEOTaUYm04wk0yTTb6zQZ1RXGRsaa2htiHPZlmbamzey\nviHBxqYEm5pq2dRcw6amGmrjpb99ctQoAYjIG+Q2u7w2OMHxoSSnJqY4OTbFqYkpBseD19DkFEMT\naUbDIZGzNdbE2NhUQ1tDgvM3NNBcW01jTYy1dXEaEjFq41U011aztraadfVx2hoSNNdWsybibe/l\nQglApMJMZ7L0j6XoHU5yYuS3beonwve+kaAZZiz1xkq9Pl7Fuvo46+rirG+Ic25bPWvrgvm1ddVs\nbErQFn57b6mP01jmF0JVOiUAkQgZT02fHvEyM/rlxMiZI2EGxlLMGghDvGoNG5oSbGqq4aKzmnh3\nQ4JNzTVsaEywtaWOzWtraamPr8hTq2T5FCUBmNn1wFeAKuAb7v7FWesTwH3AVcBJ4A/d/Wgxyhap\nRP2jKV4bnOCVgXGe7x5if88wR/rHGZ5Mv2HbppoY7c21bGyu4cJNjWxqCjpIZ0bEbGqqoaU+rs7S\nClRwAjCzKuBrwPuAbuBpM9vj7gdzNvsEcMrdzzOzW4AvAX9YaNkiUTc1nT19ZWrwGuU3vSNnDHms\nj1dxyeZm3n9ZO1vW1QUjYHIqd3WWynyKcQawE+hy9yMAZnY/cBOQmwBuAj4fTj8IfNXMzN1nnYiK\nVLbpTJZXByf4x64BfvR8L8+8doqpcAhkPLaGCzY2cu0FG7hwUyPntjWwtaWWHa0Nkb9gSZZHMRLA\nZuD1nPlu4O3zbePu02Y2DKwHBmbvzMx2AbsAtm3bVoTwRFZeJuucHE/RPxoMkewbTXFsaJLB8Sn6\nRpMMjE7RMzRJ70jy9JWq57TV89Grz+bSLc1c3N7EjtZ6YlV6jLcUT9l1Arv7bmA3QEdHh84QpGxk\nss7IZJrBiSn6R1OMJacZS00zkkwzPJE+fUFT7mssNc1oMthmrvPdxpoYbY0JWusT7NzRwua1tWxb\nX0fH2es4p62h9L+kVJRiJIAeYGvO/JZw2VzbdJtZDGgm6AwWWTbuTjKdZSw1zcTU9OnKeGIquAJ1\n5krUialMUJFPppmYyjA5lWEinWEsGYxvD15pxsM7Qs6ntjoc015XTVNtNVtb6mhMxGisidFcW01r\nY4LWhuDV1pjgrLU1JGJqn5eVU4wE8DRwvpntIKjobwH+aNY2e4BbgV8BNwM/Vfu/zDadyTI+lWFi\naprx1DTjqaCSHp+aeZ97+UzlPp7KnN5mIpyePdxxPtVVRmNNNXXxKuriVdTGYzQmgouYGmtiNNYE\nFzA11VSzrr6atoYammpj1CeCZc211cRjap6R1aXgBBC26X8aeIRgGOjd7v6CmX0B6HT3PcA3gW+Z\nWRcwSJAkJCJmKu6RyTSnJqYYT2WYTIffrlMZRlPBN+iRyZlv0tOMpYLtZ75557s9wGy11VXUJ2LU\nJ6qojwfv6xvibIvXUZ+ooi4eoyERoy5RFbzHY9THq2iqnankY9TFg33UxatIxNZoGKRUnKL0Abj7\nXmDvrGV35kwngQ8XoywpnmzWGQ0r4Jk269FkmpHkTKUcvI8mpxlLpRlPBd/OJ9NZxlJpxpLBt/Cp\nOW7UNZeGsDlkptJtrq2mvbnm9O0BGhLVQYUerp+puM+syIPKW6NeRApXdp3A8ua4B5X4TCdkbmU+\nNKtDcmSO+XxNJI2JGE21QcU8UyG31FfRkGg43WQyU2EHzSPx09/Aa6uD5pSgco9pBItImVECKDOZ\nrDM0McXx4SR9o0n6R1MMjE0xEnZIDk8EzSz9oykGx4Mbcy1UicfWGM21QRt1U201LfVxdrTWn142\nszx3viGs9BsS+qYtEmVKACWUzTr9Y8H472NDSY4NTdJ9aoITIyleHQzue35qYmrO4YKJ2BoaEjGa\n66pZVxfnnLZ63rajhfX18dOdkE3hCJTcyrwuXqW2bRGZkxLAMkimM7zYO8rRk+N09Y3R1TfG0ZMT\nvHpy/PTDpWc0JmLBk4nW1XLltrWsrw/uojhzOX9bOHRQN+ESkWJTAihAOpPl8IkxXjoxyosnRnmp\nN3jvGZo8/S1+jcH21nq2tdRx9TktnNNaz1lra4NXcy3NdbpdroisDCWARZqcyrC/e4gDx4Kbcu17\nfYijA+NMhw3wsTXGuW0NXLF1LTdftSXnXi11+vYuImVJCWAe2ayzv2eYx17q57GX+tn3+tDpyr61\nIc7lW9byvos3cuGmRi7cFNynRRcCichqogSQI5t1nuse4u9f6OXv9h3j2HASM7h0czO3/dNzeNv2\ndVy6pZkNjTUrHaqISMGUAAg6bX/wbA/fePwIR/rHia0x3nV+K5+57gLe/ZY2WhsSKx2iiEjRVXQC\nSGey3PerV/nGL45wfDjJRe1N/I8PX877LtqozlkRibyKTQCPHx7gzocPcGRgnJ3bW/iLD1/ONeeu\n15h5EakYFZcARpNp/tveQ3znqdfZ0VrP33y0g/detEEVv4hUnIpKAF19Y9x279O8OjjBbe/awX/4\nvbdQF6+oQyAiclrF1H5PHx3kj+95mkRsDQ/suoadO1pWOiQRkRVVEQmgbyTJrvs6aW1I8K1P7GTL\nurqVDklEZMVFPgG4O599cD8TUxke+DdXqfIXEQlF/tLVh/cd4+cv9XPHDRfylo2NKx2OiEjZiHQC\nyGSdv/7pYS5qb+Kj12xf6XBERMpKpBPAY4f7ebl/nE9eey5r9GATEZEzRDoBPPRsD401Ma67ZONK\nhyIiUnYKSgBm1mJmj5rZ4fB93TzbZcxsX/jaU0iZi5VMZ3jkhV4+dOVmEjHdjllEZLZCzwBuB37i\n7ucDPwnn5zLp7leErw8UWOaiHDo+Qmo6yzvObS1FcSIiq06hCeAm4N5w+l7ggwXur2j2dw8DcNmW\n5hWORESkPBWaADa6+/FwuheYr7G9xsw6zewJMytJkniue4i2xgRnra0tRXEiIqtO3gvBzOzHwKY5\nVn0ud8bd3cx8nt2c7e49ZnYO8FMze97dX56nvF3ALoBt27blC29ePacm2bG+fsk/LyISdXkTgLu/\nd751ZnbCzNrd/biZtQN98+yjJ3w/Ymb/D7gSmDMBuPtuYDdAR0fHfAklr96RJJduVvOPiMh8Cm0C\n2gPcGk7fCjw8ewMzW2dmiXC6FXgncLDAchfk7vQOJ9X8IyKygEITwBeB95nZYeC94Txm1mFm3wi3\nuQjoNLPngJ8BX3T3ZU0Aw5NpUtNZNjbp2b0iIvMp6GZw7n4S+N05lncCt4XTvwQuLaScN6t/NAXA\nhkY9y1dEZD6RvBJ4YGwKgPX18RWORESkfEUyAYylpgForNGD3UVE5hPJBDAymQagoSbyjzsQEVmy\naCaAZJAA1tbqDEBEZD6RTACp6SwANdW6CZyIyHyimQDSQQKIxyL564mIFEUka8jUdIbYGqNKD4ER\nEZlXRBNAloS+/YuILCiStWRqOkNC7f8iIguKZgJI6wxARCSfSNaSagISEckvkrVkajqj5wCLiOQR\n0QSQJVEdyV9NRKRoIllLqg9ARCS/SNaSagISEckvkglgKqMzABGRfCJZS6bSWd0GQkQkj0jWkll3\n1phuAyEispBIJgAHUP0vIrKgSCYAXPW/iEg+BSUAM/uwmb1gZlkz61hgu+vN7EUz6zKz2wspU0RE\niqPQM4ADwO8Dj823gZlVAV8DbgAuBj5iZhcXWO6CPCh3OYsQEVn1CnporrsfgryV7U6gy92PhNve\nD9wEHCyk7DxxqQlIRCSPUvQBbAZez5nvDpctm+AMYDlLEBFZ/fKeAZjZj4FNc6z6nLs/XOyAzGwX\nsAtg27ZtS9qHqxNYRCSvvAnA3d9bYBk9wNac+S3hsvnK2w3sBujo6PClFOi4+gBERPIoRRPQ08D5\nZrbDzOLALcCe5SxQZwAiIvkVOgz0Q2bWDVwD/NDMHgmXn2VmewHcfRr4NPAIcAj4rru/UFjYiwlu\n2UsQEVnVCh0F9BDw0BzLjwE35szvBfYWUtabi6tUJYmIrF7RvBIYMJ0CiIgsKJIJwN01DFREJI9o\nJgDUBSAikk80E4DrQjARkXyimQBw9QGIiOQRyQQAOgMQEcknkglAw0BFRPKLZgJAZwAiIvlEMwHo\nmZAiInlFMgGArgMQEcknkglAN4MTEckvmgkA9QGIiOQTzQTgug5ARCSfSCYA0BmAiEg+kUwAugxA\nRCS/aCYAdQKLiOQV0QSgZwKLiOQTzQSw0gGIiKwCkUwA6HbQIiJ5RTIBBA+EUQYQEVlIQQnAzD5s\nZi+YWdbMOhbY7qiZPW9m+8yss5AyFx9bKUoREVm9YgX+/AHg94H/tYht3+PuAwWWtyiu+0GLiORV\nUAJw90NA2Y240TOBRUTyK1UfgAP/YGbPmNmuZS9MncAiInnlPQMwsx8Dm+ZY9Tl3f3iR5bzL3XvM\nbAPwqJn9xt0fm6e8XcAugG3bti1y92dydB2AiEg+eROAu7+30ELcvSd87zOzh4CdwJwJwN13A7sB\nOjo6ltSYf/0lm7iovXGJ0YqIVIZCO4HzMrN6YI27j4bTvwd8YTnL/PItVy7n7kVEIqHQYaAfMrNu\n4Brgh2b2SLj8LDPbG262EXjczJ4DngJ+6O5/X0i5IiJSuEJHAT0EPDTH8mPAjeH0EeDyQsoREZHi\ni+SVwCIikp8SgIhIhVICEBGpUEoAIiIVSglARKRCKQGIiFQoK+c7Z5pZP/DqEn+8FSjJ3UcLoBiL\nYzXECKsjTsVYHCsZ49nu3raYDcs6ARTCzDrdfd5nFJQDxVgcqyFGWB1xKsbiWA0xgpqAREQqlhKA\niEiFinIC2L3SASyCYiyO1RAjrI44FWNxrIYYo9sHICIiC4vyGYCIiCwgcgnAzK43sxfNrMvMbl+B\n8o+a2fNmts/MOsNlLWb2qJkdDt/XhcvNzP4qjHW/mb01Zz+3htsfNrNbixDX3WbWZ2YHcpYVLS4z\nuyr8vbvCn33Tj2SbJ8bPm1lPeDz3mdmNOevuCMt70cyuy1k+5/+Ame0wsyfD5Q+YWXwJMW41s5+Z\n2UEze8HM/l24vGyO5QIxls2xNLMaM3vKzJ4LY/wvC+3XzBLhfFe4fvtSYy9CjPeY2Ss5x/GKcPmK\nfG4K4u6ReQFVwMvAOUAceA64uMQxHAVaZy37c+D2cPp24Evh9I3AjwieYX818GS4vAU4Er6vC6fX\nFRjXu4G3AgeWIy6CZz1cHf7Mj4AbihTj54HPzLHtxeHfNwHsCP/uVQv9DwDfBW4Jp+8CPrmEGNuB\nt4bTjcBLYSxlcywXiLFsjmX4uzWE09XAk+HvPOd+gT8B7gqnbwEeWGrsRYjxHuDmObZfkc9NIa+o\nnQHsBLrc/Yi7TwH3AzetcEwQxHBvOH0v8MGc5fd54AlgrZm1A9cBj7r7oLufAh4Fri8kAA+ewTy4\nHHGF65rc/QkP/qvvy9lXoTHO5ybgfndPufsrQBfB33/O/4Hwm9XvAA/O8fu+mRiPu/uz4fQocAjY\nTBkdywVinE/Jj2V4PMbC2erw5QvsN/f4Pgj8bhjHm4q9SDHOZ0U+N4WIWgLYDLyeM9/Nwv/4y8GB\nfzCzZyx4wD3ARnc/Hk73EjwlDeaPt1S/R7Hi2hxOL1e8nw5Pqe+eaVpZQozrgSF3ny5WjGEzxJUE\n3wzL8ljOihHK6FiaWZWZ7QP6CCrFlxfY7+lYwvXDYRzL+hmaHaO7zxzH/xoex/9pZonZMS4yluX+\n3OQVtQRQDt7l7m8FbgA+ZWbvzl0ZZvqyG3pVrnEBXwfOBa4AjgN/sbLhBMysAfg+8KfuPpK7rlyO\n5RwxltWxdPeMu18BbCH4xn7hSsYzl9kxmtk/Ae4giPVtBM06/2kFQyxI1BJAD7A1Z35LuKxk3L0n\nfO8jeFzmTuBEeLpH+N4Xbj5fvKX6PYoVV084XfR43f1E+CHMAn9DcDyXEuNJglPy2Kzlb5qZVRNU\nrN929x+Ei8vqWM4VYzkeyzCuIeBnBM8Wn2+/p2MJ1zeHcZTkM5QT4/VhE5u7ewr4W5Z+HJftc7No\nxe5UWMkXwTOOjxB0Bs10/FxSwvLrgcac6V8StN3/d87sIPzzcPqfcWan0VP+206jVwg6jNaF0y1F\niG87Z3awFi0u3tiZdWORYmzPmf73BO29AJdwZuffEYKOv3n/B4DvcWYH458sIT4jaKv98qzlZXMs\nF4ixbI4l0AasDadrgV8A759vv8CnOLMT+LtLjb0IMbbnHOcvA19c6c/NkuuEUhZWkl8o6Il/iaA9\n8XMlLvuc8B/tOeCFmfIJ2ip/AhwGfpzzxzfga2GszwMdOfv6Y4IOrS7g40WI7TsEp/1pgrbGTxQz\nLqADOBD+zFcJLzIsQozfCmPYD+zhzErsc2F5L5IzemK+/4Hw7/NUGPv3gMQSYnwXQfPOfmBf+Lqx\nnI7lAjGWzbEELgN+HcZyALhzof0CNeF8V7j+nKXGXoQYfxoexwPA/+a3I4VW5HNTyEtXAouIVKio\n9QGIiMgiKQGIiFQoJQARkQqlBCAiUqGUAEREKpQSgIhIhVICEBGpUEoAIiIV6v8DkX2FEpFoGGYA\nAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x110cc6048>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.figure()\n",
    "plt.plot(sorted(coef))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.27289331178589837"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coef[vocabulary['dress']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.35541530381124387"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coef[vocabulary['she']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.46293652100681332"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coef[vocabulary['he']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.2128136378053142"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coef[vocabulary['the']]  # ?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Error Analysis\n",
    "\n",
    "- Which ones do we get wrong?\n",
    "- Are there obvious reasons?"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}