{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using Scattertext to visualize Emoji usage by gender and heritage on Twitter\n",
    "\n",
    "## DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization\n",
    "\n",
    "## @jasonkessler\n",
    "\n",
    "https://github.com/JasonKessler/scattertext\n",
    "\n",
    "\n",
    "Cite as:\n",
    "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n",
    "\n",
    "Link to preprint: https://arxiv.org/abs/1703.00565\n",
    "\n",
    "`\n",
    "@article{kessler2017scattertext,\n",
    "  author    = {Kessler, Jason S.},\n",
    "  title     = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n",
    "  booktitle = {ACL System Demonstrations},\n",
    "  year      = {2017},\n",
    "}\n",
    "`\n",
    "\n",
    "Data is from http://followthehashtag.com/datasets/\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>.container { width:98% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "import io, json\n",
    "from zipfile import ZipFile\n",
    "import urllib.request\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import agefromname\n",
    "import nltk\n",
    "\n",
    "import imp\n",
    "import scattertext as st\n",
    "from scattertext import tweet_tokenzier_factory\n",
    "from scattertext.termranking import OncePerDocFrequencyRanker\n",
    "\n",
    "from IPython.display import IFrame\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:98% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    df = pd.read_csv('usa_tweets.csv.gz')\n",
    "except:\n",
    "    with ZipFile(io.BytesIO(urllib.request.urlopen(\n",
    "            'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n",
    "    ).read())) as zf:\n",
    "        df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))\n",
    "    df['first_name'] = df['User Name'].apply(\n",
    "        lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)\n",
    "    df['last_name'] = df['User Name'].apply(\n",
    "        lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)\n",
    "    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>first_name</th>\n",
       "      <th>last_name</th>\n",
       "      <th>User Name</th>\n",
       "      <th>Nickname</th>\n",
       "      <th>Tweet content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bill</td>\n",
       "      <td>schulhoff</td>\n",
       "      <td>Bill Schulhoff</td>\n",
       "      <td>BillSchulhoff</td>\n",
       "      <td>Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>daniele</td>\n",
       "      <td>polis</td>\n",
       "      <td>Daniele Polis</td>\n",
       "      <td>danipolis</td>\n",
       "      <td>Pausa pro café antes de embarcar no próximo vô...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>kasey</td>\n",
       "      <td>jacobs</td>\n",
       "      <td>Kasey Jacobs</td>\n",
       "      <td>KJacobs27</td>\n",
       "      <td>Good. Morning. #morning #Saturday #diner #VT #...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  first_name  last_name       User Name       Nickname  \\\n",
       "0       bill  schulhoff  Bill Schulhoff  BillSchulhoff   \n",
       "1    daniele      polis   Daniele Polis      danipolis   \n",
       "2      kasey     jacobs    Kasey Jacobs      KJacobs27   \n",
       "\n",
       "                                       Tweet content  \n",
       "0  Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...  \n",
       "1  Pausa pro café antes de embarcar no próximo vô...  \n",
       "2  Good. Morning. #morning #Saturday #diner #VT #...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['first_name', 'last_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hi</th>\n",
       "      <th>lo</th>\n",
       "      <th>prob</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>first_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>aaban</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>9.574095e-01</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aabha</th>\n",
       "      <td>0.121295</td>\n",
       "      <td>-1.387779e-17</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aabid</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>5.628005e-01</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  hi            lo  prob\n",
       "first_name                              \n",
       "aaban       1.000000  9.574095e-01   1.0\n",
       "aabha       0.121295 -1.387779e-17   0.0\n",
       "aabid       1.000000  5.628005e-01   1.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "male_prob = agefromname.AgeFromName().get_all_name_male_prob()\n",
    "male_prob.iloc[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)\n",
    "df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')\n",
    "df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]\n",
    "df_mf.to_csv('emoji_data.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_mf = pd.read_csv('emoji_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>first_name</th>\n",
       "      <th>User Name</th>\n",
       "      <th>Nickname</th>\n",
       "      <th>Tweet content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>m</td>\n",
       "      <td>bill</td>\n",
       "      <td>Bill Schulhoff</td>\n",
       "      <td>BillSchulhoff</td>\n",
       "      <td>Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>m</td>\n",
       "      <td>bill</td>\n",
       "      <td>Bill S Kenney</td>\n",
       "      <td>BillSKenney</td>\n",
       "      <td>Planning the new focuslabllc website with the ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>m</td>\n",
       "      <td>bill</td>\n",
       "      <td>Bill Pendley</td>\n",
       "      <td>BILLPENDLEY</td>\n",
       "      <td>#bibleverseoftheday @ Bill The Mortgage Guy   ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>m</td>\n",
       "      <td>bill</td>\n",
       "      <td>Bill Culver</td>\n",
       "      <td>rilla6969</td>\n",
       "      <td>Start Wars Dark Side Challenge race number one...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>m</td>\n",
       "      <td>bill</td>\n",
       "      <td>Bill Esparza</td>\n",
       "      <td>streetgourmetla</td>\n",
       "      <td>Spinach fusilli by @chef_timothy. A pre #mexic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>m</td>\n",
       "      <td>bill</td>\n",
       "      <td>Bill Meadows</td>\n",
       "      <td>BillMeadows305</td>\n",
       "      <td>https://t.co/N8E5aTvIIN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  gender first_name       User Name         Nickname  \\\n",
       "0      m       bill  Bill Schulhoff    BillSchulhoff   \n",
       "1      m       bill   Bill S Kenney      BillSKenney   \n",
       "2      m       bill    Bill Pendley      BILLPENDLEY   \n",
       "3      m       bill     Bill Culver        rilla6969   \n",
       "4      m       bill    Bill Esparza  streetgourmetla   \n",
       "5      m       bill    Bill Meadows   BillMeadows305   \n",
       "\n",
       "                                       Tweet content  \n",
       "0  Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...  \n",
       "1  Planning the new focuslabllc website with the ...  \n",
       "2  #bibleverseoftheday @ Bill The Mortgage Guy   ...  \n",
       "3  Start Wars Dark Side Challenge race number one...  \n",
       "4  Spinach fusilli by @chef_timothy. A pre #mexic...  \n",
       "5                            https://t.co/N8E5aTvIIN  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_mf[['gender', 'first_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:6]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Gender breakdown of Twitter Users\n",
    "### According to Pew, the gender breakdown of American adults Twitter was about even\n",
    "- 24% of online men, 25% of online women\n",
    "- Source: http://www.pewinternet.org/2016/11/11/social-media-update-2016/\n",
    "\n",
    "### However, among users with a gender-identifying first name, about 56% appear to be male"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "m    28159\n",
      "f    21844\n",
      "Name: gender, dtype: int64\n",
      "m    0.563146\n",
      "f    0.436854\n",
      "Name: gender, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts())\n",
    "print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts()/df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...\n",
       "1    Planning the new focuslabllc website with the ...\n",
       "2    #bibleverseoftheday @ Bill The Mortgage Guy   ...\n",
       "3    Start Wars Dark Side Challenge race number one...\n",
       "4    Spinach fusilli by @chef_timothy. A pre #mexic...\n",
       "Name: Tweet content, dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_mf['Tweet content'].iloc[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nlp = st.tweet_tokenzier_factory(nltk.tokenize.TweetTokenizer())\n",
    "df_mf['parse'] = df_mf['Tweet content'].apply(nlp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "corpus = st.CorpusFromParsedDocuments(\n",
    "    df_mf,\n",
    "    parsed_col='parse',\n",
    "    category_col='gender',\n",
    "    feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
    ").build()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/emoji_gender_scattertext.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x1279e26d8>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata= (df_mf['User Name']\n",
    "           + ' (@' + df_mf['Nickname'] + ') '\n",
    "           + df_mf['Date'].astype(str))\n",
    "html = st.produce_scattertext_explorer(\n",
    "    corpus,\n",
    "    category='f',\n",
    "    category_name='Female',\n",
    "    not_category_name='Male',\n",
    "    use_full_doc=True,\n",
    "    term_ranker=OncePerDocFrequencyRanker,\n",
    "    sort_by_dist=False,\n",
    "    metadata=metadata,\n",
    "    width_in_pixels=1000\n",
    ")\n",
    "\n",
    "file_name = 'output/emoji_gender_scattertext.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/emoji_gender_lorp.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x1533b4d30>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "html = st.produce_fightin_words_explorer(corpus, \n",
    "                                         category='f', \n",
    "                                         category_name='Female', \n",
    "                                         not_category_name='Male', \n",
    "                                         term_ranker=OncePerDocFrequencyRanker,\n",
    "                                         metadata=metadata)\n",
    "file_name = 'output/emoji_gender_lorp.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use US Census data to find last names that are associated with a particular heritage\n",
    "From https://www.census.gov/data/developers/data-sets/surnames.2010.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from urllib.request import urlopen\n",
    "#url = 'https://api.census.gov/data/2010/surname?get=COUNT,CUM_PROP100K,NAME,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K,RANK,&RANK=1:100000'\n",
    "url ='https://api.census.gov/data/2010/surname?get=NAME,COUNT,CUM_PROP100K,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K&RANK=1:200000'\n",
    "with urlopen(url) as f:\n",
    "    raw = f.read().decode('utf-8')\n",
    "    rows = json.loads(raw)\n",
    "    name_df = pd.DataFrame(rows[1:], columns=rows[0]).set_index('NAME').replace('(S)', 0).astype(float).reset_index()\n",
    "name_df['NAME'] = name_df['NAME'].apply(str.lower)\n",
    "name_df = name_df.set_index('NAME')\n",
    "name_df['heritage'] = name_df.apply(lambda x: max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[1][3:]\n",
    "                                    if max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[0] > 85\n",
    "                                    else np.nan, axis=1)\n",
    "name_df['heritage'] = name_df['heritage'].apply(lambda x:\n",
    "                                                {'AIAN': 'Native American', \n",
    "                                                 'API': 'Asian', \n",
    "                                                 'BLACK': 'African American', \n",
    "                                                 'HISPANIC': 'Hispanic', \n",
    "                                                 'WHITE': 'White'}.get(x, np.nan))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COUNT</th>\n",
       "      <th>CUM_PROP100K</th>\n",
       "      <th>PCT2PRACE</th>\n",
       "      <th>PCTAIAN</th>\n",
       "      <th>PCTAPI</th>\n",
       "      <th>PCTBLACK</th>\n",
       "      <th>PCTHISPANIC</th>\n",
       "      <th>PCTWHITE</th>\n",
       "      <th>PROP100K</th>\n",
       "      <th>RANK</th>\n",
       "      <th>heritage</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NAME</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>garcia</th>\n",
       "      <td>1166120.0</td>\n",
       "      <td>3400.12</td>\n",
       "      <td>0.26</td>\n",
       "      <td>0.47</td>\n",
       "      <td>1.41</td>\n",
       "      <td>0.45</td>\n",
       "      <td>92.03</td>\n",
       "      <td>5.38</td>\n",
       "      <td>395.32</td>\n",
       "      <td>6.0</td>\n",
       "      <td>Hispanic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rodriguez</th>\n",
       "      <td>1094924.0</td>\n",
       "      <td>4543.50</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.57</td>\n",
       "      <td>0.54</td>\n",
       "      <td>93.77</td>\n",
       "      <td>4.75</td>\n",
       "      <td>371.19</td>\n",
       "      <td>9.0</td>\n",
       "      <td>Hispanic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>martinez</th>\n",
       "      <td>1060159.0</td>\n",
       "      <td>4902.90</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.51</td>\n",
       "      <td>0.60</td>\n",
       "      <td>0.49</td>\n",
       "      <td>92.91</td>\n",
       "      <td>5.28</td>\n",
       "      <td>359.40</td>\n",
       "      <td>10.0</td>\n",
       "      <td>Hispanic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hernandez</th>\n",
       "      <td>1043281.0</td>\n",
       "      <td>5256.58</td>\n",
       "      <td>0.16</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.60</td>\n",
       "      <td>0.36</td>\n",
       "      <td>94.89</td>\n",
       "      <td>3.79</td>\n",
       "      <td>353.68</td>\n",
       "      <td>11.0</td>\n",
       "      <td>Hispanic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>lopez</th>\n",
       "      <td>874523.0</td>\n",
       "      <td>5553.05</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.38</td>\n",
       "      <td>1.02</td>\n",
       "      <td>0.57</td>\n",
       "      <td>92.92</td>\n",
       "      <td>4.86</td>\n",
       "      <td>296.47</td>\n",
       "      <td>12.0</td>\n",
       "      <td>Hispanic</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               COUNT  CUM_PROP100K  PCT2PRACE  PCTAIAN  PCTAPI  PCTBLACK  \\\n",
       "NAME                                                                       \n",
       "garcia     1166120.0       3400.12       0.26     0.47    1.41      0.45   \n",
       "rodriguez  1094924.0       4543.50       0.18     0.18    0.57      0.54   \n",
       "martinez   1060159.0       4902.90       0.22     0.51    0.60      0.49   \n",
       "hernandez  1043281.0       5256.58       0.16     0.19    0.60      0.36   \n",
       "lopez       874523.0       5553.05       0.25     0.38    1.02      0.57   \n",
       "\n",
       "           PCTHISPANIC  PCTWHITE  PROP100K  RANK  heritage  \n",
       "NAME                                                        \n",
       "garcia           92.03      5.38    395.32   6.0  Hispanic  \n",
       "rodriguez        93.77      4.75    371.19   9.0  Hispanic  \n",
       "martinez         92.91      5.28    359.40  10.0  Hispanic  \n",
       "hernandez        94.89      3.79    353.68  11.0  Hispanic  \n",
       "lopez            92.92      4.86    296.47  12.0  Hispanic  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name_df[name_df['PCTHISPANIC'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COUNT</th>\n",
       "      <th>CUM_PROP100K</th>\n",
       "      <th>PCT2PRACE</th>\n",
       "      <th>PCTAIAN</th>\n",
       "      <th>PCTAPI</th>\n",
       "      <th>PCTBLACK</th>\n",
       "      <th>PCTHISPANIC</th>\n",
       "      <th>PCTWHITE</th>\n",
       "      <th>PROP100K</th>\n",
       "      <th>RANK</th>\n",
       "      <th>heritage</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NAME</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>washington</th>\n",
       "      <td>177386.0</td>\n",
       "      <td>20370.63</td>\n",
       "      <td>3.78</td>\n",
       "      <td>0.68</td>\n",
       "      <td>0.30</td>\n",
       "      <td>87.53</td>\n",
       "      <td>2.54</td>\n",
       "      <td>5.17</td>\n",
       "      <td>60.14</td>\n",
       "      <td>145.0</td>\n",
       "      <td>African American</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pierre</th>\n",
       "      <td>33913.0</td>\n",
       "      <td>41272.38</td>\n",
       "      <td>2.23</td>\n",
       "      <td>0.92</td>\n",
       "      <td>0.31</td>\n",
       "      <td>86.74</td>\n",
       "      <td>2.75</td>\n",
       "      <td>7.05</td>\n",
       "      <td>11.50</td>\n",
       "      <td>1026.0</td>\n",
       "      <td>African American</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>smalls</th>\n",
       "      <td>12435.0</td>\n",
       "      <td>53820.35</td>\n",
       "      <td>2.76</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.23</td>\n",
       "      <td>90.49</td>\n",
       "      <td>2.46</td>\n",
       "      <td>3.78</td>\n",
       "      <td>4.22</td>\n",
       "      <td>2888.0</td>\n",
       "      <td>African American</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>jeanbaptiste</th>\n",
       "      <td>7915.0</td>\n",
       "      <td>59139.71</td>\n",
       "      <td>2.50</td>\n",
       "      <td>0.13</td>\n",
       "      <td>0.21</td>\n",
       "      <td>94.04</td>\n",
       "      <td>2.15</td>\n",
       "      <td>0.97</td>\n",
       "      <td>2.68</td>\n",
       "      <td>4483.0</td>\n",
       "      <td>African American</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>diallo</th>\n",
       "      <td>7502.0</td>\n",
       "      <td>59784.56</td>\n",
       "      <td>1.55</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.12</td>\n",
       "      <td>95.64</td>\n",
       "      <td>0.76</td>\n",
       "      <td>1.84</td>\n",
       "      <td>2.54</td>\n",
       "      <td>4730.0</td>\n",
       "      <td>African American</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 COUNT  CUM_PROP100K  PCT2PRACE  PCTAIAN  PCTAPI  PCTBLACK  \\\n",
       "NAME                                                                         \n",
       "washington    177386.0      20370.63       3.78     0.68    0.30     87.53   \n",
       "pierre         33913.0      41272.38       2.23     0.92    0.31     86.74   \n",
       "smalls         12435.0      53820.35       2.76     0.28    0.23     90.49   \n",
       "jeanbaptiste    7915.0      59139.71       2.50     0.13    0.21     94.04   \n",
       "diallo          7502.0      59784.56       1.55     0.09    0.12     95.64   \n",
       "\n",
       "              PCTHISPANIC  PCTWHITE  PROP100K    RANK          heritage  \n",
       "NAME                                                                     \n",
       "washington           2.54      5.17     60.14   145.0  African American  \n",
       "pierre               2.75      7.05     11.50  1026.0  African American  \n",
       "smalls               2.46      3.78      4.22  2888.0  African American  \n",
       "jeanbaptiste         2.15      0.97      2.68  4483.0  African American  \n",
       "diallo               0.76      1.84      2.54  4730.0  African American  "
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name_df[name_df['PCTBLACK'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "heritage\n",
       "Native American         98523.0\n",
       "African American      1343997.0\n",
       "Asian                 7336493.0\n",
       "Hispanic             36248570.0\n",
       "White                92111160.0\n",
       "N/A                 128528485.0\n",
       "Name: COUNT, dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name_df.fillna('N/A').groupby('heritage').sum()['COUNT'].sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Self-identified race in the US (via: https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_United_States)\n",
    "- White alone: 72.4%\n",
    "- Hispanic and Latino Americans (of any race): 16.3%\n",
    "- Black or African American: 12.6%\n",
    "- Asian: 4.8%\n",
    "- Native American and Alaska Natives: 0.9%\n",
    "- Native Hawaiians and Other Pacific Islanders: 0.2%\n",
    "- Two or more races: 2.9%\n",
    "- Some other race: 6.2%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "heritage\n",
       "Native American     0.000718\n",
       "African American    0.009800\n",
       "Asian               0.053497\n",
       "Hispanic            0.264320\n",
       "White               0.671664\n",
       "Name: COUNT, dtype: float64"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name_df.dropna().groupby('heritage').sum()['COUNT'].sort_values()/name_df.dropna().groupby('heritage').sum()['COUNT'].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Caution!!\n",
    "- Hispanics are over reprented (26.4% of Americans w/ heritage-identifiable names vs 16.3% of population)\n",
    "- African Americans are far under represented: (0.9% of Americans w/ heritage-identifiable names vs 12.6% of population)\n",
    "- White, Asians have similar shares\n",
    "- Native Americans are very difficult to identify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_mf_heritage = pd.merge(df_mf, name_df[['heritage']].dropna(), left_on='last_name', right_index = True, how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "White               22176\n",
       "Hispanic             9188\n",
       "Asian                1249\n",
       "African American      166\n",
       "Native American         8\n",
       "Name: heritage, dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_mf_heritage['heritage'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_mf_heritage['Is-White'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'White' else 'Not White')\n",
    "df_mf_heritage['Is-Hispanic'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Hispanic' else 'Not Hispanic')\n",
    "df_mf_heritage['Is-Asian'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Asian' else 'Not Asian')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Emojis white people like"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/emoji_white_v_all.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x11e89a4e0>"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata= (df_mf_heritage['User Name']\n",
    "           + ' (@' + df_mf_heritage['Nickname'] + ') '\n",
    "           + df_mf_heritage['Date'].astype(str))\n",
    "html = st.produce_scattertext_explorer(\n",
    "    corpus = st.CorpusFromParsedDocuments(\n",
    "        df_mf_heritage,\n",
    "        parsed_col='parse',\n",
    "        category_col='Is-White',\n",
    "        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
    "    ).build(),\n",
    "    category='White',\n",
    "    category_name='White',\n",
    "    not_category_name='Not-White',\n",
    "    use_full_doc=True,\n",
    "    term_ranker=OncePerDocFrequencyRanker,\n",
    "    sort_by_dist=False,\n",
    "    metadata=metadata,\n",
    "    width_in_pixels=1000,\n",
    "    max_docs_per_category=1000\n",
    ")\n",
    "\n",
    "file_name = 'output/emoji_white_v_all.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Emojis Hispanic people like"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/emoji_hispanic_v_all.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x154054be0>"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata= (df_mf_heritage['User Name']\n",
    "           + ' (@' + df_mf_heritage['Nickname'] + ') '\n",
    "           + df_mf_heritage['Date'].astype(str))\n",
    "html = st.produce_scattertext_explorer(\n",
    "    corpus = st.CorpusFromParsedDocuments(\n",
    "        df_mf_heritage,\n",
    "        parsed_col='parse',\n",
    "        category_col='Is-Hispanic',\n",
    "        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
    "    ).build(),\n",
    "    category='Hispanic',\n",
    "    category_name='Hispanic',\n",
    "    not_category_name='Not-Hispanic',\n",
    "    use_full_doc=True,\n",
    "    term_ranker=OncePerDocFrequencyRanker,\n",
    "    sort_by_dist=False,\n",
    "    metadata=metadata,\n",
    "    width_in_pixels=1000,\n",
    "    max_docs_per_category=1000\n",
    ")\n",
    "\n",
    "file_name = 'output/emoji_hispanic_v_all.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [py36]",
   "language": "python",
   "name": "Python [py36]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}