{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using Scattertext to Visualize Emoji Usage by Language (e.g., English or Spanish) on Twitter\n",
    "\n",
    "## DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization\n",
    "\n",
    "## @jasonkessler\n",
    "\n",
    "https://github.com/JasonKessler/scattertext\n",
    "\n",
    "\n",
    "Cite as:\n",
    "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n",
    "\n",
    "Link to preprint: https://arxiv.org/abs/1703.00565\n",
    "\n",
    "`\n",
    "@article{kessler2017scattertext,\n",
    "  author    = {Kessler, Jason S.},\n",
    "  title     = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n",
    "  booktitle = {ACL System Demonstrations},\n",
    "  year      = {2017},\n",
    "}\n",
    "`\n",
    "\n",
    "Data is from http://followthehashtag.com/datasets/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>.container { width:98% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import scattertext as st\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import spacy\n",
    "\n",
    "from IPython.display import IFrame\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:98% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nlp = spacy.en.English()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    df = pd.read_csv('usa_tweets.csv.gz')\n",
    "except:\n",
    "    with ZipFile(io.BytesIO(urllib.request.urlopen(\n",
    "            'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n",
    "    ).read())) as zf:\n",
    "        df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))\n",
    "    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')\n",
    "    df['first_name'] = df['User Name'].apply(\n",
    "        lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)\n",
    "    df['last_name'] = df['User Name'].apply(\n",
    "        lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)\n",
    "    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')\n",
    "df['parse'] = df['Tweet content'].apply(nlp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "en    172206\n",
       "es     27062\n",
       "fr      1695\n",
       "it       959\n",
       "pt       737\n",
       "de       636\n",
       "da       442\n",
       "nl       282\n",
       "ru       277\n",
       "sv       271\n",
       "fi       125\n",
       "tr        91\n",
       "hu        37\n",
       "Name: Tweet language (ISO 639-1), dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Tweet language (ISO 639-1)'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['english'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'en' if x == 'en' else 'non-en')\n",
    "df['spanish'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'es' if x == 'es' else 'non-es')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['en', 'non-en']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus.get_categories()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/emoji_english-v-non.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x111e16f60>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata= (df['User Name']\n",
    "           + ' (@' + df['Nickname'] + ') '\n",
    "           + df['Date'].astype(str))\n",
    "\n",
    "html = st.produce_scattertext_explorer(\n",
    "    corpus = st.CorpusFromParsedDocuments(\n",
    "        df,\n",
    "        parsed_col='parse',\n",
    "        category_col='english',\n",
    "        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
    "    ).build(),\n",
    "    category='en',\n",
    "    category_name='English',\n",
    "    not_category_name='Non-English',\n",
    "    use_full_doc=True,\n",
    "    term_ranker=st.OncePerDocFrequencyRanker,\n",
    "    sort_by_dist=False,\n",
    "    metadata=metadata,\n",
    "    width_in_pixels=1000\n",
    ")\n",
    "file_name = 'output/emoji_english-v-non.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/emoji_spanish-v-non.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x106aaae10>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata= (df['User Name']\n",
    "           + ' (@' + df['Nickname'] + ') '\n",
    "           + df['Date'].astype(str))\n",
    "\n",
    "html = st.produce_scattertext_explorer(\n",
    "    corpus = st.CorpusFromParsedDocuments(\n",
    "        df,\n",
    "        parsed_col='parse',\n",
    "        category_col='spanish',\n",
    "        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
    "    ).build(),\n",
    "    category='es',\n",
    "    category_name='Spanish',\n",
    "    not_category_name='Non-Spanish',\n",
    "    use_full_doc=True,\n",
    "    term_ranker=st.OncePerDocFrequencyRanker,\n",
    "    sort_by_dist=False,\n",
    "    metadata=metadata,\n",
    "    width_in_pixels=1000\n",
    ")\n",
    "file_name = 'output/emoji_spanish-v-non.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [py36]",
   "language": "python",
   "name": "Python [py36]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}