{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Using Scattertext to Visualize Emoji Usage by Language (e.g., English or Spanish) on Twitter\n",
"\n",
"## DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization\n",
"\n",
"## @jasonkessler\n",
"\n",
"https://github.com/JasonKessler/scattertext\n",
"\n",
"\n",
"Cite as:\n",
"Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n",
"\n",
"Link to preprint: https://arxiv.org/abs/1703.00565\n",
"\n",
"`\n",
"@article{kessler2017scattertext,\n",
" author = {Kessler, Jason S.},\n",
" title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n",
" booktitle = {ACL System Demonstrations},\n",
" year = {2017},\n",
"}\n",
"`\n",
"\n",
"Data is from http://followthehashtag.com/datasets/"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import scattertext as st\n",
"import pandas as pd\n",
"import numpy as np\n",
"import spacy\n",
"\n",
"from IPython.display import IFrame\n",
"from IPython.core.display import display, HTML\n",
"display(HTML(\"\"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"nlp = spacy.en.English()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"try:\n",
" df = pd.read_csv('usa_tweets.csv.gz')\n",
"except:\n",
" with ZipFile(io.BytesIO(urllib.request.urlopen(\n",
" 'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n",
" ).read())) as zf:\n",
" df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))\n",
" df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')\n",
" df['first_name'] = df['User Name'].apply(\n",
" lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)\n",
" df['last_name'] = df['User Name'].apply(\n",
" lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)\n",
" df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')\n",
"df['parse'] = df['Tweet content'].apply(nlp)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"en 172206\n",
"es 27062\n",
"fr 1695\n",
"it 959\n",
"pt 737\n",
"de 636\n",
"da 442\n",
"nl 282\n",
"ru 277\n",
"sv 271\n",
"fi 125\n",
"tr 91\n",
"hu 37\n",
"Name: Tweet language (ISO 639-1), dtype: int64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Tweet language (ISO 639-1)'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['english'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'en' if x == 'en' else 'non-en')\n",
"df['spanish'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'es' if x == 'es' else 'non-es')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['en', 'non-en']"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corpus.get_categories()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metadata= (df['User Name']\n",
" + ' (@' + df['Nickname'] + ') '\n",
" + df['Date'].astype(str))\n",
"\n",
"html = st.produce_scattertext_explorer(\n",
" corpus = st.CorpusFromParsedDocuments(\n",
" df,\n",
" parsed_col='parse',\n",
" category_col='english',\n",
" feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
" ).build(),\n",
" category='en',\n",
" category_name='English',\n",
" not_category_name='Non-English',\n",
" use_full_doc=True,\n",
" term_ranker=st.OncePerDocFrequencyRanker,\n",
" sort_by_dist=False,\n",
" metadata=metadata,\n",
" width_in_pixels=1000\n",
")\n",
"file_name = 'output/emoji_english-v-non.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1200, height=700)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metadata= (df['User Name']\n",
" + ' (@' + df['Nickname'] + ') '\n",
" + df['Date'].astype(str))\n",
"\n",
"html = st.produce_scattertext_explorer(\n",
" corpus = st.CorpusFromParsedDocuments(\n",
" df,\n",
" parsed_col='parse',\n",
" category_col='spanish',\n",
" feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
" ).build(),\n",
" category='es',\n",
" category_name='Spanish',\n",
" not_category_name='Non-Spanish',\n",
" use_full_doc=True,\n",
" term_ranker=st.OncePerDocFrequencyRanker,\n",
" sort_by_dist=False,\n",
" metadata=metadata,\n",
" width_in_pixels=1000\n",
")\n",
"file_name = 'output/emoji_spanish-v-non.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1200, height=700)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [py36]",
"language": "python",
"name": "Python [py36]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}