{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Using Scattertext to visualize Emoji usage by nationality on Twitter\n",
"\n",
"## DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization\n",
"\n",
"## @jasonkessler\n",
"\n",
"https://github.com/JasonKessler/scattertext\n",
"\n",
"\n",
"Cite as:\n",
"Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n",
"\n",
"Link to preprint: https://arxiv.org/abs/1703.00565\n",
"\n",
"`\n",
"@article{kessler2017scattertext,\n",
" author = {Kessler, Jason S.},\n",
" title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n",
" booktitle = {ACL System Demonstrations},\n",
" year = {2017},\n",
"}\n",
"`\n",
"\n",
"Data is from http://followthehashtag.com/datasets/"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"import io, json\n",
"from zipfile import ZipFile\n",
"import urllib.request\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import agefromname\n",
"import nltk\n",
"import spacy\n",
"\n",
"import scattertext as st\n",
"from scattertext import tweet_tokenzier_factory\n",
"from scattertext.termranking import OncePerDocFrequencyRanker\n",
"\n",
"from IPython.display import IFrame\n",
"from IPython.core.display import display, HTML\n",
"display(HTML(\"\"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"nlp = spacy.en.English()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def load_us_tweets():\n",
" try:\n",
" df = pd.read_csv('usa_tweets.csv.gz')\n",
" except:\n",
" with ZipFile(io.BytesIO(urllib.request.urlopen(\n",
" 'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n",
" ).read())) as zf:\n",
" df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))\n",
" df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')\n",
" df['parse'] = df['Tweet content'].apply(nlp)\n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def load_jp_tweets():\n",
" try:\n",
" df = pd.read_csv('jp_tweets.csv.gz')\n",
" except:\n",
" with ZipFile(io.BytesIO(urllib.request.urlopen(\n",
" 'http://followthehashtag.com/content/uploads/Tokyo-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n",
" ).read())) as zf:\n",
" df = pd.read_excel(zf.open('export_dashboard_geocode_35_6894875_139_69170639999993_52km_2016_04_21_20_09_45.xlsx'), sheetname='Stream')\n",
" df['parse'] = df['Tweet content'].apply(nlp)\n",
" df.to_csv('jp_tweets.csv.gz', index=False, compression='gzip')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def load_uk_tweets():\n",
" try:\n",
" df = pd.read_csv('uk_tweets.csv.gz')\n",
" except:\n",
" with ZipFile(io.BytesIO(urllib.request.urlopen(\n",
" 'http://followthehashtag.com/content/uploads/UK-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n",
" ).read())) as zf:\n",
" df = pd.read_excel(zf.open('export_dashboard_x_uk_x_filter_nativeretweets_geocode_55_378051_3_43597299999999_750km_2016_04_21_10_32_03.xlsx'), sheetname='Stream')\n",
" df['parse'] = df['Tweet content'].apply(nlp)\n",
" df.to_csv('uk_tweets.csv.gz', index=False, compression='gzip')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.concat([load_jp_tweets(), load_us_tweets(), load_uk_tweets()])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"JP 182331\n",
"US 178609\n",
"GB 155695\n",
"MX 20293\n",
"IE 10580\n",
"Name: Country, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Country'].value_counts().iloc[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ensure tweets originate in either the US or Japan"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_clean = df[((df['Country'] == 'JP') & (df['Tweet language (ISO 639-1)'] != 'en')) \n",
" | ((df['Country'] == 'US') & (df['Tweet language (ISO 639-1)'] == 'en'))\n",
" | ((df['Country'] == 'GB') & (df['Tweet language (ISO 639-1)'] == 'en'))]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def twitter_metadata(df):\n",
" return (df['User Name']\n",
" + ' (@' + df['Nickname'] + ') '\n",
" + df['Date'].astype(str))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"html = st.produce_scattertext_explorer(\n",
" corpus = st.CorpusFromParsedDocuments(\n",
" df_clean[df_clean.Country.isin(['US', 'JP'])],\n",
" parsed_col='parse',\n",
" category_col='Country',\n",
" feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
" ).build(),\n",
" category='JP',\n",
" category_name='Japan',\n",
" not_category_name='US',\n",
" use_full_doc=True,\n",
" term_ranker=st.OncePerDocFrequencyRanker,\n",
" sort_by_dist=False,\n",
" metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'JP'])]),\n",
" width_in_pixels=1000\n",
")\n",
"file_name = 'output/emoji_japan-v-us.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1200, height=700)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"html = st.produce_scattertext_explorer(\n",
" corpus = st.CorpusFromParsedDocuments(\n",
" df_clean[df_clean.Country.isin(['US', 'GB'])],\n",
" parsed_col='parse',\n",
" category_col='Country',\n",
" feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n",
" ).build(),\n",
" category='GB',\n",
" category_name='UK',\n",
" not_category_name='US',\n",
" use_full_doc=True,\n",
" term_ranker=st.OncePerDocFrequencyRanker,\n",
" sort_by_dist=False,\n",
" metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'GB'])]),\n",
" width_in_pixels=1000\n",
")\n",
"file_name = 'output/emoji_uk-v-us.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src='output/emoji_uk-v-us.html', width = 1200, height=700)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [py36]",
"language": "python",
"name": "Python [py36]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}