{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using Scattertext to Visualize Emoji Usage by Language (e.g., English or Spanish) on Twitter\n", "\n", "## DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization\n", "\n", "## @jasonkessler\n", "\n", "https://github.com/JasonKessler/scattertext\n", "\n", "\n", "Cite as:\n", "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n", "\n", "Link to preprint: https://arxiv.org/abs/1703.00565\n", "\n", "`\n", "@article{kessler2017scattertext,\n", " author = {Kessler, Jason S.},\n", " title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n", " booktitle = {ACL System Demonstrations},\n", " year = {2017},\n", "}\n", "`\n", "\n", "Data is from http://followthehashtag.com/datasets/" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import scattertext as st\n", "import pandas as pd\n", "import numpy as np\n", "import spacy\n", "\n", "from IPython.display import IFrame\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nlp = spacy.en.English()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "try:\n", " df = pd.read_csv('usa_tweets.csv.gz')\n", "except:\n", " with ZipFile(io.BytesIO(urllib.request.urlopen(\n", " 'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n", " ).read())) as zf:\n", " df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))\n", " df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')\n", " df['first_name'] = df['User Name'].apply(\n", " lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)\n", " df['last_name'] = df['User Name'].apply(\n", " lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)\n", " df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')\n", "df['parse'] = df['Tweet content'].apply(nlp)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "en 172206\n", "es 27062\n", "fr 1695\n", "it 959\n", "pt 737\n", "de 636\n", "da 442\n", "nl 282\n", "ru 277\n", "sv 271\n", "fi 125\n", "tr 91\n", "hu 37\n", "Name: Tweet language (ISO 639-1), dtype: int64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Tweet language (ISO 639-1)'].value_counts()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df['english'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'en' if x == 'en' else 'non-en')\n", "df['spanish'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'es' if x == 'es' else 'non-es')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['en', 'non-en']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corpus.get_categories()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata= (df['User Name']\n", " + ' (@' + df['Nickname'] + ') '\n", " + df['Date'].astype(str))\n", "\n", "html = st.produce_scattertext_explorer(\n", " corpus = st.CorpusFromParsedDocuments(\n", " df,\n", " parsed_col='parse',\n", " category_col='english',\n", " feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n", " ).build(),\n", " category='en',\n", " category_name='English',\n", " not_category_name='Non-English',\n", " use_full_doc=True,\n", " term_ranker=st.OncePerDocFrequencyRanker,\n", " sort_by_dist=False,\n", " metadata=metadata,\n", " width_in_pixels=1000\n", ")\n", "file_name = 'output/emoji_english-v-non.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata= (df['User Name']\n", " + ' (@' + df['Nickname'] + ') '\n", " + df['Date'].astype(str))\n", "\n", "html = st.produce_scattertext_explorer(\n", " corpus = st.CorpusFromParsedDocuments(\n", " df,\n", " parsed_col='parse',\n", " category_col='spanish',\n", " feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n", " ).build(),\n", " category='es',\n", " category_name='Spanish',\n", " not_category_name='Non-Spanish',\n", " use_full_doc=True,\n", " term_ranker=st.OncePerDocFrequencyRanker,\n", " sort_by_dist=False,\n", " metadata=metadata,\n", " width_in_pixels=1000\n", ")\n", "file_name = 'output/emoji_spanish-v-non.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [py36]", "language": "python", "name": "Python [py36]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }