{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using Scattertext to visualize Emoji usage by gender and heritage on Twitter\n", "\n", "## DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization\n", "\n", "## @jasonkessler\n", "\n", "https://github.com/JasonKessler/scattertext\n", "\n", "\n", "Cite as:\n", "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n", "\n", "Link to preprint: https://arxiv.org/abs/1703.00565\n", "\n", "`\n", "@article{kessler2017scattertext,\n", " author = {Kessler, Jason S.},\n", " title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n", " booktitle = {ACL System Demonstrations},\n", " year = {2017},\n", "}\n", "`\n", "\n", "Data is from http://followthehashtag.com/datasets/\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "import io, json\n", "from zipfile import ZipFile\n", "import urllib.request\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import agefromname\n", "import nltk\n", "\n", "import imp\n", "import scattertext as st\n", "from scattertext import tweet_tokenzier_factory\n", "from scattertext.termranking import OncePerDocFrequencyRanker\n", "\n", "from IPython.display import IFrame\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "try:\n", " df = pd.read_csv('usa_tweets.csv.gz')\n", "except:\n", " with ZipFile(io.BytesIO(urllib.request.urlopen(\n", " 'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'\n", " ).read())) as zf:\n", " df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))\n", " df['first_name'] = df['User Name'].apply(\n", " lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)\n", " df['last_name'] = df['User Name'].apply(\n", " lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)\n", " df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameUser NameNicknameTweet content
0billschulhoffBill SchulhoffBillSchulhoffWind 3.2 mph NNE. Barometer 30.20 in, Rising s...
1danielepolisDaniele PolisdanipolisPausa pro café antes de embarcar no próximo vô...
2kaseyjacobsKasey JacobsKJacobs27Good. Morning. #morning #Saturday #diner #VT #...
\n", "
" ], "text/plain": [ " first_name last_name User Name Nickname \\\n", "0 bill schulhoff Bill Schulhoff BillSchulhoff \n", "1 daniele polis Daniele Polis danipolis \n", "2 kasey jacobs Kasey Jacobs KJacobs27 \n", "\n", " Tweet content \n", "0 Wind 3.2 mph NNE. Barometer 30.20 in, Rising s... \n", "1 Pausa pro café antes de embarcar no próximo vô... \n", "2 Good. Morning. #morning #Saturday #diner #VT #... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[['first_name', 'last_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:3]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hiloprob
first_name
aaban1.0000009.574095e-011.0
aabha0.121295-1.387779e-170.0
aabid1.0000005.628005e-011.0
\n", "
" ], "text/plain": [ " hi lo prob\n", "first_name \n", "aaban 1.000000 9.574095e-01 1.0\n", "aabha 0.121295 -1.387779e-17 0.0\n", "aabid 1.000000 5.628005e-01 1.0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "male_prob = agefromname.AgeFromName().get_all_name_male_prob()\n", "male_prob.iloc[:3]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)\n", "df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')\n", "df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]\n", "df_mf.to_csv('emoji_data.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_mf = pd.read_csv('emoji_data.csv')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderfirst_nameUser NameNicknameTweet content
0mbillBill SchulhoffBillSchulhoffWind 3.2 mph NNE. Barometer 30.20 in, Rising s...
1mbillBill S KenneyBillSKenneyPlanning the new focuslabllc website with the ...
2mbillBill PendleyBILLPENDLEY#bibleverseoftheday @ Bill The Mortgage Guy ...
3mbillBill Culverrilla6969Start Wars Dark Side Challenge race number one...
4mbillBill EsparzastreetgourmetlaSpinach fusilli by @chef_timothy. A pre #mexic...
5mbillBill MeadowsBillMeadows305https://t.co/N8E5aTvIIN
\n", "
" ], "text/plain": [ " gender first_name User Name Nickname \\\n", "0 m bill Bill Schulhoff BillSchulhoff \n", "1 m bill Bill S Kenney BillSKenney \n", "2 m bill Bill Pendley BILLPENDLEY \n", "3 m bill Bill Culver rilla6969 \n", "4 m bill Bill Esparza streetgourmetla \n", "5 m bill Bill Meadows BillMeadows305 \n", "\n", " Tweet content \n", "0 Wind 3.2 mph NNE. Barometer 30.20 in, Rising s... \n", "1 Planning the new focuslabllc website with the ... \n", "2 #bibleverseoftheday @ Bill The Mortgage Guy ... \n", "3 Start Wars Dark Side Challenge race number one... \n", "4 Spinach fusilli by @chef_timothy. A pre #mexic... \n", "5 https://t.co/N8E5aTvIIN " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_mf[['gender', 'first_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:6]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Gender breakdown of Twitter Users\n", "### According to Pew, the gender breakdown of American adults Twitter was about even\n", "- 24% of online men, 25% of online women\n", "- Source: http://www.pewinternet.org/2016/11/11/social-media-update-2016/\n", "\n", "### However, among users with a gender-identifying first name, about 56% appear to be male" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "m 28159\n", "f 21844\n", "Name: gender, dtype: int64\n", "m 0.563146\n", "f 0.436854\n", "Name: gender, dtype: float64\n" ] } ], "source": [ "print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts())\n", "print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts()/df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts().sum())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...\n", "1 Planning the new focuslabllc website with the ...\n", "2 #bibleverseoftheday @ Bill The Mortgage Guy ...\n", "3 Start Wars Dark Side Challenge race number one...\n", "4 Spinach fusilli by @chef_timothy. A pre #mexic...\n", "Name: Tweet content, dtype: object" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_mf['Tweet content'].iloc[:5]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nlp = st.tweet_tokenzier_factory(nltk.tokenize.TweetTokenizer())\n", "df_mf['parse'] = df_mf['Tweet content'].apply(nlp)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "corpus = st.CorpusFromParsedDocuments(\n", " df_mf,\n", " parsed_col='parse',\n", " category_col='gender',\n", " feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n", ").build()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata= (df_mf['User Name']\n", " + ' (@' + df_mf['Nickname'] + ') '\n", " + df_mf['Date'].astype(str))\n", "html = st.produce_scattertext_explorer(\n", " corpus,\n", " category='f',\n", " category_name='Female',\n", " not_category_name='Male',\n", " use_full_doc=True,\n", " term_ranker=OncePerDocFrequencyRanker,\n", " sort_by_dist=False,\n", " metadata=metadata,\n", " width_in_pixels=1000\n", ")\n", "\n", "file_name = 'output/emoji_gender_scattertext.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "html = st.produce_fightin_words_explorer(corpus, \n", " category='f', \n", " category_name='Female', \n", " not_category_name='Male', \n", " term_ranker=OncePerDocFrequencyRanker,\n", " metadata=metadata)\n", "file_name = 'output/emoji_gender_lorp.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Use US Census data to find last names that are associated with a particular heritage\n", "From https://www.census.gov/data/developers/data-sets/surnames.2010.html" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from urllib.request import urlopen\n", "#url = 'https://api.census.gov/data/2010/surname?get=COUNT,CUM_PROP100K,NAME,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K,RANK,&RANK=1:100000'\n", "url ='https://api.census.gov/data/2010/surname?get=NAME,COUNT,CUM_PROP100K,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K&RANK=1:200000'\n", "with urlopen(url) as f:\n", " raw = f.read().decode('utf-8')\n", " rows = json.loads(raw)\n", " name_df = pd.DataFrame(rows[1:], columns=rows[0]).set_index('NAME').replace('(S)', 0).astype(float).reset_index()\n", "name_df['NAME'] = name_df['NAME'].apply(str.lower)\n", "name_df = name_df.set_index('NAME')\n", "name_df['heritage'] = name_df.apply(lambda x: max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[1][3:]\n", " if max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[0] > 85\n", " else np.nan, axis=1)\n", "name_df['heritage'] = name_df['heritage'].apply(lambda x:\n", " {'AIAN': 'Native American', \n", " 'API': 'Asian', \n", " 'BLACK': 'African American', \n", " 'HISPANIC': 'Hispanic', \n", " 'WHITE': 'White'}.get(x, np.nan))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTCUM_PROP100KPCT2PRACEPCTAIANPCTAPIPCTBLACKPCTHISPANICPCTWHITEPROP100KRANKheritage
NAME
garcia1166120.03400.120.260.471.410.4592.035.38395.326.0Hispanic
rodriguez1094924.04543.500.180.180.570.5493.774.75371.199.0Hispanic
martinez1060159.04902.900.220.510.600.4992.915.28359.4010.0Hispanic
hernandez1043281.05256.580.160.190.600.3694.893.79353.6811.0Hispanic
lopez874523.05553.050.250.381.020.5792.924.86296.4712.0Hispanic
\n", "
" ], "text/plain": [ " COUNT CUM_PROP100K PCT2PRACE PCTAIAN PCTAPI PCTBLACK \\\n", "NAME \n", "garcia 1166120.0 3400.12 0.26 0.47 1.41 0.45 \n", "rodriguez 1094924.0 4543.50 0.18 0.18 0.57 0.54 \n", "martinez 1060159.0 4902.90 0.22 0.51 0.60 0.49 \n", "hernandez 1043281.0 5256.58 0.16 0.19 0.60 0.36 \n", "lopez 874523.0 5553.05 0.25 0.38 1.02 0.57 \n", "\n", " PCTHISPANIC PCTWHITE PROP100K RANK heritage \n", "NAME \n", "garcia 92.03 5.38 395.32 6.0 Hispanic \n", "rodriguez 93.77 4.75 371.19 9.0 Hispanic \n", "martinez 92.91 5.28 359.40 10.0 Hispanic \n", "hernandez 94.89 3.79 353.68 11.0 Hispanic \n", "lopez 92.92 4.86 296.47 12.0 Hispanic " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_df[name_df['PCTHISPANIC'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTCUM_PROP100KPCT2PRACEPCTAIANPCTAPIPCTBLACKPCTHISPANICPCTWHITEPROP100KRANKheritage
NAME
washington177386.020370.633.780.680.3087.532.545.1760.14145.0African American
pierre33913.041272.382.230.920.3186.742.757.0511.501026.0African American
smalls12435.053820.352.760.280.2390.492.463.784.222888.0African American
jeanbaptiste7915.059139.712.500.130.2194.042.150.972.684483.0African American
diallo7502.059784.561.550.090.1295.640.761.842.544730.0African American
\n", "
" ], "text/plain": [ " COUNT CUM_PROP100K PCT2PRACE PCTAIAN PCTAPI PCTBLACK \\\n", "NAME \n", "washington 177386.0 20370.63 3.78 0.68 0.30 87.53 \n", "pierre 33913.0 41272.38 2.23 0.92 0.31 86.74 \n", "smalls 12435.0 53820.35 2.76 0.28 0.23 90.49 \n", "jeanbaptiste 7915.0 59139.71 2.50 0.13 0.21 94.04 \n", "diallo 7502.0 59784.56 1.55 0.09 0.12 95.64 \n", "\n", " PCTHISPANIC PCTWHITE PROP100K RANK heritage \n", "NAME \n", "washington 2.54 5.17 60.14 145.0 African American \n", "pierre 2.75 7.05 11.50 1026.0 African American \n", "smalls 2.46 3.78 4.22 2888.0 African American \n", "jeanbaptiste 2.15 0.97 2.68 4483.0 African American \n", "diallo 0.76 1.84 2.54 4730.0 African American " ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_df[name_df['PCTBLACK'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "heritage\n", "Native American 98523.0\n", "African American 1343997.0\n", "Asian 7336493.0\n", "Hispanic 36248570.0\n", "White 92111160.0\n", "N/A 128528485.0\n", "Name: COUNT, dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_df.fillna('N/A').groupby('heritage').sum()['COUNT'].sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Self-identified race in the US (via: https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_United_States)\n", "- White alone: 72.4%\n", "- Hispanic and Latino Americans (of any race): 16.3%\n", "- Black or African American: 12.6%\n", "- Asian: 4.8%\n", "- Native American and Alaska Natives: 0.9%\n", "- Native Hawaiians and Other Pacific Islanders: 0.2%\n", "- Two or more races: 2.9%\n", "- Some other race: 6.2%" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "heritage\n", "Native American 0.000718\n", "African American 0.009800\n", "Asian 0.053497\n", "Hispanic 0.264320\n", "White 0.671664\n", "Name: COUNT, dtype: float64" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_df.dropna().groupby('heritage').sum()['COUNT'].sort_values()/name_df.dropna().groupby('heritage').sum()['COUNT'].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Caution!!\n", "- Hispanics are over reprented (26.4% of Americans w/ heritage-identifiable names vs 16.3% of population)\n", "- African Americans are far under represented: (0.9% of Americans w/ heritage-identifiable names vs 12.6% of population)\n", "- White, Asians have similar shares\n", "- Native Americans are very difficult to identify" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_mf_heritage = pd.merge(df_mf, name_df[['heritage']].dropna(), left_on='last_name', right_index = True, how='inner')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "White 22176\n", "Hispanic 9188\n", "Asian 1249\n", "African American 166\n", "Native American 8\n", "Name: heritage, dtype: int64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_mf_heritage['heritage'].value_counts()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_mf_heritage['Is-White'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'White' else 'Not White')\n", "df_mf_heritage['Is-Hispanic'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Hispanic' else 'Not Hispanic')\n", "df_mf_heritage['Is-Asian'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Asian' else 'Not Asian')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Emojis white people like" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata= (df_mf_heritage['User Name']\n", " + ' (@' + df_mf_heritage['Nickname'] + ') '\n", " + df_mf_heritage['Date'].astype(str))\n", "html = st.produce_scattertext_explorer(\n", " corpus = st.CorpusFromParsedDocuments(\n", " df_mf_heritage,\n", " parsed_col='parse',\n", " category_col='Is-White',\n", " feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n", " ).build(),\n", " category='White',\n", " category_name='White',\n", " not_category_name='Not-White',\n", " use_full_doc=True,\n", " term_ranker=OncePerDocFrequencyRanker,\n", " sort_by_dist=False,\n", " metadata=metadata,\n", " width_in_pixels=1000,\n", " max_docs_per_category=1000\n", ")\n", "\n", "file_name = 'output/emoji_white_v_all.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Emojis Hispanic people like" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata= (df_mf_heritage['User Name']\n", " + ' (@' + df_mf_heritage['Nickname'] + ') '\n", " + df_mf_heritage['Date'].astype(str))\n", "html = st.produce_scattertext_explorer(\n", " corpus = st.CorpusFromParsedDocuments(\n", " df_mf_heritage,\n", " parsed_col='parse',\n", " category_col='Is-Hispanic',\n", " feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()\n", " ).build(),\n", " category='Hispanic',\n", " category_name='Hispanic',\n", " not_category_name='Not-Hispanic',\n", " use_full_doc=True,\n", " term_ranker=OncePerDocFrequencyRanker,\n", " sort_by_dist=False,\n", " metadata=metadata,\n", " width_in_pixels=1000,\n", " max_docs_per_category=1000\n", ")\n", "\n", "file_name = 'output/emoji_hispanic_v_all.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [py36]", "language": "python", "name": "Python [py36]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }