{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Analysis of Tweets from Ireland 8th"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pyspark\n",
    "from pyspark.sql import SQLContext\n",
    "\n",
    "# Add the elasticsearch-hadoop jar\n",
    "os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'\n",
    "conf = pyspark.SparkConf()\n",
    "\n",
    "# Point to the master.\n",
    "conf.setMaster(\"spark://tweetsets.library.gwu.edu:7101\")\n",
    "import os\n",
    "import pyspark\n",
    "\n",
    "# Add the elasticsearch-hadoop jar\n",
    "os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'\n",
    "conf = pyspark.SparkConf()\n",
    "\n",
    "# Point to the master.\n",
    "conf.setMaster(\"spark://tweetsets.library.gwu.edu:7101\")\n",
    "conf.setAppName(\"ireland-8th-analysis\")\n",
    "conf.set(\"spark.driver.bindAddress\", \"0.0.0.0\")\n",
    "# Don't hog all of the cores.\n",
    "conf.set(\"spark.cores.max\", \"3\")\n",
    "# Specify a port for the block manager (which runs as part of the worker). The range 7003-7028 is set \n",
    "# to be open in the Spark worker container.\n",
    "conf.set(\"spark.blockManager.port\", \"7003\")\n",
    "\n",
    "# create the context\n",
    "sc = pyspark.SparkContext(conf=conf)\n",
    "\n",
    "# Configure for ElasticSearch cluster and index.\n",
    "es_conf = {\"es.nodes\": \"tweetsets.library.gwu.edu\",\n",
    "           \"es.port\": \"9200\",\n",
    "           \"es.resource\": \"tweets-ba2157/doc\",\n",
    "           \"es.read.field.as.array.include\": \"hashtags,text,urls\"}\n",
    "\n",
    "sqlContext = SQLContext(sc)\n",
    "tweets_df = sqlContext.read.format(\"org.elasticsearch.spark.sql\").options(**es_conf).load()\n",
    "tweets_df.createOrReplaceTempView(\"tweets\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "478303"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweets_df.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top hashtags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+--------------+\n",
      "|hashtag            |count(hashtag)|\n",
      "+-------------------+--------------+\n",
      "|repealthe8th       |84179         |\n",
      "|together4yes       |57253         |\n",
      "|savethe8th         |43656         |\n",
      "|8thref             |35935         |\n",
      "|togetherforyes     |18763         |\n",
      "|lovebothvoteno     |18290         |\n",
      "|voteyes            |8245          |\n",
      "|loveboth           |8109          |\n",
      "|latelate           |5920          |\n",
      "|men4yes            |5884          |\n",
      "|votenotoabortion   |5732          |\n",
      "|latelateshow       |5584          |\n",
      "|voteno             |5324          |\n",
      "|prolife            |4596          |\n",
      "|hometovote         |4342          |\n",
      "|lifecanvass        |3637          |\n",
      "|trustwomen         |3483          |\n",
      "|repeal             |3450          |\n",
      "|ireland            |3192          |\n",
      "|mybodymychoice     |2903          |\n",
      "|私は黙らない0428         |2879          |\n",
      "|repealth8th        |2614          |\n",
      "|jointherebellion   |2602          |\n",
      "|abortion           |2463          |\n",
      "|repeal8th          |2258          |\n",
      "|rtept              |1722          |\n",
      "|repealtheeighth    |1601          |\n",
      "|register4yes       |1445          |\n",
      "|prochoice          |1423          |\n",
      "|repealfacts        |1360          |\n",
      "|standupforlife     |1322          |\n",
      "|itstime            |1293          |\n",
      "|8thamendment       |1283          |\n",
      "|praytoendabortion  |1196          |\n",
      "|womensayno2abortion|1050          |\n",
      "|wakeupireland      |1049          |\n",
      "|ourfuture          |1049          |\n",
      "|savelives          |980           |\n",
      "|menforyes          |957           |\n",
      "|scotref            |921           |\n",
      "|chooselife         |912           |\n",
      "|yes                |899           |\n",
      "|corksaysyes        |893           |\n",
      "|tinylivesatstake   |863           |\n",
      "|time4choice        |832           |\n",
      "|istandwithnicola   |820           |\n",
      "|repealmobile       |758           |\n",
      "|votenoroadshow     |757           |\n",
      "|loveboats          |736           |\n",
      "|studentsforchoice  |646           |\n",
      "+-------------------+--------------+\n",
      "only showing top 50 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "hashtags_df = sqlContext.sql(\"SELECT hashtag, count(hashtag) from (SELECT explode(hashtags) hashtag FROM tweets) group by hashtag order by count(hashtag) desc\")\n",
    "hashtags_df.show(50, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top users by all tweet types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------+-----------------------+\n",
      "|user_screen_name|count(user_screen_name)|\n",
      "+----------------+-----------------------+\n",
      "|kaydnan         |3449                   |\n",
      "|Irishprolifer   |2783                   |\n",
      "|ShannonBlue     |2433                   |\n",
      "|mobyrne100      |2375                   |\n",
      "|EmmaMurphy12150 |2278                   |\n",
      "|Declan1497      |2225                   |\n",
      "|MaryOGrady8     |2209                   |\n",
      "|BernadetteComm1 |2164                   |\n",
      "|Donnchadh32     |1823                   |\n",
      "|EamonReilly_com |1727                   |\n",
      "|Paul71          |1700                   |\n",
      "|christi85573643 |1627                   |\n",
      "|rosecaroline9   |1626                   |\n",
      "|MeathRight2Life |1594                   |\n",
      "|DLTogether4Yes  |1587                   |\n",
      "|MandyGall7      |1559                   |\n",
      "|laurathornton30 |1545                   |\n",
      "|MaryThorn85     |1528                   |\n",
      "|babydollirish2  |1474                   |\n",
      "|ebt51           |1430                   |\n",
      "|ExposeMediaBias |1419                   |\n",
      "|WolfeTone15     |1397                   |\n",
      "|IsabelCorcoran5 |1236                   |\n",
      "|JanetOS_        |1164                   |\n",
      "|renemccoll      |1103                   |\n",
      "|seamus6346      |1079                   |\n",
      "|ElaineYoung94   |1039                   |\n",
      "|daraghnoel      |873                    |\n",
      "|pnolan26        |872                    |\n",
      "|PadraigSagart   |809                    |\n",
      "|NursepollyRgn   |809                    |\n",
      "|DrCollins10     |807                    |\n",
      "|corkmankeane    |789                    |\n",
      "|Together4yes    |773                    |\n",
      "|marybuckley549  |764                    |\n",
      "|8threfbot       |763                    |\n",
      "|Berlinnaeus     |757                    |\n",
      "|Thebfromtuam    |754                    |\n",
      "|loveboth8       |741                    |\n",
      "|GiveBackMy_Mind |730                    |\n",
      "|renebatt26      |723                    |\n",
      "|Savethe8thInfo  |721                    |\n",
      "|theRallyforLife |718                    |\n",
      "|TeilHarder      |716                    |\n",
      "|Colmogorman     |700                    |\n",
      "|paddylepage     |685                    |\n",
      "|repeal_shield   |681                    |\n",
      "|paddyearly      |679                    |\n",
      "|firstlady10000  |678                    |\n",
      "|IrelandStandUp  |664                    |\n",
      "+----------------+-----------------------+\n",
      "only showing top 50 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "screen_name_df = sqlContext.sql(\"SELECT user_screen_name, count(user_screen_name) from tweets group by user_screen_name order by count(user_screen_name) desc\")\n",
    "screen_name_df.show(50, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top users by original tweets only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------+-----------------------+\n",
      "|user_screen_name|count(user_screen_name)|\n",
      "+----------------+-----------------------+\n",
      "|ebt51           |626                    |\n",
      "|loveboth8       |546                    |\n",
      "|Savethe8thInfo  |516                    |\n",
      "|testisfidelis   |417                    |\n",
      "|Together4yes    |287                    |\n",
      "|Thebfromtuam    |263                    |\n",
      "|LoveLifeLove8th |246                    |\n",
      "|AllPassingThing |218                    |\n",
      "|lifeinstitute   |209                    |\n",
      "|JanetOS_        |183                    |\n",
      "|MarieAFlaherty  |148                    |\n",
      "|InHerIrishShoes |145                    |\n",
      "|PacifistIreland |143                    |\n",
      "|repeal8thfunds  |142                    |\n",
      "|RosForChoice    |142                    |\n",
      "|tvcritics       |135                    |\n",
      "|StopRepealHate  |125                    |\n",
      "|MandyGall7      |122                    |\n",
      "|TarynDeVere     |122                    |\n",
      "|paddyearly      |110                    |\n",
      "|AnnieKatelynch  |109                    |\n",
      "|rocknrollok     |109                    |\n",
      "|LawlessRoisin   |106                    |\n",
      "|TFYEastCork     |106                    |\n",
      "|EmmaMurphy12150 |102                    |\n",
      "|laurathornton30 |99                     |\n",
      "|TheUSI          |99                     |\n",
      "|Donnchadh32     |98                     |\n",
      "|GendercideNews  |98                     |\n",
      "|prolifecampaign |97                     |\n",
      "|Irishprolifer   |94                     |\n",
      "|TipperaryForYes |94                     |\n",
      "|LeanneWoodfull  |91                     |\n",
      "|speakofIMELDA   |90                     |\n",
      "|john_mcguirk    |89                     |\n",
      "|Colmogorman     |89                     |\n",
      "|ZazaFL          |88                     |\n",
      "|DroghedaT4Y     |87                     |\n",
      "|IrelandStandUp  |87                     |\n",
      "|SpotlightEire   |84                     |\n",
      "|AmnestyIreland  |82                     |\n",
      "|StopViolenceIRL |81                     |\n",
      "|Paul71          |81                     |\n",
      "|CoraSherlock    |79                     |\n",
      "|WingnutParody   |75                     |\n",
      "|Berlinnaeus     |75                     |\n",
      "|RepealExpose    |73                     |\n",
      "|NualaDonnellan  |73                     |\n",
      "|ExposeMediaBias |72                     |\n",
      "|SK4Repeal       |71                     |\n",
      "+----------------+-----------------------+\n",
      "only showing top 50 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "screen_name_orig_df = sqlContext.sql(\"SELECT user_screen_name, count(user_screen_name) from tweets where tweet_type='original' group by user_screen_name order by count(user_screen_name) desc\")\n",
    "screen_name_orig_df.show(50, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top URLs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+\n",
      "|url                                                                                                                                                              |count(url)|\n",
      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+\n",
      "|http://checktheregister.ie                                                                                                                                       |569       |\n",
      "|http://undecided8.org                                                                                                                                            |314       |\n",
      "|http://www.irishtimes.com/opinion/anti-abortion-posters-fail-to-take-account-of-life-1.3470187                                                                   |294       |\n",
      "|http://bit.ly/2keysma                                                                                                                                            |292       |\n",
      "|http://www.checktheregister.ie                                                                                                                                   |279       |\n",
      "|http://jrnl.ie/3986043t                                                                                                                                          |163       |\n",
      "|http://crowdfund.togetherforyes.ie                                                                                                                               |150       |\n",
      "|http://www.checktheregister.ie/publicpages/default.aspx?uilang=                                                                                                  |137       |\n",
      "|http://youtu.be/itsxbbkp-tq                                                                                                                                      |135       |\n",
      "|http://adoption.ie/wp-content/uploads/2018/04/ara-position-paper-on-8th-amendment.pdf                                                                            |131       |\n",
      "|http://www.irishtimes.com/news/politics/obstetricians-body-recommends-yes-vote-in-abortion-referendum-1.3473125                                                  |127       |\n",
      "|http://www.thejournal.ie/together-for-yes-crowdfunding-3957637-apr2018/                                                                                          |114       |\n",
      "|http://goo.gl/3qa2n5                                                                                                                                             |113       |\n",
      "|http://www.save8.ie/donate                                                                                                                                       |109       |\n",
      "|http://twibbon.com/support/loveboth-vote-no/twitter                                                                                                              |101       |\n",
      "|http://bit.ly/2jp6u0l                                                                                                                                            |93        |\n",
      "|http://bit.ly/2itifbo                                                                                                                                            |91        |\n",
      "|http://www.youtube.com/watch?v=itsxbbkp-tq&sns=tw                                                                                                                |84        |\n",
      "|http://save8.ie                                                                                                                                                  |82        |\n",
      "|http://www.irishtimes.com/opinion/graham-linehan-men-must-play-their-part-in-repealing-the-eighth-1.3481645                                                      |77        |\n",
      "|http://xytex.com                                                                                                                                                 |76        |\n",
      "|http://philippaandneil.wordpress.com                                                                                                                             |71        |\n",
      "|http://tinyurl.com/ybfp9gpz                                                                                                                                      |71        |\n",
      "|http://trinitynews.ie/katie-ascoughs-cynical-worldview/                                                                                                          |70        |\n",
      "|http://goo.gl/7p2f9x                                                                                                                                             |69        |\n",
      "|http://bit.ly/2rjqt0t                                                                                                                                            |69        |\n",
      "|http://bit.ly/2hzhf4f                                                                                                                                            |69        |\n",
      "|http://clarechampion.ie/i-was-abandoned-by-the-irish-state/                                                                                                      |67        |\n",
      "|http://www.togetherforyes.ie/donate                                                                                                                              |66        |\n",
      "|http://togetherforyes.causevox.com/                                                                                                                              |66        |\n",
      "|http://youtu.be/cw_ylrol_70                                                                                                                                      |63        |\n",
      "|http://www.jpands.org/vol22no4/coleman.pdf                                                                                                                       |62        |\n",
      "|http://www.togetherforyes.ie/canvassing-information-contacts/                                                                                                    |61        |\n",
      "|http://youtu.be/8acbuqzkq80                                                                                                                                      |60        |\n",
      "|http://www.the-pool.com/health/wombs-etc/2018/17/caroline-o-donoghue-one-month-before-ireland-abortion-referendum                                                |58        |\n",
      "|http://bit.ly/2rsxltw                                                                                                                                            |58        |\n",
      "|http://www.liveaction.org/news/abortion-facility-misleads-women-deceiving-depictions-fetal-development/?utm_content=70471828&utm_medium=social&utm_source=twitter|58        |\n",
      "|http://www.irishtimes.com/news/ireland/irish-news/eighth-amendment-causing-uncertainty-for-doctors-gynaecologist-1.3478274                                       |57        |\n",
      "|http://www.togetherforyes.ie/register4yes/                                                                                                                       |57        |\n",
      "|http://www.irishtimes.com/culture/music/u2-support-repeal-of-eighth-amendment-on-eve-of-new-tour-1.3481713                                                       |56        |\n",
      "|http://bit.ly/2qcfkxx                                                                                                                                            |55        |\n",
      "|http://youtu.be/s4pk1d0ob1w                                                                                                                                      |52        |\n",
      "|http://unitedtrusts.com                                                                                                                                          |52        |\n",
      "|http://www.irishtimes.com/news/social-affairs/heart-failure-patient-unable-to-get-abortion-as-life-not-at-immediate-risk-1.3475728                               |52        |\n",
      "|http://www.thetimes.co.uk/article/my-options-were-to-feel-my-unborn-child-die-or-watch-her-die-jb8r5jqdz?sharetoken=1bd1e8800606f849d71f461b26aed059             |52        |\n",
      "|http://crowdfund.togetherforyes.ie/                                                                                                                              |50        |\n",
      "|http://hometovote.com                                                                                                                                            |49        |\n",
      "|http://chooselife2018.ie                                                                                                                                         |48        |\n",
      "|http://www.pop.org/many-american-women-felt-pressured-abortions-study-finds/                                                                                     |48        |\n",
      "|http://goo.gl/hhp1ds                                                                                                                                             |48        |\n",
      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+\n",
      "only showing top 50 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "urls_df = sqlContext.sql(\"SELECT url, count(url) from (SELECT explode(urls) url FROM tweets) where not url like 'http://twitter.com%' group by url order by count(url) desc\")\n",
    "urls_df.show(50, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top timezones"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------------+---------------------+\n",
      "|user_time_zone            |count(user_time_zone)|\n",
      "+--------------------------+---------------------+\n",
      "|Dublin                    |113521               |\n",
      "|Pacific Time (US & Canada)|37240                |\n",
      "|London                    |30191                |\n",
      "|Amsterdam                 |19888                |\n",
      "|Casablanca                |15636                |\n",
      "|Europe/Dublin             |8694                 |\n",
      "|Eastern Time (US & Canada)|7565                 |\n",
      "|Hawaii                    |4303                 |\n",
      "|Central Time (US & Canada)|3470                 |\n",
      "|Europe/London             |1890                 |\n",
      "+--------------------------+---------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tz_df = sqlContext.sql(\"SELECT user_time_zone, count(user_time_zone) FROM tweets group by user_time_zone order by count(user_time_zone) desc\")\n",
    "tz_df.show(10, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top user languages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+--------------------+\n",
      "|user_language|count(user_language)|\n",
      "+-------------+--------------------+\n",
      "|en           |441308              |\n",
      "|en-gb        |19521               |\n",
      "|ja           |4473                |\n",
      "|en-GB        |4193                |\n",
      "|es           |2195                |\n",
      "|fr           |1582                |\n",
      "|ga           |992                 |\n",
      "|de           |878                 |\n",
      "|it           |812                 |\n",
      "|ru           |517                 |\n",
      "+-------------+--------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "lang_df = sqlContext.sql(\"SELECT user_language, count(user_language) FROM tweets group by user_language order by count(user_language) desc\")\n",
    "lang_df.show(10, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top retweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------------------------------------------------------------------------+-------------------------------+\n",
      "|concat(https://twitter.com/, retweeted_quoted_screen_name, /status/, retweet_quoted_status_id)|count(retweet_quoted_status_id)|\n",
      "+----------------------------------------------------------------------------------------------+-------------------------------+\n",
      "|https://twitter.com/Together4yes/status/993174221265174529                                    |4549                           |\n",
      "|https://twitter.com/markohalloran/status/987318846980751360                                   |2251                           |\n",
      "|https://twitter.com/campaignforleo/status/987646457045020672                                  |1958                           |\n",
      "|https://twitter.com/amyhuberman/status/987303602514530304                                     |1614                           |\n",
      "|https://twitter.com/Sarah_Hyland/status/993520504052092928                                    |1474                           |\n",
      "|https://twitter.com/campaignforleo/status/988744931501133825                                  |1261                           |\n",
      "|https://twitter.com/NursepollyRgn/status/985280763942916096                                   |1222                           |\n",
      "|https://twitter.com/SimonHarrisTD/status/989998862164164609                                   |1221                           |\n",
      "|https://twitter.com/Iam_here_2018/status/989056513753874433                                   |1128                           |\n",
      "|https://twitter.com/RealJamesWoods/status/993575065332600834                                  |998                            |\n",
      "|https://twitter.com/davidmcw/status/988357892481929216                                        |994                            |\n",
      "|https://twitter.com/Stephanenny/status/986860375667888128                                     |942                            |\n",
      "|https://twitter.com/itsclairekane/status/985071490709110784                                   |904                            |\n",
      "|https://twitter.com/Together4yes/status/984890051837472768                                    |883                            |\n",
      "|https://twitter.com/sineadgleeson/status/992389180461404161                                   |855                            |\n",
      "|https://twitter.com/obianuju/status/993398334772703232                                        |841                            |\n",
      "|https://twitter.com/aoifegracemoore/status/986295031408578560                                 |840                            |\n",
      "|https://twitter.com/Cllr_Campbell/status/988738546612817920                                   |835                            |\n",
      "|https://twitter.com/annakatclarke/status/990078608650723329                                   |815                            |\n",
      "|https://twitter.com/Longford4Repeal/status/990262959472508933                                 |803                            |\n",
      "|https://twitter.com/DervalORourke/status/985531080580231169                                   |746                            |\n",
      "|https://twitter.com/campaignforleo/status/987644583440994304                                  |727                            |\n",
      "|https://twitter.com/LdnIrishARC/status/988313775144136704                                     |678                            |\n",
      "|https://twitter.com/adrianshanahan/status/988365011096625152                                  |671                            |\n",
      "|https://twitter.com/LilaGraceRose/status/992155086049624064                                   |666                            |\n",
      "+----------------------------------------------------------------------------------------------+-------------------------------+\n",
      "only showing top 25 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "rt_df = sqlContext.sql(\"SELECT CONCAT('https://twitter.com/', retweeted_quoted_screen_name, '/status/', retweet_quoted_status_id), count(retweet_quoted_status_id) FROM tweets group by retweet_quoted_status_id, retweeted_quoted_screen_name order by count(retweet_quoted_status_id) desc\")\n",
    "rt_df.show(25, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top trigrams (combinations of 3 words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------------------------+-----+\n",
      "|ngrams                                        |count|\n",
      "+----------------------------------------------+-----+\n",
      "|vote women yes                                |362  |\n",
      "|voting women yes                              |243  |\n",
      "|help please support                           |122  |\n",
      "|vote voting yes                               |117  |\n",
      "|8th abortion amendment                        |104  |\n",
      "|@josephamadigan @simonharristd @together4yes  |99   |\n",
      "|care change compassion                        |88   |\n",
      "|@conmurphysport @gordonwdarcy @kevinmcgahern  |85   |\n",
      "|@andyleeboxing @conmurphysport @gordonwdarcy  |83   |\n",
      "|irish referendum urged                        |78   |\n",
      "|@gordonwdarcy @kevinmcgahern @richiesadlier   |75   |\n",
      "|registered sure vote                          |74   |\n",
      "|please privilege right                        |74   |\n",
      "|perfect please privilege                      |73   |\n",
      "|life perfect please                           |72   |\n",
      "|@simonharristd @together4yes abortion         |72   |\n",
      "|child every life                              |72   |\n",
      "|@kevinmcgahern @richiesadlier @together4yes   |71   |\n",
      "|emigrants irish referendum                    |70   |\n",
      "|today vote yes                                |69   |\n",
      "|every life perfect                            |69   |\n",
      "|abortion emigrants irish                      |68   |\n",
      "|@campaignforleo @josephamadigan @simonharristd|67   |\n",
      "|people person rejected                        |65   |\n",
      "|little people person                          |65   |\n",
      "|voting woman yes                              |64   |\n",
      "|vote voting women                             |62   |\n",
      "|@ire201261 @irishredale1916 @ivorysiobhan     |58   |\n",
      "|ireland life vote                             |58   |\n",
      "|@richardbrutontd @senatornoone @simonharristd |56   |\n",
      "|1 2 3                                         |55   |\n",
      "|together yes €500,000                         |53   |\n",
      "|woman women yes                               |52   |\n",
      "|sometimes telling try                         |52   |\n",
      "|register today vote                           |51   |\n",
      "|form garda get                                |50   |\n",
      "|referendum register registered                |49   |\n",
      "|oversimplify posters sometimes                |49   |\n",
      "|posters sometimes telling                     |48   |\n",
      "|@nealerichmond @richardbrutontd @simonharristd|48   |\n",
      "|@ireland @ldnirishu4l @siobhka                |48   |\n",
      "|neatly oversimplify posters                   |47   |\n",
      "|register registered sure                      |47   |\n",
      "|@nwci @orlanwci @repealeight                  |47   |\n",
      "|raise together yes                            |46   |\n",
      "|vote woman yes                                |46   |\n",
      "|8th @together4yes amendment                   |46   |\n",
      "|daily latest thanks                           |46   |\n",
      "|last night people                             |45   |\n",
      "|support vote yes                              |45   |\n",
      "+----------------------------------------------+-----+\n",
      "only showing top 50 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.feature import RegexTokenizer, NGram, StopWordsRemover\n",
    "from pyspark.sql.functions import sort_array, udf, explode\n",
    "from pyspark.sql.types import ArrayType, StringType\n",
    "\n",
    "\n",
    "# Text (using distinct)\n",
    "text_df = tweets_df.select(explode(\"text\").alias(\"text\")).distinct()\n",
    "\n",
    "# Tokenize\n",
    "tokenizer = RegexTokenizer(pattern=\"([:\\.!?,]|'s|’s)*\\\\s+[‘]*\", inputCol=\"text\", outputCol=\"words\")\n",
    "tokenized_df = tokenizer.transform(text_df)\n",
    "\n",
    "# Stopwords\n",
    "stop_words = StopWordsRemover.loadDefaultStopWords('english')\n",
    "stop_words.extend(['rt', ' ', '-', '&amp;', 'it’s', '', 'may', 'see', 'want', 'i’m', 'us', 'make', \"we've\", \"you're\", \"you've\", \"don't\", \"i’ve\", 'it', 'they’re', 'don’t', 'lets', 'add'])\n",
    "remover = StopWordsRemover(inputCol=\"words\", outputCol=\"filtered_words\", stopWords=stop_words)\n",
    "filtered_df = remover.transform(tokenized_df)\n",
    "\n",
    "# Remove hashtags and URLs and dupes\n",
    "def clean(arr):\n",
    "    new_arr = set()\n",
    "    for item in arr:\n",
    "        add_to_arr = True\n",
    "        for startswith in ('#', 'http'):\n",
    "            if item.startswith(startswith):\n",
    "                add_to_arr = False\n",
    "        if add_to_arr:\n",
    "            new_arr.add(item)\n",
    "    return list(new_arr)\n",
    "\n",
    "clean_udf = udf(lambda arr: clean(arr), ArrayType(StringType()))\n",
    "clean_df = filtered_df.withColumn(\"clean_words\", clean_udf(filtered_df.filtered_words))\n",
    "\n",
    "# Sort the words\n",
    "sorted_df = clean_df.select(sort_array('clean_words').alias('sorted_words'))\n",
    "\n",
    "ngram = NGram(n=3, inputCol=\"sorted_words\", outputCol=\"ngrams\")\n",
    "ngram_df = ngram.transform(sorted_df).select(explode('ngrams').alias('ngrams'))\n",
    "ngram_df.groupBy('ngrams').count().orderBy('count', ascending=False).show(50, truncate=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}