{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Analysis of Tweets from Ireland 8th" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pyspark\n", "from pyspark.sql import SQLContext\n", "\n", "# Add the elasticsearch-hadoop jar\n", "os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'\n", "conf = pyspark.SparkConf()\n", "\n", "# Point to the master.\n", "conf.setMaster(\"spark://tweetsets.library.gwu.edu:7101\")\n", "import os\n", "import pyspark\n", "\n", "# Add the elasticsearch-hadoop jar\n", "os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'\n", "conf = pyspark.SparkConf()\n", "\n", "# Point to the master.\n", "conf.setMaster(\"spark://tweetsets.library.gwu.edu:7101\")\n", "conf.setAppName(\"ireland-8th-analysis\")\n", "conf.set(\"spark.driver.bindAddress\", \"0.0.0.0\")\n", "# Don't hog all of the cores.\n", "conf.set(\"spark.cores.max\", \"3\")\n", "# Specify a port for the block manager (which runs as part of the worker). The range 7003-7028 is set \n", "# to be open in the Spark worker container.\n", "conf.set(\"spark.blockManager.port\", \"7003\")\n", "\n", "# create the context\n", "sc = pyspark.SparkContext(conf=conf)\n", "\n", "# Configure for ElasticSearch cluster and index.\n", "es_conf = {\"es.nodes\": \"tweetsets.library.gwu.edu\",\n", " \"es.port\": \"9200\",\n", " \"es.resource\": \"tweets-ba2157/doc\",\n", " \"es.read.field.as.array.include\": \"hashtags,text,urls\"}\n", "\n", "sqlContext = SQLContext(sc)\n", "tweets_df = sqlContext.read.format(\"org.elasticsearch.spark.sql\").options(**es_conf).load()\n", "tweets_df.createOrReplaceTempView(\"tweets\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "478303" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets_df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top hashtags" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------------+--------------+\n", "|hashtag |count(hashtag)|\n", "+-------------------+--------------+\n", "|repealthe8th |84179 |\n", "|together4yes |57253 |\n", "|savethe8th |43656 |\n", "|8thref |35935 |\n", "|togetherforyes |18763 |\n", "|lovebothvoteno |18290 |\n", "|voteyes |8245 |\n", "|loveboth |8109 |\n", "|latelate |5920 |\n", "|men4yes |5884 |\n", "|votenotoabortion |5732 |\n", "|latelateshow |5584 |\n", "|voteno |5324 |\n", "|prolife |4596 |\n", "|hometovote |4342 |\n", "|lifecanvass |3637 |\n", "|trustwomen |3483 |\n", "|repeal |3450 |\n", "|ireland |3192 |\n", "|mybodymychoice |2903 |\n", "|私は黙らない0428 |2879 |\n", "|repealth8th |2614 |\n", "|jointherebellion |2602 |\n", "|abortion |2463 |\n", "|repeal8th |2258 |\n", "|rtept |1722 |\n", "|repealtheeighth |1601 |\n", "|register4yes |1445 |\n", "|prochoice |1423 |\n", "|repealfacts |1360 |\n", "|standupforlife |1322 |\n", "|itstime |1293 |\n", "|8thamendment |1283 |\n", "|praytoendabortion |1196 |\n", "|womensayno2abortion|1050 |\n", "|wakeupireland |1049 |\n", "|ourfuture |1049 |\n", "|savelives |980 |\n", "|menforyes |957 |\n", "|scotref |921 |\n", "|chooselife |912 |\n", "|yes |899 |\n", "|corksaysyes |893 |\n", "|tinylivesatstake |863 |\n", "|time4choice |832 |\n", "|istandwithnicola |820 |\n", "|repealmobile |758 |\n", "|votenoroadshow |757 |\n", "|loveboats |736 |\n", "|studentsforchoice |646 |\n", "+-------------------+--------------+\n", "only showing top 50 rows\n", "\n" ] } ], "source": [ "hashtags_df = sqlContext.sql(\"SELECT hashtag, count(hashtag) from (SELECT explode(hashtags) hashtag FROM tweets) group by hashtag order by count(hashtag) desc\")\n", "hashtags_df.show(50, truncate=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top users by all tweet types" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------+-----------------------+\n", "|user_screen_name|count(user_screen_name)|\n", "+----------------+-----------------------+\n", "|kaydnan |3449 |\n", "|Irishprolifer |2783 |\n", "|ShannonBlue |2433 |\n", "|mobyrne100 |2375 |\n", "|EmmaMurphy12150 |2278 |\n", "|Declan1497 |2225 |\n", "|MaryOGrady8 |2209 |\n", "|BernadetteComm1 |2164 |\n", "|Donnchadh32 |1823 |\n", "|EamonReilly_com |1727 |\n", "|Paul71 |1700 |\n", "|christi85573643 |1627 |\n", "|rosecaroline9 |1626 |\n", "|MeathRight2Life |1594 |\n", "|DLTogether4Yes |1587 |\n", "|MandyGall7 |1559 |\n", "|laurathornton30 |1545 |\n", "|MaryThorn85 |1528 |\n", "|babydollirish2 |1474 |\n", "|ebt51 |1430 |\n", "|ExposeMediaBias |1419 |\n", "|WolfeTone15 |1397 |\n", "|IsabelCorcoran5 |1236 |\n", "|JanetOS_ |1164 |\n", "|renemccoll |1103 |\n", "|seamus6346 |1079 |\n", "|ElaineYoung94 |1039 |\n", "|daraghnoel |873 |\n", "|pnolan26 |872 |\n", "|PadraigSagart |809 |\n", "|NursepollyRgn |809 |\n", "|DrCollins10 |807 |\n", "|corkmankeane |789 |\n", "|Together4yes |773 |\n", "|marybuckley549 |764 |\n", "|8threfbot |763 |\n", "|Berlinnaeus |757 |\n", "|Thebfromtuam |754 |\n", "|loveboth8 |741 |\n", "|GiveBackMy_Mind |730 |\n", "|renebatt26 |723 |\n", "|Savethe8thInfo |721 |\n", "|theRallyforLife |718 |\n", "|TeilHarder |716 |\n", "|Colmogorman |700 |\n", "|paddylepage |685 |\n", "|repeal_shield |681 |\n", "|paddyearly |679 |\n", "|firstlady10000 |678 |\n", "|IrelandStandUp |664 |\n", "+----------------+-----------------------+\n", "only showing top 50 rows\n", "\n" ] } ], "source": [ "screen_name_df = sqlContext.sql(\"SELECT user_screen_name, count(user_screen_name) from tweets group by user_screen_name order by count(user_screen_name) desc\")\n", "screen_name_df.show(50, truncate=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top users by original tweets only" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------+-----------------------+\n", "|user_screen_name|count(user_screen_name)|\n", "+----------------+-----------------------+\n", "|ebt51 |626 |\n", "|loveboth8 |546 |\n", "|Savethe8thInfo |516 |\n", "|testisfidelis |417 |\n", "|Together4yes |287 |\n", "|Thebfromtuam |263 |\n", "|LoveLifeLove8th |246 |\n", "|AllPassingThing |218 |\n", "|lifeinstitute |209 |\n", "|JanetOS_ |183 |\n", "|MarieAFlaherty |148 |\n", "|InHerIrishShoes |145 |\n", "|PacifistIreland |143 |\n", "|repeal8thfunds |142 |\n", "|RosForChoice |142 |\n", "|tvcritics |135 |\n", "|StopRepealHate |125 |\n", "|MandyGall7 |122 |\n", "|TarynDeVere |122 |\n", "|paddyearly |110 |\n", "|AnnieKatelynch |109 |\n", "|rocknrollok |109 |\n", "|LawlessRoisin |106 |\n", "|TFYEastCork |106 |\n", "|EmmaMurphy12150 |102 |\n", "|laurathornton30 |99 |\n", "|TheUSI |99 |\n", "|Donnchadh32 |98 |\n", "|GendercideNews |98 |\n", "|prolifecampaign |97 |\n", "|Irishprolifer |94 |\n", "|TipperaryForYes |94 |\n", "|LeanneWoodfull |91 |\n", "|speakofIMELDA |90 |\n", "|john_mcguirk |89 |\n", "|Colmogorman |89 |\n", "|ZazaFL |88 |\n", "|DroghedaT4Y |87 |\n", "|IrelandStandUp |87 |\n", "|SpotlightEire |84 |\n", "|AmnestyIreland |82 |\n", "|StopViolenceIRL |81 |\n", "|Paul71 |81 |\n", "|CoraSherlock |79 |\n", "|WingnutParody |75 |\n", "|Berlinnaeus |75 |\n", "|RepealExpose |73 |\n", "|NualaDonnellan |73 |\n", "|ExposeMediaBias |72 |\n", "|SK4Repeal |71 |\n", "+----------------+-----------------------+\n", "only showing top 50 rows\n", "\n" ] } ], "source": [ "screen_name_orig_df = sqlContext.sql(\"SELECT user_screen_name, count(user_screen_name) from tweets where tweet_type='original' group by user_screen_name order by count(user_screen_name) desc\")\n", "screen_name_orig_df.show(50, truncate=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top URLs" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+\n", "|url |count(url)|\n", "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+\n", "|http://checktheregister.ie |569 |\n", "|http://undecided8.org |314 |\n", "|http://www.irishtimes.com/opinion/anti-abortion-posters-fail-to-take-account-of-life-1.3470187 |294 |\n", "|http://bit.ly/2keysma |292 |\n", "|http://www.checktheregister.ie |279 |\n", "|http://jrnl.ie/3986043t |163 |\n", "|http://crowdfund.togetherforyes.ie |150 |\n", "|http://www.checktheregister.ie/publicpages/default.aspx?uilang= |137 |\n", "|http://youtu.be/itsxbbkp-tq |135 |\n", "|http://adoption.ie/wp-content/uploads/2018/04/ara-position-paper-on-8th-amendment.pdf |131 |\n", "|http://www.irishtimes.com/news/politics/obstetricians-body-recommends-yes-vote-in-abortion-referendum-1.3473125 |127 |\n", "|http://www.thejournal.ie/together-for-yes-crowdfunding-3957637-apr2018/ |114 |\n", "|http://goo.gl/3qa2n5 |113 |\n", "|http://www.save8.ie/donate |109 |\n", "|http://twibbon.com/support/loveboth-vote-no/twitter |101 |\n", "|http://bit.ly/2jp6u0l |93 |\n", "|http://bit.ly/2itifbo |91 |\n", "|http://www.youtube.com/watch?v=itsxbbkp-tq&sns=tw |84 |\n", "|http://save8.ie |82 |\n", "|http://www.irishtimes.com/opinion/graham-linehan-men-must-play-their-part-in-repealing-the-eighth-1.3481645 |77 |\n", "|http://xytex.com |76 |\n", "|http://philippaandneil.wordpress.com |71 |\n", "|http://tinyurl.com/ybfp9gpz |71 |\n", "|http://trinitynews.ie/katie-ascoughs-cynical-worldview/ |70 |\n", "|http://goo.gl/7p2f9x |69 |\n", "|http://bit.ly/2rjqt0t |69 |\n", "|http://bit.ly/2hzhf4f |69 |\n", "|http://clarechampion.ie/i-was-abandoned-by-the-irish-state/ |67 |\n", "|http://www.togetherforyes.ie/donate |66 |\n", "|http://togetherforyes.causevox.com/ |66 |\n", "|http://youtu.be/cw_ylrol_70 |63 |\n", "|http://www.jpands.org/vol22no4/coleman.pdf |62 |\n", "|http://www.togetherforyes.ie/canvassing-information-contacts/ |61 |\n", "|http://youtu.be/8acbuqzkq80 |60 |\n", "|http://www.the-pool.com/health/wombs-etc/2018/17/caroline-o-donoghue-one-month-before-ireland-abortion-referendum |58 |\n", "|http://bit.ly/2rsxltw |58 |\n", "|http://www.liveaction.org/news/abortion-facility-misleads-women-deceiving-depictions-fetal-development/?utm_content=70471828&utm_medium=social&utm_source=twitter|58 |\n", "|http://www.irishtimes.com/news/ireland/irish-news/eighth-amendment-causing-uncertainty-for-doctors-gynaecologist-1.3478274 |57 |\n", "|http://www.togetherforyes.ie/register4yes/ |57 |\n", "|http://www.irishtimes.com/culture/music/u2-support-repeal-of-eighth-amendment-on-eve-of-new-tour-1.3481713 |56 |\n", "|http://bit.ly/2qcfkxx |55 |\n", "|http://youtu.be/s4pk1d0ob1w |52 |\n", "|http://unitedtrusts.com |52 |\n", "|http://www.irishtimes.com/news/social-affairs/heart-failure-patient-unable-to-get-abortion-as-life-not-at-immediate-risk-1.3475728 |52 |\n", "|http://www.thetimes.co.uk/article/my-options-were-to-feel-my-unborn-child-die-or-watch-her-die-jb8r5jqdz?sharetoken=1bd1e8800606f849d71f461b26aed059 |52 |\n", "|http://crowdfund.togetherforyes.ie/ |50 |\n", "|http://hometovote.com |49 |\n", "|http://chooselife2018.ie |48 |\n", "|http://www.pop.org/many-american-women-felt-pressured-abortions-study-finds/ |48 |\n", "|http://goo.gl/hhp1ds |48 |\n", "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+\n", "only showing top 50 rows\n", "\n" ] } ], "source": [ "urls_df = sqlContext.sql(\"SELECT url, count(url) from (SELECT explode(urls) url FROM tweets) where not url like 'http://twitter.com%' group by url order by count(url) desc\")\n", "urls_df.show(50, truncate=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top timezones" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------------+---------------------+\n", "|user_time_zone |count(user_time_zone)|\n", "+--------------------------+---------------------+\n", "|Dublin |113521 |\n", "|Pacific Time (US & Canada)|37240 |\n", "|London |30191 |\n", "|Amsterdam |19888 |\n", "|Casablanca |15636 |\n", "|Europe/Dublin |8694 |\n", "|Eastern Time (US & Canada)|7565 |\n", "|Hawaii |4303 |\n", "|Central Time (US & Canada)|3470 |\n", "|Europe/London |1890 |\n", "+--------------------------+---------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "tz_df = sqlContext.sql(\"SELECT user_time_zone, count(user_time_zone) FROM tweets group by user_time_zone order by count(user_time_zone) desc\")\n", "tz_df.show(10, truncate=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top user languages" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------+--------------------+\n", "|user_language|count(user_language)|\n", "+-------------+--------------------+\n", "|en |441308 |\n", "|en-gb |19521 |\n", "|ja |4473 |\n", "|en-GB |4193 |\n", "|es |2195 |\n", "|fr |1582 |\n", "|ga |992 |\n", "|de |878 |\n", "|it |812 |\n", "|ru |517 |\n", "+-------------+--------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "lang_df = sqlContext.sql(\"SELECT user_language, count(user_language) FROM tweets group by user_language order by count(user_language) desc\")\n", "lang_df.show(10, truncate=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top retweets" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------------------------------------------------------------------------------------+-------------------------------+\n", "|concat(https://twitter.com/, retweeted_quoted_screen_name, /status/, retweet_quoted_status_id)|count(retweet_quoted_status_id)|\n", "+----------------------------------------------------------------------------------------------+-------------------------------+\n", "|https://twitter.com/Together4yes/status/993174221265174529 |4549 |\n", "|https://twitter.com/markohalloran/status/987318846980751360 |2251 |\n", "|https://twitter.com/campaignforleo/status/987646457045020672 |1958 |\n", "|https://twitter.com/amyhuberman/status/987303602514530304 |1614 |\n", "|https://twitter.com/Sarah_Hyland/status/993520504052092928 |1474 |\n", "|https://twitter.com/campaignforleo/status/988744931501133825 |1261 |\n", "|https://twitter.com/NursepollyRgn/status/985280763942916096 |1222 |\n", "|https://twitter.com/SimonHarrisTD/status/989998862164164609 |1221 |\n", "|https://twitter.com/Iam_here_2018/status/989056513753874433 |1128 |\n", "|https://twitter.com/RealJamesWoods/status/993575065332600834 |998 |\n", "|https://twitter.com/davidmcw/status/988357892481929216 |994 |\n", "|https://twitter.com/Stephanenny/status/986860375667888128 |942 |\n", "|https://twitter.com/itsclairekane/status/985071490709110784 |904 |\n", "|https://twitter.com/Together4yes/status/984890051837472768 |883 |\n", "|https://twitter.com/sineadgleeson/status/992389180461404161 |855 |\n", "|https://twitter.com/obianuju/status/993398334772703232 |841 |\n", "|https://twitter.com/aoifegracemoore/status/986295031408578560 |840 |\n", "|https://twitter.com/Cllr_Campbell/status/988738546612817920 |835 |\n", "|https://twitter.com/annakatclarke/status/990078608650723329 |815 |\n", "|https://twitter.com/Longford4Repeal/status/990262959472508933 |803 |\n", "|https://twitter.com/DervalORourke/status/985531080580231169 |746 |\n", "|https://twitter.com/campaignforleo/status/987644583440994304 |727 |\n", "|https://twitter.com/LdnIrishARC/status/988313775144136704 |678 |\n", "|https://twitter.com/adrianshanahan/status/988365011096625152 |671 |\n", "|https://twitter.com/LilaGraceRose/status/992155086049624064 |666 |\n", "+----------------------------------------------------------------------------------------------+-------------------------------+\n", "only showing top 25 rows\n", "\n" ] } ], "source": [ "rt_df = sqlContext.sql(\"SELECT CONCAT('https://twitter.com/', retweeted_quoted_screen_name, '/status/', retweet_quoted_status_id), count(retweet_quoted_status_id) FROM tweets group by retweet_quoted_status_id, retweeted_quoted_screen_name order by count(retweet_quoted_status_id) desc\")\n", "rt_df.show(25, truncate=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top trigrams (combinations of 3 words)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------------------------------------+-----+\n", "|ngrams |count|\n", "+----------------------------------------------+-----+\n", "|vote women yes |362 |\n", "|voting women yes |243 |\n", "|help please support |122 |\n", "|vote voting yes |117 |\n", "|8th abortion amendment |104 |\n", "|@josephamadigan @simonharristd @together4yes |99 |\n", "|care change compassion |88 |\n", "|@conmurphysport @gordonwdarcy @kevinmcgahern |85 |\n", "|@andyleeboxing @conmurphysport @gordonwdarcy |83 |\n", "|irish referendum urged |78 |\n", "|@gordonwdarcy @kevinmcgahern @richiesadlier |75 |\n", "|registered sure vote |74 |\n", "|please privilege right |74 |\n", "|perfect please privilege |73 |\n", "|life perfect please |72 |\n", "|@simonharristd @together4yes abortion |72 |\n", "|child every life |72 |\n", "|@kevinmcgahern @richiesadlier @together4yes |71 |\n", "|emigrants irish referendum |70 |\n", "|today vote yes |69 |\n", "|every life perfect |69 |\n", "|abortion emigrants irish |68 |\n", "|@campaignforleo @josephamadigan @simonharristd|67 |\n", "|people person rejected |65 |\n", "|little people person |65 |\n", "|voting woman yes |64 |\n", "|vote voting women |62 |\n", "|@ire201261 @irishredale1916 @ivorysiobhan |58 |\n", "|ireland life vote |58 |\n", "|@richardbrutontd @senatornoone @simonharristd |56 |\n", "|1 2 3 |55 |\n", "|together yes €500,000 |53 |\n", "|woman women yes |52 |\n", "|sometimes telling try |52 |\n", "|register today vote |51 |\n", "|form garda get |50 |\n", "|referendum register registered |49 |\n", "|oversimplify posters sometimes |49 |\n", "|posters sometimes telling |48 |\n", "|@nealerichmond @richardbrutontd @simonharristd|48 |\n", "|@ireland @ldnirishu4l @siobhka |48 |\n", "|neatly oversimplify posters |47 |\n", "|register registered sure |47 |\n", "|@nwci @orlanwci @repealeight |47 |\n", "|raise together yes |46 |\n", "|vote woman yes |46 |\n", "|8th @together4yes amendment |46 |\n", "|daily latest thanks |46 |\n", "|last night people |45 |\n", "|support vote yes |45 |\n", "+----------------------------------------------+-----+\n", "only showing top 50 rows\n", "\n" ] } ], "source": [ "from pyspark.ml.feature import RegexTokenizer, NGram, StopWordsRemover\n", "from pyspark.sql.functions import sort_array, udf, explode\n", "from pyspark.sql.types import ArrayType, StringType\n", "\n", "\n", "# Text (using distinct)\n", "text_df = tweets_df.select(explode(\"text\").alias(\"text\")).distinct()\n", "\n", "# Tokenize\n", "tokenizer = RegexTokenizer(pattern=\"([:\\.!?,]|'s|’s)*\\\\s+[‘]*\", inputCol=\"text\", outputCol=\"words\")\n", "tokenized_df = tokenizer.transform(text_df)\n", "\n", "# Stopwords\n", "stop_words = StopWordsRemover.loadDefaultStopWords('english')\n", "stop_words.extend(['rt', ' ', '-', '&', 'it’s', '', 'may', 'see', 'want', 'i’m', 'us', 'make', \"we've\", \"you're\", \"you've\", \"don't\", \"i’ve\", 'it', 'they’re', 'don’t', 'lets', 'add'])\n", "remover = StopWordsRemover(inputCol=\"words\", outputCol=\"filtered_words\", stopWords=stop_words)\n", "filtered_df = remover.transform(tokenized_df)\n", "\n", "# Remove hashtags and URLs and dupes\n", "def clean(arr):\n", " new_arr = set()\n", " for item in arr:\n", " add_to_arr = True\n", " for startswith in ('#', 'http'):\n", " if item.startswith(startswith):\n", " add_to_arr = False\n", " if add_to_arr:\n", " new_arr.add(item)\n", " return list(new_arr)\n", "\n", "clean_udf = udf(lambda arr: clean(arr), ArrayType(StringType()))\n", "clean_df = filtered_df.withColumn(\"clean_words\", clean_udf(filtered_df.filtered_words))\n", "\n", "# Sort the words\n", "sorted_df = clean_df.select(sort_array('clean_words').alias('sorted_words'))\n", "\n", "ngram = NGram(n=3, inputCol=\"sorted_words\", outputCol=\"ngrams\")\n", "ngram_df = ngram.transform(sorted_df).select(explode('ngrams').alias('ngrams'))\n", "ngram_df.groupBy('ngrams').count().orderBy('count', ascending=False).show(50, truncate=False)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }