{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Infobox Counts from XML Dumps\n", "- extract infobox data from dump files\n", "- create and populate `population_infobox` table\n", "- report infobox counts" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',\n", " 'enwiki-20190420-pages-articles-multistream.xml.bz2']\n", "\n", "INFOBOX_REGEX = re.compile(r'\\{ *infobox ', re.IGNORECASE)\n", "\n", "def count_infobox(entity, date):\n", " page_text = entity.revision.text._VALUE\n", " count = len(INFOBOX_REGEX.findall(page_text))\n", " return Row(page_id=entity.id, infobox_count=count, dt=date)\n", "\n", "infobox_rdd = sc.emptyRDD()\n", "for file in WIKIPEDIA_XML_DUMPS:\n", " wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)\n", " dump_date = re.search(r'.*(\\d{8}).*',file).group(1)\n", " articles = wikipedia\\\n", " .filter(\"ns = '0'\")\\\n", " .filter(\"redirect._title is null\") \\\n", " .filter(\"revision.text._VALUE is not null\") \\\n", " .filter(\"length(revision.text._VALUE) > 0\")\n", " daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_infobox(entity, dump_date)))\n", " infobox_rdd = infobox_rdd.union(daily_counts.rdd)\n", "\n", "infobox_merged = sqlContext.createDataFrame(infobox_rdd)\n", "infobox = infobox_merged.toPandas()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dtinfobox_countpage_id
020190401012
120190401125
220190401039
3201904011290
4201904012303
5201904010305
6201904012307
7201904011308
8201904010309
9201904011316
10201904012324
11201904011330
12201904012332
13201904010334
14201904010336
15201904011339
16201904011340
17201904011344
18201904011358
19201904010359
20201904010569
21201904010572
22201904010573
23201904010579
24201904010580
25201904011586
26201904010590
27201904010593
28201904011594
29201904011595
............
1168687720190420160552524
1168687820190420160552528
1168687920190420060552534
1168688020190420060552561
1168688120190420160552576
1168688220190420160552592
1168688320190420160552597
1168688420190420060552601
1168688520190420160552606
1168688620190420060552607
1168688720190420160552611
1168688820190420160552619
1168688920190420060552630
1168689020190420160552632
1168689120190420160552643
1168689220190420160552649
1168689320190420160552656
1168689420190420160552665
1168689520190420160552674
1168689620190420160552678
1168689720190420160552679
1168689820190420160552689
1168689920190420160552690
1168690020190420160552709
1168690120190420160552711
1168690220190420160552724
1168690320190420160552730
1168690420190420060552736
1168690520190420160552745
1168690620190420160552764
\n", "

11686907 rows × 3 columns

\n", "
" ], "text/plain": [ " dt infobox_count page_id\n", "0 20190401 0 12\n", "1 20190401 1 25\n", "2 20190401 0 39\n", "3 20190401 1 290\n", "4 20190401 2 303\n", "5 20190401 0 305\n", "6 20190401 2 307\n", "7 20190401 1 308\n", "8 20190401 0 309\n", "9 20190401 1 316\n", "10 20190401 2 324\n", "11 20190401 1 330\n", "12 20190401 2 332\n", "13 20190401 0 334\n", "14 20190401 0 336\n", "15 20190401 1 339\n", "16 20190401 1 340\n", "17 20190401 1 344\n", "18 20190401 1 358\n", "19 20190401 0 359\n", "20 20190401 0 569\n", "21 20190401 0 572\n", "22 20190401 0 573\n", "23 20190401 0 579\n", "24 20190401 0 580\n", "25 20190401 1 586\n", "26 20190401 0 590\n", "27 20190401 0 593\n", "28 20190401 1 594\n", "29 20190401 1 595\n", "... ... ... ...\n", "11686877 20190420 1 60552524\n", "11686878 20190420 1 60552528\n", "11686879 20190420 0 60552534\n", "11686880 20190420 0 60552561\n", "11686881 20190420 1 60552576\n", "11686882 20190420 1 60552592\n", "11686883 20190420 1 60552597\n", "11686884 20190420 0 60552601\n", "11686885 20190420 1 60552606\n", "11686886 20190420 0 60552607\n", "11686887 20190420 1 60552611\n", "11686888 20190420 1 60552619\n", "11686889 20190420 0 60552630\n", "11686890 20190420 1 60552632\n", "11686891 20190420 1 60552643\n", "11686892 20190420 1 60552649\n", "11686893 20190420 1 60552656\n", "11686894 20190420 1 60552665\n", "11686895 20190420 1 60552674\n", "11686896 20190420 1 60552678\n", "11686897 20190420 1 60552679\n", "11686898 20190420 1 60552689\n", "11686899 20190420 1 60552690\n", "11686900 20190420 1 60552709\n", "11686901 20190420 1 60552711\n", "11686902 20190420 1 60552724\n", "11686903 20190420 1 60552730\n", "11686904 20190420 0 60552736\n", "11686905 20190420 1 60552745\n", "11686906 20190420 1 60552764\n", "\n", "[11686907 rows x 3 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "infobox" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# write infobox counts data to a table for later use\n", "infobox_merged.createOrReplaceTempView(\"temp_infobox_count\")\n", "sqlContext.sql(\"DROP TABLE IF EXISTS ryanmax.infobox_count\")\n", "sqlContext.sql(\"CREATE TABLE ryanmax.infobox_count AS SELECT * FROM temp_infobox_count\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dtinfobox_countpage_id
5011201904014910577
6537201904015813696
491472019040134102962
491652019040127102985
495652019040129103782
1623902019040152373334
1648862019040134381506
1696332019040128397295
2769362019040178782670
2852512019040132841648
29370320190401239871024
3093852019040134931088
3247312019040127996290
34673020190401341086718
35084220190401351104268
35086520190401361104332
35090620190401351104505
35091020190401351104515
35091820190401381104564
35128820190401351106128
35129420190401331106178
35131120190401331106241
35132620190401261106299
35134620190401291106375
35150620190401281106937
35775420190401341130287
35859720190401361133844
35868820190401391134236
50670420190401281847173
52628720190401801945597
............
11306108201904203155569842
11308223201904203555595865
113099292019042012355619912
113159452019042017955697297
11321659201904202655763651
11325516201904203055809734
11332444201904202755887267
11342183201904202856002822
113531832019042011956142356
113579782019042012956192995
113716952019042014556363828
113774012019042014256434290
113854232019042013056532039
113859042019042012656538063
113891342019042013656577813
114136322019042033456887146
114142052019042014556891883
11435248201904202857147301
114724652019042013957437661
114758312019042012257482185
114797262019042014157536902
114852872019042013657620451
114866732019042014657637971
114921472019042012857706926
115017512019042012657839087
11506322201904207157897975
11520191201904202758072995
11537552201904203058344889
11564773201904207558742596
11652475201904204360089397
\n", "

729 rows × 3 columns

\n", "
" ], "text/plain": [ " dt infobox_count page_id\n", "5011 20190401 49 10577\n", "6537 20190401 58 13696\n", "49147 20190401 34 102962\n", "49165 20190401 27 102985\n", "49565 20190401 29 103782\n", "162390 20190401 52 373334\n", "164886 20190401 34 381506\n", "169633 20190401 28 397295\n", "276936 20190401 78 782670\n", "285251 20190401 32 841648\n", "293703 20190401 239 871024\n", "309385 20190401 34 931088\n", "324731 20190401 27 996290\n", "346730 20190401 34 1086718\n", "350842 20190401 35 1104268\n", "350865 20190401 36 1104332\n", "350906 20190401 35 1104505\n", "350910 20190401 35 1104515\n", "350918 20190401 38 1104564\n", "351288 20190401 35 1106128\n", "351294 20190401 33 1106178\n", "351311 20190401 33 1106241\n", "351326 20190401 26 1106299\n", "351346 20190401 29 1106375\n", "351506 20190401 28 1106937\n", "357754 20190401 34 1130287\n", "358597 20190401 36 1133844\n", "358688 20190401 39 1134236\n", "506704 20190401 28 1847173\n", "526287 20190401 80 1945597\n", "... ... ... ...\n", "11306108 20190420 31 55569842\n", "11308223 20190420 35 55595865\n", "11309929 20190420 123 55619912\n", "11315945 20190420 179 55697297\n", "11321659 20190420 26 55763651\n", "11325516 20190420 30 55809734\n", "11332444 20190420 27 55887267\n", "11342183 20190420 28 56002822\n", "11353183 20190420 119 56142356\n", "11357978 20190420 129 56192995\n", "11371695 20190420 145 56363828\n", "11377401 20190420 142 56434290\n", "11385423 20190420 130 56532039\n", "11385904 20190420 126 56538063\n", "11389134 20190420 136 56577813\n", "11413632 20190420 334 56887146\n", "11414205 20190420 145 56891883\n", "11435248 20190420 28 57147301\n", "11472465 20190420 139 57437661\n", "11475831 20190420 122 57482185\n", "11479726 20190420 141 57536902\n", "11485287 20190420 136 57620451\n", "11486673 20190420 146 57637971\n", "11492147 20190420 128 57706926\n", "11501751 20190420 126 57839087\n", "11506322 20190420 71 57897975\n", "11520191 20190420 27 58072995\n", "11537552 20190420 30 58344889\n", "11564773 20190420 75 58742596\n", "11652475 20190420 43 60089397\n", "\n", "[729 rows x 3 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "infobox.query('infobox_count>25')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Rate of change between dump days" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rate of change: 0.00149698163222\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pages_w_infoboxextract_date
0583908320190401
1584782420190420
\n", "
" ], "text/plain": [ " pages_w_infobox extract_date\n", "0 5839083 20190401\n", "1 5847824 20190420" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate rate of change\n", "q = \"\"\"\n", "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date\n", "FROM \n", " ryanmax.infobox_count\n", "GROUP BY extract_date\n", "ORDER BY extract_date\n", "\"\"\"\n", "ibs = spark.sql(q)\n", "ibs_pd = sqlContext.createDataFrame(ibs.rdd).toPandas()\n", "print('Rate of change: ',(ibs_pd['pages_w_infobox'][1] - ibs_pd['pages_w_infobox'][0])/ibs_pd['pages_w_infobox'][0])\n", "ibs_pd\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+\n", "|pages_w_infobox|\n", "+---------------+\n", "| 5847824|\n", "+---------------+\n", "\n" ] } ], "source": [ "# write data for 2019-04-20 and report out link count\n", "ibs = spark.sql(\"SELECT * FROM ryanmax.infobox_count WHERE dt = '20190420'\")\n", "ibs_df = sqlContext.createDataFrame(ibs.rdd)\n", "ibs_df.createOrReplaceTempView(\"temp_infobox\")\n", "sqlContext.sql(\"DROP TABLE IF EXISTS ryanmax.population_infobox\")\n", "sqlContext.sql(\"CREATE TABLE ryanmax.population_infobox AS SELECT * FROM temp_infobox\")\n", "spark.sql(\"select COUNT(DISTINCT page_id) AS pages_w_infobox from ryanmax.population_infobox\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Infobox Counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count of pages with infoboxes in W pages with external links" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+\n", "|pages_w_infobox|\n", "+---------------+\n", "| 5209322|\n", "+---------------+\n", "\n" ] } ], "source": [ "# Count of pages with infoboxes in W pages with external links\n", "w_infobox_query = \"\"\"\n", "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", "\"\"\"\n", "\n", "w_infobox = spark.sql(w_infobox_query)\n", "w_infobox.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count of pages with infoboxes in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+\n", "|pages_w_infobox|\n", "+---------------+\n", "| 32597|\n", "+---------------+\n", "\n" ] } ], "source": [ "# Count of pages with infoboxes in WP:M pages with external links\n", "pm_infobox_query = \"\"\"\n", "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", "\"\"\"\n", "\n", "pm_infobox = spark.sql(pm_infobox_query)\n", "pm_infobox.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Number of infoboxes per page in W pages with external links" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "| 0|346| 0.67| 1.0| 0.8573307965874148| 1.0|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "\n" ] } ], "source": [ "# Number of infoboxes per page in W pages with external links\n", "w_infobox_cnts_query = \"\"\"\n", "SELECT MIN(infobox_count) as min,\n", " MAX(infobox_count) as max, \n", " CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,\n", " PERCENTILE(infobox_count,0.5) AS median_infobox_count,\n", " STDDEV(infobox_count) as stddev_infobox_count,\n", " (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", "\"\"\"\n", "\n", "w_infobox_cnts = spark.sql(w_infobox_cnts_query)\n", "w_infobox_cnts.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Number of infoboxes per page in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "| 0| 11| 0.49| 0.0| 0.5235718191828268| 1.0|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "\n" ] } ], "source": [ "# Number of infoboxes per page in WP:M pages with external links\n", "pm_infobox_cnts_query = \"\"\"\n", "SELECT MIN(infobox_count) as min,\n", " MAX(infobox_count) as max, \n", " CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,\n", " PERCENTILE(infobox_count,0.5) AS median_infobox_count,\n", " STDDEV(infobox_count) as stddev_infobox_count,\n", " (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", "\"\"\"\n", "\n", "pm_infobox_cnts = spark.sql(pm_infobox_cnts_query)\n", "pm_infobox_cnts.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark large: spark-xml jar and local venv path ", "language": "python", "name": "spark-ryanmax" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }