{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Infobox Counts from XML Dumps\n", "- extract infobox data from dump files\n", "- create and populate `population_infobox` table\n", "- report infobox counts" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',\n", " 'enwiki-20190420-pages-articles-multistream.xml.bz2']\n", "\n", "INFOBOX_REGEX = re.compile(r'\\{ *infobox ', re.IGNORECASE)\n", "\n", "def count_infobox(entity, date):\n", " page_text = entity.revision.text._VALUE\n", " count = len(INFOBOX_REGEX.findall(page_text))\n", " return Row(page_id=entity.id, infobox_count=count, dt=date)\n", "\n", "infobox_rdd = sc.emptyRDD()\n", "for file in WIKIPEDIA_XML_DUMPS:\n", " wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)\n", " dump_date = re.search(r'.*(\\d{8}).*',file).group(1)\n", " articles = wikipedia\\\n", " .filter(\"ns = '0'\")\\\n", " .filter(\"redirect._title is null\") \\\n", " .filter(\"revision.text._VALUE is not null\") \\\n", " .filter(\"length(revision.text._VALUE) > 0\")\n", " daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_infobox(entity, dump_date)))\n", " infobox_rdd = infobox_rdd.union(daily_counts.rdd)\n", "\n", "infobox_merged = sqlContext.createDataFrame(infobox_rdd)\n", "infobox = infobox_merged.toPandas()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>dt</th>\n", " <th>infobox_count</th>\n", " <th>page_id</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>12</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>25</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>39</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>290</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>20190401</td>\n", " <td>2</td>\n", " <td>303</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>305</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>20190401</td>\n", " <td>2</td>\n", " <td>307</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>308</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>309</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>316</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>20190401</td>\n", " <td>2</td>\n", " <td>324</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>330</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>20190401</td>\n", " <td>2</td>\n", " <td>332</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>334</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>336</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>339</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>340</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>344</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>358</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>359</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>569</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>572</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>573</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>579</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>580</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>586</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>590</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>20190401</td>\n", " <td>0</td>\n", " <td>593</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>594</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>20190401</td>\n", " <td>1</td>\n", " <td>595</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>11686877</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552524</td>\n", " </tr>\n", " <tr>\n", " <th>11686878</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552528</td>\n", " </tr>\n", " <tr>\n", " <th>11686879</th>\n", " <td>20190420</td>\n", " <td>0</td>\n", " <td>60552534</td>\n", " </tr>\n", " <tr>\n", " <th>11686880</th>\n", " <td>20190420</td>\n", " <td>0</td>\n", " <td>60552561</td>\n", " </tr>\n", " <tr>\n", " <th>11686881</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552576</td>\n", " </tr>\n", " <tr>\n", " <th>11686882</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552592</td>\n", " </tr>\n", " <tr>\n", " <th>11686883</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552597</td>\n", " </tr>\n", " <tr>\n", " <th>11686884</th>\n", " <td>20190420</td>\n", " <td>0</td>\n", " <td>60552601</td>\n", " </tr>\n", " <tr>\n", " <th>11686885</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552606</td>\n", " </tr>\n", " <tr>\n", " <th>11686886</th>\n", " <td>20190420</td>\n", " <td>0</td>\n", " <td>60552607</td>\n", " </tr>\n", " <tr>\n", " <th>11686887</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552611</td>\n", " </tr>\n", " <tr>\n", " <th>11686888</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552619</td>\n", " </tr>\n", " <tr>\n", " <th>11686889</th>\n", " <td>20190420</td>\n", " <td>0</td>\n", " <td>60552630</td>\n", " </tr>\n", " <tr>\n", " <th>11686890</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552632</td>\n", " </tr>\n", " <tr>\n", " <th>11686891</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552643</td>\n", " </tr>\n", " <tr>\n", " <th>11686892</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552649</td>\n", " </tr>\n", " <tr>\n", " <th>11686893</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552656</td>\n", " </tr>\n", " <tr>\n", " <th>11686894</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552665</td>\n", " </tr>\n", " <tr>\n", " <th>11686895</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552674</td>\n", " </tr>\n", " <tr>\n", " <th>11686896</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552678</td>\n", " </tr>\n", " <tr>\n", " <th>11686897</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552679</td>\n", " </tr>\n", " <tr>\n", " <th>11686898</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552689</td>\n", " </tr>\n", " <tr>\n", " <th>11686899</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552690</td>\n", " </tr>\n", " <tr>\n", " <th>11686900</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552709</td>\n", " </tr>\n", " <tr>\n", " <th>11686901</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552711</td>\n", " </tr>\n", " <tr>\n", " <th>11686902</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552724</td>\n", " </tr>\n", " <tr>\n", " <th>11686903</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552730</td>\n", " </tr>\n", " <tr>\n", " <th>11686904</th>\n", " <td>20190420</td>\n", " <td>0</td>\n", " <td>60552736</td>\n", " </tr>\n", " <tr>\n", " <th>11686905</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552745</td>\n", " </tr>\n", " <tr>\n", " <th>11686906</th>\n", " <td>20190420</td>\n", " <td>1</td>\n", " <td>60552764</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>11686907 rows × 3 columns</p>\n", "</div>" ], "text/plain": [ " dt infobox_count page_id\n", "0 20190401 0 12\n", "1 20190401 1 25\n", "2 20190401 0 39\n", "3 20190401 1 290\n", "4 20190401 2 303\n", "5 20190401 0 305\n", "6 20190401 2 307\n", "7 20190401 1 308\n", "8 20190401 0 309\n", "9 20190401 1 316\n", "10 20190401 2 324\n", "11 20190401 1 330\n", "12 20190401 2 332\n", "13 20190401 0 334\n", "14 20190401 0 336\n", "15 20190401 1 339\n", "16 20190401 1 340\n", "17 20190401 1 344\n", "18 20190401 1 358\n", "19 20190401 0 359\n", "20 20190401 0 569\n", "21 20190401 0 572\n", "22 20190401 0 573\n", "23 20190401 0 579\n", "24 20190401 0 580\n", "25 20190401 1 586\n", "26 20190401 0 590\n", "27 20190401 0 593\n", "28 20190401 1 594\n", "29 20190401 1 595\n", "... ... ... ...\n", "11686877 20190420 1 60552524\n", "11686878 20190420 1 60552528\n", "11686879 20190420 0 60552534\n", "11686880 20190420 0 60552561\n", "11686881 20190420 1 60552576\n", "11686882 20190420 1 60552592\n", "11686883 20190420 1 60552597\n", "11686884 20190420 0 60552601\n", "11686885 20190420 1 60552606\n", "11686886 20190420 0 60552607\n", "11686887 20190420 1 60552611\n", "11686888 20190420 1 60552619\n", "11686889 20190420 0 60552630\n", "11686890 20190420 1 60552632\n", "11686891 20190420 1 60552643\n", "11686892 20190420 1 60552649\n", "11686893 20190420 1 60552656\n", "11686894 20190420 1 60552665\n", "11686895 20190420 1 60552674\n", "11686896 20190420 1 60552678\n", "11686897 20190420 1 60552679\n", "11686898 20190420 1 60552689\n", "11686899 20190420 1 60552690\n", "11686900 20190420 1 60552709\n", "11686901 20190420 1 60552711\n", "11686902 20190420 1 60552724\n", "11686903 20190420 1 60552730\n", "11686904 20190420 0 60552736\n", "11686905 20190420 1 60552745\n", "11686906 20190420 1 60552764\n", "\n", "[11686907 rows x 3 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "infobox" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# write infobox counts data to a table for later use\n", "infobox_merged.createOrReplaceTempView(\"temp_infobox_count\")\n", "sqlContext.sql(\"DROP TABLE IF EXISTS ryanmax.infobox_count\")\n", "sqlContext.sql(\"CREATE TABLE ryanmax.infobox_count AS SELECT * FROM temp_infobox_count\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>dt</th>\n", " <th>infobox_count</th>\n", " <th>page_id</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>5011</th>\n", " <td>20190401</td>\n", " <td>49</td>\n", " <td>10577</td>\n", " </tr>\n", " <tr>\n", " <th>6537</th>\n", " <td>20190401</td>\n", " <td>58</td>\n", " <td>13696</td>\n", " </tr>\n", " <tr>\n", " <th>49147</th>\n", " <td>20190401</td>\n", " <td>34</td>\n", " <td>102962</td>\n", " </tr>\n", " <tr>\n", " <th>49165</th>\n", " <td>20190401</td>\n", " <td>27</td>\n", " <td>102985</td>\n", " </tr>\n", " <tr>\n", " <th>49565</th>\n", " <td>20190401</td>\n", " <td>29</td>\n", " <td>103782</td>\n", " </tr>\n", " <tr>\n", " <th>162390</th>\n", " <td>20190401</td>\n", " <td>52</td>\n", " <td>373334</td>\n", " </tr>\n", " <tr>\n", " <th>164886</th>\n", " <td>20190401</td>\n", " <td>34</td>\n", " <td>381506</td>\n", " </tr>\n", " <tr>\n", " <th>169633</th>\n", " <td>20190401</td>\n", " <td>28</td>\n", " <td>397295</td>\n", " </tr>\n", " <tr>\n", " <th>276936</th>\n", " <td>20190401</td>\n", " <td>78</td>\n", " <td>782670</td>\n", " </tr>\n", " <tr>\n", " <th>285251</th>\n", " <td>20190401</td>\n", " <td>32</td>\n", " <td>841648</td>\n", " </tr>\n", " <tr>\n", " <th>293703</th>\n", " <td>20190401</td>\n", " <td>239</td>\n", " <td>871024</td>\n", " </tr>\n", " <tr>\n", " <th>309385</th>\n", " <td>20190401</td>\n", " <td>34</td>\n", " <td>931088</td>\n", " </tr>\n", " <tr>\n", " <th>324731</th>\n", " <td>20190401</td>\n", " <td>27</td>\n", " <td>996290</td>\n", " </tr>\n", " <tr>\n", " <th>346730</th>\n", " <td>20190401</td>\n", " <td>34</td>\n", " <td>1086718</td>\n", " </tr>\n", " <tr>\n", " <th>350842</th>\n", " <td>20190401</td>\n", " <td>35</td>\n", " <td>1104268</td>\n", " </tr>\n", " <tr>\n", " <th>350865</th>\n", " <td>20190401</td>\n", " <td>36</td>\n", " <td>1104332</td>\n", " </tr>\n", " <tr>\n", " <th>350906</th>\n", " <td>20190401</td>\n", " <td>35</td>\n", " <td>1104505</td>\n", " </tr>\n", " <tr>\n", " <th>350910</th>\n", " <td>20190401</td>\n", " <td>35</td>\n", " <td>1104515</td>\n", " </tr>\n", " <tr>\n", " <th>350918</th>\n", " <td>20190401</td>\n", " <td>38</td>\n", " <td>1104564</td>\n", " </tr>\n", " <tr>\n", " <th>351288</th>\n", " <td>20190401</td>\n", " <td>35</td>\n", " <td>1106128</td>\n", " </tr>\n", " <tr>\n", " <th>351294</th>\n", " <td>20190401</td>\n", " <td>33</td>\n", " <td>1106178</td>\n", " </tr>\n", " <tr>\n", " <th>351311</th>\n", " <td>20190401</td>\n", " <td>33</td>\n", " <td>1106241</td>\n", " </tr>\n", " <tr>\n", " <th>351326</th>\n", " <td>20190401</td>\n", " <td>26</td>\n", " <td>1106299</td>\n", " </tr>\n", " <tr>\n", " <th>351346</th>\n", " <td>20190401</td>\n", " <td>29</td>\n", " <td>1106375</td>\n", " </tr>\n", " <tr>\n", " <th>351506</th>\n", " <td>20190401</td>\n", " <td>28</td>\n", " <td>1106937</td>\n", " </tr>\n", " <tr>\n", " <th>357754</th>\n", " <td>20190401</td>\n", " <td>34</td>\n", " <td>1130287</td>\n", " </tr>\n", " <tr>\n", " <th>358597</th>\n", " <td>20190401</td>\n", " <td>36</td>\n", " <td>1133844</td>\n", " </tr>\n", " <tr>\n", " <th>358688</th>\n", " <td>20190401</td>\n", " <td>39</td>\n", " <td>1134236</td>\n", " </tr>\n", " <tr>\n", " <th>506704</th>\n", " <td>20190401</td>\n", " <td>28</td>\n", " <td>1847173</td>\n", " </tr>\n", " <tr>\n", " <th>526287</th>\n", " <td>20190401</td>\n", " <td>80</td>\n", " <td>1945597</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>11306108</th>\n", " <td>20190420</td>\n", " <td>31</td>\n", " <td>55569842</td>\n", " </tr>\n", " <tr>\n", " <th>11308223</th>\n", " <td>20190420</td>\n", " <td>35</td>\n", " <td>55595865</td>\n", " </tr>\n", " <tr>\n", " <th>11309929</th>\n", " <td>20190420</td>\n", " <td>123</td>\n", " <td>55619912</td>\n", " </tr>\n", " <tr>\n", " <th>11315945</th>\n", " <td>20190420</td>\n", " <td>179</td>\n", " <td>55697297</td>\n", " </tr>\n", " <tr>\n", " <th>11321659</th>\n", " <td>20190420</td>\n", " <td>26</td>\n", " <td>55763651</td>\n", " </tr>\n", " <tr>\n", " <th>11325516</th>\n", " <td>20190420</td>\n", " <td>30</td>\n", " <td>55809734</td>\n", " </tr>\n", " <tr>\n", " <th>11332444</th>\n", " <td>20190420</td>\n", " <td>27</td>\n", " <td>55887267</td>\n", " </tr>\n", " <tr>\n", " <th>11342183</th>\n", " <td>20190420</td>\n", " <td>28</td>\n", " <td>56002822</td>\n", " </tr>\n", " <tr>\n", " <th>11353183</th>\n", " <td>20190420</td>\n", " <td>119</td>\n", " <td>56142356</td>\n", " </tr>\n", " <tr>\n", " <th>11357978</th>\n", " <td>20190420</td>\n", " <td>129</td>\n", " <td>56192995</td>\n", " </tr>\n", " <tr>\n", " <th>11371695</th>\n", " <td>20190420</td>\n", " <td>145</td>\n", " <td>56363828</td>\n", " </tr>\n", " <tr>\n", " <th>11377401</th>\n", " <td>20190420</td>\n", " <td>142</td>\n", " <td>56434290</td>\n", " </tr>\n", " <tr>\n", " <th>11385423</th>\n", " <td>20190420</td>\n", " <td>130</td>\n", " <td>56532039</td>\n", " </tr>\n", " <tr>\n", " <th>11385904</th>\n", " <td>20190420</td>\n", " <td>126</td>\n", " <td>56538063</td>\n", " </tr>\n", " <tr>\n", " <th>11389134</th>\n", " <td>20190420</td>\n", " <td>136</td>\n", " <td>56577813</td>\n", " </tr>\n", " <tr>\n", " <th>11413632</th>\n", " <td>20190420</td>\n", " <td>334</td>\n", " <td>56887146</td>\n", " </tr>\n", " <tr>\n", " <th>11414205</th>\n", " <td>20190420</td>\n", " <td>145</td>\n", " <td>56891883</td>\n", " </tr>\n", " <tr>\n", " <th>11435248</th>\n", " <td>20190420</td>\n", " <td>28</td>\n", " <td>57147301</td>\n", " </tr>\n", " <tr>\n", " <th>11472465</th>\n", " <td>20190420</td>\n", " <td>139</td>\n", " <td>57437661</td>\n", " </tr>\n", " <tr>\n", " <th>11475831</th>\n", " <td>20190420</td>\n", " <td>122</td>\n", " <td>57482185</td>\n", " </tr>\n", " <tr>\n", " <th>11479726</th>\n", " <td>20190420</td>\n", " <td>141</td>\n", " <td>57536902</td>\n", " </tr>\n", " <tr>\n", " <th>11485287</th>\n", " <td>20190420</td>\n", " <td>136</td>\n", " <td>57620451</td>\n", " </tr>\n", " <tr>\n", " <th>11486673</th>\n", " <td>20190420</td>\n", " <td>146</td>\n", " <td>57637971</td>\n", " </tr>\n", " <tr>\n", " <th>11492147</th>\n", " <td>20190420</td>\n", " <td>128</td>\n", " <td>57706926</td>\n", " </tr>\n", " <tr>\n", " <th>11501751</th>\n", " <td>20190420</td>\n", " <td>126</td>\n", " <td>57839087</td>\n", " </tr>\n", " <tr>\n", " <th>11506322</th>\n", " <td>20190420</td>\n", " <td>71</td>\n", " <td>57897975</td>\n", " </tr>\n", " <tr>\n", " <th>11520191</th>\n", " <td>20190420</td>\n", " <td>27</td>\n", " <td>58072995</td>\n", " </tr>\n", " <tr>\n", " <th>11537552</th>\n", " <td>20190420</td>\n", " <td>30</td>\n", " <td>58344889</td>\n", " </tr>\n", " <tr>\n", " <th>11564773</th>\n", " <td>20190420</td>\n", " <td>75</td>\n", " <td>58742596</td>\n", " </tr>\n", " <tr>\n", " <th>11652475</th>\n", " <td>20190420</td>\n", " <td>43</td>\n", " <td>60089397</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>729 rows × 3 columns</p>\n", "</div>" ], "text/plain": [ " dt infobox_count page_id\n", "5011 20190401 49 10577\n", "6537 20190401 58 13696\n", "49147 20190401 34 102962\n", "49165 20190401 27 102985\n", "49565 20190401 29 103782\n", "162390 20190401 52 373334\n", "164886 20190401 34 381506\n", "169633 20190401 28 397295\n", "276936 20190401 78 782670\n", "285251 20190401 32 841648\n", "293703 20190401 239 871024\n", "309385 20190401 34 931088\n", "324731 20190401 27 996290\n", "346730 20190401 34 1086718\n", "350842 20190401 35 1104268\n", "350865 20190401 36 1104332\n", "350906 20190401 35 1104505\n", "350910 20190401 35 1104515\n", "350918 20190401 38 1104564\n", "351288 20190401 35 1106128\n", "351294 20190401 33 1106178\n", "351311 20190401 33 1106241\n", "351326 20190401 26 1106299\n", "351346 20190401 29 1106375\n", "351506 20190401 28 1106937\n", "357754 20190401 34 1130287\n", "358597 20190401 36 1133844\n", "358688 20190401 39 1134236\n", "506704 20190401 28 1847173\n", "526287 20190401 80 1945597\n", "... ... ... ...\n", "11306108 20190420 31 55569842\n", "11308223 20190420 35 55595865\n", "11309929 20190420 123 55619912\n", "11315945 20190420 179 55697297\n", "11321659 20190420 26 55763651\n", "11325516 20190420 30 55809734\n", "11332444 20190420 27 55887267\n", "11342183 20190420 28 56002822\n", "11353183 20190420 119 56142356\n", "11357978 20190420 129 56192995\n", "11371695 20190420 145 56363828\n", "11377401 20190420 142 56434290\n", "11385423 20190420 130 56532039\n", "11385904 20190420 126 56538063\n", "11389134 20190420 136 56577813\n", "11413632 20190420 334 56887146\n", "11414205 20190420 145 56891883\n", "11435248 20190420 28 57147301\n", "11472465 20190420 139 57437661\n", "11475831 20190420 122 57482185\n", "11479726 20190420 141 57536902\n", "11485287 20190420 136 57620451\n", "11486673 20190420 146 57637971\n", "11492147 20190420 128 57706926\n", "11501751 20190420 126 57839087\n", "11506322 20190420 71 57897975\n", "11520191 20190420 27 58072995\n", "11537552 20190420 30 58344889\n", "11564773 20190420 75 58742596\n", "11652475 20190420 43 60089397\n", "\n", "[729 rows x 3 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "infobox.query('infobox_count>25')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Rate of change between dump days" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rate of change: 0.00149698163222\n" ] }, { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>pages_w_infobox</th>\n", " <th>extract_date</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>5839083</td>\n", " <td>20190401</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>5847824</td>\n", " <td>20190420</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " pages_w_infobox extract_date\n", "0 5839083 20190401\n", "1 5847824 20190420" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate rate of change\n", "q = \"\"\"\n", "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date\n", "FROM \n", " ryanmax.infobox_count\n", "GROUP BY extract_date\n", "ORDER BY extract_date\n", "\"\"\"\n", "ibs = spark.sql(q)\n", "ibs_pd = sqlContext.createDataFrame(ibs.rdd).toPandas()\n", "print('Rate of change: ',(ibs_pd['pages_w_infobox'][1] - ibs_pd['pages_w_infobox'][0])/ibs_pd['pages_w_infobox'][0])\n", "ibs_pd\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+\n", "|pages_w_infobox|\n", "+---------------+\n", "| 5847824|\n", "+---------------+\n", "\n" ] } ], "source": [ "# write data for 2019-04-20 and report out link count\n", "ibs = spark.sql(\"SELECT * FROM ryanmax.infobox_count WHERE dt = '20190420'\")\n", "ibs_df = sqlContext.createDataFrame(ibs.rdd)\n", "ibs_df.createOrReplaceTempView(\"temp_infobox\")\n", "sqlContext.sql(\"DROP TABLE IF EXISTS ryanmax.population_infobox\")\n", "sqlContext.sql(\"CREATE TABLE ryanmax.population_infobox AS SELECT * FROM temp_infobox\")\n", "spark.sql(\"select COUNT(DISTINCT page_id) AS pages_w_infobox from ryanmax.population_infobox\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Infobox Counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count of pages with infoboxes in W pages with external links" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+\n", "|pages_w_infobox|\n", "+---------------+\n", "| 5209322|\n", "+---------------+\n", "\n" ] } ], "source": [ "# Count of pages with infoboxes in W pages with external links\n", "w_infobox_query = \"\"\"\n", "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", "\"\"\"\n", "\n", "w_infobox = spark.sql(w_infobox_query)\n", "w_infobox.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count of pages with infoboxes in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+\n", "|pages_w_infobox|\n", "+---------------+\n", "| 32597|\n", "+---------------+\n", "\n" ] } ], "source": [ "# Count of pages with infoboxes in WP:M pages with external links\n", "pm_infobox_query = \"\"\"\n", "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", "\"\"\"\n", "\n", "pm_infobox = spark.sql(pm_infobox_query)\n", "pm_infobox.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Number of infoboxes per page in W pages with external links" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "| 0|346| 0.67| 1.0| 0.8573307965874148| 1.0|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "\n" ] } ], "source": [ "# Number of infoboxes per page in W pages with external links\n", "w_infobox_cnts_query = \"\"\"\n", "SELECT MIN(infobox_count) as min,\n", " MAX(infobox_count) as max, \n", " CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,\n", " PERCENTILE(infobox_count,0.5) AS median_infobox_count,\n", " STDDEV(infobox_count) as stddev_infobox_count,\n", " (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", "\"\"\"\n", "\n", "w_infobox_cnts = spark.sql(w_infobox_cnts_query)\n", "w_infobox_cnts.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Number of infoboxes per page in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "| 0| 11| 0.49| 0.0| 0.5235718191828268| 1.0|\n", "+---+---+---------------------+--------------------+--------------------+-----------------+\n", "\n" ] } ], "source": [ "# Number of infoboxes per page in WP:M pages with external links\n", "pm_infobox_cnts_query = \"\"\"\n", "SELECT MIN(infobox_count) as min,\n", " MAX(infobox_count) as max, \n", " CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,\n", " PERCENTILE(infobox_count,0.5) AS median_infobox_count,\n", " STDDEV(infobox_count) as stddev_infobox_count,\n", " (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count\n", "FROM \n", " ryanmax.population_infobox\n", "WHERE page_id IN \n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", "\"\"\"\n", "\n", "pm_infobox_cnts = spark.sql(pm_infobox_cnts_query)\n", "pm_infobox_cnts.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark large: spark-xml jar and local venv path ", "language": "python", "name": "spark-ryanmax" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }