{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Infobox Counts from XML Dumps\n",
    "- extract infobox data from dump files\n",
    "- create and populate `population_infobox` table\n",
    "- report infobox counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n",
    "%run -i 'data-defaults.py'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',\n",
    "                       'enwiki-20190420-pages-articles-multistream.xml.bz2']\n",
    "\n",
    "INFOBOX_REGEX = re.compile(r'\\{ *infobox ', re.IGNORECASE)\n",
    "\n",
    "def count_infobox(entity, date):\n",
    "    page_text = entity.revision.text._VALUE\n",
    "    count = len(INFOBOX_REGEX.findall(page_text))\n",
    "    return Row(page_id=entity.id, infobox_count=count, dt=date)\n",
    "\n",
    "infobox_rdd = sc.emptyRDD()\n",
    "for file in WIKIPEDIA_XML_DUMPS:\n",
    "    wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)\n",
    "    dump_date = re.search(r'.*(\\d{8}).*',file).group(1)\n",
    "    articles = wikipedia\\\n",
    "        .filter(\"ns = '0'\")\\\n",
    "        .filter(\"redirect._title is null\") \\\n",
    "        .filter(\"revision.text._VALUE is not null\") \\\n",
    "        .filter(\"length(revision.text._VALUE) > 0\")\n",
    "    daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_infobox(entity, dump_date)))\n",
    "    infobox_rdd = infobox_rdd.union(daily_counts.rdd)\n",
    "\n",
    "infobox_merged = sqlContext.createDataFrame(infobox_rdd)\n",
    "infobox = infobox_merged.toPandas()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dt</th>\n",
       "      <th>infobox_count</th>\n",
       "      <th>page_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20190401</td>\n",
       "      <td>2</td>\n",
       "      <td>303</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>305</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>20190401</td>\n",
       "      <td>2</td>\n",
       "      <td>307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>309</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>20190401</td>\n",
       "      <td>2</td>\n",
       "      <td>324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>20190401</td>\n",
       "      <td>2</td>\n",
       "      <td>332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>340</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>359</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>572</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>573</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>579</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>586</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>590</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>20190401</td>\n",
       "      <td>0</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>20190401</td>\n",
       "      <td>1</td>\n",
       "      <td>595</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686877</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686878</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552528</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686879</th>\n",
       "      <td>20190420</td>\n",
       "      <td>0</td>\n",
       "      <td>60552534</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686880</th>\n",
       "      <td>20190420</td>\n",
       "      <td>0</td>\n",
       "      <td>60552561</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686881</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552576</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686882</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686883</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552597</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686884</th>\n",
       "      <td>20190420</td>\n",
       "      <td>0</td>\n",
       "      <td>60552601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686885</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686886</th>\n",
       "      <td>20190420</td>\n",
       "      <td>0</td>\n",
       "      <td>60552607</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686887</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552611</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686888</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552619</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686889</th>\n",
       "      <td>20190420</td>\n",
       "      <td>0</td>\n",
       "      <td>60552630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686890</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686891</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552643</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686892</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552649</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686893</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552656</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686894</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552665</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686895</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686896</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686897</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686898</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686899</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686900</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552709</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686901</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686902</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552724</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686903</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552730</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686904</th>\n",
       "      <td>20190420</td>\n",
       "      <td>0</td>\n",
       "      <td>60552736</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686905</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552745</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11686906</th>\n",
       "      <td>20190420</td>\n",
       "      <td>1</td>\n",
       "      <td>60552764</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11686907 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                dt  infobox_count   page_id\n",
       "0         20190401              0        12\n",
       "1         20190401              1        25\n",
       "2         20190401              0        39\n",
       "3         20190401              1       290\n",
       "4         20190401              2       303\n",
       "5         20190401              0       305\n",
       "6         20190401              2       307\n",
       "7         20190401              1       308\n",
       "8         20190401              0       309\n",
       "9         20190401              1       316\n",
       "10        20190401              2       324\n",
       "11        20190401              1       330\n",
       "12        20190401              2       332\n",
       "13        20190401              0       334\n",
       "14        20190401              0       336\n",
       "15        20190401              1       339\n",
       "16        20190401              1       340\n",
       "17        20190401              1       344\n",
       "18        20190401              1       358\n",
       "19        20190401              0       359\n",
       "20        20190401              0       569\n",
       "21        20190401              0       572\n",
       "22        20190401              0       573\n",
       "23        20190401              0       579\n",
       "24        20190401              0       580\n",
       "25        20190401              1       586\n",
       "26        20190401              0       590\n",
       "27        20190401              0       593\n",
       "28        20190401              1       594\n",
       "29        20190401              1       595\n",
       "...            ...            ...       ...\n",
       "11686877  20190420              1  60552524\n",
       "11686878  20190420              1  60552528\n",
       "11686879  20190420              0  60552534\n",
       "11686880  20190420              0  60552561\n",
       "11686881  20190420              1  60552576\n",
       "11686882  20190420              1  60552592\n",
       "11686883  20190420              1  60552597\n",
       "11686884  20190420              0  60552601\n",
       "11686885  20190420              1  60552606\n",
       "11686886  20190420              0  60552607\n",
       "11686887  20190420              1  60552611\n",
       "11686888  20190420              1  60552619\n",
       "11686889  20190420              0  60552630\n",
       "11686890  20190420              1  60552632\n",
       "11686891  20190420              1  60552643\n",
       "11686892  20190420              1  60552649\n",
       "11686893  20190420              1  60552656\n",
       "11686894  20190420              1  60552665\n",
       "11686895  20190420              1  60552674\n",
       "11686896  20190420              1  60552678\n",
       "11686897  20190420              1  60552679\n",
       "11686898  20190420              1  60552689\n",
       "11686899  20190420              1  60552690\n",
       "11686900  20190420              1  60552709\n",
       "11686901  20190420              1  60552711\n",
       "11686902  20190420              1  60552724\n",
       "11686903  20190420              1  60552730\n",
       "11686904  20190420              0  60552736\n",
       "11686905  20190420              1  60552745\n",
       "11686906  20190420              1  60552764\n",
       "\n",
       "[11686907 rows x 3 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "infobox"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# write infobox counts data to a table for later use\n",
    "infobox_merged.createOrReplaceTempView(\"temp_infobox_count\")\n",
    "sqlContext.sql(\"DROP TABLE IF EXISTS ryanmax.infobox_count\")\n",
    "sqlContext.sql(\"CREATE TABLE ryanmax.infobox_count AS SELECT * FROM temp_infobox_count\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dt</th>\n",
       "      <th>infobox_count</th>\n",
       "      <th>page_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5011</th>\n",
       "      <td>20190401</td>\n",
       "      <td>49</td>\n",
       "      <td>10577</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6537</th>\n",
       "      <td>20190401</td>\n",
       "      <td>58</td>\n",
       "      <td>13696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49147</th>\n",
       "      <td>20190401</td>\n",
       "      <td>34</td>\n",
       "      <td>102962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49165</th>\n",
       "      <td>20190401</td>\n",
       "      <td>27</td>\n",
       "      <td>102985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49565</th>\n",
       "      <td>20190401</td>\n",
       "      <td>29</td>\n",
       "      <td>103782</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162390</th>\n",
       "      <td>20190401</td>\n",
       "      <td>52</td>\n",
       "      <td>373334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164886</th>\n",
       "      <td>20190401</td>\n",
       "      <td>34</td>\n",
       "      <td>381506</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169633</th>\n",
       "      <td>20190401</td>\n",
       "      <td>28</td>\n",
       "      <td>397295</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>276936</th>\n",
       "      <td>20190401</td>\n",
       "      <td>78</td>\n",
       "      <td>782670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>285251</th>\n",
       "      <td>20190401</td>\n",
       "      <td>32</td>\n",
       "      <td>841648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>293703</th>\n",
       "      <td>20190401</td>\n",
       "      <td>239</td>\n",
       "      <td>871024</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>309385</th>\n",
       "      <td>20190401</td>\n",
       "      <td>34</td>\n",
       "      <td>931088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>324731</th>\n",
       "      <td>20190401</td>\n",
       "      <td>27</td>\n",
       "      <td>996290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>346730</th>\n",
       "      <td>20190401</td>\n",
       "      <td>34</td>\n",
       "      <td>1086718</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350842</th>\n",
       "      <td>20190401</td>\n",
       "      <td>35</td>\n",
       "      <td>1104268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350865</th>\n",
       "      <td>20190401</td>\n",
       "      <td>36</td>\n",
       "      <td>1104332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350906</th>\n",
       "      <td>20190401</td>\n",
       "      <td>35</td>\n",
       "      <td>1104505</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350910</th>\n",
       "      <td>20190401</td>\n",
       "      <td>35</td>\n",
       "      <td>1104515</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350918</th>\n",
       "      <td>20190401</td>\n",
       "      <td>38</td>\n",
       "      <td>1104564</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351288</th>\n",
       "      <td>20190401</td>\n",
       "      <td>35</td>\n",
       "      <td>1106128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351294</th>\n",
       "      <td>20190401</td>\n",
       "      <td>33</td>\n",
       "      <td>1106178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351311</th>\n",
       "      <td>20190401</td>\n",
       "      <td>33</td>\n",
       "      <td>1106241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351326</th>\n",
       "      <td>20190401</td>\n",
       "      <td>26</td>\n",
       "      <td>1106299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351346</th>\n",
       "      <td>20190401</td>\n",
       "      <td>29</td>\n",
       "      <td>1106375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351506</th>\n",
       "      <td>20190401</td>\n",
       "      <td>28</td>\n",
       "      <td>1106937</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>357754</th>\n",
       "      <td>20190401</td>\n",
       "      <td>34</td>\n",
       "      <td>1130287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>358597</th>\n",
       "      <td>20190401</td>\n",
       "      <td>36</td>\n",
       "      <td>1133844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>358688</th>\n",
       "      <td>20190401</td>\n",
       "      <td>39</td>\n",
       "      <td>1134236</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>506704</th>\n",
       "      <td>20190401</td>\n",
       "      <td>28</td>\n",
       "      <td>1847173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>526287</th>\n",
       "      <td>20190401</td>\n",
       "      <td>80</td>\n",
       "      <td>1945597</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11306108</th>\n",
       "      <td>20190420</td>\n",
       "      <td>31</td>\n",
       "      <td>55569842</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11308223</th>\n",
       "      <td>20190420</td>\n",
       "      <td>35</td>\n",
       "      <td>55595865</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11309929</th>\n",
       "      <td>20190420</td>\n",
       "      <td>123</td>\n",
       "      <td>55619912</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11315945</th>\n",
       "      <td>20190420</td>\n",
       "      <td>179</td>\n",
       "      <td>55697297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11321659</th>\n",
       "      <td>20190420</td>\n",
       "      <td>26</td>\n",
       "      <td>55763651</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11325516</th>\n",
       "      <td>20190420</td>\n",
       "      <td>30</td>\n",
       "      <td>55809734</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11332444</th>\n",
       "      <td>20190420</td>\n",
       "      <td>27</td>\n",
       "      <td>55887267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11342183</th>\n",
       "      <td>20190420</td>\n",
       "      <td>28</td>\n",
       "      <td>56002822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11353183</th>\n",
       "      <td>20190420</td>\n",
       "      <td>119</td>\n",
       "      <td>56142356</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11357978</th>\n",
       "      <td>20190420</td>\n",
       "      <td>129</td>\n",
       "      <td>56192995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11371695</th>\n",
       "      <td>20190420</td>\n",
       "      <td>145</td>\n",
       "      <td>56363828</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11377401</th>\n",
       "      <td>20190420</td>\n",
       "      <td>142</td>\n",
       "      <td>56434290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11385423</th>\n",
       "      <td>20190420</td>\n",
       "      <td>130</td>\n",
       "      <td>56532039</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11385904</th>\n",
       "      <td>20190420</td>\n",
       "      <td>126</td>\n",
       "      <td>56538063</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11389134</th>\n",
       "      <td>20190420</td>\n",
       "      <td>136</td>\n",
       "      <td>56577813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11413632</th>\n",
       "      <td>20190420</td>\n",
       "      <td>334</td>\n",
       "      <td>56887146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11414205</th>\n",
       "      <td>20190420</td>\n",
       "      <td>145</td>\n",
       "      <td>56891883</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11435248</th>\n",
       "      <td>20190420</td>\n",
       "      <td>28</td>\n",
       "      <td>57147301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11472465</th>\n",
       "      <td>20190420</td>\n",
       "      <td>139</td>\n",
       "      <td>57437661</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11475831</th>\n",
       "      <td>20190420</td>\n",
       "      <td>122</td>\n",
       "      <td>57482185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11479726</th>\n",
       "      <td>20190420</td>\n",
       "      <td>141</td>\n",
       "      <td>57536902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11485287</th>\n",
       "      <td>20190420</td>\n",
       "      <td>136</td>\n",
       "      <td>57620451</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11486673</th>\n",
       "      <td>20190420</td>\n",
       "      <td>146</td>\n",
       "      <td>57637971</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11492147</th>\n",
       "      <td>20190420</td>\n",
       "      <td>128</td>\n",
       "      <td>57706926</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11501751</th>\n",
       "      <td>20190420</td>\n",
       "      <td>126</td>\n",
       "      <td>57839087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11506322</th>\n",
       "      <td>20190420</td>\n",
       "      <td>71</td>\n",
       "      <td>57897975</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11520191</th>\n",
       "      <td>20190420</td>\n",
       "      <td>27</td>\n",
       "      <td>58072995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11537552</th>\n",
       "      <td>20190420</td>\n",
       "      <td>30</td>\n",
       "      <td>58344889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11564773</th>\n",
       "      <td>20190420</td>\n",
       "      <td>75</td>\n",
       "      <td>58742596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11652475</th>\n",
       "      <td>20190420</td>\n",
       "      <td>43</td>\n",
       "      <td>60089397</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>729 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                dt  infobox_count   page_id\n",
       "5011      20190401             49     10577\n",
       "6537      20190401             58     13696\n",
       "49147     20190401             34    102962\n",
       "49165     20190401             27    102985\n",
       "49565     20190401             29    103782\n",
       "162390    20190401             52    373334\n",
       "164886    20190401             34    381506\n",
       "169633    20190401             28    397295\n",
       "276936    20190401             78    782670\n",
       "285251    20190401             32    841648\n",
       "293703    20190401            239    871024\n",
       "309385    20190401             34    931088\n",
       "324731    20190401             27    996290\n",
       "346730    20190401             34   1086718\n",
       "350842    20190401             35   1104268\n",
       "350865    20190401             36   1104332\n",
       "350906    20190401             35   1104505\n",
       "350910    20190401             35   1104515\n",
       "350918    20190401             38   1104564\n",
       "351288    20190401             35   1106128\n",
       "351294    20190401             33   1106178\n",
       "351311    20190401             33   1106241\n",
       "351326    20190401             26   1106299\n",
       "351346    20190401             29   1106375\n",
       "351506    20190401             28   1106937\n",
       "357754    20190401             34   1130287\n",
       "358597    20190401             36   1133844\n",
       "358688    20190401             39   1134236\n",
       "506704    20190401             28   1847173\n",
       "526287    20190401             80   1945597\n",
       "...            ...            ...       ...\n",
       "11306108  20190420             31  55569842\n",
       "11308223  20190420             35  55595865\n",
       "11309929  20190420            123  55619912\n",
       "11315945  20190420            179  55697297\n",
       "11321659  20190420             26  55763651\n",
       "11325516  20190420             30  55809734\n",
       "11332444  20190420             27  55887267\n",
       "11342183  20190420             28  56002822\n",
       "11353183  20190420            119  56142356\n",
       "11357978  20190420            129  56192995\n",
       "11371695  20190420            145  56363828\n",
       "11377401  20190420            142  56434290\n",
       "11385423  20190420            130  56532039\n",
       "11385904  20190420            126  56538063\n",
       "11389134  20190420            136  56577813\n",
       "11413632  20190420            334  56887146\n",
       "11414205  20190420            145  56891883\n",
       "11435248  20190420             28  57147301\n",
       "11472465  20190420            139  57437661\n",
       "11475831  20190420            122  57482185\n",
       "11479726  20190420            141  57536902\n",
       "11485287  20190420            136  57620451\n",
       "11486673  20190420            146  57637971\n",
       "11492147  20190420            128  57706926\n",
       "11501751  20190420            126  57839087\n",
       "11506322  20190420             71  57897975\n",
       "11520191  20190420             27  58072995\n",
       "11537552  20190420             30  58344889\n",
       "11564773  20190420             75  58742596\n",
       "11652475  20190420             43  60089397\n",
       "\n",
       "[729 rows x 3 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "infobox.query('infobox_count>25')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Rate of change between dump days"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rate of change:  0.00149698163222\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pages_w_infobox</th>\n",
       "      <th>extract_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5839083</td>\n",
       "      <td>20190401</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5847824</td>\n",
       "      <td>20190420</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pages_w_infobox extract_date\n",
       "0          5839083     20190401\n",
       "1          5847824     20190420"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate rate of change\n",
    "q = \"\"\"\n",
    "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date\n",
    "FROM \n",
    "    ryanmax.infobox_count\n",
    "GROUP BY extract_date\n",
    "ORDER BY extract_date\n",
    "\"\"\"\n",
    "ibs = spark.sql(q)\n",
    "ibs_pd = sqlContext.createDataFrame(ibs.rdd).toPandas()\n",
    "print('Rate of change: ',(ibs_pd['pages_w_infobox'][1] - ibs_pd['pages_w_infobox'][0])/ibs_pd['pages_w_infobox'][0])\n",
    "ibs_pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------+\n",
      "|pages_w_infobox|\n",
      "+---------------+\n",
      "|        5847824|\n",
      "+---------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# write data for 2019-04-20 and report out link count\n",
    "ibs = spark.sql(\"SELECT * FROM ryanmax.infobox_count WHERE dt = '20190420'\")\n",
    "ibs_df = sqlContext.createDataFrame(ibs.rdd)\n",
    "ibs_df.createOrReplaceTempView(\"temp_infobox\")\n",
    "sqlContext.sql(\"DROP TABLE IF EXISTS ryanmax.population_infobox\")\n",
    "sqlContext.sql(\"CREATE TABLE ryanmax.population_infobox AS SELECT * FROM temp_infobox\")\n",
    "spark.sql(\"select COUNT(DISTINCT page_id) AS pages_w_infobox from ryanmax.population_infobox\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Infobox Counts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count of pages with infoboxes in W pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------+\n",
      "|pages_w_infobox|\n",
      "+---------------+\n",
      "|        5209322|\n",
      "+---------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Count of pages with infoboxes in W pages with external links\n",
    "w_infobox_query = \"\"\"\n",
    "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox\n",
    "FROM \n",
    "    ryanmax.population_infobox\n",
    "WHERE page_id IN \n",
    "    (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n",
    "\"\"\"\n",
    "\n",
    "w_infobox = spark.sql(w_infobox_query)\n",
    "w_infobox.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count of pages with infoboxes in WP:M pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------+\n",
      "|pages_w_infobox|\n",
      "+---------------+\n",
      "|          32597|\n",
      "+---------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Count of pages with infoboxes in WP:M pages with external links\n",
    "pm_infobox_query = \"\"\"\n",
    "SELECT COUNT(DISTINCT page_id) AS pages_w_infobox\n",
    "FROM \n",
    "    ryanmax.population_infobox\n",
    "WHERE page_id IN \n",
    "    (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n",
    "\"\"\"\n",
    "\n",
    "pm_infobox = spark.sql(pm_infobox_query)\n",
    "pm_infobox.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of infoboxes per page in W pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+---------------------+--------------------+--------------------+-----------------+\n",
      "|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|\n",
      "+---+---+---------------------+--------------------+--------------------+-----------------+\n",
      "|  0|346|                 0.67|                 1.0|  0.8573307965874148|              1.0|\n",
      "+---+---+---------------------+--------------------+--------------------+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Number of infoboxes per page in W pages with external links\n",
    "w_infobox_cnts_query = \"\"\"\n",
    "SELECT MIN(infobox_count) as min,\n",
    "    MAX(infobox_count) as max, \n",
    "    CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,\n",
    "    PERCENTILE(infobox_count,0.5) AS median_infobox_count,\n",
    "    STDDEV(infobox_count) as stddev_infobox_count,\n",
    "    (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count\n",
    "FROM \n",
    "    ryanmax.population_infobox\n",
    "WHERE page_id IN \n",
    "    (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n",
    "\"\"\"\n",
    "\n",
    "w_infobox_cnts = spark.sql(w_infobox_cnts_query)\n",
    "w_infobox_cnts.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of infoboxes per page in WP:M pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+---------------------+--------------------+--------------------+-----------------+\n",
      "|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|\n",
      "+---+---+---------------------+--------------------+--------------------+-----------------+\n",
      "|  0| 11|                 0.49|                 0.0|  0.5235718191828268|              1.0|\n",
      "+---+---+---------------------+--------------------+--------------------+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Number of infoboxes per page in WP:M pages with external links\n",
    "pm_infobox_cnts_query = \"\"\"\n",
    "SELECT MIN(infobox_count) as min,\n",
    "    MAX(infobox_count) as max, \n",
    "    CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,\n",
    "    PERCENTILE(infobox_count,0.5) AS median_infobox_count,\n",
    "    STDDEV(infobox_count) as stddev_infobox_count,\n",
    "    (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count\n",
    "FROM \n",
    "    ryanmax.population_infobox\n",
    "WHERE page_id IN \n",
    "    (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n",
    "\"\"\"\n",
    "\n",
    "pm_infobox_cnts = spark.sql(pm_infobox_cnts_query)\n",
    "pm_infobox_cnts.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark large: spark-xml jar and local venv path ",
   "language": "python",
   "name": "spark-ryanmax"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}