{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pageviews for W:PM and W pages with external links\n",
    "- pageview data source: [`wmf.pageview_hourly`](https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_hourly) table\n",
    "- counts exclude bots/spiders (agent_type = 'user')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n",
    "%run -i 'data-defaults.py'\n",
    "# pandas float formatting\n",
    "pd.options.display.float_format = '{:.4f}'.format"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Total pageviews for WP:M pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>access_method</th>\n",
       "      <th>total_pageviews</th>\n",
       "      <th>daily_average_pageviews</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>desktop</td>\n",
       "      <td>62650292</td>\n",
       "      <td>1957821.6250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>mobile web</td>\n",
       "      <td>124214376</td>\n",
       "      <td>3881699.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>mobile app</td>\n",
       "      <td>1150383</td>\n",
       "      <td>35949.4688</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  access_method  total_pageviews  daily_average_pageviews\n",
       "0       desktop         62650292             1957821.6250\n",
       "1    mobile web        124214376             3881699.2500\n",
       "2    mobile app          1150383               35949.4688"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wpm_query = \"\"\"\n",
    "SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews\n",
    "FROM wmf.pageview_hourly\n",
    "WHERE project = 'en.wikipedia'\n",
    "AND page_id IN\n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks)\n",
    "AND agent_type = 'user'\n",
    "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n",
    "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n",
    "GROUP BY access_method\n",
    "\"\"\"\n",
    "wpm_pageviews = spark.sql(wpm_query.format(days_in_study, start_date_string, end_date_string))\n",
    "wpm_pageviews_rdd = wpm_pageviews.rdd\n",
    "wpm_pageviews_df = sqlContext.createDataFrame(wpm_pageviews_rdd)\n",
    "wpm_pageviews_pandas = wpm_pageviews_df.toPandas()\n",
    "\n",
    "wpm_pageviews_pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Total pageviews for W pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>access_method</th>\n",
       "      <th>total_pageviews</th>\n",
       "      <th>daily_average_pageviews</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>desktop</td>\n",
       "      <td>3134600739</td>\n",
       "      <td>97956273.0938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>mobile web</td>\n",
       "      <td>4120797860</td>\n",
       "      <td>128774933.1250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>mobile app</td>\n",
       "      <td>54845509</td>\n",
       "      <td>1713922.1562</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  access_method  total_pageviews  daily_average_pageviews\n",
       "0       desktop       3134600739            97956273.0938\n",
       "1    mobile web       4120797860           128774933.1250\n",
       "2    mobile app         54845509             1713922.1562"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w_query = \"\"\"\n",
    "SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews\n",
    "FROM wmf.pageview_hourly\n",
    "WHERE project = 'en.wikipedia'\n",
    "AND page_id IN\n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_w_pages_with_extlinks)\n",
    "AND agent_type = 'user'\n",
    "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n",
    "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n",
    "GROUP BY access_method\n",
    "\"\"\"\n",
    "w_pageviews = spark.sql(w_query.format(days_in_study,start_date_string, end_date_string))\n",
    "w_pageviews_rdd = w_pageviews.rdd\n",
    "w_pageviews_df = sqlContext.createDataFrame(w_pageviews_rdd)\n",
    "w_pageviews_pandas = w_pageviews_df.toPandas()\n",
    "\n",
    "w_pageviews_pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Top WP:M pages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>page_id</th>\n",
       "      <th>page_title</th>\n",
       "      <th>total_pageviews</th>\n",
       "      <th>daily_average_pageviews</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>43573275</td>\n",
       "      <td>Elizabeth Holmes</td>\n",
       "      <td>1349527</td>\n",
       "      <td>42172.7188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>41779862</td>\n",
       "      <td>Theranos</td>\n",
       "      <td>546969</td>\n",
       "      <td>17092.7812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>58911</td>\n",
       "      <td>Measles</td>\n",
       "      <td>388340</td>\n",
       "      <td>12135.6250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>18079</td>\n",
       "      <td>Leonardo da Vinci</td>\n",
       "      <td>380421</td>\n",
       "      <td>11888.1562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>27546</td>\n",
       "      <td>Sexual intercourse</td>\n",
       "      <td>371290</td>\n",
       "      <td>11602.8125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>37556</td>\n",
       "      <td>Asperger syndrome</td>\n",
       "      <td>336547</td>\n",
       "      <td>10517.0938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>791546</td>\n",
       "      <td>Ketogenic diet</td>\n",
       "      <td>319326</td>\n",
       "      <td>9978.9375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>4488176</td>\n",
       "      <td>Factitious disorder imposed on another</td>\n",
       "      <td>314542</td>\n",
       "      <td>9829.4375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>4501</td>\n",
       "      <td>Black Death</td>\n",
       "      <td>308057</td>\n",
       "      <td>9626.7812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>56880920</td>\n",
       "      <td>Ramesh Balwani</td>\n",
       "      <td>305280</td>\n",
       "      <td>9540.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>52135</td>\n",
       "      <td>Pneumonia</td>\n",
       "      <td>292179</td>\n",
       "      <td>9130.5938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1232575</td>\n",
       "      <td>Suicide methods</td>\n",
       "      <td>290488</td>\n",
       "      <td>9077.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>44990</td>\n",
       "      <td>Ryan White</td>\n",
       "      <td>260600</td>\n",
       "      <td>8143.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>56483</td>\n",
       "      <td>Tourette syndrome</td>\n",
       "      <td>245082</td>\n",
       "      <td>7658.8125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>50601</td>\n",
       "      <td>Cystic fibrosis</td>\n",
       "      <td>241014</td>\n",
       "      <td>7531.6875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1232085</td>\n",
       "      <td>Cannabidiol</td>\n",
       "      <td>240093</td>\n",
       "      <td>7502.9062</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>149223</td>\n",
       "      <td>Borderline personality disorder</td>\n",
       "      <td>239415</td>\n",
       "      <td>7481.7188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>26652964</td>\n",
       "      <td>Intersex</td>\n",
       "      <td>229836</td>\n",
       "      <td>7182.3750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>63522</td>\n",
       "      <td>Crohn's disease</td>\n",
       "      <td>226383</td>\n",
       "      <td>7074.4688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>30653</td>\n",
       "      <td>Tuberculosis</td>\n",
       "      <td>222910</td>\n",
       "      <td>6965.9375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>4531</td>\n",
       "      <td>Bipolar disorder</td>\n",
       "      <td>221744</td>\n",
       "      <td>6929.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>10024</td>\n",
       "      <td>MDMA</td>\n",
       "      <td>206809</td>\n",
       "      <td>6462.7812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>50603</td>\n",
       "      <td>Multiple sclerosis</td>\n",
       "      <td>206073</td>\n",
       "      <td>6439.7812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>17537</td>\n",
       "      <td>Lysergic acid diethylamide</td>\n",
       "      <td>198790</td>\n",
       "      <td>6212.1875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1481886</td>\n",
       "      <td>Cannabis (drug)</td>\n",
       "      <td>197614</td>\n",
       "      <td>6175.4375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>5069516</td>\n",
       "      <td>HIV/AIDS</td>\n",
       "      <td>196495</td>\n",
       "      <td>6140.4688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>141915</td>\n",
       "      <td>Fentanyl</td>\n",
       "      <td>195843</td>\n",
       "      <td>6120.0938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>7701</td>\n",
       "      <td>Cocaine</td>\n",
       "      <td>195679</td>\n",
       "      <td>6114.9688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>244113</td>\n",
       "      <td>Lyme disease</td>\n",
       "      <td>192052</td>\n",
       "      <td>6001.6250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>27790</td>\n",
       "      <td>Schizophrenia</td>\n",
       "      <td>189400</td>\n",
       "      <td>5918.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>158400</td>\n",
       "      <td>Sepsis</td>\n",
       "      <td>186060</td>\n",
       "      <td>5814.3750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>22228064</td>\n",
       "      <td>Parkinson's disease</td>\n",
       "      <td>183926</td>\n",
       "      <td>5747.6875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>26743</td>\n",
       "      <td>Sigmund Freud</td>\n",
       "      <td>182681</td>\n",
       "      <td>5708.7812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>482534</td>\n",
       "      <td>Gabapentin</td>\n",
       "      <td>181754</td>\n",
       "      <td>5679.8125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>318049</td>\n",
       "      <td>Fibromyalgia</td>\n",
       "      <td>179980</td>\n",
       "      <td>5624.3750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>42043</td>\n",
       "      <td>Project MKUltra</td>\n",
       "      <td>175753</td>\n",
       "      <td>5492.2812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>101965</td>\n",
       "      <td>Psoriasis</td>\n",
       "      <td>175601</td>\n",
       "      <td>5487.5312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>40017873</td>\n",
       "      <td>Diabetes mellitus</td>\n",
       "      <td>175558</td>\n",
       "      <td>5486.1875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>83406</td>\n",
       "      <td>Paracetamol</td>\n",
       "      <td>174939</td>\n",
       "      <td>5466.8438</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>491494</td>\n",
       "      <td>Ankylosing spondylitis</td>\n",
       "      <td>173697</td>\n",
       "      <td>5428.0312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>37195273</td>\n",
       "      <td>Trypophobia</td>\n",
       "      <td>172821</td>\n",
       "      <td>5400.6562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>65847</td>\n",
       "      <td>Vitiligo</td>\n",
       "      <td>172270</td>\n",
       "      <td>5383.4375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>25</td>\n",
       "      <td>Autism</td>\n",
       "      <td>169124</td>\n",
       "      <td>5285.1250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>11038318</td>\n",
       "      <td>Methamphetamine</td>\n",
       "      <td>166435</td>\n",
       "      <td>5201.0938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>443800</td>\n",
       "      <td>Shingles</td>\n",
       "      <td>164727</td>\n",
       "      <td>5147.7188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>649100</td>\n",
       "      <td>Adderall</td>\n",
       "      <td>164180</td>\n",
       "      <td>5130.6250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>250858</td>\n",
       "      <td>Tramadol</td>\n",
       "      <td>162858</td>\n",
       "      <td>5089.3125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>20556798</td>\n",
       "      <td>Myocardial infarction</td>\n",
       "      <td>160886</td>\n",
       "      <td>5027.6875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>19375577</td>\n",
       "      <td>Amyotrophic lateral sclerosis</td>\n",
       "      <td>160844</td>\n",
       "      <td>5026.3750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>234806</td>\n",
       "      <td>Diazepam</td>\n",
       "      <td>158085</td>\n",
       "      <td>4940.1562</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     page_id                              page_title  total_pageviews  \\\n",
       "0   43573275                        Elizabeth Holmes          1349527   \n",
       "1   41779862                                Theranos           546969   \n",
       "2      58911                                 Measles           388340   \n",
       "3      18079                       Leonardo da Vinci           380421   \n",
       "4      27546                      Sexual intercourse           371290   \n",
       "5      37556                       Asperger syndrome           336547   \n",
       "6     791546                          Ketogenic diet           319326   \n",
       "7    4488176  Factitious disorder imposed on another           314542   \n",
       "8       4501                             Black Death           308057   \n",
       "9   56880920                          Ramesh Balwani           305280   \n",
       "10     52135                               Pneumonia           292179   \n",
       "11   1232575                         Suicide methods           290488   \n",
       "12     44990                              Ryan White           260600   \n",
       "13     56483                       Tourette syndrome           245082   \n",
       "14     50601                         Cystic fibrosis           241014   \n",
       "15   1232085                             Cannabidiol           240093   \n",
       "16    149223         Borderline personality disorder           239415   \n",
       "17  26652964                                Intersex           229836   \n",
       "18     63522                         Crohn's disease           226383   \n",
       "19     30653                            Tuberculosis           222910   \n",
       "20      4531                        Bipolar disorder           221744   \n",
       "21     10024                                    MDMA           206809   \n",
       "22     50603                      Multiple sclerosis           206073   \n",
       "23     17537              Lysergic acid diethylamide           198790   \n",
       "24   1481886                         Cannabis (drug)           197614   \n",
       "25   5069516                                HIV/AIDS           196495   \n",
       "26    141915                                Fentanyl           195843   \n",
       "27      7701                                 Cocaine           195679   \n",
       "28    244113                            Lyme disease           192052   \n",
       "29     27790                           Schizophrenia           189400   \n",
       "30    158400                                  Sepsis           186060   \n",
       "31  22228064                     Parkinson's disease           183926   \n",
       "32     26743                           Sigmund Freud           182681   \n",
       "33    482534                              Gabapentin           181754   \n",
       "34    318049                            Fibromyalgia           179980   \n",
       "35     42043                         Project MKUltra           175753   \n",
       "36    101965                               Psoriasis           175601   \n",
       "37  40017873                       Diabetes mellitus           175558   \n",
       "38     83406                             Paracetamol           174939   \n",
       "39    491494                  Ankylosing spondylitis           173697   \n",
       "40  37195273                             Trypophobia           172821   \n",
       "41     65847                                Vitiligo           172270   \n",
       "42        25                                  Autism           169124   \n",
       "43  11038318                         Methamphetamine           166435   \n",
       "44    443800                                Shingles           164727   \n",
       "45    649100                                Adderall           164180   \n",
       "46    250858                                Tramadol           162858   \n",
       "47  20556798                   Myocardial infarction           160886   \n",
       "48  19375577           Amyotrophic lateral sclerosis           160844   \n",
       "49    234806                                Diazepam           158085   \n",
       "\n",
       "    daily_average_pageviews  \n",
       "0                42172.7188  \n",
       "1                17092.7812  \n",
       "2                12135.6250  \n",
       "3                11888.1562  \n",
       "4                11602.8125  \n",
       "5                10517.0938  \n",
       "6                 9978.9375  \n",
       "7                 9829.4375  \n",
       "8                 9626.7812  \n",
       "9                 9540.0000  \n",
       "10                9130.5938  \n",
       "11                9077.7500  \n",
       "12                8143.7500  \n",
       "13                7658.8125  \n",
       "14                7531.6875  \n",
       "15                7502.9062  \n",
       "16                7481.7188  \n",
       "17                7182.3750  \n",
       "18                7074.4688  \n",
       "19                6965.9375  \n",
       "20                6929.5000  \n",
       "21                6462.7812  \n",
       "22                6439.7812  \n",
       "23                6212.1875  \n",
       "24                6175.4375  \n",
       "25                6140.4688  \n",
       "26                6120.0938  \n",
       "27                6114.9688  \n",
       "28                6001.6250  \n",
       "29                5918.7500  \n",
       "30                5814.3750  \n",
       "31                5747.6875  \n",
       "32                5708.7812  \n",
       "33                5679.8125  \n",
       "34                5624.3750  \n",
       "35                5492.2812  \n",
       "36                5487.5312  \n",
       "37                5486.1875  \n",
       "38                5466.8438  \n",
       "39                5428.0312  \n",
       "40                5400.6562  \n",
       "41                5383.4375  \n",
       "42                5285.1250  \n",
       "43                5201.0938  \n",
       "44                5147.7188  \n",
       "45                5130.6250  \n",
       "46                5089.3125  \n",
       "47                5027.6875  \n",
       "48                5026.3750  \n",
       "49                4940.1562  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# most visited WP:M pages with external links\n",
    "top_wpm_query = \"\"\"\n",
    "SELECT pageview_hourly.page_id, population_page_titles_20190420.page_title, SUM(pageview_hourly.view_count) AS total_pageviews, SUM(pageview_hourly.view_count)/{} AS daily_average_pageviews\n",
    "FROM wmf.pageview_hourly\n",
    "LEFT JOIN ryanmax.population_page_titles_20190420 ON pageview_hourly.page_id = population_page_titles_20190420.page_id\n",
    "WHERE project = 'en.wikipedia'\n",
    "AND pageview_hourly.page_id IN\n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks)\n",
    "AND agent_type = 'user'\n",
    "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n",
    "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n",
    "GROUP BY pageview_hourly.page_id, population_page_titles_20190420.page_title \n",
    "ORDER BY total_pageviews DESC\n",
    "LIMIT 50\n",
    "\"\"\"\n",
    "\n",
    "top_wpm = spark.sql(top_wpm_query.format(days_in_study,start_date_string, end_date_string))\n",
    "top_wpm.limit(50).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark - YARN",
   "language": "python",
   "name": "spark_yarn_pyspark"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}