{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WP:M Page Classes/Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n",
    "%run -i 'data-defaults.py'\n",
    "# pandas float formatting\n",
    "pd.options.display.float_format = '{:.4f}'.format"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>action</th>\n",
       "      <th>extClick</th>\n",
       "      <th>fnClick</th>\n",
       "      <th>fnHover</th>\n",
       "      <th>upClick</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>B-Class_medicine_articles</th>\n",
       "      <td>197327.0000</td>\n",
       "      <td>332716.0000</td>\n",
       "      <td>608502.0000</td>\n",
       "      <td>19064.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C-Class_medicine_articles</th>\n",
       "      <td>212225.0000</td>\n",
       "      <td>238659.0000</td>\n",
       "      <td>413065.0000</td>\n",
       "      <td>12359.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Category-Class_medicine_articles</th>\n",
       "      <td>12.0000</td>\n",
       "      <td>6.0000</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Disambig-Class_medicine_articles</th>\n",
       "      <td>164.0000</td>\n",
       "      <td>27.0000</td>\n",
       "      <td>29.0000</td>\n",
       "      <td>2.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FA-Class_medicine_articles</th>\n",
       "      <td>14235.0000</td>\n",
       "      <td>29790.0000</td>\n",
       "      <td>70200.0000</td>\n",
       "      <td>2152.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FL-Class_medicine_articles</th>\n",
       "      <td>425.0000</td>\n",
       "      <td>911.0000</td>\n",
       "      <td>1260.0000</td>\n",
       "      <td>40.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GA-Class_medicine_articles</th>\n",
       "      <td>26627.0000</td>\n",
       "      <td>65354.0000</td>\n",
       "      <td>139347.0000</td>\n",
       "      <td>8036.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>List-Class_medicine_articles</th>\n",
       "      <td>12431.0000</td>\n",
       "      <td>10669.0000</td>\n",
       "      <td>18497.0000</td>\n",
       "      <td>187.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Redirect-Class_medicine_articles</th>\n",
       "      <td>148.0000</td>\n",
       "      <td>98.0000</td>\n",
       "      <td>106.0000</td>\n",
       "      <td>2.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Start-Class_medicine_articles</th>\n",
       "      <td>291457.0000</td>\n",
       "      <td>190972.0000</td>\n",
       "      <td>287123.0000</td>\n",
       "      <td>6710.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Stub-Class_medicine_articles</th>\n",
       "      <td>70427.0000</td>\n",
       "      <td>17996.0000</td>\n",
       "      <td>21208.0000</td>\n",
       "      <td>700.0000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "action                              extClick     fnClick     fnHover  \\\n",
       "category                                                               \n",
       "B-Class_medicine_articles        197327.0000 332716.0000 608502.0000   \n",
       "C-Class_medicine_articles        212225.0000 238659.0000 413065.0000   \n",
       "Category-Class_medicine_articles     12.0000      6.0000     13.0000   \n",
       "Disambig-Class_medicine_articles    164.0000     27.0000     29.0000   \n",
       "FA-Class_medicine_articles        14235.0000  29790.0000  70200.0000   \n",
       "FL-Class_medicine_articles          425.0000    911.0000   1260.0000   \n",
       "GA-Class_medicine_articles        26627.0000  65354.0000 139347.0000   \n",
       "List-Class_medicine_articles      12431.0000  10669.0000  18497.0000   \n",
       "Redirect-Class_medicine_articles    148.0000     98.0000    106.0000   \n",
       "Start-Class_medicine_articles    291457.0000 190972.0000 287123.0000   \n",
       "Stub-Class_medicine_articles      70427.0000  17996.0000  21208.0000   \n",
       "\n",
       "action                              upClick  \n",
       "category                                     \n",
       "B-Class_medicine_articles        19064.0000  \n",
       "C-Class_medicine_articles        12359.0000  \n",
       "Category-Class_medicine_articles        nan  \n",
       "Disambig-Class_medicine_articles     2.0000  \n",
       "FA-Class_medicine_articles        2152.0000  \n",
       "FL-Class_medicine_articles          40.0000  \n",
       "GA-Class_medicine_articles        8036.0000  \n",
       "List-Class_medicine_articles       187.0000  \n",
       "Redirect-Class_medicine_articles     2.0000  \n",
       "Start-Class_medicine_articles     6710.0000  \n",
       "Stub-Class_medicine_articles       700.0000  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type\n",
    "pm_category_events_query = \"\"\"\n",
    "SELECT projmed_categories.category, action, count(*) count\n",
    "FROM \n",
    "    citationusage, \n",
    "    (SELECT DISTINCT page_id, category \n",
    "    FROM ryanmax.projmed_categories \n",
    "    WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n",
    "    AND to_date(projmed_categories.dt) = '2019-04-20'\n",
    "    )\n",
    "    AS projmed_categories\n",
    "WHERE citationusage.page_id = projmed_categories.page_id\n",
    "    AND wiki = 'enwiki'\n",
    "    AND citationusage.page_id IN \n",
    "        (SELECT page_id \n",
    "        FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "        )\n",
    "    {}\n",
    "    AND to_date(event_time) >= '{}'\n",
    "    AND to_date(event_time) <= '{}'\n",
    "    AND useragent_is_bot = FALSE\n",
    "GROUP BY projmed_categories.category, action\n",
    "ORDER BY projmed_categories.category, action\n",
    "\"\"\"\n",
    "\n",
    "pm_category_events = spark.sql(\n",
    "    pm_category_events_query.format(\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "pm_category_events_rdd = pm_category_events.rdd\n",
    "pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd)\n",
    "pm_category_events_pandas = pm_category_events_df.toPandas()\n",
    "pm_category_events_pandas.pivot(index='category', columns='action', values='count')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Daily average event counts for each WP:M class (FA, A, GA, B, C, Start, Stub)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Totals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>B-Class_medicine_articles</th>\n",
       "      <td>1157609</td>\n",
       "      <td>36175.2812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C-Class_medicine_articles</th>\n",
       "      <td>876308</td>\n",
       "      <td>27384.6250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Category-Class_medicine_articles</th>\n",
       "      <td>31</td>\n",
       "      <td>0.9688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Disambig-Class_medicine_articles</th>\n",
       "      <td>222</td>\n",
       "      <td>6.9375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FA-Class_medicine_articles</th>\n",
       "      <td>116377</td>\n",
       "      <td>3636.7812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FL-Class_medicine_articles</th>\n",
       "      <td>2636</td>\n",
       "      <td>82.3750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GA-Class_medicine_articles</th>\n",
       "      <td>239364</td>\n",
       "      <td>7480.1250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>List-Class_medicine_articles</th>\n",
       "      <td>41784</td>\n",
       "      <td>1305.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Redirect-Class_medicine_articles</th>\n",
       "      <td>354</td>\n",
       "      <td>11.0625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Start-Class_medicine_articles</th>\n",
       "      <td>776262</td>\n",
       "      <td>24258.1875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Stub-Class_medicine_articles</th>\n",
       "      <td>110331</td>\n",
       "      <td>3447.8438</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        count              \n",
       "                                 total_events daily_average\n",
       "category                                                   \n",
       "B-Class_medicine_articles             1157609    36175.2812\n",
       "C-Class_medicine_articles              876308    27384.6250\n",
       "Category-Class_medicine_articles           31        0.9688\n",
       "Disambig-Class_medicine_articles          222        6.9375\n",
       "FA-Class_medicine_articles             116377     3636.7812\n",
       "FL-Class_medicine_articles               2636       82.3750\n",
       "GA-Class_medicine_articles             239364     7480.1250\n",
       "List-Class_medicine_articles            41784     1305.7500\n",
       "Redirect-Class_medicine_articles          354       11.0625\n",
       "Start-Class_medicine_articles          776262    24258.1875\n",
       "Stub-Class_medicine_articles           110331     3447.8438"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pm_category_events_pandas.groupby(['category']).agg(\n",
    "    [\n",
    "        ('total_events','sum'),\n",
    "        ('daily_average', lambda x: sum(x)/days_in_study)\n",
    "    ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### By event type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>action</th>\n",
       "      <th>extClick</th>\n",
       "      <th>fnClick</th>\n",
       "      <th>fnHover</th>\n",
       "      <th>upClick</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>B-Class_medicine_articles</th>\n",
       "      <td>6166.4688</td>\n",
       "      <td>10397.3750</td>\n",
       "      <td>19015.6875</td>\n",
       "      <td>595.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C-Class_medicine_articles</th>\n",
       "      <td>6632.0312</td>\n",
       "      <td>7458.0938</td>\n",
       "      <td>12908.2812</td>\n",
       "      <td>386.2188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Category-Class_medicine_articles</th>\n",
       "      <td>0.3750</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.4062</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Disambig-Class_medicine_articles</th>\n",
       "      <td>5.1250</td>\n",
       "      <td>0.8438</td>\n",
       "      <td>0.9062</td>\n",
       "      <td>0.0625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FA-Class_medicine_articles</th>\n",
       "      <td>444.8438</td>\n",
       "      <td>930.9375</td>\n",
       "      <td>2193.7500</td>\n",
       "      <td>67.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FL-Class_medicine_articles</th>\n",
       "      <td>13.2812</td>\n",
       "      <td>28.4688</td>\n",
       "      <td>39.3750</td>\n",
       "      <td>1.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GA-Class_medicine_articles</th>\n",
       "      <td>832.0938</td>\n",
       "      <td>2042.3125</td>\n",
       "      <td>4354.5938</td>\n",
       "      <td>251.1250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>List-Class_medicine_articles</th>\n",
       "      <td>388.4688</td>\n",
       "      <td>333.4062</td>\n",
       "      <td>578.0312</td>\n",
       "      <td>5.8438</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Redirect-Class_medicine_articles</th>\n",
       "      <td>4.6250</td>\n",
       "      <td>3.0625</td>\n",
       "      <td>3.3125</td>\n",
       "      <td>0.0625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Start-Class_medicine_articles</th>\n",
       "      <td>9108.0312</td>\n",
       "      <td>5967.8750</td>\n",
       "      <td>8972.5938</td>\n",
       "      <td>209.6875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Stub-Class_medicine_articles</th>\n",
       "      <td>2200.8438</td>\n",
       "      <td>562.3750</td>\n",
       "      <td>662.7500</td>\n",
       "      <td>21.8750</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "action                            extClick    fnClick    fnHover  upClick\n",
       "category                                                                 \n",
       "B-Class_medicine_articles        6166.4688 10397.3750 19015.6875 595.7500\n",
       "C-Class_medicine_articles        6632.0312  7458.0938 12908.2812 386.2188\n",
       "Category-Class_medicine_articles    0.3750     0.1875     0.4062      nan\n",
       "Disambig-Class_medicine_articles    5.1250     0.8438     0.9062   0.0625\n",
       "FA-Class_medicine_articles        444.8438   930.9375  2193.7500  67.2500\n",
       "FL-Class_medicine_articles         13.2812    28.4688    39.3750   1.2500\n",
       "GA-Class_medicine_articles        832.0938  2042.3125  4354.5938 251.1250\n",
       "List-Class_medicine_articles      388.4688   333.4062   578.0312   5.8438\n",
       "Redirect-Class_medicine_articles    4.6250     3.0625     3.3125   0.0625\n",
       "Start-Class_medicine_articles    9108.0312  5967.8750  8972.5938 209.6875\n",
       "Stub-Class_medicine_articles     2200.8438   562.3750   662.7500  21.8750"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pm_category_events_pandas['average'] = pm_category_events_pandas['count'].map(lambda x: x/days_in_study)\n",
    "pm_category_events_pandas.pivot(index='category', columns='action', values='average')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n",
    "**numbers will not match [1] because we're limiting to namespace 0 pages with external links**\n",
    "\n",
    "[1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>pages_w_links</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Start-Class_medicine_articles</td>\n",
       "      <td>14484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Stub-Class_medicine_articles</td>\n",
       "      <td>9765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C-Class_medicine_articles</td>\n",
       "      <td>5332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>B-Class_medicine_articles</td>\n",
       "      <td>2173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>List-Class_medicine_articles</td>\n",
       "      <td>452</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>GA-Class_medicine_articles</td>\n",
       "      <td>240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>FA-Class_medicine_articles</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Disambig-Class_medicine_articles</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Redirect-Class_medicine_articles</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>FL-Class_medicine_articles</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Category-Class_medicine_articles</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            category  pages_w_links\n",
       "0      Start-Class_medicine_articles          14484\n",
       "1       Stub-Class_medicine_articles           9765\n",
       "2          C-Class_medicine_articles           5332\n",
       "3          B-Class_medicine_articles           2173\n",
       "4       List-Class_medicine_articles            452\n",
       "5         GA-Class_medicine_articles            240\n",
       "6         FA-Class_medicine_articles             62\n",
       "7   Disambig-Class_medicine_articles             17\n",
       "8   Redirect-Class_medicine_articles             13\n",
       "9         FL-Class_medicine_articles             12\n",
       "10  Category-Class_medicine_articles              1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n",
    "# numbers will not match [1] because we're limiting to namespace 0 pages with external links\n",
    "# [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics\n",
    "pm_category_pages = \"\"\"\n",
    "SELECT category, COUNT(DISTINCT page_id) AS pages_w_links \n",
    "FROM ryanmax.projmed_categories \n",
    "WHERE category LIKE '%Class_medicine_articles%'\n",
    "AND to_date(projmed_categories.dt) = '2019-04-20'\n",
    "AND page_id IN \n",
    "            (SELECT page_id \n",
    "            FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "            )\n",
    "GROUP BY category\n",
    "ORDER BY COUNT(*) DESC\n",
    "\"\"\"\n",
    "pm_cat_counts = spark.sql(pm_category_pages)\n",
    "cats = sqlContext.createDataFrame(pm_cat_counts.rdd)\n",
    "cats.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)\n",
    "**numbers will be higher than overall WP:M pageloads since one page may have more than one category**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>total_pageloads</th>\n",
       "      <th>daily_average</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>B-Class_medicine_articles</td>\n",
       "      <td>57892215</td>\n",
       "      <td>1809131.7188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C-Class_medicine_articles</td>\n",
       "      <td>51507495</td>\n",
       "      <td>1609609.2188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Category-Class_medicine_articles</td>\n",
       "      <td>259</td>\n",
       "      <td>8.0938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Disambig-Class_medicine_articles</td>\n",
       "      <td>36927</td>\n",
       "      <td>1153.9688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FA-Class_medicine_articles</td>\n",
       "      <td>4389844</td>\n",
       "      <td>137182.6250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>FL-Class_medicine_articles</td>\n",
       "      <td>121691</td>\n",
       "      <td>3802.8438</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>GA-Class_medicine_articles</td>\n",
       "      <td>9350159</td>\n",
       "      <td>292192.4688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>List-Class_medicine_articles</td>\n",
       "      <td>1801806</td>\n",
       "      <td>56306.4375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Redirect-Class_medicine_articles</td>\n",
       "      <td>25308</td>\n",
       "      <td>790.8750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Start-Class_medicine_articles</td>\n",
       "      <td>54741621</td>\n",
       "      <td>1710675.6562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Stub-Class_medicine_articles</td>\n",
       "      <td>8056357</td>\n",
       "      <td>251761.1562</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            category  total_pageloads  daily_average\n",
       "0          B-Class_medicine_articles         57892215   1809131.7188\n",
       "1          C-Class_medicine_articles         51507495   1609609.2188\n",
       "2   Category-Class_medicine_articles              259         8.0938\n",
       "3   Disambig-Class_medicine_articles            36927      1153.9688\n",
       "4         FA-Class_medicine_articles          4389844    137182.6250\n",
       "5         FL-Class_medicine_articles           121691      3802.8438\n",
       "6         GA-Class_medicine_articles          9350159    292192.4688\n",
       "7       List-Class_medicine_articles          1801806     56306.4375\n",
       "8   Redirect-Class_medicine_articles            25308       790.8750\n",
       "9      Start-Class_medicine_articles         54741621   1710675.6562\n",
       "10      Stub-Class_medicine_articles          8056357    251761.1562"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)\n",
    "# numbers will be higher than overall WP:M pageloads since one page may have more than one category\n",
    "pm_category_pageloads_query = \"\"\"\n",
    "SELECT projmed_categories.category, sum(view_count) AS total_pageloads, sum(view_count)/{} AS daily_average\n",
    "FROM \n",
    "    wmf.pageview_hourly, \n",
    "    (SELECT DISTINCT page_id, category \n",
    "    FROM ryanmax.projmed_categories \n",
    "    WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n",
    "    AND to_date(projmed_categories.dt) = '2019-04-20'\n",
    "    ) \n",
    "    AS projmed_categories\n",
    "WHERE pageview_hourly.page_id = projmed_categories.page_id\n",
    "    AND pageview_hourly.project = 'en.wikipedia'\n",
    "    AND pageview_hourly.agent_type = 'user'\n",
    "    AND pageview_hourly.page_id IN \n",
    "        (SELECT page_id \n",
    "        FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "        )\n",
    "    AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n",
    "    AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n",
    "    GROUP BY projmed_categories.category\n",
    "    ORDER BY projmed_categories.category\n",
    "\"\"\"\n",
    "\n",
    "pm_category_pageloads = spark.sql(\n",
    "    pm_category_pageloads_query.format(\n",
    "        days_in_study, start_date_string, end_date_string\n",
    "    ))\n",
    "pm_category_pageloads.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>num_extlinks</th>\n",
       "      <th>avg_extlinks_per_page</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>B-Class_medicine_articles</td>\n",
       "      <td>216159</td>\n",
       "      <td>99.4749</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C-Class_medicine_articles</td>\n",
       "      <td>256660</td>\n",
       "      <td>48.1358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Category-Class_medicine_articles</td>\n",
       "      <td>135</td>\n",
       "      <td>135.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Disambig-Class_medicine_articles</td>\n",
       "      <td>85</td>\n",
       "      <td>5.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FA-Class_medicine_articles</td>\n",
       "      <td>13914</td>\n",
       "      <td>224.4194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>FL-Class_medicine_articles</td>\n",
       "      <td>2588</td>\n",
       "      <td>215.6667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>GA-Class_medicine_articles</td>\n",
       "      <td>32793</td>\n",
       "      <td>136.6375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>List-Class_medicine_articles</td>\n",
       "      <td>34091</td>\n",
       "      <td>75.4226</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Redirect-Class_medicine_articles</td>\n",
       "      <td>119</td>\n",
       "      <td>9.1538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Start-Class_medicine_articles</td>\n",
       "      <td>310574</td>\n",
       "      <td>21.4426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Stub-Class_medicine_articles</td>\n",
       "      <td>76668</td>\n",
       "      <td>7.8513</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            category  num_extlinks  avg_extlinks_per_page\n",
       "0          B-Class_medicine_articles        216159                99.4749\n",
       "1          C-Class_medicine_articles        256660                48.1358\n",
       "2   Category-Class_medicine_articles           135               135.0000\n",
       "3   Disambig-Class_medicine_articles            85                 5.0000\n",
       "4         FA-Class_medicine_articles         13914               224.4194\n",
       "5         FL-Class_medicine_articles          2588               215.6667\n",
       "6         GA-Class_medicine_articles         32793               136.6375\n",
       "7       List-Class_medicine_articles         34091                75.4226\n",
       "8   Redirect-Class_medicine_articles           119                 9.1538\n",
       "9      Start-Class_medicine_articles        310574                21.4426\n",
       "10      Stub-Class_medicine_articles         76668                 7.8513"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n",
    "pm_category_links_query = \"\"\"\n",
    "SELECT projmed_categories.category, \n",
    "    COUNT(DISTINCT el_from, el_to) AS num_extlinks, \n",
    "    COUNT(DISTINCT el_from, el_to)/COUNT(DISTINCT el_from) as avg_extlinks_per_page\n",
    "FROM \n",
    "    ryanmax.population_externallinks, \n",
    "    (SELECT DISTINCT page_id, category \n",
    "    FROM ryanmax.projmed_categories \n",
    "    WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n",
    "    AND to_date(projmed_categories.dt) = '2019-04-20'\n",
    "    ) \n",
    "    AS projmed_categories\n",
    "WHERE population_externallinks.el_from = projmed_categories.page_id\n",
    "    AND population_externallinks.el_from IN \n",
    "        (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n",
    "    GROUP BY projmed_categories.category\n",
    "    ORDER BY projmed_categories.category\n",
    "\"\"\"\n",
    "\n",
    "pm_category_links = spark.sql(pm_category_links_query)\n",
    "pm_category_links.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark large: spark-xml jar and local venv path ",
   "language": "python",
   "name": "spark-ryanmax"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}