{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Top Hostnames\n",
    "- report externallinks hostname counts for W and WP:M pages\n",
    "- report various hostname event data for W and WP:M pages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n",
    "%run -i 'data-defaults.py'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# parse hostnames from links\n",
    "# includes extraction for web.archive.org links\n",
    "parse_host = \"\"\"\n",
    "    PARSE_URL(REGEXP_REPLACE(LOWER({}),'^https?://web.archive.org/web/[^/]+/',''),'HOST')\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hostnames present in pages with external links"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Top 20 hostnames found in WP:M pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>238688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>183761</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>www.worldcat.org</td>\n",
       "      <td>25434</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>24491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.google.com</td>\n",
       "      <td>11074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>apps.who.int</td>\n",
       "      <td>12816</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.icd9data.com</td>\n",
       "      <td>8693</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>www.wikidata.org</td>\n",
       "      <td>8069</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>amigo.geneontology.org</td>\n",
       "      <td>6909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>www.nlm.nih.gov</td>\n",
       "      <td>6420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>www.cdc.gov</td>\n",
       "      <td>6285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>www.jstor.org</td>\n",
       "      <td>4302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>scholar.google.com</td>\n",
       "      <td>2850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.who.int</td>\n",
       "      <td>5350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>4545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>www.omim.org</td>\n",
       "      <td>4415</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>adsabs.harvard.edu</td>\n",
       "      <td>4255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>www.emedicine.com</td>\n",
       "      <td>3477</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>www.diseasesdatabase.com</td>\n",
       "      <td>3105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>id.loc.gov</td>\n",
       "      <td>3008</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        host   count\n",
       "0       www.ncbi.nlm.nih.gov  238688\n",
       "1                    doi.org  183761\n",
       "2           www.worldcat.org   25434\n",
       "3           books.google.com   24491\n",
       "4             www.google.com   11074\n",
       "5               apps.who.int   12816\n",
       "6           www.icd9data.com    8693\n",
       "7           www.wikidata.org    8069\n",
       "8     amigo.geneontology.org    6909\n",
       "9            www.nlm.nih.gov    6420\n",
       "10               www.cdc.gov    6285\n",
       "11             www.jstor.org    4302\n",
       "12        scholar.google.com    2850\n",
       "13               www.who.int    5350\n",
       "14           www.nytimes.com    4545\n",
       "15              www.omim.org    4415\n",
       "16        adsabs.harvard.edu    4255\n",
       "17         www.emedicine.com    3477\n",
       "18  www.diseasesdatabase.com    3105\n",
       "19                id.loc.gov    3008"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count of top 20 hostnames in WP:M pages with external links\n",
    "top_hosts_wpm_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count \n",
    "FROM ryanmax.population_externallinks\n",
    "WHERE el_from IN \n",
    "    (SELECT DISTINCT page_id AS el_from\n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "    )\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_wpm = spark.sql(\n",
    "    top_hosts_wpm_query.format(\n",
    "        parse_host.format('el_to')\n",
    "    ))\n",
    "top_hosts_wpm.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Top 20 hostnames found in W pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tools.wmflabs.org</td>\n",
       "      <td>2118863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.google.com</td>\n",
       "      <td>1763467</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>1199818</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>1122007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.worldcat.org</td>\n",
       "      <td>1061459</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>1231622</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>scholar.google.com</td>\n",
       "      <td>437109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>www.wikidata.org</td>\n",
       "      <td>805309</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>www.jstor.org</td>\n",
       "      <td>548764</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>ssd.jpl.nasa.gov</td>\n",
       "      <td>592901</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>minorplanetcenter.net</td>\n",
       "      <td>556758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>viaf.org</td>\n",
       "      <td>535724</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>id.loc.gov</td>\n",
       "      <td>450557</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.imdb.com</td>\n",
       "      <td>418623</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>isni.org</td>\n",
       "      <td>413584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>www.gbif.org</td>\n",
       "      <td>351989</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>343951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>www.youtube.com</td>\n",
       "      <td>330816</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>eol.org</td>\n",
       "      <td>304728</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>www.webcitation.org</td>\n",
       "      <td>301718</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     host    count\n",
       "0       tools.wmflabs.org  2118863\n",
       "1          www.google.com  1763467\n",
       "2                 doi.org  1199818\n",
       "3    www.ncbi.nlm.nih.gov  1122007\n",
       "4        www.worldcat.org  1061459\n",
       "5        books.google.com  1231622\n",
       "6      scholar.google.com   437109\n",
       "7        www.wikidata.org   805309\n",
       "8           www.jstor.org   548764\n",
       "9        ssd.jpl.nasa.gov   592901\n",
       "10  minorplanetcenter.net   556758\n",
       "11               viaf.org   535724\n",
       "12             id.loc.gov   450557\n",
       "13           www.imdb.com   418623\n",
       "14               isni.org   413584\n",
       "15           www.gbif.org   351989\n",
       "16        www.nytimes.com   343951\n",
       "17        www.youtube.com   330816\n",
       "18                eol.org   304728\n",
       "19    www.webcitation.org   301718"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count of top 20 hostnames in WP:M pages with external links\n",
    "top_hosts_w_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count \n",
    "FROM ryanmax.population_externallinks\n",
    "WHERE el_from IN \n",
    "    (SELECT DISTINCT page_id AS el_from\n",
    "    FROM ryanmax.population_w_pages_with_extlinks\n",
    "    )\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_w = spark.sql(\n",
    "    top_hosts_w_query.format(\n",
    "        parse_host.format('el_to')\n",
    "    ))\n",
    "top_hosts_w.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### web.archive.org links found in WP:M pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>web.archive.org</td>\n",
       "      <td>50390</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              host  count\n",
       "0  web.archive.org  50390"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count of web.archive.org links in WP:M pages with external links\n",
    "archive_org_wpm_query = \"\"\"\n",
    "SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count \n",
    "FROM ryanmax.population_externallinks\n",
    "WHERE el_from IN \n",
    "    (SELECT DISTINCT page_id AS el_from\n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "    )\n",
    "AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'\n",
    "AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'\n",
    "GROUP BY host\n",
    "\"\"\"\n",
    "archive_org_wpm = spark.sql(archive_org_wpm_query)\n",
    "archive_org_wpm.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### web.archive.org links found in W pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>web.archive.org</td>\n",
       "      <td>4416436</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              host    count\n",
       "0  web.archive.org  4416436"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count of web.archive.org links in W pages with external links\n",
    "archive_org_w_query = \"\"\"\n",
    "SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count \n",
    "FROM ryanmax.population_externallinks\n",
    "WHERE el_from IN \n",
    "    (SELECT DISTINCT page_id AS el_from\n",
    "    FROM ryanmax.population_w_pages_with_extlinks\n",
    "    )\n",
    "AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'\n",
    "AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'\n",
    "GROUP BY host\n",
    "\"\"\"\n",
    "archive_org_w = spark.sql(archive_org_w_query)\n",
    "archive_org_w.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Event Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### all event types"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames\n",
    "- limited to WP:M pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en.wikipedia.org</td>\n",
       "      <td>1860686</td>\n",
       "      <td>58146.43750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>en.m.wikipedia.org</td>\n",
       "      <td>627309</td>\n",
       "      <td>19603.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>86432</td>\n",
       "      <td>2701.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>74930</td>\n",
       "      <td>2341.56250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>19729</td>\n",
       "      <td>616.53125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>www.drugs.com</td>\n",
       "      <td>16639</td>\n",
       "      <td>519.96875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.who.int</td>\n",
       "      <td>15105</td>\n",
       "      <td>472.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>translate.googleusercontent.com</td>\n",
       "      <td>10775</td>\n",
       "      <td>336.71875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>theranos.com</td>\n",
       "      <td>10660</td>\n",
       "      <td>333.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>www.cdc.gov</td>\n",
       "      <td>10119</td>\n",
       "      <td>316.21875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>chemapps.stolaf.edu</td>\n",
       "      <td>7613</td>\n",
       "      <td>237.90625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>apps.who.int</td>\n",
       "      <td>6384</td>\n",
       "      <td>199.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>www.nlm.nih.gov</td>\n",
       "      <td>6291</td>\n",
       "      <td>196.59375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.fda.gov</td>\n",
       "      <td>5101</td>\n",
       "      <td>159.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>5005</td>\n",
       "      <td>156.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>www.commonchemistry.org</td>\n",
       "      <td>4604</td>\n",
       "      <td>143.87500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>www.accessdata.fda.gov</td>\n",
       "      <td>4352</td>\n",
       "      <td>136.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>www.youtube.com</td>\n",
       "      <td>4154</td>\n",
       "      <td>129.81250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>emedicine.medscape.com</td>\n",
       "      <td>3791</td>\n",
       "      <td>118.46875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>pubchem.ncbi.nlm.nih.gov</td>\n",
       "      <td>3469</td>\n",
       "      <td>108.40625</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               host  total_events  daily_average_events\n",
       "0                  en.wikipedia.org       1860686           58146.43750\n",
       "1                en.m.wikipedia.org        627309           19603.40625\n",
       "2              www.ncbi.nlm.nih.gov         86432            2701.00000\n",
       "3                           doi.org         74930            2341.56250\n",
       "4                  books.google.com         19729             616.53125\n",
       "5                     www.drugs.com         16639             519.96875\n",
       "6                       www.who.int         15105             472.03125\n",
       "7   translate.googleusercontent.com         10775             336.71875\n",
       "8                      theranos.com         10660             333.12500\n",
       "9                       www.cdc.gov         10119             316.21875\n",
       "10              chemapps.stolaf.edu          7613             237.90625\n",
       "11                     apps.who.int          6384             199.50000\n",
       "12                  www.nlm.nih.gov          6291             196.59375\n",
       "13                      www.fda.gov          5101             159.40625\n",
       "14                  www.nytimes.com          5005             156.40625\n",
       "15          www.commonchemistry.org          4604             143.87500\n",
       "16           www.accessdata.fda.gov          4352             136.00000\n",
       "17                  www.youtube.com          4154             129.81250\n",
       "18           emedicine.medscape.com          3791             118.46875\n",
       "19         pubchem.ncbi.nlm.nih.gov          3469             108.40625"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames\n",
    "# limited to WP:M pages with external links\n",
    "top_hosts_wpm_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_wpm_events = spark.sql(\n",
    "    top_hosts_wpm_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_wpm_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames\n",
    "- limited to W pages with external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en.wikipedia.org</td>\n",
       "      <td>42334009</td>\n",
       "      <td>1.322938e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>en.m.wikipedia.org</td>\n",
       "      <td>17636814</td>\n",
       "      <td>5.511504e+05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>www.imdb.com</td>\n",
       "      <td>3050694</td>\n",
       "      <td>9.533419e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>tools.wmflabs.org</td>\n",
       "      <td>1064153</td>\n",
       "      <td>3.325478e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.youtube.com</td>\n",
       "      <td>601433</td>\n",
       "      <td>1.879478e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>573246</td>\n",
       "      <td>1.791394e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.espncricinfo.com</td>\n",
       "      <td>379919</td>\n",
       "      <td>1.187247e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>350714</td>\n",
       "      <td>1.095981e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>301568</td>\n",
       "      <td>9.424000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>www.bbc.co.uk</td>\n",
       "      <td>227540</td>\n",
       "      <td>7.110625e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>www.theguardian.com</td>\n",
       "      <td>225815</td>\n",
       "      <td>7.056719e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>translate.googleusercontent.com</td>\n",
       "      <td>184858</td>\n",
       "      <td>5.776812e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>www.instagram.com</td>\n",
       "      <td>182944</td>\n",
       "      <td>5.717000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>twitter.com</td>\n",
       "      <td>180994</td>\n",
       "      <td>5.656062e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>www.google.com</td>\n",
       "      <td>166936</td>\n",
       "      <td>5.216750e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>archive.org</td>\n",
       "      <td>162174</td>\n",
       "      <td>5.067938e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>157171</td>\n",
       "      <td>4.911594e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>www.allmusic.com</td>\n",
       "      <td>155615</td>\n",
       "      <td>4.862969e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>www.rottentomatoes.com</td>\n",
       "      <td>140717</td>\n",
       "      <td>4.397406e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>www.biblegateway.com</td>\n",
       "      <td>136031</td>\n",
       "      <td>4.250969e+03</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               host  total_events  daily_average_events\n",
       "0                  en.wikipedia.org      42334009          1.322938e+06\n",
       "1                en.m.wikipedia.org      17636814          5.511504e+05\n",
       "2                      www.imdb.com       3050694          9.533419e+04\n",
       "3                 tools.wmflabs.org       1064153          3.325478e+04\n",
       "4                   www.youtube.com        601433          1.879478e+04\n",
       "5                  books.google.com        573246          1.791394e+04\n",
       "6              www.espncricinfo.com        379919          1.187247e+04\n",
       "7                           doi.org        350714          1.095981e+04\n",
       "8                   www.nytimes.com        301568          9.424000e+03\n",
       "9                     www.bbc.co.uk        227540          7.110625e+03\n",
       "10              www.theguardian.com        225815          7.056719e+03\n",
       "11  translate.googleusercontent.com        184858          5.776812e+03\n",
       "12                www.instagram.com        182944          5.717000e+03\n",
       "13                      twitter.com        180994          5.656062e+03\n",
       "14                   www.google.com        166936          5.216750e+03\n",
       "15                      archive.org        162174          5.067938e+03\n",
       "16             www.ncbi.nlm.nih.gov        157171          4.911594e+03\n",
       "17                 www.allmusic.com        155615          4.862969e+03\n",
       "18           www.rottentomatoes.com        140717          4.397406e+03\n",
       "19             www.biblegateway.com        136031          4.250969e+03"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames by event type\n",
    "# limited to W pages with external links\n",
    "top_hosts_w_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_w_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_w_events = spark.sql(\n",
    "    top_hosts_w_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_w_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### extClick events"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames\n",
    "- limited to WP:M pages with external links\n",
    "- further limited to extClick events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>86432</td>\n",
       "      <td>2701.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>74930</td>\n",
       "      <td>2341.56250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>19725</td>\n",
       "      <td>616.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>www.drugs.com</td>\n",
       "      <td>16639</td>\n",
       "      <td>519.96875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.who.int</td>\n",
       "      <td>15105</td>\n",
       "      <td>472.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>theranos.com</td>\n",
       "      <td>10660</td>\n",
       "      <td>333.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.cdc.gov</td>\n",
       "      <td>10119</td>\n",
       "      <td>316.21875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>chemapps.stolaf.edu</td>\n",
       "      <td>7613</td>\n",
       "      <td>237.90625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>apps.who.int</td>\n",
       "      <td>6384</td>\n",
       "      <td>199.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>www.nlm.nih.gov</td>\n",
       "      <td>6291</td>\n",
       "      <td>196.59375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>www.fda.gov</td>\n",
       "      <td>5101</td>\n",
       "      <td>159.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>5005</td>\n",
       "      <td>156.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>www.commonchemistry.org</td>\n",
       "      <td>4604</td>\n",
       "      <td>143.87500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.accessdata.fda.gov</td>\n",
       "      <td>4352</td>\n",
       "      <td>136.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>www.youtube.com</td>\n",
       "      <td>4154</td>\n",
       "      <td>129.81250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>emedicine.medscape.com</td>\n",
       "      <td>3791</td>\n",
       "      <td>118.46875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>translate.googleusercontent.com</td>\n",
       "      <td>3553</td>\n",
       "      <td>111.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>pubchem.ncbi.nlm.nih.gov</td>\n",
       "      <td>3469</td>\n",
       "      <td>108.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>www.cancer.gov</td>\n",
       "      <td>3225</td>\n",
       "      <td>100.78125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>www.drugbank.ca</td>\n",
       "      <td>2648</td>\n",
       "      <td>82.75000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               host  total_events  daily_average_events\n",
       "0              www.ncbi.nlm.nih.gov         86432            2701.00000\n",
       "1                           doi.org         74930            2341.56250\n",
       "2                  books.google.com         19725             616.40625\n",
       "3                     www.drugs.com         16639             519.96875\n",
       "4                       www.who.int         15105             472.03125\n",
       "5                      theranos.com         10660             333.12500\n",
       "6                       www.cdc.gov         10119             316.21875\n",
       "7               chemapps.stolaf.edu          7613             237.90625\n",
       "8                      apps.who.int          6384             199.50000\n",
       "9                   www.nlm.nih.gov          6291             196.59375\n",
       "10                      www.fda.gov          5101             159.40625\n",
       "11                  www.nytimes.com          5005             156.40625\n",
       "12          www.commonchemistry.org          4604             143.87500\n",
       "13           www.accessdata.fda.gov          4352             136.00000\n",
       "14                  www.youtube.com          4154             129.81250\n",
       "15           emedicine.medscape.com          3791             118.46875\n",
       "16  translate.googleusercontent.com          3553             111.03125\n",
       "17         pubchem.ncbi.nlm.nih.gov          3469             108.40625\n",
       "18                   www.cancer.gov          3225             100.78125\n",
       "19                  www.drugbank.ca          2648              82.75000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames\n",
    "# limited to WP:M pages with external links\n",
    "# further limited to extClick events\n",
    "top_hosts_wpm_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND action = 'extClick'\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_wpm_events = spark.sql(\n",
    "    top_hosts_wpm_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_wpm_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames by event type\n",
    "- limited to W pages with external links\n",
    "- further limited to extClick events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>www.imdb.com</td>\n",
       "      <td>3050694</td>\n",
       "      <td>95334.18750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tools.wmflabs.org</td>\n",
       "      <td>1064153</td>\n",
       "      <td>33254.78125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>www.youtube.com</td>\n",
       "      <td>601433</td>\n",
       "      <td>18794.78125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>573062</td>\n",
       "      <td>17908.18750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.espncricinfo.com</td>\n",
       "      <td>379919</td>\n",
       "      <td>11872.46875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>350714</td>\n",
       "      <td>10959.81250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>301568</td>\n",
       "      <td>9424.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>www.bbc.co.uk</td>\n",
       "      <td>227540</td>\n",
       "      <td>7110.62500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>www.theguardian.com</td>\n",
       "      <td>225815</td>\n",
       "      <td>7056.71875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>www.instagram.com</td>\n",
       "      <td>182944</td>\n",
       "      <td>5717.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>twitter.com</td>\n",
       "      <td>180994</td>\n",
       "      <td>5656.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>www.google.com</td>\n",
       "      <td>166936</td>\n",
       "      <td>5216.75000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>archive.org</td>\n",
       "      <td>162153</td>\n",
       "      <td>5067.28125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>157171</td>\n",
       "      <td>4911.59375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>www.allmusic.com</td>\n",
       "      <td>155615</td>\n",
       "      <td>4862.96875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>www.rottentomatoes.com</td>\n",
       "      <td>140717</td>\n",
       "      <td>4397.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>www.findagrave.com</td>\n",
       "      <td>130021</td>\n",
       "      <td>4063.15625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>news.bbc.co.uk</td>\n",
       "      <td>129189</td>\n",
       "      <td>4037.15625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>www.telegraph.co.uk</td>\n",
       "      <td>115366</td>\n",
       "      <td>3605.18750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>www.facebook.com</td>\n",
       "      <td>111813</td>\n",
       "      <td>3494.15625</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      host  total_events  daily_average_events\n",
       "0             www.imdb.com       3050694           95334.18750\n",
       "1        tools.wmflabs.org       1064153           33254.78125\n",
       "2          www.youtube.com        601433           18794.78125\n",
       "3         books.google.com        573062           17908.18750\n",
       "4     www.espncricinfo.com        379919           11872.46875\n",
       "5                  doi.org        350714           10959.81250\n",
       "6          www.nytimes.com        301568            9424.00000\n",
       "7            www.bbc.co.uk        227540            7110.62500\n",
       "8      www.theguardian.com        225815            7056.71875\n",
       "9        www.instagram.com        182944            5717.00000\n",
       "10             twitter.com        180994            5656.06250\n",
       "11          www.google.com        166936            5216.75000\n",
       "12             archive.org        162153            5067.28125\n",
       "13    www.ncbi.nlm.nih.gov        157171            4911.59375\n",
       "14        www.allmusic.com        155615            4862.96875\n",
       "15  www.rottentomatoes.com        140717            4397.40625\n",
       "16      www.findagrave.com        130021            4063.15625\n",
       "17          news.bbc.co.uk        129189            4037.15625\n",
       "18     www.telegraph.co.uk        115366            3605.18750\n",
       "19        www.facebook.com        111813            3494.15625"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames by event type\n",
    "# limited to W pages with external links\n",
    "# further limited to extClick events\n",
    "top_hosts_w_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND action = 'extClick'\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_w_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_w_events = spark.sql(\n",
    "    top_hosts_w_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_w_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### limited to fnHover events"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames\n",
    "- limited to WP:M pages with external links\n",
    "- further limited to fnHover events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en.wikipedia.org</td>\n",
       "      <td>1465057</td>\n",
       "      <td>45783.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>en.m.wikipedia.org</td>\n",
       "      <td>89821</td>\n",
       "      <td>2806.90625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>translate.googleusercontent.com</td>\n",
       "      <td>4032</td>\n",
       "      <td>126.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>698</td>\n",
       "      <td>21.81250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.translatoruser-int.com</td>\n",
       "      <td>149</td>\n",
       "      <td>4.65625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>papago.naver.net</td>\n",
       "      <td>99</td>\n",
       "      <td>3.09375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.biblegateway.com</td>\n",
       "      <td>39</td>\n",
       "      <td>1.21875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>z5h64q92x9.net</td>\n",
       "      <td>21</td>\n",
       "      <td>0.65625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>scholar.google.com</td>\n",
       "      <td>17</td>\n",
       "      <td>0.53125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>emedien3.sub.uni-hamburg.de</td>\n",
       "      <td>12</td>\n",
       "      <td>0.37500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>en.wikipedi0.org</td>\n",
       "      <td>5</td>\n",
       "      <td>0.15625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>translate.baiducontent.com</td>\n",
       "      <td>4</td>\n",
       "      <td>0.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>en.m.wikipedi0.org</td>\n",
       "      <td>3</td>\n",
       "      <td>0.09375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.gpedia.com</td>\n",
       "      <td>2</td>\n",
       "      <td>0.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>en.wikipedia.org.</td>\n",
       "      <td>2</td>\n",
       "      <td>0.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>1</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>speechpanel.readspeaker.com</td>\n",
       "      <td>1</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>nl.wikiludia.com</td>\n",
       "      <td>1</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>en.wk.jsproxy.tk</td>\n",
       "      <td>1</td>\n",
       "      <td>0.03125</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               host  total_events  daily_average_events\n",
       "0                  en.wikipedia.org       1465057           45783.03125\n",
       "1                en.m.wikipedia.org         89821            2806.90625\n",
       "2   translate.googleusercontent.com          4032             126.00000\n",
       "3                              None           698              21.81250\n",
       "4        www.translatoruser-int.com           149               4.65625\n",
       "5                  papago.naver.net            99               3.09375\n",
       "6              www.biblegateway.com            39               1.21875\n",
       "7                    z5h64q92x9.net            21               0.65625\n",
       "8                scholar.google.com            17               0.53125\n",
       "9       emedien3.sub.uni-hamburg.de            12               0.37500\n",
       "10                 en.wikipedi0.org             5               0.15625\n",
       "11       translate.baiducontent.com             4               0.12500\n",
       "12               en.m.wikipedi0.org             3               0.09375\n",
       "13                   www.gpedia.com             2               0.06250\n",
       "14                en.wikipedia.org.             2               0.06250\n",
       "15                 books.google.com             1               0.03125\n",
       "16      speechpanel.readspeaker.com             1               0.03125\n",
       "17                 nl.wikiludia.com             1               0.03125\n",
       "18                 en.wk.jsproxy.tk             1               0.03125"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames\n",
    "# limited to WP:M pages with external links\n",
    "# further limited to fnHover events\n",
    "top_hosts_wpm_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND action = 'fnHover'\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_wpm_events = spark.sql(\n",
    "    top_hosts_wpm_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_wpm_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames by event type\n",
    "- limited to W pages with external links\n",
    "- further limited to fnHover events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en.wikipedia.org</td>\n",
       "      <td>33809467</td>\n",
       "      <td>1.056546e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>en.m.wikipedia.org</td>\n",
       "      <td>2015284</td>\n",
       "      <td>6.297762e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>translate.googleusercontent.com</td>\n",
       "      <td>53714</td>\n",
       "      <td>1.678562e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>27224</td>\n",
       "      <td>8.507500e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.biblegateway.com</td>\n",
       "      <td>16145</td>\n",
       "      <td>5.045312e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>www.translatoruser-int.com</td>\n",
       "      <td>2063</td>\n",
       "      <td>6.446875e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>papago.naver.net</td>\n",
       "      <td>1250</td>\n",
       "      <td>3.906250e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>z5h64q92x9.net</td>\n",
       "      <td>327</td>\n",
       "      <td>1.021875e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>en.wikipedi0.org</td>\n",
       "      <td>246</td>\n",
       "      <td>7.687500e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>emedien3.sub.uni-hamburg.de</td>\n",
       "      <td>169</td>\n",
       "      <td>5.281250e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>104</td>\n",
       "      <td>3.250000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>translate.baiducontent.com</td>\n",
       "      <td>102</td>\n",
       "      <td>3.187500e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>www.gutenberg.org</td>\n",
       "      <td>80</td>\n",
       "      <td>2.500000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>fsfe.org</td>\n",
       "      <td>32</td>\n",
       "      <td>1.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>en.wikipedia.org.</td>\n",
       "      <td>29</td>\n",
       "      <td>9.062500e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>wn.rsarchive.org</td>\n",
       "      <td>28</td>\n",
       "      <td>8.750000e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>en.wk.jsproxy.tk</td>\n",
       "      <td>28</td>\n",
       "      <td>8.750000e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>en.wikiludia.com</td>\n",
       "      <td>26</td>\n",
       "      <td>8.125000e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>en.wikipedia.nom.rs</td>\n",
       "      <td>13</td>\n",
       "      <td>4.062500e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>www.gpedia.com</td>\n",
       "      <td>12</td>\n",
       "      <td>3.750000e-01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               host  total_events  daily_average_events\n",
       "0                  en.wikipedia.org      33809467          1.056546e+06\n",
       "1                en.m.wikipedia.org       2015284          6.297762e+04\n",
       "2   translate.googleusercontent.com         53714          1.678562e+03\n",
       "3                              None         27224          8.507500e+02\n",
       "4              www.biblegateway.com         16145          5.045312e+02\n",
       "5        www.translatoruser-int.com          2063          6.446875e+01\n",
       "6                  papago.naver.net          1250          3.906250e+01\n",
       "7                    z5h64q92x9.net           327          1.021875e+01\n",
       "8                  en.wikipedi0.org           246          7.687500e+00\n",
       "9       emedien3.sub.uni-hamburg.de           169          5.281250e+00\n",
       "10                 books.google.com           104          3.250000e+00\n",
       "11       translate.baiducontent.com           102          3.187500e+00\n",
       "12                www.gutenberg.org            80          2.500000e+00\n",
       "13                         fsfe.org            32          1.000000e+00\n",
       "14                en.wikipedia.org.            29          9.062500e-01\n",
       "15                 wn.rsarchive.org            28          8.750000e-01\n",
       "16                 en.wk.jsproxy.tk            28          8.750000e-01\n",
       "17                 en.wikiludia.com            26          8.125000e-01\n",
       "18              en.wikipedia.nom.rs            13          4.062500e-01\n",
       "19                   www.gpedia.com            12          3.750000e-01"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames by event type\n",
    "# limited to W pages with external links\n",
    "# further limited to fnHover events\n",
    "top_hosts_w_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND action = 'fnHover'\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_w_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_w_events = spark.sql(\n",
    "    top_hosts_w_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_w_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### limited to extClick, fnClick OR upClick events"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames\n",
    "- limited to WP:M pages with external links\n",
    "- further limited to extClick, fnClick OR upClick events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en.m.wikipedia.org</td>\n",
       "      <td>537488</td>\n",
       "      <td>16796.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>en.wikipedia.org</td>\n",
       "      <td>395629</td>\n",
       "      <td>12363.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>86432</td>\n",
       "      <td>2701.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>74930</td>\n",
       "      <td>2341.56250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>19728</td>\n",
       "      <td>616.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>www.drugs.com</td>\n",
       "      <td>16639</td>\n",
       "      <td>519.96875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.who.int</td>\n",
       "      <td>15105</td>\n",
       "      <td>472.03125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>theranos.com</td>\n",
       "      <td>10660</td>\n",
       "      <td>333.12500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>www.cdc.gov</td>\n",
       "      <td>10119</td>\n",
       "      <td>316.21875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>chemapps.stolaf.edu</td>\n",
       "      <td>7613</td>\n",
       "      <td>237.90625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>translate.googleusercontent.com</td>\n",
       "      <td>6743</td>\n",
       "      <td>210.71875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>apps.who.int</td>\n",
       "      <td>6384</td>\n",
       "      <td>199.50000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>www.nlm.nih.gov</td>\n",
       "      <td>6291</td>\n",
       "      <td>196.59375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.fda.gov</td>\n",
       "      <td>5101</td>\n",
       "      <td>159.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>5005</td>\n",
       "      <td>156.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>www.commonchemistry.org</td>\n",
       "      <td>4604</td>\n",
       "      <td>143.87500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>www.accessdata.fda.gov</td>\n",
       "      <td>4352</td>\n",
       "      <td>136.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>www.youtube.com</td>\n",
       "      <td>4154</td>\n",
       "      <td>129.81250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>emedicine.medscape.com</td>\n",
       "      <td>3791</td>\n",
       "      <td>118.46875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>pubchem.ncbi.nlm.nih.gov</td>\n",
       "      <td>3469</td>\n",
       "      <td>108.40625</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               host  total_events  daily_average_events\n",
       "0                en.m.wikipedia.org        537488           16796.50000\n",
       "1                  en.wikipedia.org        395629           12363.40625\n",
       "2              www.ncbi.nlm.nih.gov         86432            2701.00000\n",
       "3                           doi.org         74930            2341.56250\n",
       "4                  books.google.com         19728             616.50000\n",
       "5                     www.drugs.com         16639             519.96875\n",
       "6                       www.who.int         15105             472.03125\n",
       "7                      theranos.com         10660             333.12500\n",
       "8                       www.cdc.gov         10119             316.21875\n",
       "9               chemapps.stolaf.edu          7613             237.90625\n",
       "10  translate.googleusercontent.com          6743             210.71875\n",
       "11                     apps.who.int          6384             199.50000\n",
       "12                  www.nlm.nih.gov          6291             196.59375\n",
       "13                      www.fda.gov          5101             159.40625\n",
       "14                  www.nytimes.com          5005             156.40625\n",
       "15          www.commonchemistry.org          4604             143.87500\n",
       "16           www.accessdata.fda.gov          4352             136.00000\n",
       "17                  www.youtube.com          4154             129.81250\n",
       "18           emedicine.medscape.com          3791             118.46875\n",
       "19         pubchem.ncbi.nlm.nih.gov          3469             108.40625"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames\n",
    "# limited to WP:M pages with external links\n",
    "# further limited to extClick, fnClick OR upClick events\n",
    "top_hosts_wpm_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND action in ('extClick', 'upClick','fnClick')\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_wpm_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_wpm_events = spark.sql(\n",
    "    top_hosts_wpm_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_wpm_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Total event count for top 20 hostnames by event type\n",
    "- limited to W pages with external links\n",
    "- further limited to extClick, fnClick OR upClick events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>total_events</th>\n",
       "      <th>daily_average_events</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>en.m.wikipedia.org</td>\n",
       "      <td>15621530</td>\n",
       "      <td>488172.81250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>en.wikipedia.org</td>\n",
       "      <td>8524542</td>\n",
       "      <td>266391.93750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>www.imdb.com</td>\n",
       "      <td>3050694</td>\n",
       "      <td>95334.18750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>tools.wmflabs.org</td>\n",
       "      <td>1064153</td>\n",
       "      <td>33254.78125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>www.youtube.com</td>\n",
       "      <td>601433</td>\n",
       "      <td>18794.78125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>books.google.com</td>\n",
       "      <td>573142</td>\n",
       "      <td>17910.68750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>www.espncricinfo.com</td>\n",
       "      <td>379919</td>\n",
       "      <td>11872.46875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>doi.org</td>\n",
       "      <td>350714</td>\n",
       "      <td>10959.81250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>www.nytimes.com</td>\n",
       "      <td>301568</td>\n",
       "      <td>9424.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>www.bbc.co.uk</td>\n",
       "      <td>227540</td>\n",
       "      <td>7110.62500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>www.theguardian.com</td>\n",
       "      <td>225815</td>\n",
       "      <td>7056.71875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>www.instagram.com</td>\n",
       "      <td>182944</td>\n",
       "      <td>5717.00000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>twitter.com</td>\n",
       "      <td>180994</td>\n",
       "      <td>5656.06250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>www.google.com</td>\n",
       "      <td>166936</td>\n",
       "      <td>5216.75000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>archive.org</td>\n",
       "      <td>162164</td>\n",
       "      <td>5067.62500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>www.ncbi.nlm.nih.gov</td>\n",
       "      <td>157171</td>\n",
       "      <td>4911.59375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>www.allmusic.com</td>\n",
       "      <td>155615</td>\n",
       "      <td>4862.96875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>www.rottentomatoes.com</td>\n",
       "      <td>140717</td>\n",
       "      <td>4397.40625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>translate.googleusercontent.com</td>\n",
       "      <td>131144</td>\n",
       "      <td>4098.25000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>www.findagrave.com</td>\n",
       "      <td>130021</td>\n",
       "      <td>4063.15625</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               host  total_events  daily_average_events\n",
       "0                en.m.wikipedia.org      15621530          488172.81250\n",
       "1                  en.wikipedia.org       8524542          266391.93750\n",
       "2                      www.imdb.com       3050694           95334.18750\n",
       "3                 tools.wmflabs.org       1064153           33254.78125\n",
       "4                   www.youtube.com        601433           18794.78125\n",
       "5                  books.google.com        573142           17910.68750\n",
       "6              www.espncricinfo.com        379919           11872.46875\n",
       "7                           doi.org        350714           10959.81250\n",
       "8                   www.nytimes.com        301568            9424.00000\n",
       "9                     www.bbc.co.uk        227540            7110.62500\n",
       "10              www.theguardian.com        225815            7056.71875\n",
       "11                www.instagram.com        182944            5717.00000\n",
       "12                      twitter.com        180994            5656.06250\n",
       "13                   www.google.com        166936            5216.75000\n",
       "14                      archive.org        162164            5067.62500\n",
       "15             www.ncbi.nlm.nih.gov        157171            4911.59375\n",
       "16                 www.allmusic.com        155615            4862.96875\n",
       "17           www.rottentomatoes.com        140717            4397.40625\n",
       "18  translate.googleusercontent.com        131144            4098.25000\n",
       "19               www.findagrave.com        130021            4063.15625"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total event count for top 20 hostnames by event type\n",
    "# limited to W pages with external links\n",
    "# further limited to extClick, fnClick OR upClick events\n",
    "top_hosts_w_query = \"\"\"\n",
    "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND action in ('extClick', 'upClick','fnClick')\n",
    "AND page_id IN \n",
    "    (SELECT DISTINCT page_id \n",
    "    FROM ryanmax.population_w_pages_with_extlinks\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_hosts_w_events = spark.sql(\n",
    "    top_hosts_w_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        days_in_study,\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_hosts_w_events.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top wikipedia.org links by event type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>action</th>\n",
       "      <th>extClick</th>\n",
       "      <th>fnClick</th>\n",
       "      <th>fnHover</th>\n",
       "      <th>upClick</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>host</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>ar.wikipedia.org</th>\n",
       "      <td>276.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>de.wikipedia.org</th>\n",
       "      <td>1798.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>en.m.wikipedia.org</th>\n",
       "      <td>NaN</td>\n",
       "      <td>16044275.0</td>\n",
       "      <td>2105104.0</td>\n",
       "      <td>114695.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>en.wikipedia.org</th>\n",
       "      <td>262.0</td>\n",
       "      <td>7875877.0</td>\n",
       "      <td>35274475.0</td>\n",
       "      <td>1043977.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>eo.wikipedia.org</th>\n",
       "      <td>280.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>es.wikipedia.org</th>\n",
       "      <td>785.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fr.wikipedia.org</th>\n",
       "      <td>1804.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>it.wikipedia.org</th>\n",
       "      <td>646.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ja.wikipedia.org</th>\n",
       "      <td>1973.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nl.wikipedia.org</th>\n",
       "      <td>1062.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pt.wikipedia.org</th>\n",
       "      <td>383.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ru.wikipedia.org</th>\n",
       "      <td>1092.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>simple.wikipedia.org</th>\n",
       "      <td>563.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>www.wikipedia.org</th>\n",
       "      <td>941.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zh.wikipedia.org</th>\n",
       "      <td>1171.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "action                extClick     fnClick     fnHover    upClick\n",
       "host                                                             \n",
       "ar.wikipedia.org         276.0         NaN         NaN        NaN\n",
       "de.wikipedia.org        1798.0         NaN         NaN        NaN\n",
       "en.m.wikipedia.org         NaN  16044275.0   2105104.0   114695.0\n",
       "en.wikipedia.org         262.0   7875877.0  35274475.0  1043977.0\n",
       "eo.wikipedia.org         280.0         NaN         NaN        NaN\n",
       "es.wikipedia.org         785.0         NaN         NaN        NaN\n",
       "fr.wikipedia.org        1804.0         NaN         NaN        NaN\n",
       "it.wikipedia.org         646.0         NaN         NaN        NaN\n",
       "ja.wikipedia.org        1973.0         NaN         NaN        NaN\n",
       "nl.wikipedia.org        1062.0         NaN         NaN        NaN\n",
       "pt.wikipedia.org         383.0         NaN         NaN        NaN\n",
       "ru.wikipedia.org        1092.0         NaN         NaN        NaN\n",
       "simple.wikipedia.org     563.0         NaN         NaN        NaN\n",
       "www.wikipedia.org        941.0         NaN         NaN        NaN\n",
       "zh.wikipedia.org        1171.0         NaN         NaN        NaN"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# wikipedia.org links by event type\n",
    "top_wiki_hosts_query = \"\"\"\n",
    "SELECT {} AS host, action, COUNT(*) AS count \n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND lower(parse_url(link_url,'HOST')) like '%wikipedia.org'\n",
    "AND (\n",
    "        page_id IN \n",
    "        (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n",
    "        OR page_id IN\n",
    "        (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n",
    "    )\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host, action\n",
    "ORDER BY COUNT(*) DESC\n",
    "LIMIT 20\n",
    "\"\"\"\n",
    "top_wiki_hosts_events = spark.sql(\n",
    "    top_wiki_hosts_query.format(\n",
    "        parse_host.format('link_url'),\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "top_wiki_hosts_events_rdd = top_wiki_hosts_events.rdd\n",
    "top_wiki_hosts_events_df = sqlContext.createDataFrame(top_wiki_hosts_events_rdd)\n",
    "top_wiki_hosts_events_pandas = top_wiki_hosts_events_df.toPandas()\n",
    "\n",
    "top_wiki_hosts_events_pandas.pivot(index='host', columns='action', values='count')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### web.archive.org links by event type for W:PM pages withn external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>action</th>\n",
       "      <th>extClick</th>\n",
       "      <th>fnClick</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>host</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>web.archive.org</th>\n",
       "      <td>51898</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "action           extClick  fnClick\n",
       "host                              \n",
       "web.archive.org     51898        3"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# web.archive.org links by event type for W:PM pages withn external links\n",
    "archive_org_wpm_query = \"\"\"\n",
    "SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count \n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'\n",
    "AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'\n",
    "AND page_id IN\n",
    "        (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host, action\n",
    "\"\"\"\n",
    "archive_org_wpm_events = spark.sql(\n",
    "    archive_org_wpm_query.format(\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "archive_org_wpm_events_rdd = archive_org_wpm_events.rdd\n",
    "archive_org_wpm_events_df = sqlContext.createDataFrame(archive_org_wpm_events_rdd)\n",
    "archive_org_wpm_events_pandas = archive_org_wpm_events_df.toPandas()\n",
    "\n",
    "archive_org_wpm_events_pandas.pivot(index='host', columns='action', values='count')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### web.archive.org links by event type for W pages withn external links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>action</th>\n",
       "      <th>extClick</th>\n",
       "      <th>fnClick</th>\n",
       "      <th>fnHover</th>\n",
       "      <th>upClick</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>host</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>web.archive.org</th>\n",
       "      <td>1963023</td>\n",
       "      <td>31</td>\n",
       "      <td>57</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "action           extClick  fnClick  fnHover  upClick\n",
       "host                                                \n",
       "web.archive.org   1963023       31       57        1"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# web.archive.org links by event type for W pages withn external links\n",
    "archive_org_w_query = \"\"\"\n",
    "SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count \n",
    "FROM citationusage \n",
    "WHERE wiki = 'enwiki'\n",
    "AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'\n",
    "AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'\n",
    "AND page_id IN\n",
    "        (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n",
    "{}\n",
    "AND to_date(event_time) >= '{}'\n",
    "AND to_date(event_time) <= '{}'\n",
    "AND useragent_is_bot = FALSE\n",
    "GROUP BY host, action\n",
    "\"\"\"\n",
    "archive_org_w_events = spark.sql(\n",
    "    archive_org_w_query.format(\n",
    "        event_exclusion_sql, start_date_string, end_date_string\n",
    "    ))\n",
    "archive_org_w_events_rdd = archive_org_w_events.rdd\n",
    "archive_org_w_events_df = sqlContext.createDataFrame(archive_org_w_events_rdd)\n",
    "archive_org_w_events_pandas = archive_org_w_events_df.toPandas()\n",
    "\n",
    "archive_org_w_events_pandas.pivot(index='host', columns='action', values='count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark - YARN",
   "language": "python",
   "name": "spark_yarn_pyspark"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}