{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploration of tools.wmflabs.org click events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n",
    "%run -i 'data-defaults.py'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## tools.wmflabs.org link counts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2019-07-26 [count of distinct page/externallink to tools.wmflabs.org](https://quarry.wmflabs.org/query/37908)\n",
    "\n",
    "select count(distinct el_from, el_to) from externallinks where el_to like '%tools.wmflabs.org%';\n",
    "\n",
    "\n",
    "| count(distinct el_from, el_to) |\n",
    "|--------------------------------|\n",
    "|                       12804396 |"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### count of tools.wmflabs.org links in 20190420 externallinks table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+\n",
      "|   links|\n",
      "+--------+\n",
      "|12664610|\n",
      "+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# count of tools.wmflabs.org links in 20190420 externallinks table\n",
    "query = \"\"\"\n",
    "select count(distinct el_from, el_to) AS links \n",
    "FROM ryanmax.population_externallinks \n",
    "WHERE LOWER(el_to) LIKE '%tools.wmflabs.org%'\n",
    "\"\"\"\n",
    "spark.sql(query).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>link_path</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tools.wmflabs.org/geohack</td>\n",
       "      <td>1053717</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tools.wmflabs.org/os</td>\n",
       "      <td>15816</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>tools.wmflabs.org/osm4wiki</td>\n",
       "      <td>11249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>tools.wmflabs.org/isin</td>\n",
       "      <td>2239</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>tools.wmflabs.org/kmlexport</td>\n",
       "      <td>1966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>tools.wmflabs.org/timescale</td>\n",
       "      <td>1780</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>tools.wmflabs.org/bibleversefinder2</td>\n",
       "      <td>1331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>tools.wmflabs.org/ftl</td>\n",
       "      <td>1115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>tools.wmflabs.org/scholia</td>\n",
       "      <td>435</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>tools.wmflabs.org/panoviewer</td>\n",
       "      <td>184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>tools.wmflabs.org/dupdet</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>tools.wmflabs.org/wikidata-externalid-url</td>\n",
       "      <td>46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>tools.wmflabs.org/citeplato</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>tools.wmflabs.org/zoomviewer</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>tools.wmflabs.org/copyvios</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>tools.wmflabs.org/wiwosm</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>tools.wmflabs.org/family</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>tools.wmflabs.org/reasonator</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>tools.wmflabs.org/makeref</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>tools.wmflabs.org/citations</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>tools.wmflabs.org/dplbot</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>tools.wmflabs.org/dispenser</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>tools.wmflabs.org/refill</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>tools.wmflabs.org/hashtags</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>tools.wmflabs.org/wikivoyage</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>tools.wmflabs.org/magnustools</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>tools.wmflabs.org/citation-template-filling</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>tools.wmflabs.org/denkmalliste</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>tools.wmflabs.org/templatecount</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>tools.wmflabs.org/pageviews</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>tools.wmflabs.org/citer</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      link_path    count\n",
       "0                     tools.wmflabs.org/geohack  1053717\n",
       "1                          tools.wmflabs.org/os    15816\n",
       "2                    tools.wmflabs.org/osm4wiki    11249\n",
       "3                        tools.wmflabs.org/isin     2239\n",
       "4                   tools.wmflabs.org/kmlexport     1966\n",
       "5                   tools.wmflabs.org/timescale     1780\n",
       "6           tools.wmflabs.org/bibleversefinder2     1331\n",
       "7                         tools.wmflabs.org/ftl     1115\n",
       "8                     tools.wmflabs.org/scholia      435\n",
       "9                  tools.wmflabs.org/panoviewer      184\n",
       "10                     tools.wmflabs.org/dupdet       50\n",
       "11    tools.wmflabs.org/wikidata-externalid-url       46\n",
       "12                  tools.wmflabs.org/citeplato       41\n",
       "13                 tools.wmflabs.org/zoomviewer       37\n",
       "14                   tools.wmflabs.org/copyvios       30\n",
       "15                     tools.wmflabs.org/wiwosm       28\n",
       "16                     tools.wmflabs.org/family       26\n",
       "17                 tools.wmflabs.org/reasonator       21\n",
       "18                    tools.wmflabs.org/makeref       13\n",
       "19                  tools.wmflabs.org/citations       11\n",
       "20                     tools.wmflabs.org/dplbot        9\n",
       "21                  tools.wmflabs.org/dispenser        6\n",
       "22                     tools.wmflabs.org/refill        5\n",
       "23                   tools.wmflabs.org/hashtags        4\n",
       "24                 tools.wmflabs.org/wikivoyage        3\n",
       "25                tools.wmflabs.org/magnustools        3\n",
       "26  tools.wmflabs.org/citation-template-filling        2\n",
       "27               tools.wmflabs.org/denkmalliste        1\n",
       "28              tools.wmflabs.org/templatecount        1\n",
       "29                  tools.wmflabs.org/pageviews        1\n",
       "30                      tools.wmflabs.org/citer        1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count of tools.wmflabs.org events by tools.wmflabs.org/subdirectory/\n",
    "\n",
    "events_query = \"\"\"\n",
    "select CONCAT('tools.wmflabs.org/',REGEXP_REPLACE(link_url,'.*tools.wmflabs.org/([a-zA-Z0-9\\-_]+)/?.*','$1')) as link_path, count(*) AS count \n",
    "FROM citationusage\n",
    "WHERE wiki = 'enwiki'\n",
    "AND useragent_is_bot = FALSE\n",
    "AND LOWER(link_url) LIKE '%://tools.wmflabs.org%' \n",
    "GROUP BY link_path\n",
    "ORDER BY count DESC\n",
    "LIMIT 100\n",
    "\"\"\"\n",
    "events = spark.sql(events_query)\n",
    "events.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark - YARN",
   "language": "python",
   "name": "spark_yarn_pyspark"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}