{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploration of tools.wmflabs.org click events" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tools.wmflabs.org link counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2019-07-26 [count of distinct page/externallink to tools.wmflabs.org](https://quarry.wmflabs.org/query/37908)\n", "\n", "select count(distinct el_from, el_to) from externallinks where el_to like '%tools.wmflabs.org%';\n", "\n", "\n", "| count(distinct el_from, el_to) |\n", "|--------------------------------|\n", "| 12804396 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### count of tools.wmflabs.org links in 20190420 externallinks table" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+\n", "| links|\n", "+--------+\n", "|12664610|\n", "+--------+\n", "\n" ] } ], "source": [ "# count of tools.wmflabs.org links in 20190420 externallinks table\n", "query = \"\"\"\n", "select count(distinct el_from, el_to) AS links \n", "FROM ryanmax.population_externallinks \n", "WHERE LOWER(el_to) LIKE '%tools.wmflabs.org%'\n", "\"\"\"\n", "spark.sql(query).show()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>link_path</th>\n", " <th>count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>tools.wmflabs.org/geohack</td>\n", " <td>1053717</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>tools.wmflabs.org/os</td>\n", " <td>15816</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>tools.wmflabs.org/osm4wiki</td>\n", " <td>11249</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>tools.wmflabs.org/isin</td>\n", " <td>2239</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>tools.wmflabs.org/kmlexport</td>\n", " <td>1966</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>tools.wmflabs.org/timescale</td>\n", " <td>1780</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>tools.wmflabs.org/bibleversefinder2</td>\n", " <td>1331</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>tools.wmflabs.org/ftl</td>\n", " <td>1115</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>tools.wmflabs.org/scholia</td>\n", " <td>435</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>tools.wmflabs.org/panoviewer</td>\n", " <td>184</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>tools.wmflabs.org/dupdet</td>\n", " <td>50</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>tools.wmflabs.org/wikidata-externalid-url</td>\n", " <td>46</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>tools.wmflabs.org/citeplato</td>\n", " <td>41</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>tools.wmflabs.org/zoomviewer</td>\n", " <td>37</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>tools.wmflabs.org/copyvios</td>\n", " <td>30</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>tools.wmflabs.org/wiwosm</td>\n", " <td>28</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>tools.wmflabs.org/family</td>\n", " <td>26</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>tools.wmflabs.org/reasonator</td>\n", " <td>21</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>tools.wmflabs.org/makeref</td>\n", " <td>13</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>tools.wmflabs.org/citations</td>\n", " <td>11</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>tools.wmflabs.org/dplbot</td>\n", " <td>9</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>tools.wmflabs.org/dispenser</td>\n", " <td>6</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>tools.wmflabs.org/refill</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>tools.wmflabs.org/hashtags</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>tools.wmflabs.org/wikivoyage</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>tools.wmflabs.org/magnustools</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>tools.wmflabs.org/citation-template-filling</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>tools.wmflabs.org/denkmalliste</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>tools.wmflabs.org/templatecount</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>tools.wmflabs.org/pageviews</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>tools.wmflabs.org/citer</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " link_path count\n", "0 tools.wmflabs.org/geohack 1053717\n", "1 tools.wmflabs.org/os 15816\n", "2 tools.wmflabs.org/osm4wiki 11249\n", "3 tools.wmflabs.org/isin 2239\n", "4 tools.wmflabs.org/kmlexport 1966\n", "5 tools.wmflabs.org/timescale 1780\n", "6 tools.wmflabs.org/bibleversefinder2 1331\n", "7 tools.wmflabs.org/ftl 1115\n", "8 tools.wmflabs.org/scholia 435\n", "9 tools.wmflabs.org/panoviewer 184\n", "10 tools.wmflabs.org/dupdet 50\n", "11 tools.wmflabs.org/wikidata-externalid-url 46\n", "12 tools.wmflabs.org/citeplato 41\n", "13 tools.wmflabs.org/zoomviewer 37\n", "14 tools.wmflabs.org/copyvios 30\n", "15 tools.wmflabs.org/wiwosm 28\n", "16 tools.wmflabs.org/family 26\n", "17 tools.wmflabs.org/reasonator 21\n", "18 tools.wmflabs.org/makeref 13\n", "19 tools.wmflabs.org/citations 11\n", "20 tools.wmflabs.org/dplbot 9\n", "21 tools.wmflabs.org/dispenser 6\n", "22 tools.wmflabs.org/refill 5\n", "23 tools.wmflabs.org/hashtags 4\n", "24 tools.wmflabs.org/wikivoyage 3\n", "25 tools.wmflabs.org/magnustools 3\n", "26 tools.wmflabs.org/citation-template-filling 2\n", "27 tools.wmflabs.org/denkmalliste 1\n", "28 tools.wmflabs.org/templatecount 1\n", "29 tools.wmflabs.org/pageviews 1\n", "30 tools.wmflabs.org/citer 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of tools.wmflabs.org events by tools.wmflabs.org/subdirectory/\n", "\n", "events_query = \"\"\"\n", "select CONCAT('tools.wmflabs.org/',REGEXP_REPLACE(link_url,'.*tools.wmflabs.org/([a-zA-Z0-9\\-_]+)/?.*','$1')) as link_path, count(*) AS count \n", "FROM citationusage\n", "WHERE wiki = 'enwiki'\n", "AND useragent_is_bot = FALSE\n", "AND LOWER(link_url) LIKE '%://tools.wmflabs.org%' \n", "GROUP BY link_path\n", "ORDER BY count DESC\n", "LIMIT 100\n", "\"\"\"\n", "events = spark.sql(events_query)\n", "events.toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }