{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploration of tools.wmflabs.org click events" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tools.wmflabs.org link counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2019-07-26 [count of distinct page/externallink to tools.wmflabs.org](https://quarry.wmflabs.org/query/37908)\n", "\n", "select count(distinct el_from, el_to) from externallinks where el_to like '%tools.wmflabs.org%';\n", "\n", "\n", "| count(distinct el_from, el_to) |\n", "|--------------------------------|\n", "| 12804396 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### count of tools.wmflabs.org links in 20190420 externallinks table" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+\n", "| links|\n", "+--------+\n", "|12664610|\n", "+--------+\n", "\n" ] } ], "source": [ "# count of tools.wmflabs.org links in 20190420 externallinks table\n", "query = \"\"\"\n", "select count(distinct el_from, el_to) AS links \n", "FROM ryanmax.population_externallinks \n", "WHERE LOWER(el_to) LIKE '%tools.wmflabs.org%'\n", "\"\"\"\n", "spark.sql(query).show()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
link_pathcount
0tools.wmflabs.org/geohack1053717
1tools.wmflabs.org/os15816
2tools.wmflabs.org/osm4wiki11249
3tools.wmflabs.org/isin2239
4tools.wmflabs.org/kmlexport1966
5tools.wmflabs.org/timescale1780
6tools.wmflabs.org/bibleversefinder21331
7tools.wmflabs.org/ftl1115
8tools.wmflabs.org/scholia435
9tools.wmflabs.org/panoviewer184
10tools.wmflabs.org/dupdet50
11tools.wmflabs.org/wikidata-externalid-url46
12tools.wmflabs.org/citeplato41
13tools.wmflabs.org/zoomviewer37
14tools.wmflabs.org/copyvios30
15tools.wmflabs.org/wiwosm28
16tools.wmflabs.org/family26
17tools.wmflabs.org/reasonator21
18tools.wmflabs.org/makeref13
19tools.wmflabs.org/citations11
20tools.wmflabs.org/dplbot9
21tools.wmflabs.org/dispenser6
22tools.wmflabs.org/refill5
23tools.wmflabs.org/hashtags4
24tools.wmflabs.org/wikivoyage3
25tools.wmflabs.org/magnustools3
26tools.wmflabs.org/citation-template-filling2
27tools.wmflabs.org/denkmalliste1
28tools.wmflabs.org/templatecount1
29tools.wmflabs.org/pageviews1
30tools.wmflabs.org/citer1
\n", "
" ], "text/plain": [ " link_path count\n", "0 tools.wmflabs.org/geohack 1053717\n", "1 tools.wmflabs.org/os 15816\n", "2 tools.wmflabs.org/osm4wiki 11249\n", "3 tools.wmflabs.org/isin 2239\n", "4 tools.wmflabs.org/kmlexport 1966\n", "5 tools.wmflabs.org/timescale 1780\n", "6 tools.wmflabs.org/bibleversefinder2 1331\n", "7 tools.wmflabs.org/ftl 1115\n", "8 tools.wmflabs.org/scholia 435\n", "9 tools.wmflabs.org/panoviewer 184\n", "10 tools.wmflabs.org/dupdet 50\n", "11 tools.wmflabs.org/wikidata-externalid-url 46\n", "12 tools.wmflabs.org/citeplato 41\n", "13 tools.wmflabs.org/zoomviewer 37\n", "14 tools.wmflabs.org/copyvios 30\n", "15 tools.wmflabs.org/wiwosm 28\n", "16 tools.wmflabs.org/family 26\n", "17 tools.wmflabs.org/reasonator 21\n", "18 tools.wmflabs.org/makeref 13\n", "19 tools.wmflabs.org/citations 11\n", "20 tools.wmflabs.org/dplbot 9\n", "21 tools.wmflabs.org/dispenser 6\n", "22 tools.wmflabs.org/refill 5\n", "23 tools.wmflabs.org/hashtags 4\n", "24 tools.wmflabs.org/wikivoyage 3\n", "25 tools.wmflabs.org/magnustools 3\n", "26 tools.wmflabs.org/citation-template-filling 2\n", "27 tools.wmflabs.org/denkmalliste 1\n", "28 tools.wmflabs.org/templatecount 1\n", "29 tools.wmflabs.org/pageviews 1\n", "30 tools.wmflabs.org/citer 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of tools.wmflabs.org events by tools.wmflabs.org/subdirectory/\n", "\n", "events_query = \"\"\"\n", "select CONCAT('tools.wmflabs.org/',REGEXP_REPLACE(link_url,'.*tools.wmflabs.org/([a-zA-Z0-9\\-_]+)/?.*','$1')) as link_path, count(*) AS count \n", "FROM citationusage\n", "WHERE wiki = 'enwiki'\n", "AND useragent_is_bot = FALSE\n", "AND LOWER(link_url) LIKE '%://tools.wmflabs.org%' \n", "GROUP BY link_path\n", "ORDER BY count DESC\n", "LIMIT 100\n", "\"\"\"\n", "events = spark.sql(events_query)\n", "events.toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }