{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Top Hostnames\n", "- report externallinks hostname counts for W and WP:M pages\n", "- report various hostname event data for W and WP:M pages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# parse hostnames from links\n", "# includes extraction for web.archive.org links\n", "parse_host = \"\"\"\n", " PARSE_URL(REGEXP_REPLACE(LOWER({}),'^https?://web.archive.org/web/[^/]+/',''),'HOST')\n", "\"\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hostnames present in pages with external links" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Top 20 hostnames found in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hostcount
0www.ncbi.nlm.nih.gov238688
1doi.org183761
2www.worldcat.org25434
3books.google.com24491
4www.google.com11074
5apps.who.int12816
6www.icd9data.com8693
7www.wikidata.org8069
8amigo.geneontology.org6909
9www.nlm.nih.gov6420
10www.cdc.gov6285
11www.jstor.org4302
12scholar.google.com2850
13www.who.int5350
14www.nytimes.com4545
15www.omim.org4415
16adsabs.harvard.edu4255
17www.emedicine.com3477
18www.diseasesdatabase.com3105
19id.loc.gov3008
\n", "
" ], "text/plain": [ " host count\n", "0 www.ncbi.nlm.nih.gov 238688\n", "1 doi.org 183761\n", "2 www.worldcat.org 25434\n", "3 books.google.com 24491\n", "4 www.google.com 11074\n", "5 apps.who.int 12816\n", "6 www.icd9data.com 8693\n", "7 www.wikidata.org 8069\n", "8 amigo.geneontology.org 6909\n", "9 www.nlm.nih.gov 6420\n", "10 www.cdc.gov 6285\n", "11 www.jstor.org 4302\n", "12 scholar.google.com 2850\n", "13 www.who.int 5350\n", "14 www.nytimes.com 4545\n", "15 www.omim.org 4415\n", "16 adsabs.harvard.edu 4255\n", "17 www.emedicine.com 3477\n", "18 www.diseasesdatabase.com 3105\n", "19 id.loc.gov 3008" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of top 20 hostnames in WP:M pages with external links\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('el_to')\n", " ))\n", "top_hosts_wpm.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Top 20 hostnames found in W pages with external links" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hostcount
0tools.wmflabs.org2118863
1www.google.com1763467
2doi.org1199818
3www.ncbi.nlm.nih.gov1122007
4www.worldcat.org1061459
5books.google.com1231622
6scholar.google.com437109
7www.wikidata.org805309
8www.jstor.org548764
9ssd.jpl.nasa.gov592901
10minorplanetcenter.net556758
11viaf.org535724
12id.loc.gov450557
13www.imdb.com418623
14isni.org413584
15www.gbif.org351989
16www.nytimes.com343951
17www.youtube.com330816
18eol.org304728
19www.webcitation.org301718
\n", "
" ], "text/plain": [ " host count\n", "0 tools.wmflabs.org 2118863\n", "1 www.google.com 1763467\n", "2 doi.org 1199818\n", "3 www.ncbi.nlm.nih.gov 1122007\n", "4 www.worldcat.org 1061459\n", "5 books.google.com 1231622\n", "6 scholar.google.com 437109\n", "7 www.wikidata.org 805309\n", "8 www.jstor.org 548764\n", "9 ssd.jpl.nasa.gov 592901\n", "10 minorplanetcenter.net 556758\n", "11 viaf.org 535724\n", "12 id.loc.gov 450557\n", "13 www.imdb.com 418623\n", "14 isni.org 413584\n", "15 www.gbif.org 351989\n", "16 www.nytimes.com 343951\n", "17 www.youtube.com 330816\n", "18 eol.org 304728\n", "19 www.webcitation.org 301718" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of top 20 hostnames in WP:M pages with external links\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('el_to')\n", " ))\n", "top_hosts_w.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links found in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hostcount
0web.archive.org50390
\n", "
" ], "text/plain": [ " host count\n", "0 web.archive.org 50390" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of web.archive.org links in WP:M pages with external links\n", "archive_org_wpm_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'\n", "GROUP BY host\n", "\"\"\"\n", "archive_org_wpm = spark.sql(archive_org_wpm_query)\n", "archive_org_wpm.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links found in W pages with external links" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hostcount
0web.archive.org4416436
\n", "
" ], "text/plain": [ " host count\n", "0 web.archive.org 4416436" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of web.archive.org links in W pages with external links\n", "archive_org_w_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'\n", "GROUP BY host\n", "\"\"\"\n", "archive_org_w = spark.sql(archive_org_w_query)\n", "archive_org_w.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Event Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### all event types" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0en.wikipedia.org186068658146.43750
1en.m.wikipedia.org62730919603.40625
2www.ncbi.nlm.nih.gov864322701.00000
3doi.org749302341.56250
4books.google.com19729616.53125
5www.drugs.com16639519.96875
6www.who.int15105472.03125
7translate.googleusercontent.com10775336.71875
8theranos.com10660333.12500
9www.cdc.gov10119316.21875
10chemapps.stolaf.edu7613237.90625
11apps.who.int6384199.50000
12www.nlm.nih.gov6291196.59375
13www.fda.gov5101159.40625
14www.nytimes.com5005156.40625
15www.commonchemistry.org4604143.87500
16www.accessdata.fda.gov4352136.00000
17www.youtube.com4154129.81250
18emedicine.medscape.com3791118.46875
19pubchem.ncbi.nlm.nih.gov3469108.40625
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 1860686 58146.43750\n", "1 en.m.wikipedia.org 627309 19603.40625\n", "2 www.ncbi.nlm.nih.gov 86432 2701.00000\n", "3 doi.org 74930 2341.56250\n", "4 books.google.com 19729 616.53125\n", "5 www.drugs.com 16639 519.96875\n", "6 www.who.int 15105 472.03125\n", "7 translate.googleusercontent.com 10775 336.71875\n", "8 theranos.com 10660 333.12500\n", "9 www.cdc.gov 10119 316.21875\n", "10 chemapps.stolaf.edu 7613 237.90625\n", "11 apps.who.int 6384 199.50000\n", "12 www.nlm.nih.gov 6291 196.59375\n", "13 www.fda.gov 5101 159.40625\n", "14 www.nytimes.com 5005 156.40625\n", "15 www.commonchemistry.org 4604 143.87500\n", "16 www.accessdata.fda.gov 4352 136.00000\n", "17 www.youtube.com 4154 129.81250\n", "18 emedicine.medscape.com 3791 118.46875\n", "19 pubchem.ncbi.nlm.nih.gov 3469 108.40625" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to W pages with external links" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0en.wikipedia.org423340091.322938e+06
1en.m.wikipedia.org176368145.511504e+05
2www.imdb.com30506949.533419e+04
3tools.wmflabs.org10641533.325478e+04
4www.youtube.com6014331.879478e+04
5books.google.com5732461.791394e+04
6www.espncricinfo.com3799191.187247e+04
7doi.org3507141.095981e+04
8www.nytimes.com3015689.424000e+03
9www.bbc.co.uk2275407.110625e+03
10www.theguardian.com2258157.056719e+03
11translate.googleusercontent.com1848585.776812e+03
12www.instagram.com1829445.717000e+03
13twitter.com1809945.656062e+03
14www.google.com1669365.216750e+03
15archive.org1621745.067938e+03
16www.ncbi.nlm.nih.gov1571714.911594e+03
17www.allmusic.com1556154.862969e+03
18www.rottentomatoes.com1407174.397406e+03
19www.biblegateway.com1360314.250969e+03
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 42334009 1.322938e+06\n", "1 en.m.wikipedia.org 17636814 5.511504e+05\n", "2 www.imdb.com 3050694 9.533419e+04\n", "3 tools.wmflabs.org 1064153 3.325478e+04\n", "4 www.youtube.com 601433 1.879478e+04\n", "5 books.google.com 573246 1.791394e+04\n", "6 www.espncricinfo.com 379919 1.187247e+04\n", "7 doi.org 350714 1.095981e+04\n", "8 www.nytimes.com 301568 9.424000e+03\n", "9 www.bbc.co.uk 227540 7.110625e+03\n", "10 www.theguardian.com 225815 7.056719e+03\n", "11 translate.googleusercontent.com 184858 5.776812e+03\n", "12 www.instagram.com 182944 5.717000e+03\n", "13 twitter.com 180994 5.656062e+03\n", "14 www.google.com 166936 5.216750e+03\n", "15 archive.org 162174 5.067938e+03\n", "16 www.ncbi.nlm.nih.gov 157171 4.911594e+03\n", "17 www.allmusic.com 155615 4.862969e+03\n", "18 www.rottentomatoes.com 140717 4.397406e+03\n", "19 www.biblegateway.com 136031 4.250969e+03" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### extClick events" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links\n", "- further limited to extClick events" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0www.ncbi.nlm.nih.gov864322701.00000
1doi.org749302341.56250
2books.google.com19725616.40625
3www.drugs.com16639519.96875
4www.who.int15105472.03125
5theranos.com10660333.12500
6www.cdc.gov10119316.21875
7chemapps.stolaf.edu7613237.90625
8apps.who.int6384199.50000
9www.nlm.nih.gov6291196.59375
10www.fda.gov5101159.40625
11www.nytimes.com5005156.40625
12www.commonchemistry.org4604143.87500
13www.accessdata.fda.gov4352136.00000
14www.youtube.com4154129.81250
15emedicine.medscape.com3791118.46875
16translate.googleusercontent.com3553111.03125
17pubchem.ncbi.nlm.nih.gov3469108.40625
18www.cancer.gov3225100.78125
19www.drugbank.ca264882.75000
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 www.ncbi.nlm.nih.gov 86432 2701.00000\n", "1 doi.org 74930 2341.56250\n", "2 books.google.com 19725 616.40625\n", "3 www.drugs.com 16639 519.96875\n", "4 www.who.int 15105 472.03125\n", "5 theranos.com 10660 333.12500\n", "6 www.cdc.gov 10119 316.21875\n", "7 chemapps.stolaf.edu 7613 237.90625\n", "8 apps.who.int 6384 199.50000\n", "9 www.nlm.nih.gov 6291 196.59375\n", "10 www.fda.gov 5101 159.40625\n", "11 www.nytimes.com 5005 156.40625\n", "12 www.commonchemistry.org 4604 143.87500\n", "13 www.accessdata.fda.gov 4352 136.00000\n", "14 www.youtube.com 4154 129.81250\n", "15 emedicine.medscape.com 3791 118.46875\n", "16 translate.googleusercontent.com 3553 111.03125\n", "17 pubchem.ncbi.nlm.nih.gov 3469 108.40625\n", "18 www.cancer.gov 3225 100.78125\n", "19 www.drugbank.ca 2648 82.75000" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "# further limited to extClick events\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'extClick'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames by event type\n", "- limited to W pages with external links\n", "- further limited to extClick events" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0www.imdb.com305069495334.18750
1tools.wmflabs.org106415333254.78125
2www.youtube.com60143318794.78125
3books.google.com57306217908.18750
4www.espncricinfo.com37991911872.46875
5doi.org35071410959.81250
6www.nytimes.com3015689424.00000
7www.bbc.co.uk2275407110.62500
8www.theguardian.com2258157056.71875
9www.instagram.com1829445717.00000
10twitter.com1809945656.06250
11www.google.com1669365216.75000
12archive.org1621535067.28125
13www.ncbi.nlm.nih.gov1571714911.59375
14www.allmusic.com1556154862.96875
15www.rottentomatoes.com1407174397.40625
16www.findagrave.com1300214063.15625
17news.bbc.co.uk1291894037.15625
18www.telegraph.co.uk1153663605.18750
19www.facebook.com1118133494.15625
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 www.imdb.com 3050694 95334.18750\n", "1 tools.wmflabs.org 1064153 33254.78125\n", "2 www.youtube.com 601433 18794.78125\n", "3 books.google.com 573062 17908.18750\n", "4 www.espncricinfo.com 379919 11872.46875\n", "5 doi.org 350714 10959.81250\n", "6 www.nytimes.com 301568 9424.00000\n", "7 www.bbc.co.uk 227540 7110.62500\n", "8 www.theguardian.com 225815 7056.71875\n", "9 www.instagram.com 182944 5717.00000\n", "10 twitter.com 180994 5656.06250\n", "11 www.google.com 166936 5216.75000\n", "12 archive.org 162153 5067.28125\n", "13 www.ncbi.nlm.nih.gov 157171 4911.59375\n", "14 www.allmusic.com 155615 4862.96875\n", "15 www.rottentomatoes.com 140717 4397.40625\n", "16 www.findagrave.com 130021 4063.15625\n", "17 news.bbc.co.uk 129189 4037.15625\n", "18 www.telegraph.co.uk 115366 3605.18750\n", "19 www.facebook.com 111813 3494.15625" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "# further limited to extClick events\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'extClick'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### limited to fnHover events" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links\n", "- further limited to fnHover events" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0en.wikipedia.org146505745783.03125
1en.m.wikipedia.org898212806.90625
2translate.googleusercontent.com4032126.00000
3None69821.81250
4www.translatoruser-int.com1494.65625
5papago.naver.net993.09375
6www.biblegateway.com391.21875
7z5h64q92x9.net210.65625
8scholar.google.com170.53125
9emedien3.sub.uni-hamburg.de120.37500
10en.wikipedi0.org50.15625
11translate.baiducontent.com40.12500
12en.m.wikipedi0.org30.09375
13www.gpedia.com20.06250
14en.wikipedia.org.20.06250
15books.google.com10.03125
16speechpanel.readspeaker.com10.03125
17nl.wikiludia.com10.03125
18en.wk.jsproxy.tk10.03125
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 1465057 45783.03125\n", "1 en.m.wikipedia.org 89821 2806.90625\n", "2 translate.googleusercontent.com 4032 126.00000\n", "3 None 698 21.81250\n", "4 www.translatoruser-int.com 149 4.65625\n", "5 papago.naver.net 99 3.09375\n", "6 www.biblegateway.com 39 1.21875\n", "7 z5h64q92x9.net 21 0.65625\n", "8 scholar.google.com 17 0.53125\n", "9 emedien3.sub.uni-hamburg.de 12 0.37500\n", "10 en.wikipedi0.org 5 0.15625\n", "11 translate.baiducontent.com 4 0.12500\n", "12 en.m.wikipedi0.org 3 0.09375\n", "13 www.gpedia.com 2 0.06250\n", "14 en.wikipedia.org. 2 0.06250\n", "15 books.google.com 1 0.03125\n", "16 speechpanel.readspeaker.com 1 0.03125\n", "17 nl.wikiludia.com 1 0.03125\n", "18 en.wk.jsproxy.tk 1 0.03125" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "# further limited to fnHover events\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'fnHover'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames by event type\n", "- limited to W pages with external links\n", "- further limited to fnHover events" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0en.wikipedia.org338094671.056546e+06
1en.m.wikipedia.org20152846.297762e+04
2translate.googleusercontent.com537141.678562e+03
3None272248.507500e+02
4www.biblegateway.com161455.045312e+02
5www.translatoruser-int.com20636.446875e+01
6papago.naver.net12503.906250e+01
7z5h64q92x9.net3271.021875e+01
8en.wikipedi0.org2467.687500e+00
9emedien3.sub.uni-hamburg.de1695.281250e+00
10books.google.com1043.250000e+00
11translate.baiducontent.com1023.187500e+00
12www.gutenberg.org802.500000e+00
13fsfe.org321.000000e+00
14en.wikipedia.org.299.062500e-01
15wn.rsarchive.org288.750000e-01
16en.wk.jsproxy.tk288.750000e-01
17en.wikiludia.com268.125000e-01
18en.wikipedia.nom.rs134.062500e-01
19www.gpedia.com123.750000e-01
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 33809467 1.056546e+06\n", "1 en.m.wikipedia.org 2015284 6.297762e+04\n", "2 translate.googleusercontent.com 53714 1.678562e+03\n", "3 None 27224 8.507500e+02\n", "4 www.biblegateway.com 16145 5.045312e+02\n", "5 www.translatoruser-int.com 2063 6.446875e+01\n", "6 papago.naver.net 1250 3.906250e+01\n", "7 z5h64q92x9.net 327 1.021875e+01\n", "8 en.wikipedi0.org 246 7.687500e+00\n", "9 emedien3.sub.uni-hamburg.de 169 5.281250e+00\n", "10 books.google.com 104 3.250000e+00\n", "11 translate.baiducontent.com 102 3.187500e+00\n", "12 www.gutenberg.org 80 2.500000e+00\n", "13 fsfe.org 32 1.000000e+00\n", "14 en.wikipedia.org. 29 9.062500e-01\n", "15 wn.rsarchive.org 28 8.750000e-01\n", "16 en.wk.jsproxy.tk 28 8.750000e-01\n", "17 en.wikiludia.com 26 8.125000e-01\n", "18 en.wikipedia.nom.rs 13 4.062500e-01\n", "19 www.gpedia.com 12 3.750000e-01" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "# further limited to fnHover events\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'fnHover'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### limited to extClick, fnClick OR upClick events" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links\n", "- further limited to extClick, fnClick OR upClick events" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0en.m.wikipedia.org53748816796.50000
1en.wikipedia.org39562912363.40625
2www.ncbi.nlm.nih.gov864322701.00000
3doi.org749302341.56250
4books.google.com19728616.50000
5www.drugs.com16639519.96875
6www.who.int15105472.03125
7theranos.com10660333.12500
8www.cdc.gov10119316.21875
9chemapps.stolaf.edu7613237.90625
10translate.googleusercontent.com6743210.71875
11apps.who.int6384199.50000
12www.nlm.nih.gov6291196.59375
13www.fda.gov5101159.40625
14www.nytimes.com5005156.40625
15www.commonchemistry.org4604143.87500
16www.accessdata.fda.gov4352136.00000
17www.youtube.com4154129.81250
18emedicine.medscape.com3791118.46875
19pubchem.ncbi.nlm.nih.gov3469108.40625
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.m.wikipedia.org 537488 16796.50000\n", "1 en.wikipedia.org 395629 12363.40625\n", "2 www.ncbi.nlm.nih.gov 86432 2701.00000\n", "3 doi.org 74930 2341.56250\n", "4 books.google.com 19728 616.50000\n", "5 www.drugs.com 16639 519.96875\n", "6 www.who.int 15105 472.03125\n", "7 theranos.com 10660 333.12500\n", "8 www.cdc.gov 10119 316.21875\n", "9 chemapps.stolaf.edu 7613 237.90625\n", "10 translate.googleusercontent.com 6743 210.71875\n", "11 apps.who.int 6384 199.50000\n", "12 www.nlm.nih.gov 6291 196.59375\n", "13 www.fda.gov 5101 159.40625\n", "14 www.nytimes.com 5005 156.40625\n", "15 www.commonchemistry.org 4604 143.87500\n", "16 www.accessdata.fda.gov 4352 136.00000\n", "17 www.youtube.com 4154 129.81250\n", "18 emedicine.medscape.com 3791 118.46875\n", "19 pubchem.ncbi.nlm.nih.gov 3469 108.40625" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "# further limited to extClick, fnClick OR upClick events\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action in ('extClick', 'upClick','fnClick')\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames by event type\n", "- limited to W pages with external links\n", "- further limited to extClick, fnClick OR upClick events" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hosttotal_eventsdaily_average_events
0en.m.wikipedia.org15621530488172.81250
1en.wikipedia.org8524542266391.93750
2www.imdb.com305069495334.18750
3tools.wmflabs.org106415333254.78125
4www.youtube.com60143318794.78125
5books.google.com57314217910.68750
6www.espncricinfo.com37991911872.46875
7doi.org35071410959.81250
8www.nytimes.com3015689424.00000
9www.bbc.co.uk2275407110.62500
10www.theguardian.com2258157056.71875
11www.instagram.com1829445717.00000
12twitter.com1809945656.06250
13www.google.com1669365216.75000
14archive.org1621645067.62500
15www.ncbi.nlm.nih.gov1571714911.59375
16www.allmusic.com1556154862.96875
17www.rottentomatoes.com1407174397.40625
18translate.googleusercontent.com1311444098.25000
19www.findagrave.com1300214063.15625
\n", "
" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.m.wikipedia.org 15621530 488172.81250\n", "1 en.wikipedia.org 8524542 266391.93750\n", "2 www.imdb.com 3050694 95334.18750\n", "3 tools.wmflabs.org 1064153 33254.78125\n", "4 www.youtube.com 601433 18794.78125\n", "5 books.google.com 573142 17910.68750\n", "6 www.espncricinfo.com 379919 11872.46875\n", "7 doi.org 350714 10959.81250\n", "8 www.nytimes.com 301568 9424.00000\n", "9 www.bbc.co.uk 227540 7110.62500\n", "10 www.theguardian.com 225815 7056.71875\n", "11 www.instagram.com 182944 5717.00000\n", "12 twitter.com 180994 5656.06250\n", "13 www.google.com 166936 5216.75000\n", "14 archive.org 162164 5067.62500\n", "15 www.ncbi.nlm.nih.gov 157171 4911.59375\n", "16 www.allmusic.com 155615 4862.96875\n", "17 www.rottentomatoes.com 140717 4397.40625\n", "18 translate.googleusercontent.com 131144 4098.25000\n", "19 www.findagrave.com 130021 4063.15625" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "# further limited to extClick, fnClick OR upClick events\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action in ('extClick', 'upClick','fnClick')\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top wikipedia.org links by event type" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
host
ar.wikipedia.org276.0NaNNaNNaN
de.wikipedia.org1798.0NaNNaNNaN
en.m.wikipedia.orgNaN16044275.02105104.0114695.0
en.wikipedia.org262.07875877.035274475.01043977.0
eo.wikipedia.org280.0NaNNaNNaN
es.wikipedia.org785.0NaNNaNNaN
fr.wikipedia.org1804.0NaNNaNNaN
it.wikipedia.org646.0NaNNaNNaN
ja.wikipedia.org1973.0NaNNaNNaN
nl.wikipedia.org1062.0NaNNaNNaN
pt.wikipedia.org383.0NaNNaNNaN
ru.wikipedia.org1092.0NaNNaNNaN
simple.wikipedia.org563.0NaNNaNNaN
www.wikipedia.org941.0NaNNaNNaN
zh.wikipedia.org1171.0NaNNaNNaN
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "host \n", "ar.wikipedia.org 276.0 NaN NaN NaN\n", "de.wikipedia.org 1798.0 NaN NaN NaN\n", "en.m.wikipedia.org NaN 16044275.0 2105104.0 114695.0\n", "en.wikipedia.org 262.0 7875877.0 35274475.0 1043977.0\n", "eo.wikipedia.org 280.0 NaN NaN NaN\n", "es.wikipedia.org 785.0 NaN NaN NaN\n", "fr.wikipedia.org 1804.0 NaN NaN NaN\n", "it.wikipedia.org 646.0 NaN NaN NaN\n", "ja.wikipedia.org 1973.0 NaN NaN NaN\n", "nl.wikipedia.org 1062.0 NaN NaN NaN\n", "pt.wikipedia.org 383.0 NaN NaN NaN\n", "ru.wikipedia.org 1092.0 NaN NaN NaN\n", "simple.wikipedia.org 563.0 NaN NaN NaN\n", "www.wikipedia.org 941.0 NaN NaN NaN\n", "zh.wikipedia.org 1171.0 NaN NaN NaN" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# wikipedia.org links by event type\n", "top_wiki_hosts_query = \"\"\"\n", "SELECT {} AS host, action, COUNT(*) AS count \n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND lower(parse_url(link_url,'HOST')) like '%wikipedia.org'\n", "AND (\n", " page_id IN \n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", " OR page_id IN\n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host, action\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_wiki_hosts_events = spark.sql(\n", " top_wiki_hosts_query.format(\n", " parse_host.format('link_url'),\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_wiki_hosts_events_rdd = top_wiki_hosts_events.rdd\n", "top_wiki_hosts_events_df = sqlContext.createDataFrame(top_wiki_hosts_events_rdd)\n", "top_wiki_hosts_events_pandas = top_wiki_hosts_events_df.toPandas()\n", "\n", "top_wiki_hosts_events_pandas.pivot(index='host', columns='action', values='count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links by event type for W:PM pages withn external links" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClick
host
web.archive.org518983
\n", "
" ], "text/plain": [ "action extClick fnClick\n", "host \n", "web.archive.org 51898 3" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# web.archive.org links by event type for W:PM pages withn external links\n", "archive_org_wpm_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count \n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'\n", "AND page_id IN\n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host, action\n", "\"\"\"\n", "archive_org_wpm_events = spark.sql(\n", " archive_org_wpm_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "archive_org_wpm_events_rdd = archive_org_wpm_events.rdd\n", "archive_org_wpm_events_df = sqlContext.createDataFrame(archive_org_wpm_events_rdd)\n", "archive_org_wpm_events_pandas = archive_org_wpm_events_df.toPandas()\n", "\n", "archive_org_wpm_events_pandas.pivot(index='host', columns='action', values='count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links by event type for W pages withn external links" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
host
web.archive.org196302331571
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "host \n", "web.archive.org 1963023 31 57 1" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# web.archive.org links by event type for W pages withn external links\n", "archive_org_w_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count \n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'\n", "AND page_id IN\n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host, action\n", "\"\"\"\n", "archive_org_w_events = spark.sql(\n", " archive_org_w_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "archive_org_w_events_rdd = archive_org_w_events.rdd\n", "archive_org_w_events_df = sqlContext.createDataFrame(archive_org_w_events_rdd)\n", "archive_org_w_events_pandas = archive_org_w_events_df.toPandas()\n", "\n", "archive_org_w_events_pandas.pivot(index='host', columns='action', values='count')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }