{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Top Hostnames\n", "- report externallinks hostname counts for W and WP:M pages\n", "- report various hostname event data for W and WP:M pages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# parse hostnames from links\n", "# includes extraction for web.archive.org links\n", "parse_host = \"\"\"\n", " PARSE_URL(REGEXP_REPLACE(LOWER({}),'^https?://web.archive.org/web/[^/]+/',''),'HOST')\n", "\"\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hostnames present in pages with external links" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Top 20 hostnames found in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>238688</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>doi.org</td>\n", " <td>183761</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>www.worldcat.org</td>\n", " <td>25434</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>books.google.com</td>\n", " <td>24491</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.google.com</td>\n", " <td>11074</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>apps.who.int</td>\n", " <td>12816</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.icd9data.com</td>\n", " <td>8693</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>www.wikidata.org</td>\n", " <td>8069</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>amigo.geneontology.org</td>\n", " <td>6909</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.nlm.nih.gov</td>\n", " <td>6420</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>www.cdc.gov</td>\n", " <td>6285</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>www.jstor.org</td>\n", " <td>4302</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>scholar.google.com</td>\n", " <td>2850</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.who.int</td>\n", " <td>5350</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>www.nytimes.com</td>\n", " <td>4545</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.omim.org</td>\n", " <td>4415</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>adsabs.harvard.edu</td>\n", " <td>4255</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.emedicine.com</td>\n", " <td>3477</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>www.diseasesdatabase.com</td>\n", " <td>3105</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>id.loc.gov</td>\n", " <td>3008</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host count\n", "0 www.ncbi.nlm.nih.gov 238688\n", "1 doi.org 183761\n", "2 www.worldcat.org 25434\n", "3 books.google.com 24491\n", "4 www.google.com 11074\n", "5 apps.who.int 12816\n", "6 www.icd9data.com 8693\n", "7 www.wikidata.org 8069\n", "8 amigo.geneontology.org 6909\n", "9 www.nlm.nih.gov 6420\n", "10 www.cdc.gov 6285\n", "11 www.jstor.org 4302\n", "12 scholar.google.com 2850\n", "13 www.who.int 5350\n", "14 www.nytimes.com 4545\n", "15 www.omim.org 4415\n", "16 adsabs.harvard.edu 4255\n", "17 www.emedicine.com 3477\n", "18 www.diseasesdatabase.com 3105\n", "19 id.loc.gov 3008" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of top 20 hostnames in WP:M pages with external links\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('el_to')\n", " ))\n", "top_hosts_wpm.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Top 20 hostnames found in W pages with external links" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>tools.wmflabs.org</td>\n", " <td>2118863</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>www.google.com</td>\n", " <td>1763467</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>doi.org</td>\n", " <td>1199818</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>1122007</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.worldcat.org</td>\n", " <td>1061459</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>books.google.com</td>\n", " <td>1231622</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>scholar.google.com</td>\n", " <td>437109</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>www.wikidata.org</td>\n", " <td>805309</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>www.jstor.org</td>\n", " <td>548764</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>ssd.jpl.nasa.gov</td>\n", " <td>592901</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>minorplanetcenter.net</td>\n", " <td>556758</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>viaf.org</td>\n", " <td>535724</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>id.loc.gov</td>\n", " <td>450557</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.imdb.com</td>\n", " <td>418623</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>isni.org</td>\n", " <td>413584</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.gbif.org</td>\n", " <td>351989</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.nytimes.com</td>\n", " <td>343951</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.youtube.com</td>\n", " <td>330816</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>eol.org</td>\n", " <td>304728</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.webcitation.org</td>\n", " <td>301718</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host count\n", "0 tools.wmflabs.org 2118863\n", "1 www.google.com 1763467\n", "2 doi.org 1199818\n", "3 www.ncbi.nlm.nih.gov 1122007\n", "4 www.worldcat.org 1061459\n", "5 books.google.com 1231622\n", "6 scholar.google.com 437109\n", "7 www.wikidata.org 805309\n", "8 www.jstor.org 548764\n", "9 ssd.jpl.nasa.gov 592901\n", "10 minorplanetcenter.net 556758\n", "11 viaf.org 535724\n", "12 id.loc.gov 450557\n", "13 www.imdb.com 418623\n", "14 isni.org 413584\n", "15 www.gbif.org 351989\n", "16 www.nytimes.com 343951\n", "17 www.youtube.com 330816\n", "18 eol.org 304728\n", "19 www.webcitation.org 301718" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of top 20 hostnames in WP:M pages with external links\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('el_to')\n", " ))\n", "top_hosts_w.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links found in WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>web.archive.org</td>\n", " <td>50390</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host count\n", "0 web.archive.org 50390" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of web.archive.org links in WP:M pages with external links\n", "archive_org_wpm_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'\n", "GROUP BY host\n", "\"\"\"\n", "archive_org_wpm = spark.sql(archive_org_wpm_query)\n", "archive_org_wpm.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links found in W pages with external links" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>web.archive.org</td>\n", " <td>4416436</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host count\n", "0 web.archive.org 4416436" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of web.archive.org links in W pages with external links\n", "archive_org_w_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count \n", "FROM ryanmax.population_externallinks\n", "WHERE el_from IN \n", " (SELECT DISTINCT page_id AS el_from\n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'\n", "GROUP BY host\n", "\"\"\"\n", "archive_org_w = spark.sql(archive_org_w_query)\n", "archive_org_w.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Event Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### all event types" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en.wikipedia.org</td>\n", " <td>1860686</td>\n", " <td>58146.43750</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en.m.wikipedia.org</td>\n", " <td>627309</td>\n", " <td>19603.40625</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>86432</td>\n", " <td>2701.00000</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>doi.org</td>\n", " <td>74930</td>\n", " <td>2341.56250</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>books.google.com</td>\n", " <td>19729</td>\n", " <td>616.53125</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>www.drugs.com</td>\n", " <td>16639</td>\n", " <td>519.96875</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.who.int</td>\n", " <td>15105</td>\n", " <td>472.03125</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>translate.googleusercontent.com</td>\n", " <td>10775</td>\n", " <td>336.71875</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>theranos.com</td>\n", " <td>10660</td>\n", " <td>333.12500</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.cdc.gov</td>\n", " <td>10119</td>\n", " <td>316.21875</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>chemapps.stolaf.edu</td>\n", " <td>7613</td>\n", " <td>237.90625</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>apps.who.int</td>\n", " <td>6384</td>\n", " <td>199.50000</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>www.nlm.nih.gov</td>\n", " <td>6291</td>\n", " <td>196.59375</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.fda.gov</td>\n", " <td>5101</td>\n", " <td>159.40625</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>www.nytimes.com</td>\n", " <td>5005</td>\n", " <td>156.40625</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.commonchemistry.org</td>\n", " <td>4604</td>\n", " <td>143.87500</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.accessdata.fda.gov</td>\n", " <td>4352</td>\n", " <td>136.00000</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.youtube.com</td>\n", " <td>4154</td>\n", " <td>129.81250</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>emedicine.medscape.com</td>\n", " <td>3791</td>\n", " <td>118.46875</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>pubchem.ncbi.nlm.nih.gov</td>\n", " <td>3469</td>\n", " <td>108.40625</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 1860686 58146.43750\n", "1 en.m.wikipedia.org 627309 19603.40625\n", "2 www.ncbi.nlm.nih.gov 86432 2701.00000\n", "3 doi.org 74930 2341.56250\n", "4 books.google.com 19729 616.53125\n", "5 www.drugs.com 16639 519.96875\n", "6 www.who.int 15105 472.03125\n", "7 translate.googleusercontent.com 10775 336.71875\n", "8 theranos.com 10660 333.12500\n", "9 www.cdc.gov 10119 316.21875\n", "10 chemapps.stolaf.edu 7613 237.90625\n", "11 apps.who.int 6384 199.50000\n", "12 www.nlm.nih.gov 6291 196.59375\n", "13 www.fda.gov 5101 159.40625\n", "14 www.nytimes.com 5005 156.40625\n", "15 www.commonchemistry.org 4604 143.87500\n", "16 www.accessdata.fda.gov 4352 136.00000\n", "17 www.youtube.com 4154 129.81250\n", "18 emedicine.medscape.com 3791 118.46875\n", "19 pubchem.ncbi.nlm.nih.gov 3469 108.40625" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to W pages with external links" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en.wikipedia.org</td>\n", " <td>42334009</td>\n", " <td>1.322938e+06</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en.m.wikipedia.org</td>\n", " <td>17636814</td>\n", " <td>5.511504e+05</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>www.imdb.com</td>\n", " <td>3050694</td>\n", " <td>9.533419e+04</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>tools.wmflabs.org</td>\n", " <td>1064153</td>\n", " <td>3.325478e+04</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.youtube.com</td>\n", " <td>601433</td>\n", " <td>1.879478e+04</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>books.google.com</td>\n", " <td>573246</td>\n", " <td>1.791394e+04</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.espncricinfo.com</td>\n", " <td>379919</td>\n", " <td>1.187247e+04</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>doi.org</td>\n", " <td>350714</td>\n", " <td>1.095981e+04</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>www.nytimes.com</td>\n", " <td>301568</td>\n", " <td>9.424000e+03</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.bbc.co.uk</td>\n", " <td>227540</td>\n", " <td>7.110625e+03</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>www.theguardian.com</td>\n", " <td>225815</td>\n", " <td>7.056719e+03</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>translate.googleusercontent.com</td>\n", " <td>184858</td>\n", " <td>5.776812e+03</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>www.instagram.com</td>\n", " <td>182944</td>\n", " <td>5.717000e+03</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>twitter.com</td>\n", " <td>180994</td>\n", " <td>5.656062e+03</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>www.google.com</td>\n", " <td>166936</td>\n", " <td>5.216750e+03</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>archive.org</td>\n", " <td>162174</td>\n", " <td>5.067938e+03</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>157171</td>\n", " <td>4.911594e+03</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.allmusic.com</td>\n", " <td>155615</td>\n", " <td>4.862969e+03</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>www.rottentomatoes.com</td>\n", " <td>140717</td>\n", " <td>4.397406e+03</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.biblegateway.com</td>\n", " <td>136031</td>\n", " <td>4.250969e+03</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 42334009 1.322938e+06\n", "1 en.m.wikipedia.org 17636814 5.511504e+05\n", "2 www.imdb.com 3050694 9.533419e+04\n", "3 tools.wmflabs.org 1064153 3.325478e+04\n", "4 www.youtube.com 601433 1.879478e+04\n", "5 books.google.com 573246 1.791394e+04\n", "6 www.espncricinfo.com 379919 1.187247e+04\n", "7 doi.org 350714 1.095981e+04\n", "8 www.nytimes.com 301568 9.424000e+03\n", "9 www.bbc.co.uk 227540 7.110625e+03\n", "10 www.theguardian.com 225815 7.056719e+03\n", "11 translate.googleusercontent.com 184858 5.776812e+03\n", "12 www.instagram.com 182944 5.717000e+03\n", "13 twitter.com 180994 5.656062e+03\n", "14 www.google.com 166936 5.216750e+03\n", "15 archive.org 162174 5.067938e+03\n", "16 www.ncbi.nlm.nih.gov 157171 4.911594e+03\n", "17 www.allmusic.com 155615 4.862969e+03\n", "18 www.rottentomatoes.com 140717 4.397406e+03\n", "19 www.biblegateway.com 136031 4.250969e+03" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### extClick events" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links\n", "- further limited to extClick events" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>86432</td>\n", " <td>2701.00000</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>doi.org</td>\n", " <td>74930</td>\n", " <td>2341.56250</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>books.google.com</td>\n", " <td>19725</td>\n", " <td>616.40625</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>www.drugs.com</td>\n", " <td>16639</td>\n", " <td>519.96875</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.who.int</td>\n", " <td>15105</td>\n", " <td>472.03125</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>theranos.com</td>\n", " <td>10660</td>\n", " <td>333.12500</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.cdc.gov</td>\n", " <td>10119</td>\n", " <td>316.21875</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>chemapps.stolaf.edu</td>\n", " <td>7613</td>\n", " <td>237.90625</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>apps.who.int</td>\n", " <td>6384</td>\n", " <td>199.50000</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.nlm.nih.gov</td>\n", " <td>6291</td>\n", " <td>196.59375</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>www.fda.gov</td>\n", " <td>5101</td>\n", " <td>159.40625</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>www.nytimes.com</td>\n", " <td>5005</td>\n", " <td>156.40625</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>www.commonchemistry.org</td>\n", " <td>4604</td>\n", " <td>143.87500</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.accessdata.fda.gov</td>\n", " <td>4352</td>\n", " <td>136.00000</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>www.youtube.com</td>\n", " <td>4154</td>\n", " <td>129.81250</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>emedicine.medscape.com</td>\n", " <td>3791</td>\n", " <td>118.46875</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>translate.googleusercontent.com</td>\n", " <td>3553</td>\n", " <td>111.03125</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>pubchem.ncbi.nlm.nih.gov</td>\n", " <td>3469</td>\n", " <td>108.40625</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>www.cancer.gov</td>\n", " <td>3225</td>\n", " <td>100.78125</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.drugbank.ca</td>\n", " <td>2648</td>\n", " <td>82.75000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 www.ncbi.nlm.nih.gov 86432 2701.00000\n", "1 doi.org 74930 2341.56250\n", "2 books.google.com 19725 616.40625\n", "3 www.drugs.com 16639 519.96875\n", "4 www.who.int 15105 472.03125\n", "5 theranos.com 10660 333.12500\n", "6 www.cdc.gov 10119 316.21875\n", "7 chemapps.stolaf.edu 7613 237.90625\n", "8 apps.who.int 6384 199.50000\n", "9 www.nlm.nih.gov 6291 196.59375\n", "10 www.fda.gov 5101 159.40625\n", "11 www.nytimes.com 5005 156.40625\n", "12 www.commonchemistry.org 4604 143.87500\n", "13 www.accessdata.fda.gov 4352 136.00000\n", "14 www.youtube.com 4154 129.81250\n", "15 emedicine.medscape.com 3791 118.46875\n", "16 translate.googleusercontent.com 3553 111.03125\n", "17 pubchem.ncbi.nlm.nih.gov 3469 108.40625\n", "18 www.cancer.gov 3225 100.78125\n", "19 www.drugbank.ca 2648 82.75000" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "# further limited to extClick events\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'extClick'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames by event type\n", "- limited to W pages with external links\n", "- further limited to extClick events" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>www.imdb.com</td>\n", " <td>3050694</td>\n", " <td>95334.18750</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>tools.wmflabs.org</td>\n", " <td>1064153</td>\n", " <td>33254.78125</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>www.youtube.com</td>\n", " <td>601433</td>\n", " <td>18794.78125</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>books.google.com</td>\n", " <td>573062</td>\n", " <td>17908.18750</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.espncricinfo.com</td>\n", " <td>379919</td>\n", " <td>11872.46875</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>doi.org</td>\n", " <td>350714</td>\n", " <td>10959.81250</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.nytimes.com</td>\n", " <td>301568</td>\n", " <td>9424.00000</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>www.bbc.co.uk</td>\n", " <td>227540</td>\n", " <td>7110.62500</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>www.theguardian.com</td>\n", " <td>225815</td>\n", " <td>7056.71875</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.instagram.com</td>\n", " <td>182944</td>\n", " <td>5717.00000</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>twitter.com</td>\n", " <td>180994</td>\n", " <td>5656.06250</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>www.google.com</td>\n", " <td>166936</td>\n", " <td>5216.75000</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>archive.org</td>\n", " <td>162153</td>\n", " <td>5067.28125</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>157171</td>\n", " <td>4911.59375</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>www.allmusic.com</td>\n", " <td>155615</td>\n", " <td>4862.96875</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.rottentomatoes.com</td>\n", " <td>140717</td>\n", " <td>4397.40625</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.findagrave.com</td>\n", " <td>130021</td>\n", " <td>4063.15625</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>news.bbc.co.uk</td>\n", " <td>129189</td>\n", " <td>4037.15625</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>www.telegraph.co.uk</td>\n", " <td>115366</td>\n", " <td>3605.18750</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.facebook.com</td>\n", " <td>111813</td>\n", " <td>3494.15625</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 www.imdb.com 3050694 95334.18750\n", "1 tools.wmflabs.org 1064153 33254.78125\n", "2 www.youtube.com 601433 18794.78125\n", "3 books.google.com 573062 17908.18750\n", "4 www.espncricinfo.com 379919 11872.46875\n", "5 doi.org 350714 10959.81250\n", "6 www.nytimes.com 301568 9424.00000\n", "7 www.bbc.co.uk 227540 7110.62500\n", "8 www.theguardian.com 225815 7056.71875\n", "9 www.instagram.com 182944 5717.00000\n", "10 twitter.com 180994 5656.06250\n", "11 www.google.com 166936 5216.75000\n", "12 archive.org 162153 5067.28125\n", "13 www.ncbi.nlm.nih.gov 157171 4911.59375\n", "14 www.allmusic.com 155615 4862.96875\n", "15 www.rottentomatoes.com 140717 4397.40625\n", "16 www.findagrave.com 130021 4063.15625\n", "17 news.bbc.co.uk 129189 4037.15625\n", "18 www.telegraph.co.uk 115366 3605.18750\n", "19 www.facebook.com 111813 3494.15625" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "# further limited to extClick events\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'extClick'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### limited to fnHover events" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links\n", "- further limited to fnHover events" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en.wikipedia.org</td>\n", " <td>1465057</td>\n", " <td>45783.03125</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en.m.wikipedia.org</td>\n", " <td>89821</td>\n", " <td>2806.90625</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>translate.googleusercontent.com</td>\n", " <td>4032</td>\n", " <td>126.00000</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>None</td>\n", " <td>698</td>\n", " <td>21.81250</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.translatoruser-int.com</td>\n", " <td>149</td>\n", " <td>4.65625</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>papago.naver.net</td>\n", " <td>99</td>\n", " <td>3.09375</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.biblegateway.com</td>\n", " <td>39</td>\n", " <td>1.21875</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>z5h64q92x9.net</td>\n", " <td>21</td>\n", " <td>0.65625</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>scholar.google.com</td>\n", " <td>17</td>\n", " <td>0.53125</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>emedien3.sub.uni-hamburg.de</td>\n", " <td>12</td>\n", " <td>0.37500</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>en.wikipedi0.org</td>\n", " <td>5</td>\n", " <td>0.15625</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>translate.baiducontent.com</td>\n", " <td>4</td>\n", " <td>0.12500</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>en.m.wikipedi0.org</td>\n", " <td>3</td>\n", " <td>0.09375</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.gpedia.com</td>\n", " <td>2</td>\n", " <td>0.06250</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>en.wikipedia.org.</td>\n", " <td>2</td>\n", " <td>0.06250</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>books.google.com</td>\n", " <td>1</td>\n", " <td>0.03125</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>speechpanel.readspeaker.com</td>\n", " <td>1</td>\n", " <td>0.03125</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>nl.wikiludia.com</td>\n", " <td>1</td>\n", " <td>0.03125</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>en.wk.jsproxy.tk</td>\n", " <td>1</td>\n", " <td>0.03125</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 1465057 45783.03125\n", "1 en.m.wikipedia.org 89821 2806.90625\n", "2 translate.googleusercontent.com 4032 126.00000\n", "3 None 698 21.81250\n", "4 www.translatoruser-int.com 149 4.65625\n", "5 papago.naver.net 99 3.09375\n", "6 www.biblegateway.com 39 1.21875\n", "7 z5h64q92x9.net 21 0.65625\n", "8 scholar.google.com 17 0.53125\n", "9 emedien3.sub.uni-hamburg.de 12 0.37500\n", "10 en.wikipedi0.org 5 0.15625\n", "11 translate.baiducontent.com 4 0.12500\n", "12 en.m.wikipedi0.org 3 0.09375\n", "13 www.gpedia.com 2 0.06250\n", "14 en.wikipedia.org. 2 0.06250\n", "15 books.google.com 1 0.03125\n", "16 speechpanel.readspeaker.com 1 0.03125\n", "17 nl.wikiludia.com 1 0.03125\n", "18 en.wk.jsproxy.tk 1 0.03125" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "# further limited to fnHover events\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'fnHover'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames by event type\n", "- limited to W pages with external links\n", "- further limited to fnHover events" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en.wikipedia.org</td>\n", " <td>33809467</td>\n", " <td>1.056546e+06</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en.m.wikipedia.org</td>\n", " <td>2015284</td>\n", " <td>6.297762e+04</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>translate.googleusercontent.com</td>\n", " <td>53714</td>\n", " <td>1.678562e+03</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>None</td>\n", " <td>27224</td>\n", " <td>8.507500e+02</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.biblegateway.com</td>\n", " <td>16145</td>\n", " <td>5.045312e+02</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>www.translatoruser-int.com</td>\n", " <td>2063</td>\n", " <td>6.446875e+01</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>papago.naver.net</td>\n", " <td>1250</td>\n", " <td>3.906250e+01</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>z5h64q92x9.net</td>\n", " <td>327</td>\n", " <td>1.021875e+01</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>en.wikipedi0.org</td>\n", " <td>246</td>\n", " <td>7.687500e+00</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>emedien3.sub.uni-hamburg.de</td>\n", " <td>169</td>\n", " <td>5.281250e+00</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>books.google.com</td>\n", " <td>104</td>\n", " <td>3.250000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>translate.baiducontent.com</td>\n", " <td>102</td>\n", " <td>3.187500e+00</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>www.gutenberg.org</td>\n", " <td>80</td>\n", " <td>2.500000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>fsfe.org</td>\n", " <td>32</td>\n", " <td>1.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>en.wikipedia.org.</td>\n", " <td>29</td>\n", " <td>9.062500e-01</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>wn.rsarchive.org</td>\n", " <td>28</td>\n", " <td>8.750000e-01</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>en.wk.jsproxy.tk</td>\n", " <td>28</td>\n", " <td>8.750000e-01</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>en.wikiludia.com</td>\n", " <td>26</td>\n", " <td>8.125000e-01</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>en.wikipedia.nom.rs</td>\n", " <td>13</td>\n", " <td>4.062500e-01</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.gpedia.com</td>\n", " <td>12</td>\n", " <td>3.750000e-01</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.wikipedia.org 33809467 1.056546e+06\n", "1 en.m.wikipedia.org 2015284 6.297762e+04\n", "2 translate.googleusercontent.com 53714 1.678562e+03\n", "3 None 27224 8.507500e+02\n", "4 www.biblegateway.com 16145 5.045312e+02\n", "5 www.translatoruser-int.com 2063 6.446875e+01\n", "6 papago.naver.net 1250 3.906250e+01\n", "7 z5h64q92x9.net 327 1.021875e+01\n", "8 en.wikipedi0.org 246 7.687500e+00\n", "9 emedien3.sub.uni-hamburg.de 169 5.281250e+00\n", "10 books.google.com 104 3.250000e+00\n", "11 translate.baiducontent.com 102 3.187500e+00\n", "12 www.gutenberg.org 80 2.500000e+00\n", "13 fsfe.org 32 1.000000e+00\n", "14 en.wikipedia.org. 29 9.062500e-01\n", "15 wn.rsarchive.org 28 8.750000e-01\n", "16 en.wk.jsproxy.tk 28 8.750000e-01\n", "17 en.wikiludia.com 26 8.125000e-01\n", "18 en.wikipedia.nom.rs 13 4.062500e-01\n", "19 www.gpedia.com 12 3.750000e-01" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "# further limited to fnHover events\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action = 'fnHover'\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### limited to extClick, fnClick OR upClick events" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames\n", "- limited to WP:M pages with external links\n", "- further limited to extClick, fnClick OR upClick events" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en.m.wikipedia.org</td>\n", " <td>537488</td>\n", " <td>16796.50000</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en.wikipedia.org</td>\n", " <td>395629</td>\n", " <td>12363.40625</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>86432</td>\n", " <td>2701.00000</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>doi.org</td>\n", " <td>74930</td>\n", " <td>2341.56250</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>books.google.com</td>\n", " <td>19728</td>\n", " <td>616.50000</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>www.drugs.com</td>\n", " <td>16639</td>\n", " <td>519.96875</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.who.int</td>\n", " <td>15105</td>\n", " <td>472.03125</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>theranos.com</td>\n", " <td>10660</td>\n", " <td>333.12500</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>www.cdc.gov</td>\n", " <td>10119</td>\n", " <td>316.21875</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>chemapps.stolaf.edu</td>\n", " <td>7613</td>\n", " <td>237.90625</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>translate.googleusercontent.com</td>\n", " <td>6743</td>\n", " <td>210.71875</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>apps.who.int</td>\n", " <td>6384</td>\n", " <td>199.50000</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>www.nlm.nih.gov</td>\n", " <td>6291</td>\n", " <td>196.59375</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.fda.gov</td>\n", " <td>5101</td>\n", " <td>159.40625</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>www.nytimes.com</td>\n", " <td>5005</td>\n", " <td>156.40625</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.commonchemistry.org</td>\n", " <td>4604</td>\n", " <td>143.87500</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.accessdata.fda.gov</td>\n", " <td>4352</td>\n", " <td>136.00000</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.youtube.com</td>\n", " <td>4154</td>\n", " <td>129.81250</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>emedicine.medscape.com</td>\n", " <td>3791</td>\n", " <td>118.46875</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>pubchem.ncbi.nlm.nih.gov</td>\n", " <td>3469</td>\n", " <td>108.40625</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.m.wikipedia.org 537488 16796.50000\n", "1 en.wikipedia.org 395629 12363.40625\n", "2 www.ncbi.nlm.nih.gov 86432 2701.00000\n", "3 doi.org 74930 2341.56250\n", "4 books.google.com 19728 616.50000\n", "5 www.drugs.com 16639 519.96875\n", "6 www.who.int 15105 472.03125\n", "7 theranos.com 10660 333.12500\n", "8 www.cdc.gov 10119 316.21875\n", "9 chemapps.stolaf.edu 7613 237.90625\n", "10 translate.googleusercontent.com 6743 210.71875\n", "11 apps.who.int 6384 199.50000\n", "12 www.nlm.nih.gov 6291 196.59375\n", "13 www.fda.gov 5101 159.40625\n", "14 www.nytimes.com 5005 156.40625\n", "15 www.commonchemistry.org 4604 143.87500\n", "16 www.accessdata.fda.gov 4352 136.00000\n", "17 www.youtube.com 4154 129.81250\n", "18 emedicine.medscape.com 3791 118.46875\n", "19 pubchem.ncbi.nlm.nih.gov 3469 108.40625" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames\n", "# limited to WP:M pages with external links\n", "# further limited to extClick, fnClick OR upClick events\n", "top_hosts_wpm_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action in ('extClick', 'upClick','fnClick')\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_wpm_events = spark.sql(\n", " top_hosts_wpm_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_wpm_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total event count for top 20 hostnames by event type\n", "- limited to W pages with external links\n", "- further limited to extClick, fnClick OR upClick events" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>host</th>\n", " <th>total_events</th>\n", " <th>daily_average_events</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>en.m.wikipedia.org</td>\n", " <td>15621530</td>\n", " <td>488172.81250</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>en.wikipedia.org</td>\n", " <td>8524542</td>\n", " <td>266391.93750</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>www.imdb.com</td>\n", " <td>3050694</td>\n", " <td>95334.18750</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>tools.wmflabs.org</td>\n", " <td>1064153</td>\n", " <td>33254.78125</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.youtube.com</td>\n", " <td>601433</td>\n", " <td>18794.78125</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>books.google.com</td>\n", " <td>573142</td>\n", " <td>17910.68750</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.espncricinfo.com</td>\n", " <td>379919</td>\n", " <td>11872.46875</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>doi.org</td>\n", " <td>350714</td>\n", " <td>10959.81250</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>www.nytimes.com</td>\n", " <td>301568</td>\n", " <td>9424.00000</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.bbc.co.uk</td>\n", " <td>227540</td>\n", " <td>7110.62500</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>www.theguardian.com</td>\n", " <td>225815</td>\n", " <td>7056.71875</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>www.instagram.com</td>\n", " <td>182944</td>\n", " <td>5717.00000</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>twitter.com</td>\n", " <td>180994</td>\n", " <td>5656.06250</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.google.com</td>\n", " <td>166936</td>\n", " <td>5216.75000</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>archive.org</td>\n", " <td>162164</td>\n", " <td>5067.62500</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>157171</td>\n", " <td>4911.59375</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.allmusic.com</td>\n", " <td>155615</td>\n", " <td>4862.96875</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.rottentomatoes.com</td>\n", " <td>140717</td>\n", " <td>4397.40625</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>translate.googleusercontent.com</td>\n", " <td>131144</td>\n", " <td>4098.25000</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.findagrave.com</td>\n", " <td>130021</td>\n", " <td>4063.15625</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " host total_events daily_average_events\n", "0 en.m.wikipedia.org 15621530 488172.81250\n", "1 en.wikipedia.org 8524542 266391.93750\n", "2 www.imdb.com 3050694 95334.18750\n", "3 tools.wmflabs.org 1064153 33254.78125\n", "4 www.youtube.com 601433 18794.78125\n", "5 books.google.com 573142 17910.68750\n", "6 www.espncricinfo.com 379919 11872.46875\n", "7 doi.org 350714 10959.81250\n", "8 www.nytimes.com 301568 9424.00000\n", "9 www.bbc.co.uk 227540 7110.62500\n", "10 www.theguardian.com 225815 7056.71875\n", "11 www.instagram.com 182944 5717.00000\n", "12 twitter.com 180994 5656.06250\n", "13 www.google.com 166936 5216.75000\n", "14 archive.org 162164 5067.62500\n", "15 www.ncbi.nlm.nih.gov 157171 4911.59375\n", "16 www.allmusic.com 155615 4862.96875\n", "17 www.rottentomatoes.com 140717 4397.40625\n", "18 translate.googleusercontent.com 131144 4098.25000\n", "19 www.findagrave.com 130021 4063.15625" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for top 20 hostnames by event type\n", "# limited to W pages with external links\n", "# further limited to extClick, fnClick OR upClick events\n", "top_hosts_w_query = \"\"\"\n", "SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events\n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND action in ('extClick', 'upClick','fnClick')\n", "AND page_id IN \n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_hosts_w_events = spark.sql(\n", " top_hosts_w_query.format(\n", " parse_host.format('link_url'),\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_hosts_w_events.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top wikipedia.org links by event type" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>action</th>\n", " <th>extClick</th>\n", " <th>fnClick</th>\n", " <th>fnHover</th>\n", " <th>upClick</th>\n", " </tr>\n", " <tr>\n", " <th>host</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>ar.wikipedia.org</th>\n", " <td>276.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>de.wikipedia.org</th>\n", " <td>1798.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>en.m.wikipedia.org</th>\n", " <td>NaN</td>\n", " <td>16044275.0</td>\n", " <td>2105104.0</td>\n", " <td>114695.0</td>\n", " </tr>\n", " <tr>\n", " <th>en.wikipedia.org</th>\n", " <td>262.0</td>\n", " <td>7875877.0</td>\n", " <td>35274475.0</td>\n", " <td>1043977.0</td>\n", " </tr>\n", " <tr>\n", " <th>eo.wikipedia.org</th>\n", " <td>280.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>es.wikipedia.org</th>\n", " <td>785.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>fr.wikipedia.org</th>\n", " <td>1804.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>it.wikipedia.org</th>\n", " <td>646.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>ja.wikipedia.org</th>\n", " <td>1973.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>nl.wikipedia.org</th>\n", " <td>1062.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>pt.wikipedia.org</th>\n", " <td>383.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>ru.wikipedia.org</th>\n", " <td>1092.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>simple.wikipedia.org</th>\n", " <td>563.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>www.wikipedia.org</th>\n", " <td>941.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>zh.wikipedia.org</th>\n", " <td>1171.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "host \n", "ar.wikipedia.org 276.0 NaN NaN NaN\n", "de.wikipedia.org 1798.0 NaN NaN NaN\n", "en.m.wikipedia.org NaN 16044275.0 2105104.0 114695.0\n", "en.wikipedia.org 262.0 7875877.0 35274475.0 1043977.0\n", "eo.wikipedia.org 280.0 NaN NaN NaN\n", "es.wikipedia.org 785.0 NaN NaN NaN\n", "fr.wikipedia.org 1804.0 NaN NaN NaN\n", "it.wikipedia.org 646.0 NaN NaN NaN\n", "ja.wikipedia.org 1973.0 NaN NaN NaN\n", "nl.wikipedia.org 1062.0 NaN NaN NaN\n", "pt.wikipedia.org 383.0 NaN NaN NaN\n", "ru.wikipedia.org 1092.0 NaN NaN NaN\n", "simple.wikipedia.org 563.0 NaN NaN NaN\n", "www.wikipedia.org 941.0 NaN NaN NaN\n", "zh.wikipedia.org 1171.0 NaN NaN NaN" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# wikipedia.org links by event type\n", "top_wiki_hosts_query = \"\"\"\n", "SELECT {} AS host, action, COUNT(*) AS count \n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND lower(parse_url(link_url,'HOST')) like '%wikipedia.org'\n", "AND (\n", " page_id IN \n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", " OR page_id IN\n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " )\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host, action\n", "ORDER BY COUNT(*) DESC\n", "LIMIT 20\n", "\"\"\"\n", "top_wiki_hosts_events = spark.sql(\n", " top_wiki_hosts_query.format(\n", " parse_host.format('link_url'),\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "top_wiki_hosts_events_rdd = top_wiki_hosts_events.rdd\n", "top_wiki_hosts_events_df = sqlContext.createDataFrame(top_wiki_hosts_events_rdd)\n", "top_wiki_hosts_events_pandas = top_wiki_hosts_events_df.toPandas()\n", "\n", "top_wiki_hosts_events_pandas.pivot(index='host', columns='action', values='count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links by event type for W:PM pages withn external links" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>action</th>\n", " <th>extClick</th>\n", " <th>fnClick</th>\n", " </tr>\n", " <tr>\n", " <th>host</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>web.archive.org</th>\n", " <td>51898</td>\n", " <td>3</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "action extClick fnClick\n", "host \n", "web.archive.org 51898 3" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# web.archive.org links by event type for W:PM pages withn external links\n", "archive_org_wpm_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count \n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'\n", "AND page_id IN\n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host, action\n", "\"\"\"\n", "archive_org_wpm_events = spark.sql(\n", " archive_org_wpm_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "archive_org_wpm_events_rdd = archive_org_wpm_events.rdd\n", "archive_org_wpm_events_df = sqlContext.createDataFrame(archive_org_wpm_events_rdd)\n", "archive_org_wpm_events_pandas = archive_org_wpm_events_df.toPandas()\n", "\n", "archive_org_wpm_events_pandas.pivot(index='host', columns='action', values='count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### web.archive.org links by event type for W pages withn external links" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>action</th>\n", " <th>extClick</th>\n", " <th>fnClick</th>\n", " <th>fnHover</th>\n", " <th>upClick</th>\n", " </tr>\n", " <tr>\n", " <th>host</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>web.archive.org</th>\n", " <td>1963023</td>\n", " <td>31</td>\n", " <td>57</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "host \n", "web.archive.org 1963023 31 57 1" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# web.archive.org links by event type for W pages withn external links\n", "archive_org_w_query = \"\"\"\n", "SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count \n", "FROM citationusage \n", "WHERE wiki = 'enwiki'\n", "AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'\n", "AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'\n", "AND page_id IN\n", " (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)\n", "{}\n", "AND to_date(event_time) >= '{}'\n", "AND to_date(event_time) <= '{}'\n", "AND useragent_is_bot = FALSE\n", "GROUP BY host, action\n", "\"\"\"\n", "archive_org_w_events = spark.sql(\n", " archive_org_w_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "archive_org_w_events_rdd = archive_org_w_events.rdd\n", "archive_org_w_events_df = sqlContext.createDataFrame(archive_org_w_events_rdd)\n", "archive_org_w_events_pandas = archive_org_w_events_df.toPandas()\n", "\n", "archive_org_w_events_pandas.pivot(index='host', columns='action', values='count')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }