{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pageviews for W:PM and W pages with external links\n", "- pageview data source: [`wmf.pageview_hourly`](https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_hourly) table\n", "- counts exclude bots/spiders (agent_type = 'user')" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'\n", "# pandas float formatting\n", "pd.options.display.float_format = '{:.4f}'.format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total pageviews for WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
access_methodtotal_pageviewsdaily_average_pageviews
0desktop626502921957821.6250
1mobile web1242143763881699.2500
2mobile app115038335949.4688
\n", "
" ], "text/plain": [ " access_method total_pageviews daily_average_pageviews\n", "0 desktop 62650292 1957821.6250\n", "1 mobile web 124214376 3881699.2500\n", "2 mobile app 1150383 35949.4688" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wpm_query = \"\"\"\n", "SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews\n", "FROM wmf.pageview_hourly\n", "WHERE project = 'en.wikipedia'\n", "AND page_id IN\n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks)\n", "AND agent_type = 'user'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", "GROUP BY access_method\n", "\"\"\"\n", "wpm_pageviews = spark.sql(wpm_query.format(days_in_study, start_date_string, end_date_string))\n", "wpm_pageviews_rdd = wpm_pageviews.rdd\n", "wpm_pageviews_df = sqlContext.createDataFrame(wpm_pageviews_rdd)\n", "wpm_pageviews_pandas = wpm_pageviews_df.toPandas()\n", "\n", "wpm_pageviews_pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total pageviews for W pages with external links" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
access_methodtotal_pageviewsdaily_average_pageviews
0desktop313460073997956273.0938
1mobile web4120797860128774933.1250
2mobile app548455091713922.1562
\n", "
" ], "text/plain": [ " access_method total_pageviews daily_average_pageviews\n", "0 desktop 3134600739 97956273.0938\n", "1 mobile web 4120797860 128774933.1250\n", "2 mobile app 54845509 1713922.1562" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w_query = \"\"\"\n", "SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews\n", "FROM wmf.pageview_hourly\n", "WHERE project = 'en.wikipedia'\n", "AND page_id IN\n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks)\n", "AND agent_type = 'user'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", "GROUP BY access_method\n", "\"\"\"\n", "w_pageviews = spark.sql(w_query.format(days_in_study,start_date_string, end_date_string))\n", "w_pageviews_rdd = w_pageviews.rdd\n", "w_pageviews_df = sqlContext.createDataFrame(w_pageviews_rdd)\n", "w_pageviews_pandas = w_pageviews_df.toPandas()\n", "\n", "w_pageviews_pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Top WP:M pages" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
page_idpage_titletotal_pageviewsdaily_average_pageviews
043573275Elizabeth Holmes134952742172.7188
141779862Theranos54696917092.7812
258911Measles38834012135.6250
318079Leonardo da Vinci38042111888.1562
427546Sexual intercourse37129011602.8125
537556Asperger syndrome33654710517.0938
6791546Ketogenic diet3193269978.9375
74488176Factitious disorder imposed on another3145429829.4375
84501Black Death3080579626.7812
956880920Ramesh Balwani3052809540.0000
1052135Pneumonia2921799130.5938
111232575Suicide methods2904889077.7500
1244990Ryan White2606008143.7500
1356483Tourette syndrome2450827658.8125
1450601Cystic fibrosis2410147531.6875
151232085Cannabidiol2400937502.9062
16149223Borderline personality disorder2394157481.7188
1726652964Intersex2298367182.3750
1863522Crohn's disease2263837074.4688
1930653Tuberculosis2229106965.9375
204531Bipolar disorder2217446929.5000
2110024MDMA2068096462.7812
2250603Multiple sclerosis2060736439.7812
2317537Lysergic acid diethylamide1987906212.1875
241481886Cannabis (drug)1976146175.4375
255069516HIV/AIDS1964956140.4688
26141915Fentanyl1958436120.0938
277701Cocaine1956796114.9688
28244113Lyme disease1920526001.6250
2927790Schizophrenia1894005918.7500
30158400Sepsis1860605814.3750
3122228064Parkinson's disease1839265747.6875
3226743Sigmund Freud1826815708.7812
33482534Gabapentin1817545679.8125
34318049Fibromyalgia1799805624.3750
3542043Project MKUltra1757535492.2812
36101965Psoriasis1756015487.5312
3740017873Diabetes mellitus1755585486.1875
3883406Paracetamol1749395466.8438
39491494Ankylosing spondylitis1736975428.0312
4037195273Trypophobia1728215400.6562
4165847Vitiligo1722705383.4375
4225Autism1691245285.1250
4311038318Methamphetamine1664355201.0938
44443800Shingles1647275147.7188
45649100Adderall1641805130.6250
46250858Tramadol1628585089.3125
4720556798Myocardial infarction1608865027.6875
4819375577Amyotrophic lateral sclerosis1608445026.3750
49234806Diazepam1580854940.1562
\n", "
" ], "text/plain": [ " page_id page_title total_pageviews \\\n", "0 43573275 Elizabeth Holmes 1349527 \n", "1 41779862 Theranos 546969 \n", "2 58911 Measles 388340 \n", "3 18079 Leonardo da Vinci 380421 \n", "4 27546 Sexual intercourse 371290 \n", "5 37556 Asperger syndrome 336547 \n", "6 791546 Ketogenic diet 319326 \n", "7 4488176 Factitious disorder imposed on another 314542 \n", "8 4501 Black Death 308057 \n", "9 56880920 Ramesh Balwani 305280 \n", "10 52135 Pneumonia 292179 \n", "11 1232575 Suicide methods 290488 \n", "12 44990 Ryan White 260600 \n", "13 56483 Tourette syndrome 245082 \n", "14 50601 Cystic fibrosis 241014 \n", "15 1232085 Cannabidiol 240093 \n", "16 149223 Borderline personality disorder 239415 \n", "17 26652964 Intersex 229836 \n", "18 63522 Crohn's disease 226383 \n", "19 30653 Tuberculosis 222910 \n", "20 4531 Bipolar disorder 221744 \n", "21 10024 MDMA 206809 \n", "22 50603 Multiple sclerosis 206073 \n", "23 17537 Lysergic acid diethylamide 198790 \n", "24 1481886 Cannabis (drug) 197614 \n", "25 5069516 HIV/AIDS 196495 \n", "26 141915 Fentanyl 195843 \n", "27 7701 Cocaine 195679 \n", "28 244113 Lyme disease 192052 \n", "29 27790 Schizophrenia 189400 \n", "30 158400 Sepsis 186060 \n", "31 22228064 Parkinson's disease 183926 \n", "32 26743 Sigmund Freud 182681 \n", "33 482534 Gabapentin 181754 \n", "34 318049 Fibromyalgia 179980 \n", "35 42043 Project MKUltra 175753 \n", "36 101965 Psoriasis 175601 \n", "37 40017873 Diabetes mellitus 175558 \n", "38 83406 Paracetamol 174939 \n", "39 491494 Ankylosing spondylitis 173697 \n", "40 37195273 Trypophobia 172821 \n", "41 65847 Vitiligo 172270 \n", "42 25 Autism 169124 \n", "43 11038318 Methamphetamine 166435 \n", "44 443800 Shingles 164727 \n", "45 649100 Adderall 164180 \n", "46 250858 Tramadol 162858 \n", "47 20556798 Myocardial infarction 160886 \n", "48 19375577 Amyotrophic lateral sclerosis 160844 \n", "49 234806 Diazepam 158085 \n", "\n", " daily_average_pageviews \n", "0 42172.7188 \n", "1 17092.7812 \n", "2 12135.6250 \n", "3 11888.1562 \n", "4 11602.8125 \n", "5 10517.0938 \n", "6 9978.9375 \n", "7 9829.4375 \n", "8 9626.7812 \n", "9 9540.0000 \n", "10 9130.5938 \n", "11 9077.7500 \n", "12 8143.7500 \n", "13 7658.8125 \n", "14 7531.6875 \n", "15 7502.9062 \n", "16 7481.7188 \n", "17 7182.3750 \n", "18 7074.4688 \n", "19 6965.9375 \n", "20 6929.5000 \n", "21 6462.7812 \n", "22 6439.7812 \n", "23 6212.1875 \n", "24 6175.4375 \n", "25 6140.4688 \n", "26 6120.0938 \n", "27 6114.9688 \n", "28 6001.6250 \n", "29 5918.7500 \n", "30 5814.3750 \n", "31 5747.6875 \n", "32 5708.7812 \n", "33 5679.8125 \n", "34 5624.3750 \n", "35 5492.2812 \n", "36 5487.5312 \n", "37 5486.1875 \n", "38 5466.8438 \n", "39 5428.0312 \n", "40 5400.6562 \n", "41 5383.4375 \n", "42 5285.1250 \n", "43 5201.0938 \n", "44 5147.7188 \n", "45 5130.6250 \n", "46 5089.3125 \n", "47 5027.6875 \n", "48 5026.3750 \n", "49 4940.1562 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# most visited WP:M pages with external links\n", "top_wpm_query = \"\"\"\n", "SELECT pageview_hourly.page_id, population_page_titles_20190420.page_title, SUM(pageview_hourly.view_count) AS total_pageviews, SUM(pageview_hourly.view_count)/{} AS daily_average_pageviews\n", "FROM wmf.pageview_hourly\n", "LEFT JOIN ryanmax.population_page_titles_20190420 ON pageview_hourly.page_id = population_page_titles_20190420.page_id\n", "WHERE project = 'en.wikipedia'\n", "AND pageview_hourly.page_id IN\n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks)\n", "AND agent_type = 'user'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", "GROUP BY pageview_hourly.page_id, population_page_titles_20190420.page_title \n", "ORDER BY total_pageviews DESC\n", "LIMIT 50\n", "\"\"\"\n", "\n", "top_wpm = spark.sql(top_wpm_query.format(days_in_study,start_date_string, end_date_string))\n", "top_wpm.limit(50).toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }