{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pageviews for W:PM and W pages with external links\n", "- pageview data source: [`wmf.pageview_hourly`](https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_hourly) table\n", "- counts exclude bots/spiders (agent_type = 'user')" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'\n", "# pandas float formatting\n", "pd.options.display.float_format = '{:.4f}'.format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total pageviews for WP:M pages with external links" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>access_method</th>\n", " <th>total_pageviews</th>\n", " <th>daily_average_pageviews</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>desktop</td>\n", " <td>62650292</td>\n", " <td>1957821.6250</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>mobile web</td>\n", " <td>124214376</td>\n", " <td>3881699.2500</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>mobile app</td>\n", " <td>1150383</td>\n", " <td>35949.4688</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " access_method total_pageviews daily_average_pageviews\n", "0 desktop 62650292 1957821.6250\n", "1 mobile web 124214376 3881699.2500\n", "2 mobile app 1150383 35949.4688" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wpm_query = \"\"\"\n", "SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews\n", "FROM wmf.pageview_hourly\n", "WHERE project = 'en.wikipedia'\n", "AND page_id IN\n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks)\n", "AND agent_type = 'user'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", "GROUP BY access_method\n", "\"\"\"\n", "wpm_pageviews = spark.sql(wpm_query.format(days_in_study, start_date_string, end_date_string))\n", "wpm_pageviews_rdd = wpm_pageviews.rdd\n", "wpm_pageviews_df = sqlContext.createDataFrame(wpm_pageviews_rdd)\n", "wpm_pageviews_pandas = wpm_pageviews_df.toPandas()\n", "\n", "wpm_pageviews_pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total pageviews for W pages with external links" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>access_method</th>\n", " <th>total_pageviews</th>\n", " <th>daily_average_pageviews</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>desktop</td>\n", " <td>3134600739</td>\n", " <td>97956273.0938</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>mobile web</td>\n", " <td>4120797860</td>\n", " <td>128774933.1250</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>mobile app</td>\n", " <td>54845509</td>\n", " <td>1713922.1562</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " access_method total_pageviews daily_average_pageviews\n", "0 desktop 3134600739 97956273.0938\n", "1 mobile web 4120797860 128774933.1250\n", "2 mobile app 54845509 1713922.1562" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w_query = \"\"\"\n", "SELECT access_method, SUM(view_count) AS total_pageviews, SUM(view_count)/{} AS daily_average_pageviews\n", "FROM wmf.pageview_hourly\n", "WHERE project = 'en.wikipedia'\n", "AND page_id IN\n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_w_pages_with_extlinks)\n", "AND agent_type = 'user'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", "GROUP BY access_method\n", "\"\"\"\n", "w_pageviews = spark.sql(w_query.format(days_in_study,start_date_string, end_date_string))\n", "w_pageviews_rdd = w_pageviews.rdd\n", "w_pageviews_df = sqlContext.createDataFrame(w_pageviews_rdd)\n", "w_pageviews_pandas = w_pageviews_df.toPandas()\n", "\n", "w_pageviews_pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Top WP:M pages" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>page_id</th>\n", " <th>page_title</th>\n", " <th>total_pageviews</th>\n", " <th>daily_average_pageviews</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>43573275</td>\n", " <td>Elizabeth Holmes</td>\n", " <td>1349527</td>\n", " <td>42172.7188</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>41779862</td>\n", " <td>Theranos</td>\n", " <td>546969</td>\n", " <td>17092.7812</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>58911</td>\n", " <td>Measles</td>\n", " <td>388340</td>\n", " <td>12135.6250</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>18079</td>\n", " <td>Leonardo da Vinci</td>\n", " <td>380421</td>\n", " <td>11888.1562</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>27546</td>\n", " <td>Sexual intercourse</td>\n", " <td>371290</td>\n", " <td>11602.8125</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>37556</td>\n", " <td>Asperger syndrome</td>\n", " <td>336547</td>\n", " <td>10517.0938</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>791546</td>\n", " <td>Ketogenic diet</td>\n", " <td>319326</td>\n", " <td>9978.9375</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>4488176</td>\n", " <td>Factitious disorder imposed on another</td>\n", " <td>314542</td>\n", " <td>9829.4375</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>4501</td>\n", " <td>Black Death</td>\n", " <td>308057</td>\n", " <td>9626.7812</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>56880920</td>\n", " <td>Ramesh Balwani</td>\n", " <td>305280</td>\n", " <td>9540.0000</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>52135</td>\n", " <td>Pneumonia</td>\n", " <td>292179</td>\n", " <td>9130.5938</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>1232575</td>\n", " <td>Suicide methods</td>\n", " <td>290488</td>\n", " <td>9077.7500</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>44990</td>\n", " <td>Ryan White</td>\n", " <td>260600</td>\n", " <td>8143.7500</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>56483</td>\n", " <td>Tourette syndrome</td>\n", " <td>245082</td>\n", " <td>7658.8125</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>50601</td>\n", " <td>Cystic fibrosis</td>\n", " <td>241014</td>\n", " <td>7531.6875</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>1232085</td>\n", " <td>Cannabidiol</td>\n", " <td>240093</td>\n", " <td>7502.9062</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>149223</td>\n", " <td>Borderline personality disorder</td>\n", " <td>239415</td>\n", " <td>7481.7188</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>26652964</td>\n", " <td>Intersex</td>\n", " <td>229836</td>\n", " <td>7182.3750</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>63522</td>\n", " <td>Crohn's disease</td>\n", " <td>226383</td>\n", " <td>7074.4688</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>30653</td>\n", " <td>Tuberculosis</td>\n", " <td>222910</td>\n", " <td>6965.9375</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>4531</td>\n", " <td>Bipolar disorder</td>\n", " <td>221744</td>\n", " <td>6929.5000</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>10024</td>\n", " <td>MDMA</td>\n", " <td>206809</td>\n", " <td>6462.7812</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>50603</td>\n", " <td>Multiple sclerosis</td>\n", " <td>206073</td>\n", " <td>6439.7812</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>17537</td>\n", " <td>Lysergic acid diethylamide</td>\n", " <td>198790</td>\n", " <td>6212.1875</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>1481886</td>\n", " <td>Cannabis (drug)</td>\n", " <td>197614</td>\n", " <td>6175.4375</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>5069516</td>\n", " <td>HIV/AIDS</td>\n", " <td>196495</td>\n", " <td>6140.4688</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>141915</td>\n", " <td>Fentanyl</td>\n", " <td>195843</td>\n", " <td>6120.0938</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>7701</td>\n", " <td>Cocaine</td>\n", " <td>195679</td>\n", " <td>6114.9688</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>244113</td>\n", " <td>Lyme disease</td>\n", " <td>192052</td>\n", " <td>6001.6250</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>27790</td>\n", " <td>Schizophrenia</td>\n", " <td>189400</td>\n", " <td>5918.7500</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>158400</td>\n", " <td>Sepsis</td>\n", " <td>186060</td>\n", " <td>5814.3750</td>\n", " </tr>\n", " <tr>\n", " <th>31</th>\n", " <td>22228064</td>\n", " <td>Parkinson's disease</td>\n", " <td>183926</td>\n", " <td>5747.6875</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>26743</td>\n", " <td>Sigmund Freud</td>\n", " <td>182681</td>\n", " <td>5708.7812</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>482534</td>\n", " <td>Gabapentin</td>\n", " <td>181754</td>\n", " <td>5679.8125</td>\n", " </tr>\n", " <tr>\n", " <th>34</th>\n", " <td>318049</td>\n", " <td>Fibromyalgia</td>\n", " <td>179980</td>\n", " <td>5624.3750</td>\n", " </tr>\n", " <tr>\n", " <th>35</th>\n", " <td>42043</td>\n", " <td>Project MKUltra</td>\n", " <td>175753</td>\n", " <td>5492.2812</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>101965</td>\n", " <td>Psoriasis</td>\n", " <td>175601</td>\n", " <td>5487.5312</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>40017873</td>\n", " <td>Diabetes mellitus</td>\n", " <td>175558</td>\n", " <td>5486.1875</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>83406</td>\n", " <td>Paracetamol</td>\n", " <td>174939</td>\n", " <td>5466.8438</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>491494</td>\n", " <td>Ankylosing spondylitis</td>\n", " <td>173697</td>\n", " <td>5428.0312</td>\n", " </tr>\n", " <tr>\n", " <th>40</th>\n", " <td>37195273</td>\n", " <td>Trypophobia</td>\n", " <td>172821</td>\n", " <td>5400.6562</td>\n", " </tr>\n", " <tr>\n", " <th>41</th>\n", " <td>65847</td>\n", " <td>Vitiligo</td>\n", " <td>172270</td>\n", " <td>5383.4375</td>\n", " </tr>\n", " <tr>\n", " <th>42</th>\n", " <td>25</td>\n", " <td>Autism</td>\n", " <td>169124</td>\n", " <td>5285.1250</td>\n", " </tr>\n", " <tr>\n", " <th>43</th>\n", " <td>11038318</td>\n", " <td>Methamphetamine</td>\n", " <td>166435</td>\n", " <td>5201.0938</td>\n", " </tr>\n", " <tr>\n", " <th>44</th>\n", " <td>443800</td>\n", " <td>Shingles</td>\n", " <td>164727</td>\n", " <td>5147.7188</td>\n", " </tr>\n", " <tr>\n", " <th>45</th>\n", " <td>649100</td>\n", " <td>Adderall</td>\n", " <td>164180</td>\n", " <td>5130.6250</td>\n", " </tr>\n", " <tr>\n", " <th>46</th>\n", " <td>250858</td>\n", " <td>Tramadol</td>\n", " <td>162858</td>\n", " <td>5089.3125</td>\n", " </tr>\n", " <tr>\n", " <th>47</th>\n", " <td>20556798</td>\n", " <td>Myocardial infarction</td>\n", " <td>160886</td>\n", " <td>5027.6875</td>\n", " </tr>\n", " <tr>\n", " <th>48</th>\n", " <td>19375577</td>\n", " <td>Amyotrophic lateral sclerosis</td>\n", " <td>160844</td>\n", " <td>5026.3750</td>\n", " </tr>\n", " <tr>\n", " <th>49</th>\n", " <td>234806</td>\n", " <td>Diazepam</td>\n", " <td>158085</td>\n", " <td>4940.1562</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " page_id page_title total_pageviews \\\n", "0 43573275 Elizabeth Holmes 1349527 \n", "1 41779862 Theranos 546969 \n", "2 58911 Measles 388340 \n", "3 18079 Leonardo da Vinci 380421 \n", "4 27546 Sexual intercourse 371290 \n", "5 37556 Asperger syndrome 336547 \n", "6 791546 Ketogenic diet 319326 \n", "7 4488176 Factitious disorder imposed on another 314542 \n", "8 4501 Black Death 308057 \n", "9 56880920 Ramesh Balwani 305280 \n", "10 52135 Pneumonia 292179 \n", "11 1232575 Suicide methods 290488 \n", "12 44990 Ryan White 260600 \n", "13 56483 Tourette syndrome 245082 \n", "14 50601 Cystic fibrosis 241014 \n", "15 1232085 Cannabidiol 240093 \n", "16 149223 Borderline personality disorder 239415 \n", "17 26652964 Intersex 229836 \n", "18 63522 Crohn's disease 226383 \n", "19 30653 Tuberculosis 222910 \n", "20 4531 Bipolar disorder 221744 \n", "21 10024 MDMA 206809 \n", "22 50603 Multiple sclerosis 206073 \n", "23 17537 Lysergic acid diethylamide 198790 \n", "24 1481886 Cannabis (drug) 197614 \n", "25 5069516 HIV/AIDS 196495 \n", "26 141915 Fentanyl 195843 \n", "27 7701 Cocaine 195679 \n", "28 244113 Lyme disease 192052 \n", "29 27790 Schizophrenia 189400 \n", "30 158400 Sepsis 186060 \n", "31 22228064 Parkinson's disease 183926 \n", "32 26743 Sigmund Freud 182681 \n", "33 482534 Gabapentin 181754 \n", "34 318049 Fibromyalgia 179980 \n", "35 42043 Project MKUltra 175753 \n", "36 101965 Psoriasis 175601 \n", "37 40017873 Diabetes mellitus 175558 \n", "38 83406 Paracetamol 174939 \n", "39 491494 Ankylosing spondylitis 173697 \n", "40 37195273 Trypophobia 172821 \n", "41 65847 Vitiligo 172270 \n", "42 25 Autism 169124 \n", "43 11038318 Methamphetamine 166435 \n", "44 443800 Shingles 164727 \n", "45 649100 Adderall 164180 \n", "46 250858 Tramadol 162858 \n", "47 20556798 Myocardial infarction 160886 \n", "48 19375577 Amyotrophic lateral sclerosis 160844 \n", "49 234806 Diazepam 158085 \n", "\n", " daily_average_pageviews \n", "0 42172.7188 \n", "1 17092.7812 \n", "2 12135.6250 \n", "3 11888.1562 \n", "4 11602.8125 \n", "5 10517.0938 \n", "6 9978.9375 \n", "7 9829.4375 \n", "8 9626.7812 \n", "9 9540.0000 \n", "10 9130.5938 \n", "11 9077.7500 \n", "12 8143.7500 \n", "13 7658.8125 \n", "14 7531.6875 \n", "15 7502.9062 \n", "16 7481.7188 \n", "17 7182.3750 \n", "18 7074.4688 \n", "19 6965.9375 \n", "20 6929.5000 \n", "21 6462.7812 \n", "22 6439.7812 \n", "23 6212.1875 \n", "24 6175.4375 \n", "25 6140.4688 \n", "26 6120.0938 \n", "27 6114.9688 \n", "28 6001.6250 \n", "29 5918.7500 \n", "30 5814.3750 \n", "31 5747.6875 \n", "32 5708.7812 \n", "33 5679.8125 \n", "34 5624.3750 \n", "35 5492.2812 \n", "36 5487.5312 \n", "37 5486.1875 \n", "38 5466.8438 \n", "39 5428.0312 \n", "40 5400.6562 \n", "41 5383.4375 \n", "42 5285.1250 \n", "43 5201.0938 \n", "44 5147.7188 \n", "45 5130.6250 \n", "46 5089.3125 \n", "47 5027.6875 \n", "48 5026.3750 \n", "49 4940.1562 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# most visited WP:M pages with external links\n", "top_wpm_query = \"\"\"\n", "SELECT pageview_hourly.page_id, population_page_titles_20190420.page_title, SUM(pageview_hourly.view_count) AS total_pageviews, SUM(pageview_hourly.view_count)/{} AS daily_average_pageviews\n", "FROM wmf.pageview_hourly\n", "LEFT JOIN ryanmax.population_page_titles_20190420 ON pageview_hourly.page_id = population_page_titles_20190420.page_id\n", "WHERE project = 'en.wikipedia'\n", "AND pageview_hourly.page_id IN\n", " (SELECT DISTINCT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks)\n", "AND agent_type = 'user'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", "AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", "GROUP BY pageview_hourly.page_id, population_page_titles_20190420.page_title \n", "ORDER BY total_pageviews DESC\n", "LIMIT 50\n", "\"\"\"\n", "\n", "top_wpm = spark.sql(top_wpm_query.format(days_in_study,start_date_string, end_date_string))\n", "top_wpm.limit(50).toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }