{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# WP:M Page Classes/Categories" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'\n", "# pandas float formatting\n", "pd.options.display.float_format = '{:.4f}'.format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
category
B-Class_medicine_articles197327.0000332716.0000608502.000019064.0000
C-Class_medicine_articles212225.0000238659.0000413065.000012359.0000
Category-Class_medicine_articles12.00006.000013.0000nan
Disambig-Class_medicine_articles164.000027.000029.00002.0000
FA-Class_medicine_articles14235.000029790.000070200.00002152.0000
FL-Class_medicine_articles425.0000911.00001260.000040.0000
GA-Class_medicine_articles26627.000065354.0000139347.00008036.0000
List-Class_medicine_articles12431.000010669.000018497.0000187.0000
Redirect-Class_medicine_articles148.000098.0000106.00002.0000
Start-Class_medicine_articles291457.0000190972.0000287123.00006710.0000
Stub-Class_medicine_articles70427.000017996.000021208.0000700.0000
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover \\\n", "category \n", "B-Class_medicine_articles 197327.0000 332716.0000 608502.0000 \n", "C-Class_medicine_articles 212225.0000 238659.0000 413065.0000 \n", "Category-Class_medicine_articles 12.0000 6.0000 13.0000 \n", "Disambig-Class_medicine_articles 164.0000 27.0000 29.0000 \n", "FA-Class_medicine_articles 14235.0000 29790.0000 70200.0000 \n", "FL-Class_medicine_articles 425.0000 911.0000 1260.0000 \n", "GA-Class_medicine_articles 26627.0000 65354.0000 139347.0000 \n", "List-Class_medicine_articles 12431.0000 10669.0000 18497.0000 \n", "Redirect-Class_medicine_articles 148.0000 98.0000 106.0000 \n", "Start-Class_medicine_articles 291457.0000 190972.0000 287123.0000 \n", "Stub-Class_medicine_articles 70427.0000 17996.0000 21208.0000 \n", "\n", "action upClick \n", "category \n", "B-Class_medicine_articles 19064.0000 \n", "C-Class_medicine_articles 12359.0000 \n", "Category-Class_medicine_articles nan \n", "Disambig-Class_medicine_articles 2.0000 \n", "FA-Class_medicine_articles 2152.0000 \n", "FL-Class_medicine_articles 40.0000 \n", "GA-Class_medicine_articles 8036.0000 \n", "List-Class_medicine_articles 187.0000 \n", "Redirect-Class_medicine_articles 2.0000 \n", "Start-Class_medicine_articles 6710.0000 \n", "Stub-Class_medicine_articles 700.0000 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type\n", "pm_category_events_query = \"\"\"\n", "SELECT projmed_categories.category, action, count(*) count\n", "FROM \n", " citationusage, \n", " (SELECT DISTINCT page_id, category \n", " FROM ryanmax.projmed_categories \n", " WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n", " AND to_date(projmed_categories.dt) = '2019-04-20'\n", " )\n", " AS projmed_categories\n", "WHERE citationusage.page_id = projmed_categories.page_id\n", " AND wiki = 'enwiki'\n", " AND citationusage.page_id IN \n", " (SELECT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", " {}\n", " AND to_date(event_time) >= '{}'\n", " AND to_date(event_time) <= '{}'\n", " AND useragent_is_bot = FALSE\n", "GROUP BY projmed_categories.category, action\n", "ORDER BY projmed_categories.category, action\n", "\"\"\"\n", "\n", "pm_category_events = spark.sql(\n", " pm_category_events_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "pm_category_events_rdd = pm_category_events.rdd\n", "pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd)\n", "pm_category_events_pandas = pm_category_events_df.toPandas()\n", "pm_category_events_pandas.pivot(index='category', columns='action', values='count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Daily average event counts for each WP:M class (FA, A, GA, B, C, Start, Stub)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Totals" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
total_eventsdaily_average
category
B-Class_medicine_articles115760936175.2812
C-Class_medicine_articles87630827384.6250
Category-Class_medicine_articles310.9688
Disambig-Class_medicine_articles2226.9375
FA-Class_medicine_articles1163773636.7812
FL-Class_medicine_articles263682.3750
GA-Class_medicine_articles2393647480.1250
List-Class_medicine_articles417841305.7500
Redirect-Class_medicine_articles35411.0625
Start-Class_medicine_articles77626224258.1875
Stub-Class_medicine_articles1103313447.8438
\n", "
" ], "text/plain": [ " count \n", " total_events daily_average\n", "category \n", "B-Class_medicine_articles 1157609 36175.2812\n", "C-Class_medicine_articles 876308 27384.6250\n", "Category-Class_medicine_articles 31 0.9688\n", "Disambig-Class_medicine_articles 222 6.9375\n", "FA-Class_medicine_articles 116377 3636.7812\n", "FL-Class_medicine_articles 2636 82.3750\n", "GA-Class_medicine_articles 239364 7480.1250\n", "List-Class_medicine_articles 41784 1305.7500\n", "Redirect-Class_medicine_articles 354 11.0625\n", "Start-Class_medicine_articles 776262 24258.1875\n", "Stub-Class_medicine_articles 110331 3447.8438" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pm_category_events_pandas.groupby(['category']).agg(\n", " [\n", " ('total_events','sum'),\n", " ('daily_average', lambda x: sum(x)/days_in_study)\n", " ])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### By event type" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
category
B-Class_medicine_articles6166.468810397.375019015.6875595.7500
C-Class_medicine_articles6632.03127458.093812908.2812386.2188
Category-Class_medicine_articles0.37500.18750.4062nan
Disambig-Class_medicine_articles5.12500.84380.90620.0625
FA-Class_medicine_articles444.8438930.93752193.750067.2500
FL-Class_medicine_articles13.281228.468839.37501.2500
GA-Class_medicine_articles832.09382042.31254354.5938251.1250
List-Class_medicine_articles388.4688333.4062578.03125.8438
Redirect-Class_medicine_articles4.62503.06253.31250.0625
Start-Class_medicine_articles9108.03125967.87508972.5938209.6875
Stub-Class_medicine_articles2200.8438562.3750662.750021.8750
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "category \n", "B-Class_medicine_articles 6166.4688 10397.3750 19015.6875 595.7500\n", "C-Class_medicine_articles 6632.0312 7458.0938 12908.2812 386.2188\n", "Category-Class_medicine_articles 0.3750 0.1875 0.4062 nan\n", "Disambig-Class_medicine_articles 5.1250 0.8438 0.9062 0.0625\n", "FA-Class_medicine_articles 444.8438 930.9375 2193.7500 67.2500\n", "FL-Class_medicine_articles 13.2812 28.4688 39.3750 1.2500\n", "GA-Class_medicine_articles 832.0938 2042.3125 4354.5938 251.1250\n", "List-Class_medicine_articles 388.4688 333.4062 578.0312 5.8438\n", "Redirect-Class_medicine_articles 4.6250 3.0625 3.3125 0.0625\n", "Start-Class_medicine_articles 9108.0312 5967.8750 8972.5938 209.6875\n", "Stub-Class_medicine_articles 2200.8438 562.3750 662.7500 21.8750" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pm_category_events_pandas['average'] = pm_category_events_pandas['count'].map(lambda x: x/days_in_study)\n", "pm_category_events_pandas.pivot(index='category', columns='action', values='average')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "**numbers will not match [1] because we're limiting to namespace 0 pages with external links**\n", "\n", "[1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorypages_w_links
0Start-Class_medicine_articles14484
1Stub-Class_medicine_articles9765
2C-Class_medicine_articles5332
3B-Class_medicine_articles2173
4List-Class_medicine_articles452
5GA-Class_medicine_articles240
6FA-Class_medicine_articles62
7Disambig-Class_medicine_articles17
8Redirect-Class_medicine_articles13
9FL-Class_medicine_articles12
10Category-Class_medicine_articles1
\n", "
" ], "text/plain": [ " category pages_w_links\n", "0 Start-Class_medicine_articles 14484\n", "1 Stub-Class_medicine_articles 9765\n", "2 C-Class_medicine_articles 5332\n", "3 B-Class_medicine_articles 2173\n", "4 List-Class_medicine_articles 452\n", "5 GA-Class_medicine_articles 240\n", "6 FA-Class_medicine_articles 62\n", "7 Disambig-Class_medicine_articles 17\n", "8 Redirect-Class_medicine_articles 13\n", "9 FL-Class_medicine_articles 12\n", "10 Category-Class_medicine_articles 1" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "# numbers will not match [1] because we're limiting to namespace 0 pages with external links\n", "# [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics\n", "pm_category_pages = \"\"\"\n", "SELECT category, COUNT(DISTINCT page_id) AS pages_w_links \n", "FROM ryanmax.projmed_categories \n", "WHERE category LIKE '%Class_medicine_articles%'\n", "AND to_date(projmed_categories.dt) = '2019-04-20'\n", "AND page_id IN \n", " (SELECT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "GROUP BY category\n", "ORDER BY COUNT(*) DESC\n", "\"\"\"\n", "pm_cat_counts = spark.sql(pm_category_pages)\n", "cats = sqlContext.createDataFrame(pm_cat_counts.rdd)\n", "cats.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "**numbers will be higher than overall WP:M pageloads since one page may have more than one category**" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorytotal_pageloadsdaily_average
0B-Class_medicine_articles578922151809131.7188
1C-Class_medicine_articles515074951609609.2188
2Category-Class_medicine_articles2598.0938
3Disambig-Class_medicine_articles369271153.9688
4FA-Class_medicine_articles4389844137182.6250
5FL-Class_medicine_articles1216913802.8438
6GA-Class_medicine_articles9350159292192.4688
7List-Class_medicine_articles180180656306.4375
8Redirect-Class_medicine_articles25308790.8750
9Start-Class_medicine_articles547416211710675.6562
10Stub-Class_medicine_articles8056357251761.1562
\n", "
" ], "text/plain": [ " category total_pageloads daily_average\n", "0 B-Class_medicine_articles 57892215 1809131.7188\n", "1 C-Class_medicine_articles 51507495 1609609.2188\n", "2 Category-Class_medicine_articles 259 8.0938\n", "3 Disambig-Class_medicine_articles 36927 1153.9688\n", "4 FA-Class_medicine_articles 4389844 137182.6250\n", "5 FL-Class_medicine_articles 121691 3802.8438\n", "6 GA-Class_medicine_articles 9350159 292192.4688\n", "7 List-Class_medicine_articles 1801806 56306.4375\n", "8 Redirect-Class_medicine_articles 25308 790.8750\n", "9 Start-Class_medicine_articles 54741621 1710675.6562\n", "10 Stub-Class_medicine_articles 8056357 251761.1562" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "# numbers will be higher than overall WP:M pageloads since one page may have more than one category\n", "pm_category_pageloads_query = \"\"\"\n", "SELECT projmed_categories.category, sum(view_count) AS total_pageloads, sum(view_count)/{} AS daily_average\n", "FROM \n", " wmf.pageview_hourly, \n", " (SELECT DISTINCT page_id, category \n", " FROM ryanmax.projmed_categories \n", " WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n", " AND to_date(projmed_categories.dt) = '2019-04-20'\n", " ) \n", " AS projmed_categories\n", "WHERE pageview_hourly.page_id = projmed_categories.page_id\n", " AND pageview_hourly.project = 'en.wikipedia'\n", " AND pageview_hourly.agent_type = 'user'\n", " AND pageview_hourly.page_id IN \n", " (SELECT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", " AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", " AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", " GROUP BY projmed_categories.category\n", " ORDER BY projmed_categories.category\n", "\"\"\"\n", "\n", "pm_category_pageloads = spark.sql(\n", " pm_category_pageloads_query.format(\n", " days_in_study, start_date_string, end_date_string\n", " ))\n", "pm_category_pageloads.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorynum_extlinksavg_extlinks_per_page
0B-Class_medicine_articles21615999.4749
1C-Class_medicine_articles25666048.1358
2Category-Class_medicine_articles135135.0000
3Disambig-Class_medicine_articles855.0000
4FA-Class_medicine_articles13914224.4194
5FL-Class_medicine_articles2588215.6667
6GA-Class_medicine_articles32793136.6375
7List-Class_medicine_articles3409175.4226
8Redirect-Class_medicine_articles1199.1538
9Start-Class_medicine_articles31057421.4426
10Stub-Class_medicine_articles766687.8513
\n", "
" ], "text/plain": [ " category num_extlinks avg_extlinks_per_page\n", "0 B-Class_medicine_articles 216159 99.4749\n", "1 C-Class_medicine_articles 256660 48.1358\n", "2 Category-Class_medicine_articles 135 135.0000\n", "3 Disambig-Class_medicine_articles 85 5.0000\n", "4 FA-Class_medicine_articles 13914 224.4194\n", "5 FL-Class_medicine_articles 2588 215.6667\n", "6 GA-Class_medicine_articles 32793 136.6375\n", "7 List-Class_medicine_articles 34091 75.4226\n", "8 Redirect-Class_medicine_articles 119 9.1538\n", "9 Start-Class_medicine_articles 310574 21.4426\n", "10 Stub-Class_medicine_articles 76668 7.8513" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "pm_category_links_query = \"\"\"\n", "SELECT projmed_categories.category, \n", " COUNT(DISTINCT el_from, el_to) AS num_extlinks, \n", " COUNT(DISTINCT el_from, el_to)/COUNT(DISTINCT el_from) as avg_extlinks_per_page\n", "FROM \n", " ryanmax.population_externallinks, \n", " (SELECT DISTINCT page_id, category \n", " FROM ryanmax.projmed_categories \n", " WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n", " AND to_date(projmed_categories.dt) = '2019-04-20'\n", " ) \n", " AS projmed_categories\n", "WHERE population_externallinks.el_from = projmed_categories.page_id\n", " AND population_externallinks.el_from IN \n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " GROUP BY projmed_categories.category\n", " ORDER BY projmed_categories.category\n", "\"\"\"\n", "\n", "pm_category_links = spark.sql(pm_category_links_query)\n", "pm_category_links.toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark large: spark-xml jar and local venv path ", "language": "python", "name": "spark-ryanmax" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }