{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# WP:M Page Classes/Categories" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'\n", "# pandas float formatting\n", "pd.options.display.float_format = '{:.4f}'.format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>action</th>\n", " <th>extClick</th>\n", " <th>fnClick</th>\n", " <th>fnHover</th>\n", " <th>upClick</th>\n", " </tr>\n", " <tr>\n", " <th>category</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>B-Class_medicine_articles</th>\n", " <td>197327.0000</td>\n", " <td>332716.0000</td>\n", " <td>608502.0000</td>\n", " <td>19064.0000</td>\n", " </tr>\n", " <tr>\n", " <th>C-Class_medicine_articles</th>\n", " <td>212225.0000</td>\n", " <td>238659.0000</td>\n", " <td>413065.0000</td>\n", " <td>12359.0000</td>\n", " </tr>\n", " <tr>\n", " <th>Category-Class_medicine_articles</th>\n", " <td>12.0000</td>\n", " <td>6.0000</td>\n", " <td>13.0000</td>\n", " <td>nan</td>\n", " </tr>\n", " <tr>\n", " <th>Disambig-Class_medicine_articles</th>\n", " <td>164.0000</td>\n", " <td>27.0000</td>\n", " <td>29.0000</td>\n", " <td>2.0000</td>\n", " </tr>\n", " <tr>\n", " <th>FA-Class_medicine_articles</th>\n", " <td>14235.0000</td>\n", " <td>29790.0000</td>\n", " <td>70200.0000</td>\n", " <td>2152.0000</td>\n", " </tr>\n", " <tr>\n", " <th>FL-Class_medicine_articles</th>\n", " <td>425.0000</td>\n", " <td>911.0000</td>\n", " <td>1260.0000</td>\n", " <td>40.0000</td>\n", " </tr>\n", " <tr>\n", " <th>GA-Class_medicine_articles</th>\n", " <td>26627.0000</td>\n", " <td>65354.0000</td>\n", " <td>139347.0000</td>\n", " <td>8036.0000</td>\n", " </tr>\n", " <tr>\n", " <th>List-Class_medicine_articles</th>\n", " <td>12431.0000</td>\n", " <td>10669.0000</td>\n", " <td>18497.0000</td>\n", " <td>187.0000</td>\n", " </tr>\n", " <tr>\n", " <th>Redirect-Class_medicine_articles</th>\n", " <td>148.0000</td>\n", " <td>98.0000</td>\n", " <td>106.0000</td>\n", " <td>2.0000</td>\n", " </tr>\n", " <tr>\n", " <th>Start-Class_medicine_articles</th>\n", " <td>291457.0000</td>\n", " <td>190972.0000</td>\n", " <td>287123.0000</td>\n", " <td>6710.0000</td>\n", " </tr>\n", " <tr>\n", " <th>Stub-Class_medicine_articles</th>\n", " <td>70427.0000</td>\n", " <td>17996.0000</td>\n", " <td>21208.0000</td>\n", " <td>700.0000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "action extClick fnClick fnHover \\\n", "category \n", "B-Class_medicine_articles 197327.0000 332716.0000 608502.0000 \n", "C-Class_medicine_articles 212225.0000 238659.0000 413065.0000 \n", "Category-Class_medicine_articles 12.0000 6.0000 13.0000 \n", "Disambig-Class_medicine_articles 164.0000 27.0000 29.0000 \n", "FA-Class_medicine_articles 14235.0000 29790.0000 70200.0000 \n", "FL-Class_medicine_articles 425.0000 911.0000 1260.0000 \n", "GA-Class_medicine_articles 26627.0000 65354.0000 139347.0000 \n", "List-Class_medicine_articles 12431.0000 10669.0000 18497.0000 \n", "Redirect-Class_medicine_articles 148.0000 98.0000 106.0000 \n", "Start-Class_medicine_articles 291457.0000 190972.0000 287123.0000 \n", "Stub-Class_medicine_articles 70427.0000 17996.0000 21208.0000 \n", "\n", "action upClick \n", "category \n", "B-Class_medicine_articles 19064.0000 \n", "C-Class_medicine_articles 12359.0000 \n", "Category-Class_medicine_articles nan \n", "Disambig-Class_medicine_articles 2.0000 \n", "FA-Class_medicine_articles 2152.0000 \n", "FL-Class_medicine_articles 40.0000 \n", "GA-Class_medicine_articles 8036.0000 \n", "List-Class_medicine_articles 187.0000 \n", "Redirect-Class_medicine_articles 2.0000 \n", "Start-Class_medicine_articles 6710.0000 \n", "Stub-Class_medicine_articles 700.0000 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type\n", "pm_category_events_query = \"\"\"\n", "SELECT projmed_categories.category, action, count(*) count\n", "FROM \n", " citationusage, \n", " (SELECT DISTINCT page_id, category \n", " FROM ryanmax.projmed_categories \n", " WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n", " AND to_date(projmed_categories.dt) = '2019-04-20'\n", " )\n", " AS projmed_categories\n", "WHERE citationusage.page_id = projmed_categories.page_id\n", " AND wiki = 'enwiki'\n", " AND citationusage.page_id IN \n", " (SELECT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", " {}\n", " AND to_date(event_time) >= '{}'\n", " AND to_date(event_time) <= '{}'\n", " AND useragent_is_bot = FALSE\n", "GROUP BY projmed_categories.category, action\n", "ORDER BY projmed_categories.category, action\n", "\"\"\"\n", "\n", "pm_category_events = spark.sql(\n", " pm_category_events_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "pm_category_events_rdd = pm_category_events.rdd\n", "pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd)\n", "pm_category_events_pandas = pm_category_events_df.toPandas()\n", "pm_category_events_pandas.pivot(index='category', columns='action', values='count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Daily average event counts for each WP:M class (FA, A, GA, B, C, Start, Stub)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Totals" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr>\n", " <th></th>\n", " <th colspan=\"2\" halign=\"left\">count</th>\n", " </tr>\n", " <tr>\n", " <th></th>\n", " <th>total_events</th>\n", " <th>daily_average</th>\n", " </tr>\n", " <tr>\n", " <th>category</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>B-Class_medicine_articles</th>\n", " <td>1157609</td>\n", " <td>36175.2812</td>\n", " </tr>\n", " <tr>\n", " <th>C-Class_medicine_articles</th>\n", " <td>876308</td>\n", " <td>27384.6250</td>\n", " </tr>\n", " <tr>\n", " <th>Category-Class_medicine_articles</th>\n", " <td>31</td>\n", " <td>0.9688</td>\n", " </tr>\n", " <tr>\n", " <th>Disambig-Class_medicine_articles</th>\n", " <td>222</td>\n", " <td>6.9375</td>\n", " </tr>\n", " <tr>\n", " <th>FA-Class_medicine_articles</th>\n", " <td>116377</td>\n", " <td>3636.7812</td>\n", " </tr>\n", " <tr>\n", " <th>FL-Class_medicine_articles</th>\n", " <td>2636</td>\n", " <td>82.3750</td>\n", " </tr>\n", " <tr>\n", " <th>GA-Class_medicine_articles</th>\n", " <td>239364</td>\n", " <td>7480.1250</td>\n", " </tr>\n", " <tr>\n", " <th>List-Class_medicine_articles</th>\n", " <td>41784</td>\n", " <td>1305.7500</td>\n", " </tr>\n", " <tr>\n", " <th>Redirect-Class_medicine_articles</th>\n", " <td>354</td>\n", " <td>11.0625</td>\n", " </tr>\n", " <tr>\n", " <th>Start-Class_medicine_articles</th>\n", " <td>776262</td>\n", " <td>24258.1875</td>\n", " </tr>\n", " <tr>\n", " <th>Stub-Class_medicine_articles</th>\n", " <td>110331</td>\n", " <td>3447.8438</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " count \n", " total_events daily_average\n", "category \n", "B-Class_medicine_articles 1157609 36175.2812\n", "C-Class_medicine_articles 876308 27384.6250\n", "Category-Class_medicine_articles 31 0.9688\n", "Disambig-Class_medicine_articles 222 6.9375\n", "FA-Class_medicine_articles 116377 3636.7812\n", "FL-Class_medicine_articles 2636 82.3750\n", "GA-Class_medicine_articles 239364 7480.1250\n", "List-Class_medicine_articles 41784 1305.7500\n", "Redirect-Class_medicine_articles 354 11.0625\n", "Start-Class_medicine_articles 776262 24258.1875\n", "Stub-Class_medicine_articles 110331 3447.8438" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pm_category_events_pandas.groupby(['category']).agg(\n", " [\n", " ('total_events','sum'),\n", " ('daily_average', lambda x: sum(x)/days_in_study)\n", " ])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### By event type" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>action</th>\n", " <th>extClick</th>\n", " <th>fnClick</th>\n", " <th>fnHover</th>\n", " <th>upClick</th>\n", " </tr>\n", " <tr>\n", " <th>category</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>B-Class_medicine_articles</th>\n", " <td>6166.4688</td>\n", " <td>10397.3750</td>\n", " <td>19015.6875</td>\n", " <td>595.7500</td>\n", " </tr>\n", " <tr>\n", " <th>C-Class_medicine_articles</th>\n", " <td>6632.0312</td>\n", " <td>7458.0938</td>\n", " <td>12908.2812</td>\n", " <td>386.2188</td>\n", " </tr>\n", " <tr>\n", " <th>Category-Class_medicine_articles</th>\n", " <td>0.3750</td>\n", " <td>0.1875</td>\n", " <td>0.4062</td>\n", " <td>nan</td>\n", " </tr>\n", " <tr>\n", " <th>Disambig-Class_medicine_articles</th>\n", " <td>5.1250</td>\n", " <td>0.8438</td>\n", " <td>0.9062</td>\n", " <td>0.0625</td>\n", " </tr>\n", " <tr>\n", " <th>FA-Class_medicine_articles</th>\n", " <td>444.8438</td>\n", " <td>930.9375</td>\n", " <td>2193.7500</td>\n", " <td>67.2500</td>\n", " </tr>\n", " <tr>\n", " <th>FL-Class_medicine_articles</th>\n", " <td>13.2812</td>\n", " <td>28.4688</td>\n", " <td>39.3750</td>\n", " <td>1.2500</td>\n", " </tr>\n", " <tr>\n", " <th>GA-Class_medicine_articles</th>\n", " <td>832.0938</td>\n", " <td>2042.3125</td>\n", " <td>4354.5938</td>\n", " <td>251.1250</td>\n", " </tr>\n", " <tr>\n", " <th>List-Class_medicine_articles</th>\n", " <td>388.4688</td>\n", " <td>333.4062</td>\n", " <td>578.0312</td>\n", " <td>5.8438</td>\n", " </tr>\n", " <tr>\n", " <th>Redirect-Class_medicine_articles</th>\n", " <td>4.6250</td>\n", " <td>3.0625</td>\n", " <td>3.3125</td>\n", " <td>0.0625</td>\n", " </tr>\n", " <tr>\n", " <th>Start-Class_medicine_articles</th>\n", " <td>9108.0312</td>\n", " <td>5967.8750</td>\n", " <td>8972.5938</td>\n", " <td>209.6875</td>\n", " </tr>\n", " <tr>\n", " <th>Stub-Class_medicine_articles</th>\n", " <td>2200.8438</td>\n", " <td>562.3750</td>\n", " <td>662.7500</td>\n", " <td>21.8750</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "category \n", "B-Class_medicine_articles 6166.4688 10397.3750 19015.6875 595.7500\n", "C-Class_medicine_articles 6632.0312 7458.0938 12908.2812 386.2188\n", "Category-Class_medicine_articles 0.3750 0.1875 0.4062 nan\n", "Disambig-Class_medicine_articles 5.1250 0.8438 0.9062 0.0625\n", "FA-Class_medicine_articles 444.8438 930.9375 2193.7500 67.2500\n", "FL-Class_medicine_articles 13.2812 28.4688 39.3750 1.2500\n", "GA-Class_medicine_articles 832.0938 2042.3125 4354.5938 251.1250\n", "List-Class_medicine_articles 388.4688 333.4062 578.0312 5.8438\n", "Redirect-Class_medicine_articles 4.6250 3.0625 3.3125 0.0625\n", "Start-Class_medicine_articles 9108.0312 5967.8750 8972.5938 209.6875\n", "Stub-Class_medicine_articles 2200.8438 562.3750 662.7500 21.8750" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pm_category_events_pandas['average'] = pm_category_events_pandas['count'].map(lambda x: x/days_in_study)\n", "pm_category_events_pandas.pivot(index='category', columns='action', values='average')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "**numbers will not match [1] because we're limiting to namespace 0 pages with external links**\n", "\n", "[1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>category</th>\n", " <th>pages_w_links</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Start-Class_medicine_articles</td>\n", " <td>14484</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Stub-Class_medicine_articles</td>\n", " <td>9765</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>C-Class_medicine_articles</td>\n", " <td>5332</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>B-Class_medicine_articles</td>\n", " <td>2173</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>List-Class_medicine_articles</td>\n", " <td>452</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>GA-Class_medicine_articles</td>\n", " <td>240</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>FA-Class_medicine_articles</td>\n", " <td>62</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>Disambig-Class_medicine_articles</td>\n", " <td>17</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>Redirect-Class_medicine_articles</td>\n", " <td>13</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>FL-Class_medicine_articles</td>\n", " <td>12</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>Category-Class_medicine_articles</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " category pages_w_links\n", "0 Start-Class_medicine_articles 14484\n", "1 Stub-Class_medicine_articles 9765\n", "2 C-Class_medicine_articles 5332\n", "3 B-Class_medicine_articles 2173\n", "4 List-Class_medicine_articles 452\n", "5 GA-Class_medicine_articles 240\n", "6 FA-Class_medicine_articles 62\n", "7 Disambig-Class_medicine_articles 17\n", "8 Redirect-Class_medicine_articles 13\n", "9 FL-Class_medicine_articles 12\n", "10 Category-Class_medicine_articles 1" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "# numbers will not match [1] because we're limiting to namespace 0 pages with external links\n", "# [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics\n", "pm_category_pages = \"\"\"\n", "SELECT category, COUNT(DISTINCT page_id) AS pages_w_links \n", "FROM ryanmax.projmed_categories \n", "WHERE category LIKE '%Class_medicine_articles%'\n", "AND to_date(projmed_categories.dt) = '2019-04-20'\n", "AND page_id IN \n", " (SELECT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", "GROUP BY category\n", "ORDER BY COUNT(*) DESC\n", "\"\"\"\n", "pm_cat_counts = spark.sql(pm_category_pages)\n", "cats = sqlContext.createDataFrame(pm_cat_counts.rdd)\n", "cats.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "**numbers will be higher than overall WP:M pageloads since one page may have more than one category**" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>category</th>\n", " <th>total_pageloads</th>\n", " <th>daily_average</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>B-Class_medicine_articles</td>\n", " <td>57892215</td>\n", " <td>1809131.7188</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>C-Class_medicine_articles</td>\n", " <td>51507495</td>\n", " <td>1609609.2188</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Category-Class_medicine_articles</td>\n", " <td>259</td>\n", " <td>8.0938</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Disambig-Class_medicine_articles</td>\n", " <td>36927</td>\n", " <td>1153.9688</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>FA-Class_medicine_articles</td>\n", " <td>4389844</td>\n", " <td>137182.6250</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>FL-Class_medicine_articles</td>\n", " <td>121691</td>\n", " <td>3802.8438</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>GA-Class_medicine_articles</td>\n", " <td>9350159</td>\n", " <td>292192.4688</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>List-Class_medicine_articles</td>\n", " <td>1801806</td>\n", " <td>56306.4375</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>Redirect-Class_medicine_articles</td>\n", " <td>25308</td>\n", " <td>790.8750</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>Start-Class_medicine_articles</td>\n", " <td>54741621</td>\n", " <td>1710675.6562</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>Stub-Class_medicine_articles</td>\n", " <td>8056357</td>\n", " <td>251761.1562</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " category total_pageloads daily_average\n", "0 B-Class_medicine_articles 57892215 1809131.7188\n", "1 C-Class_medicine_articles 51507495 1609609.2188\n", "2 Category-Class_medicine_articles 259 8.0938\n", "3 Disambig-Class_medicine_articles 36927 1153.9688\n", "4 FA-Class_medicine_articles 4389844 137182.6250\n", "5 FL-Class_medicine_articles 121691 3802.8438\n", "6 GA-Class_medicine_articles 9350159 292192.4688\n", "7 List-Class_medicine_articles 1801806 56306.4375\n", "8 Redirect-Class_medicine_articles 25308 790.8750\n", "9 Start-Class_medicine_articles 54741621 1710675.6562\n", "10 Stub-Class_medicine_articles 8056357 251761.1562" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "# numbers will be higher than overall WP:M pageloads since one page may have more than one category\n", "pm_category_pageloads_query = \"\"\"\n", "SELECT projmed_categories.category, sum(view_count) AS total_pageloads, sum(view_count)/{} AS daily_average\n", "FROM \n", " wmf.pageview_hourly, \n", " (SELECT DISTINCT page_id, category \n", " FROM ryanmax.projmed_categories \n", " WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n", " AND to_date(projmed_categories.dt) = '2019-04-20'\n", " ) \n", " AS projmed_categories\n", "WHERE pageview_hourly.page_id = projmed_categories.page_id\n", " AND pageview_hourly.project = 'en.wikipedia'\n", " AND pageview_hourly.agent_type = 'user'\n", " AND pageview_hourly.page_id IN \n", " (SELECT page_id \n", " FROM ryanmax.population_wpm_pages_with_extlinks\n", " )\n", " AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'\n", " AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'\n", " GROUP BY projmed_categories.category\n", " ORDER BY projmed_categories.category\n", "\"\"\"\n", "\n", "pm_category_pageloads = spark.sql(\n", " pm_category_pageloads_query.format(\n", " days_in_study, start_date_string, end_date_string\n", " ))\n", "pm_category_pageloads.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>category</th>\n", " <th>num_extlinks</th>\n", " <th>avg_extlinks_per_page</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>B-Class_medicine_articles</td>\n", " <td>216159</td>\n", " <td>99.4749</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>C-Class_medicine_articles</td>\n", " <td>256660</td>\n", " <td>48.1358</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Category-Class_medicine_articles</td>\n", " <td>135</td>\n", " <td>135.0000</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Disambig-Class_medicine_articles</td>\n", " <td>85</td>\n", " <td>5.0000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>FA-Class_medicine_articles</td>\n", " <td>13914</td>\n", " <td>224.4194</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>FL-Class_medicine_articles</td>\n", " <td>2588</td>\n", " <td>215.6667</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>GA-Class_medicine_articles</td>\n", " <td>32793</td>\n", " <td>136.6375</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>List-Class_medicine_articles</td>\n", " <td>34091</td>\n", " <td>75.4226</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>Redirect-Class_medicine_articles</td>\n", " <td>119</td>\n", " <td>9.1538</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>Start-Class_medicine_articles</td>\n", " <td>310574</td>\n", " <td>21.4426</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>Stub-Class_medicine_articles</td>\n", " <td>76668</td>\n", " <td>7.8513</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " category num_extlinks avg_extlinks_per_page\n", "0 B-Class_medicine_articles 216159 99.4749\n", "1 C-Class_medicine_articles 256660 48.1358\n", "2 Category-Class_medicine_articles 135 135.0000\n", "3 Disambig-Class_medicine_articles 85 5.0000\n", "4 FA-Class_medicine_articles 13914 224.4194\n", "5 FL-Class_medicine_articles 2588 215.6667\n", "6 GA-Class_medicine_articles 32793 136.6375\n", "7 List-Class_medicine_articles 34091 75.4226\n", "8 Redirect-Class_medicine_articles 119 9.1538\n", "9 Start-Class_medicine_articles 310574 21.4426\n", "10 Stub-Class_medicine_articles 76668 7.8513" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub)\n", "pm_category_links_query = \"\"\"\n", "SELECT projmed_categories.category, \n", " COUNT(DISTINCT el_from, el_to) AS num_extlinks, \n", " COUNT(DISTINCT el_from, el_to)/COUNT(DISTINCT el_from) as avg_extlinks_per_page\n", "FROM \n", " ryanmax.population_externallinks, \n", " (SELECT DISTINCT page_id, category \n", " FROM ryanmax.projmed_categories \n", " WHERE projmed_categories.category LIKE '%Class_medicine_articles%'\n", " AND to_date(projmed_categories.dt) = '2019-04-20'\n", " ) \n", " AS projmed_categories\n", "WHERE population_externallinks.el_from = projmed_categories.page_id\n", " AND population_externallinks.el_from IN \n", " (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " GROUP BY projmed_categories.category\n", " ORDER BY projmed_categories.category\n", "\"\"\"\n", "\n", "pm_category_links = spark.sql(pm_category_links_query)\n", "pm_category_links.toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark large: spark-xml jar and local venv path ", "language": "python", "name": "spark-ryanmax" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }