{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# English Wikipedia Page Views by Topics in Septermber & August 2019\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://phabricator.wikimedia.org/T234839\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this analysis, we use Adam's topic dataset of articles with \"best\" topic prediction for pages accessed in September 2019. (see [example](https://dr0ptp4kt.github.io/topics-7.html) of first 10K non-randomized rows for an HTML view). \n", "\n", "The outcome topics are from the \"predicted\" field, which is the post-enrichment best guess for the articels.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "The raw code for this notebook is by default hidden for easier reading.\n", "
\n" ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import HTML\n", "\n", "HTML('''\n", "The raw code for this notebook is by default hidden for easier reading.\n", "
\n", "''')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata\n" ] } ], "source": [ "import requests\n", "import pandas as pd\n", "import json\n", "import matplotlib.pyplot as plt\n", "import gzip\n", "from wmfdata import hive\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "##read topic prediction file\n", "topic = pd.read_csv('topic_prediction.tsv.gz', sep='\\t',compression='gzip', header=0)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pageview_query = '''\n", " SELECT \n", " CONCAT(year,\"-\",month,\"-01\") AS date,\n", " page_id, \n", " SUM(view_count) AS pageviews\n", " FROM \n", " wmf.pageview_hourly\n", " WHERE year = \"{year}\"\n", " AND month = \"{month}\" \n", " AND project = \"{wiki}\"\n", " AND namespace_id = 0\n", " AND agent_type = \"user\"\n", " AND NOT (\n", " country_code IN (\"PK\", \"IR\", \"AF\") \n", " AND user_agent_map[\"browser_family\"] = \"IE\" AND user_agent_map[\"browser_major\"] = 7\n", " )\n", " GROUP BY CONCAT(year,\"-\",month,\"-01\"), page_id\n", "'''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## September 2019 Topic Analysis " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_sept_all = hive.run([\n", " \"SET mapreduce.map.memory.mb=4096\", \n", " pageview_query.format(\n", " year = 2019,\n", " month = 9,\n", " wiki = \"en.wikipedia\")\n", "])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_sept_all['proportion']= enwiki_pv_sept_all['pageviews']/enwiki_pv_sept_all['pageviews'].sum()\n", "enwiki_pv_sep_all = enwiki_pv_sept_all.sort_values(by='pageviews', ascending=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepage_idpageviewsproportion
41569742019-9-01NaN480390.000007
\n", "
" ], "text/plain": [ " date page_id pageviews proportion\n", "4156974 2019-9-01 NaN 48039 0.000007" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "enwiki_pv_sept_all[enwiki_pv_sept.page_id.isnull()]" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total page views: 7198970305\n" ] } ], "source": [ "print('Total page views in September: ' + str(enwiki_pv_sept_all.pageviews.sum()))" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of unqiue pages: 8043636\n" ] } ], "source": [ "print('Number of unqiue pages in September: ' + str(enwiki_pv_sept_all.shape[0]))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 1M pages account for 91.8% of total page views.\n" ] } ], "source": [ "print('Top 1M pages account for ' + str(round(enwiki_pv_sept_all.proportion[:1000000].sum() * 100,2)) + '% of total page views in September.')" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "pageview_title_query = '''\n", "WITH v AS (\n", " SELECT page_id, SUM(view_count) AS pageviews\n", " FROM wmf.pageview_hourly\n", " WHERE year = \"{year}\"\n", " AND month = \"{month}\" \n", " AND project = \"{wiki}\"\n", " AND namespace_id = 0\n", " AND agent_type = \"user\"\n", " AND NOT (\n", " country_code IN (\"PK\", \"IR\", \"AF\") AND user_agent_map[\"browser_family\"] = \"IE\" AND user_agent_map[\"browser_major\"] = 7\n", " )\n", " GROUP BY page_id\n", " LIMIT 10000000\n", "), p AS (\n", " SELECT page_id, page_title, page_latest\n", " FROM wmf_raw.mediawiki_page\n", " WHERE wiki_db = \"enwiki\"\n", " AND snapshot = \"{snapshot}\"\n", " AND page_id IS NOT NULL\n", " AND page_namespace = 0\n", " AND NOT page_is_redirect\n", ")\n", "\n", "SELECT v.page_id, p.page_title, v.pageviews\n", "FROM v LEFT JOIN p ON v.page_id=p.page_id\n", "'''" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_sept = hive.run([\n", " \"SET mapreduce.map.memory.mb=4096\", \n", " pageview_title_query.format(\n", " year = 2019,\n", " month = 9,\n", " wiki = \"en.wikipedia\",\n", " snapshot = \"2019-09\")\n", "])" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_sept['proportion']= enwiki_pv_sept['pageviews']/enwiki_pv_sept['pageviews'].sum()\n", "enwiki_pv_sept = enwiki_pv_sept.sort_values(by='pageviews', ascending=False)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_topic_sept = enwiki_pv_sept.merge(topic, how = 'left', on = 'page_id')" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_topic_sept['predicted'] = enwiki_pv_topic_sept['predicted'].fillna(value='Unknown')\n", "enwiki_pv_topic_sept['proportion']= enwiki_pv_topic_sept['pageviews']/enwiki_pv_topic_sept['pageviews'].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top 50 articles read in September 2019 on English Wikipedia¶" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The table below shows the top 50 articles viewed in English Wikipedia in September 2019, with the corresponding propotions among the total pageviews and the best predicted topic." ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "enwiki_page_sept_summary = enwiki_pv_topic_sept[['page_title','pageviews','proportion','predicted']].sort_values(by='pageviews', ascending=False).reset_index(drop=True).head(50)\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 50 articles account for 7.84% of total page views in September.\n" ] } ], "source": [ "print('Top 50 articles account for ' + str(round(enwiki_page_sept_summary.proportion.sum() * 100,2))+ '% of total page views in September.')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
page_titlepageviewsproportionpredicted
0Main_Page4733163590.065746Internet culture
1Wikipedia73768330.001025Language and literature
2List_of_Queen_of_the_South_episodes64262430.000893Broadcasting
3It_Chapter_Two37776630.000525Entertainment
4Deaths_in_201932253350.000448Time
5Greta_Thunberg31469050.000437History and society
6Saaho29272780.000407Entertainment
7Joker_(2019_film)28642580.000398Visual arts
8September_11_attacks24358660.000338Politics and government
9Antonio_Brown23592160.000328Sports
10Algorithms_for_calculating_variance22223850.000309Mathematics
11Chandrayaan-221647000.000301Space
12Hustlers_(2019_film)19098210.000265Entertainment
13The_Bahamas17241240.000239The_Bahamas
14Eli_Cohen17189060.000239Military and warfare
15Storm_Area_51,_They_Can't_Stop_All_of_Us17162450.000238Internet culture
16Ad_Astra_(film)17132370.000238Entertainment
176ix9ine15995510.000222Internet culture
18Bianca_Andreescu15940390.000221Sports
19Ric_Ocasek15386880.000214Performing arts
202019_FIBA_Basketball_World_Cup15368040.000213Sports
21Unbelievable_(miniseries)15302940.000213Broadcasting
22Line_shaft15263950.000212Technology
23Billie_Eilish14960200.000208Performing arts
24Mindhunter_(TV_series)14796120.000206Broadcasting
25Solar_System14558470.000202Space
26Freddie_Mercury14491460.000201Performing arts
27Rafael_Nadal14261860.000198Sports
28It_(2017_film)13821000.000192Entertainment
29United_States13700540.000190United_States
30List_of_Bollywood_films_of_201913599210.000189Entertainment
31John_Bercow13424780.000186Politics and government
32Hurricane_Dorian13022090.000181Politics and government
33Peaky_Blinders_(TV_series)12867200.000179Broadcasting
34Elton_John12534400.000174History and society
35Wayne_Williams12388870.000172History and society
36Once_Upon_a_Time_in_Hollywood12297060.000171Entertainment
37Donald_Trump12173320.000169Politics and government
38Joaquin_Phoenix12062880.000168Entertainment
39Moving_average11866360.000165Mathematics
40Apple_Network_Server11578550.000161Technology
41Eddie_Money11519550.000160Performing arts
42Sylvester_Stallone11415400.000159Entertainment
43Judy_Garland11289210.000157History and society
44Elizabeth_II11172370.000155Language and literature
45Atlanta_murders_of_1979–198111162240.000155History and society
46YouTube11142220.000155Entertainment
47Charles_Manson10994060.000153History and society
48Clash_of_Champions_(2019)10975020.000152Entertainment
49Clint_Eastwood10744020.000149Entertainment
\n", "
" ], "text/plain": [ " page_title pageviews proportion \\\n", "0 Main_Page 473316359 0.065746 \n", "1 Wikipedia 7376833 0.001025 \n", "2 List_of_Queen_of_the_South_episodes 6426243 0.000893 \n", "3 It_Chapter_Two 3777663 0.000525 \n", "4 Deaths_in_2019 3225335 0.000448 \n", "5 Greta_Thunberg 3146905 0.000437 \n", "6 Saaho 2927278 0.000407 \n", "7 Joker_(2019_film) 2864258 0.000398 \n", "8 September_11_attacks 2435866 0.000338 \n", "9 Antonio_Brown 2359216 0.000328 \n", "10 Algorithms_for_calculating_variance 2222385 0.000309 \n", "11 Chandrayaan-2 2164700 0.000301 \n", "12 Hustlers_(2019_film) 1909821 0.000265 \n", "13 The_Bahamas 1724124 0.000239 \n", "14 Eli_Cohen 1718906 0.000239 \n", "15 Storm_Area_51,_They_Can't_Stop_All_of_Us 1716245 0.000238 \n", "16 Ad_Astra_(film) 1713237 0.000238 \n", "17 6ix9ine 1599551 0.000222 \n", "18 Bianca_Andreescu 1594039 0.000221 \n", "19 Ric_Ocasek 1538688 0.000214 \n", "20 2019_FIBA_Basketball_World_Cup 1536804 0.000213 \n", "21 Unbelievable_(miniseries) 1530294 0.000213 \n", "22 Line_shaft 1526395 0.000212 \n", "23 Billie_Eilish 1496020 0.000208 \n", "24 Mindhunter_(TV_series) 1479612 0.000206 \n", "25 Solar_System 1455847 0.000202 \n", "26 Freddie_Mercury 1449146 0.000201 \n", "27 Rafael_Nadal 1426186 0.000198 \n", "28 It_(2017_film) 1382100 0.000192 \n", "29 United_States 1370054 0.000190 \n", "30 List_of_Bollywood_films_of_2019 1359921 0.000189 \n", "31 John_Bercow 1342478 0.000186 \n", "32 Hurricane_Dorian 1302209 0.000181 \n", "33 Peaky_Blinders_(TV_series) 1286720 0.000179 \n", "34 Elton_John 1253440 0.000174 \n", "35 Wayne_Williams 1238887 0.000172 \n", "36 Once_Upon_a_Time_in_Hollywood 1229706 0.000171 \n", "37 Donald_Trump 1217332 0.000169 \n", "38 Joaquin_Phoenix 1206288 0.000168 \n", "39 Moving_average 1186636 0.000165 \n", "40 Apple_Network_Server 1157855 0.000161 \n", "41 Eddie_Money 1151955 0.000160 \n", "42 Sylvester_Stallone 1141540 0.000159 \n", "43 Judy_Garland 1128921 0.000157 \n", "44 Elizabeth_II 1117237 0.000155 \n", "45 Atlanta_murders_of_1979–1981 1116224 0.000155 \n", "46 YouTube 1114222 0.000155 \n", "47 Charles_Manson 1099406 0.000153 \n", "48 Clash_of_Champions_(2019) 1097502 0.000152 \n", "49 Clint_Eastwood 1074402 0.000149 \n", "\n", " predicted \n", "0 Internet culture \n", "1 Language and literature \n", "2 Broadcasting \n", "3 Entertainment \n", "4 Time \n", "5 History and society \n", "6 Entertainment \n", "7 Visual arts \n", "8 Politics and government \n", "9 Sports \n", "10 Mathematics \n", "11 Space \n", "12 Entertainment \n", "13 The_Bahamas \n", "14 Military and warfare \n", "15 Internet culture \n", "16 Entertainment \n", "17 Internet culture \n", "18 Sports \n", "19 Performing arts \n", "20 Sports \n", "21 Broadcasting \n", "22 Technology \n", "23 Performing arts \n", "24 Broadcasting \n", "25 Space \n", "26 Performing arts \n", "27 Sports \n", "28 Entertainment \n", "29 United_States \n", "30 Entertainment \n", "31 Politics and government \n", "32 Politics and government \n", "33 Broadcasting \n", "34 History and society \n", "35 History and society \n", "36 Entertainment \n", "37 Politics and government \n", "38 Entertainment \n", "39 Mathematics \n", "40 Technology \n", "41 Performing arts \n", "42 Entertainment \n", "43 History and society \n", "44 Language and literature \n", "45 History and society \n", "46 Entertainment \n", "47 History and society \n", "48 Entertainment \n", "49 Entertainment " ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "enwiki_page_sept_summary" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### September Top 50 Topics Viewed\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The table below shows the page views by top 50 topics in September 2019 on English Wikipedia. Main page is excluded in this table." ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "enwiki_topic_sept_summary = (enwiki_pv_topic_sept[enwiki_pv_topic_sept.page_title != 'Main_Page']\n", " .groupby('predicted', as_index = False)['pageviews', 'proportion']\n", " .sum()\n", " .sort_values(by='pageviews', ascending=False))" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 10 topics account for 62.49% of total page views in September.\n", "Top 50 topics account for 92.03% of total page views in September.\n" ] } ], "source": [ "print('Top 10 topics account for ' + str(round(enwiki_topic_sept_summary.proportion[:10].sum() * 100,2))+ '% of total page views in September.')\n", "print('Top 50 topics account for ' + str(round(enwiki_topic_sept_summary.proportion[:50].sum() * 100,2))+ '% of total page views in September.')" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
predictedpageviewsproportion
148Entertainment9510155010.132101
424Sports6511863720.090453
339Performing arts5821872400.080869
75Broadcasting5089635010.070698
202History and society4906496820.068154
347Politics and government3909087020.054299
245Language and literature2332162200.032395
446Technology2330079050.032366
83Business and economics2288018180.031782
342Philosophy and religion2287166060.031770
56Biology2200551140.030567
278Medicine1978908570.027488
459Transportation1893569670.026303
282Military and warfare1879454860.026107
503Visual arts1614116190.022421
355Regional geography1154835910.016041
357Regional society1102438310.015313
167Food and drink997784560.013860
215Internet culture995169020.013823
429Structures of note808830920.011235
142Education734489680.010202
478United States591228930.008212
130Disambiguation583673910.008108
343Physics561722690.007803
98Chemistry541657450.007524
422Space432686360.006010
272Mathematics411166500.005711
182Geosciences271721920.003774
279Meteorology226315460.003144
393Science221472220.003076
211India209432880.002909
58Bodies of water186314070.002588
455Time180664220.002510
146Engineering168698930.002343
294Music151930240.002110
244Landforms148194210.002058
277Media139130780.001933
115Crafts and hobbies134509230.001868
492Unknown125004000.001736
267Maps91564310.001272
90Canada77185400.001072
36Australia74134820.001030
33Arts62646610.000870
168France57501010.000799
214Information science54597990.000758
184Germany50009790.000695
223Italy43537860.000605
178Games and toys42688830.000593
371Russia36699420.000510
100China34256390.000476
\n", "
" ], "text/plain": [ " predicted pageviews proportion\n", "148 Entertainment 951015501 0.132101\n", "424 Sports 651186372 0.090453\n", "339 Performing arts 582187240 0.080869\n", "75 Broadcasting 508963501 0.070698\n", "202 History and society 490649682 0.068154\n", "347 Politics and government 390908702 0.054299\n", "245 Language and literature 233216220 0.032395\n", "446 Technology 233007905 0.032366\n", "83 Business and economics 228801818 0.031782\n", "342 Philosophy and religion 228716606 0.031770\n", "56 Biology 220055114 0.030567\n", "278 Medicine 197890857 0.027488\n", "459 Transportation 189356967 0.026303\n", "282 Military and warfare 187945486 0.026107\n", "503 Visual arts 161411619 0.022421\n", "355 Regional geography 115483591 0.016041\n", "357 Regional society 110243831 0.015313\n", "167 Food and drink 99778456 0.013860\n", "215 Internet culture 99516902 0.013823\n", "429 Structures of note 80883092 0.011235\n", "142 Education 73448968 0.010202\n", "478 United States 59122893 0.008212\n", "130 Disambiguation 58367391 0.008108\n", "343 Physics 56172269 0.007803\n", "98 Chemistry 54165745 0.007524\n", "422 Space 43268636 0.006010\n", "272 Mathematics 41116650 0.005711\n", "182 Geosciences 27172192 0.003774\n", "279 Meteorology 22631546 0.003144\n", "393 Science 22147222 0.003076\n", "211 India 20943288 0.002909\n", "58 Bodies of water 18631407 0.002588\n", "455 Time 18066422 0.002510\n", "146 Engineering 16869893 0.002343\n", "294 Music 15193024 0.002110\n", "244 Landforms 14819421 0.002058\n", "277 Media 13913078 0.001933\n", "115 Crafts and hobbies 13450923 0.001868\n", "492 Unknown 12500400 0.001736\n", "267 Maps 9156431 0.001272\n", "90 Canada 7718540 0.001072\n", "36 Australia 7413482 0.001030\n", "33 Arts 6264661 0.000870\n", "168 France 5750101 0.000799\n", "214 Information science 5459799 0.000758\n", "184 Germany 5000979 0.000695\n", "223 Italy 4353786 0.000605\n", "178 Games and toys 4268883 0.000593\n", "371 Russia 3669942 0.000510\n", "100 China 3425639 0.000476" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "enwiki_topic_sept_summary.head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## September & August 2019 Topic Data Comparison" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_aug_all = hive.run([\n", " \"SET mapreduce.map.memory.mb=4096\", \n", " pageview_query.format(\n", " year = 2019,\n", " month = 8,\n", " wiki = \"en.wikipedia\")\n", "])" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_aug_all['proportion']= enwiki_pv_aug_all['pageviews']/enwiki_pv_aug_all['pageviews'].sum()\n", "enwiki_pv_aug_all = enwiki_pv_aug_all.sort_values(by='pageviews', ascending=False)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total page views in August: 7212202447\n" ] } ], "source": [ "print('Total page views in August: ' + str(enwiki_pv_aug_all.pageviews.sum()))" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of unqiue pages in August: 8813929\n" ] } ], "source": [ "print('Number of unqiue pages in August: ' + str(enwiki_pv_aug_all.shape[0]))" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_aug = hive.run([\n", " \"SET mapreduce.map.memory.mb=4096\", \n", " pageview_title_query.format(\n", " year = 2019,\n", " month = 8,\n", " wiki = \"en.wikipedia\",\n", " snapshot = \"2019-09\")\n", "])" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_topic_aug = enwiki_pv_aug.merge(topic, how = 'left', on = 'page_id')" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "enwiki_pv_topic_aug['predicted'] = enwiki_pv_topic_aug['predicted'].fillna(value='Unknown')\n", "enwiki_pv_topic_aug['proportion']= enwiki_pv_topic_aug['pageviews']/enwiki_pv_topic_aug['pageviews'].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### August Top 50 Topics Viewed" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "enwiki_topic_aug_summary = (enwiki_pv_topic_aug[enwiki_pv_topic_aug.page_title != 'Main_Page']\n", " .groupby('predicted', as_index = False)['pageviews', 'proportion']\n", " .sum()\n", " .sort_values(by='pageviews', ascending=False))" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 10 topics account for 62.48% of total page views in August\n", "Top 50 topics account for 91.99% of total page views in August\n" ] } ], "source": [ "print('Top 10 topics account for ' + str(round(enwiki_topic_aug_summary.proportion[:10].sum() * 100,2))+ '% of total page views in August')\n", "print('Top 50 topics account for ' + str(round(enwiki_topic_aug_summary.proportion[:50].sum() * 100,2))+ '% of total page views in August')" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
predictedpageviewsproportion
148Entertainment9510155010.132101
424Sports6511863720.090453
339Performing arts5821872400.080869
75Broadcasting5089635010.070698
202History and society4906496820.068154
347Politics and government3909087020.054299
245Language and literature2332162200.032395
446Technology2330079050.032366
83Business and economics2288018180.031782
342Philosophy and religion2287166060.031770
56Biology2200551140.030567
278Medicine1978908570.027488
459Transportation1893569670.026303
282Military and warfare1879454860.026107
503Visual arts1614116190.022421
355Regional geography1154835910.016041
357Regional society1102438310.015313
167Food and drink997784560.013860
215Internet culture995169020.013823
429Structures of note808830920.011235
142Education734489680.010202
478United States591228930.008212
130Disambiguation583673910.008108
343Physics561722690.007803
98Chemistry541657450.007524
422Space432686360.006010
272Mathematics411166500.005711
182Geosciences271721920.003774
279Meteorology226315460.003144
393Science221472220.003076
211India209432880.002909
58Bodies of water186314070.002588
455Time180664220.002510
146Engineering168698930.002343
294Music151930240.002110
244Landforms148194210.002058
277Media139130780.001933
115Crafts and hobbies134509230.001868
492Unknown125004000.001736
267Maps91564310.001272
90Canada77185400.001072
36Australia74134820.001030
33Arts62646610.000870
168France57501010.000799
214Information science54597990.000758
184Germany50009790.000695
223Italy43537860.000605
178Games and toys42688830.000593
371Russia36699420.000510
100China34256390.000476
\n", "
" ], "text/plain": [ " predicted pageviews proportion\n", "148 Entertainment 951015501 0.132101\n", "424 Sports 651186372 0.090453\n", "339 Performing arts 582187240 0.080869\n", "75 Broadcasting 508963501 0.070698\n", "202 History and society 490649682 0.068154\n", "347 Politics and government 390908702 0.054299\n", "245 Language and literature 233216220 0.032395\n", "446 Technology 233007905 0.032366\n", "83 Business and economics 228801818 0.031782\n", "342 Philosophy and religion 228716606 0.031770\n", "56 Biology 220055114 0.030567\n", "278 Medicine 197890857 0.027488\n", "459 Transportation 189356967 0.026303\n", "282 Military and warfare 187945486 0.026107\n", "503 Visual arts 161411619 0.022421\n", "355 Regional geography 115483591 0.016041\n", "357 Regional society 110243831 0.015313\n", "167 Food and drink 99778456 0.013860\n", "215 Internet culture 99516902 0.013823\n", "429 Structures of note 80883092 0.011235\n", "142 Education 73448968 0.010202\n", "478 United States 59122893 0.008212\n", "130 Disambiguation 58367391 0.008108\n", "343 Physics 56172269 0.007803\n", "98 Chemistry 54165745 0.007524\n", "422 Space 43268636 0.006010\n", "272 Mathematics 41116650 0.005711\n", "182 Geosciences 27172192 0.003774\n", "279 Meteorology 22631546 0.003144\n", "393 Science 22147222 0.003076\n", "211 India 20943288 0.002909\n", "58 Bodies of water 18631407 0.002588\n", "455 Time 18066422 0.002510\n", "146 Engineering 16869893 0.002343\n", "294 Music 15193024 0.002110\n", "244 Landforms 14819421 0.002058\n", "277 Media 13913078 0.001933\n", "115 Crafts and hobbies 13450923 0.001868\n", "492 Unknown 12500400 0.001736\n", "267 Maps 9156431 0.001272\n", "90 Canada 7718540 0.001072\n", "36 Australia 7413482 0.001030\n", "33 Arts 6264661 0.000870\n", "168 France 5750101 0.000799\n", "214 Information science 5459799 0.000758\n", "184 Germany 5000979 0.000695\n", "223 Italy 4353786 0.000605\n", "178 Games and toys 4268883 0.000593\n", "371 Russia 3669942 0.000510\n", "100 China 3425639 0.000476" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "enwiki_topic_sept_summary.head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top Topics Rank Comparison September vs. August" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "enwiki_topic_sept_summary[\"sept_rank\"] = enwiki_topic_sept_summary[\"proportion\"].rank(ascending=0) \n", "enwiki_topic_aug_summary[\"aug_rank\"] = enwiki_topic_aug_summary[\"proportion\"].rank(ascending=0) " ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "topic_rank = enwiki_topic_sept_summary.merge(enwiki_topic_aug_summary, how = 'left', on = 'predicted')\n", "topic_rank = topic_rank.rename(columns={'predicted': 'topic', 'proportion_x': 'proportion_sept','proportion_y': 'proportion_aug','pageviews_x':'pageviews_sept','pageviews_y':'pageviews_aug'})" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
topicproportion_septproportion_augsept_rankaug_rank
0Entertainment0.1321010.1389371.01.0
1Sports0.0904530.0871862.02.0
2Performing arts0.0808690.0811903.03.0
3Broadcasting0.0706980.0718534.04.0
4History and society0.0681540.0696595.05.0
5Politics and government0.0542990.0521396.06.0
6Language and literature0.0323950.0321787.07.0
7Technology0.0323660.0306788.09.0
8Business and economics0.0317820.0307939.08.0
9Philosophy and religion0.0317700.03015410.011.0
10Biology0.0305670.03022111.010.0
11Medicine0.0274880.02584512.014.0
12Transportation0.0263030.02717713.012.0
13Military and warfare0.0261070.02628214.013.0
14Visual arts0.0224210.02370715.015.0
15Regional geography0.0160410.01704616.016.0
16Regional society0.0153130.01636517.017.0
17Food and drink0.0138600.01443718.019.0
18Internet culture0.0138230.01465419.018.0
19Structures of note0.0112350.01120020.020.0
20Education0.0102020.00965521.021.0
21United States0.0082120.00873022.022.0
22Disambiguation0.0081080.00813423.023.0
23Physics0.0078030.00677924.024.0
24Chemistry0.0075240.00620925.025.0
25Space0.0060100.00522726.026.0
26Mathematics0.0057110.00436427.027.0
27Geosciences0.0037740.00347728.028.0
28Meteorology0.0031440.00226329.035.0
29Science0.0030760.00279530.032.0
30India0.0029090.00316531.029.0
31Bodies of water0.0025880.00297132.030.0
32Time0.0025100.00239533.033.0
33Engineering0.0023430.00281034.031.0
34Music0.0021100.00217535.037.0
35Landforms0.0020580.00220436.036.0
36Media0.0019330.00186637.039.0
37Crafts and hobbies0.0018680.00189438.038.0
38Unknown0.0017360.00236439.034.0
39Maps0.0012720.00117340.040.0
40Canada0.0010720.00117241.041.0
41Australia0.0010300.00102242.042.0
42Arts0.0008700.00085643.044.0
43France0.0007990.00086244.043.0
44Information science0.0007580.00065545.046.0
45Germany0.0006950.00074046.045.0
46Italy0.0006050.00062747.047.0
47Games and toys0.0005930.00059448.048.0
48Russia0.0005100.00051649.049.0
49China0.0004760.00051550.050.0
\n", "
" ], "text/plain": [ " topic proportion_sept proportion_aug sept_rank \\\n", "0 Entertainment 0.132101 0.138937 1.0 \n", "1 Sports 0.090453 0.087186 2.0 \n", "2 Performing arts 0.080869 0.081190 3.0 \n", "3 Broadcasting 0.070698 0.071853 4.0 \n", "4 History and society 0.068154 0.069659 5.0 \n", "5 Politics and government 0.054299 0.052139 6.0 \n", "6 Language and literature 0.032395 0.032178 7.0 \n", "7 Technology 0.032366 0.030678 8.0 \n", "8 Business and economics 0.031782 0.030793 9.0 \n", "9 Philosophy and religion 0.031770 0.030154 10.0 \n", "10 Biology 0.030567 0.030221 11.0 \n", "11 Medicine 0.027488 0.025845 12.0 \n", "12 Transportation 0.026303 0.027177 13.0 \n", "13 Military and warfare 0.026107 0.026282 14.0 \n", "14 Visual arts 0.022421 0.023707 15.0 \n", "15 Regional geography 0.016041 0.017046 16.0 \n", "16 Regional society 0.015313 0.016365 17.0 \n", "17 Food and drink 0.013860 0.014437 18.0 \n", "18 Internet culture 0.013823 0.014654 19.0 \n", "19 Structures of note 0.011235 0.011200 20.0 \n", "20 Education 0.010202 0.009655 21.0 \n", "21 United States 0.008212 0.008730 22.0 \n", "22 Disambiguation 0.008108 0.008134 23.0 \n", "23 Physics 0.007803 0.006779 24.0 \n", "24 Chemistry 0.007524 0.006209 25.0 \n", "25 Space 0.006010 0.005227 26.0 \n", "26 Mathematics 0.005711 0.004364 27.0 \n", "27 Geosciences 0.003774 0.003477 28.0 \n", "28 Meteorology 0.003144 0.002263 29.0 \n", "29 Science 0.003076 0.002795 30.0 \n", "30 India 0.002909 0.003165 31.0 \n", "31 Bodies of water 0.002588 0.002971 32.0 \n", "32 Time 0.002510 0.002395 33.0 \n", "33 Engineering 0.002343 0.002810 34.0 \n", "34 Music 0.002110 0.002175 35.0 \n", "35 Landforms 0.002058 0.002204 36.0 \n", "36 Media 0.001933 0.001866 37.0 \n", "37 Crafts and hobbies 0.001868 0.001894 38.0 \n", "38 Unknown 0.001736 0.002364 39.0 \n", "39 Maps 0.001272 0.001173 40.0 \n", "40 Canada 0.001072 0.001172 41.0 \n", "41 Australia 0.001030 0.001022 42.0 \n", "42 Arts 0.000870 0.000856 43.0 \n", "43 France 0.000799 0.000862 44.0 \n", "44 Information science 0.000758 0.000655 45.0 \n", "45 Germany 0.000695 0.000740 46.0 \n", "46 Italy 0.000605 0.000627 47.0 \n", "47 Games and toys 0.000593 0.000594 48.0 \n", "48 Russia 0.000510 0.000516 49.0 \n", "49 China 0.000476 0.000515 50.0 \n", "\n", " aug_rank \n", "0 1.0 \n", "1 2.0 \n", "2 3.0 \n", "3 4.0 \n", "4 5.0 \n", "5 6.0 \n", "6 7.0 \n", "7 9.0 \n", "8 8.0 \n", "9 11.0 \n", "10 10.0 \n", "11 14.0 \n", "12 12.0 \n", "13 13.0 \n", "14 15.0 \n", "15 16.0 \n", "16 17.0 \n", "17 19.0 \n", "18 18.0 \n", "19 20.0 \n", "20 21.0 \n", "21 22.0 \n", "22 23.0 \n", "23 24.0 \n", "24 25.0 \n", "25 26.0 \n", "26 27.0 \n", "27 28.0 \n", "28 35.0 \n", "29 32.0 \n", "30 29.0 \n", "31 30.0 \n", "32 33.0 \n", "33 31.0 \n", "34 37.0 \n", "35 36.0 \n", "36 39.0 \n", "37 38.0 \n", "38 34.0 \n", "39 40.0 \n", "40 41.0 \n", "41 42.0 \n", "42 44.0 \n", "43 43.0 \n", "44 46.0 \n", "45 45.0 \n", "46 47.0 \n", "47 48.0 \n", "48 49.0 \n", "49 50.0 " ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_rank[['topic','proportion_sept','proportion_aug','sept_rank','aug_rank']].head(50)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The changes in proportion and rank between September and August for top 50 topics are not very noticeable. " ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "topic_rank['rank_diff_abs'] = abs(topic_rank['sept_rank'] - topic_rank['aug_rank'])\n" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
topicproportion_septproportion_augsept_rankaug_rankrank_diff_abs
59The_Bahamas0.0002390.00002260.0196.0136.0
99Hong_Kong0.0000740.000156100.073.027.0
78predicted0.0001330.00010479.089.010.0
73Netherlands0.0001470.00012774.082.08.0
76Syria0.0001390.00012077.085.08.0
28Meteorology0.0031440.00226329.035.06.0
69Israel0.0001640.00014670.076.06.0
84Denmark0.0001050.00013885.079.06.0
94Saudi Arabia0.0000880.00007595.0101.06.0
86Argentina0.0001040.00010087.092.05.0
\n", "
" ], "text/plain": [ " topic proportion_sept proportion_aug sept_rank aug_rank \\\n", "59 The_Bahamas 0.000239 0.000022 60.0 196.0 \n", "99 Hong_Kong 0.000074 0.000156 100.0 73.0 \n", "78 predicted 0.000133 0.000104 79.0 89.0 \n", "73 Netherlands 0.000147 0.000127 74.0 82.0 \n", "76 Syria 0.000139 0.000120 77.0 85.0 \n", "28 Meteorology 0.003144 0.002263 29.0 35.0 \n", "69 Israel 0.000164 0.000146 70.0 76.0 \n", "84 Denmark 0.000105 0.000138 85.0 79.0 \n", "94 Saudi Arabia 0.000088 0.000075 95.0 101.0 \n", "86 Argentina 0.000104 0.000100 87.0 92.0 \n", "\n", " rank_diff_abs \n", "59 136.0 \n", "99 27.0 \n", "78 10.0 \n", "73 8.0 \n", "76 8.0 \n", "28 6.0 \n", "69 6.0 \n", "84 6.0 \n", "94 6.0 \n", "86 5.0 " ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_rank[['topic','proportion_sept','proportion_aug','sept_rank','aug_rank','rank_diff_abs']].head(100).sort_values(by='rank_diff_abs', ascending=False).head(10)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "By looking at top 10 topics change in rank from Auguat to September, the topics related to \"Country/Region\" changes the most between two month." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Top Topics Pageviews Comparison September vs. August" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compare changes in pageviews for top 10 topics between Spetember and August 2019." ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "## Load the RPython library so we can use R for graphs\n", "\n", "%load_ext rpy2.ipython" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "%%R\n", "library(ggplot2)\n", "library (tidyverse)\n", "library(data.table)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "image/png": "\n" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%R -i topic_rank\n", "\n", "data.table(topic_rank)[1:10] %>%\n", " melt(id.vars = c(\"topic\"), measure.vars = c(\"pageviews_sept\", \"pageviews_aug\"),variable.name = \"month\", value.name = 'count') %>%\n", " ggplot(aes(fill=month, y=count, x=reorder(topic,count))) + \n", " geom_bar(position=\"dodge\", stat=\"identity\",width = 0.6) + coord_flip() +\n", " scale_y_continuous(\"Pageviews per Topic\",\n", " labels = polloi::compress) +\n", " theme(axis.title.y=element_blank(),\n", " axis.text=element_text(size=11),\n", " legend.position = c(0.8, 0.15), legend.title = element_blank(),legend.text =element_text( hjust = 0,size = 10))+\n", " labs(color = \"type\",\n", " title = \"Top 10 Viewed Topics Pageviews (Aug vs. Sept)\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compare topics with top 10 changes in pageview percentage between Spetember and August 2019." ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "topic_rank['pv_diff_pct'] = abs(topic_rank['pageviews_aug'] / topic_rank['pageviews_sept']-1)\n" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "pv_diff = topic_rank[['topic','pageviews_sept','pageviews_aug','pv_diff_pct']].head(50).sort_values(by='pv_diff_pct', ascending=False).head(10)\n" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "image/png": "\n" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%R -i pv_diff\n", "\n", "data.table(pv_diff) %>%\n", " filter(topic != 'Unknown') %>%\n", " melt(id.vars = c(\"topic\"), measure.vars = c(\"pageviews_sept\", \"pageviews_aug\"),variable.name = \"month\", value.name = 'count') %>%\n", " ggplot(aes(fill=month, y=count, x=reorder(topic,count))) + \n", " geom_bar(position=\"dodge\", stat=\"identity\",width = 0.6) + coord_flip() +\n", " scale_y_continuous(\"Pageviews per Topic\",\n", " labels = polloi::compress) +\n", " theme(axis.title.y=element_blank(),\n", " axis.text=element_text(size=11),\n", " legend.position = c(0.8, 0.15), legend.title = element_blank(),legend.text =element_text( hjust = 0,size = 10))+\n", " labs(color = \"type\",\n", " title = \"Top 10 Pageviews %Diff Topics (Aug vs. Sept)\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }