{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Section ID Count and Event Data\n", "- limited to WP:M pages\n", "\n", "Section ID data was not limited to top-level (H2) sections during data capture, requiring post-capture processing for section ID click data.\n", "\n", "Example of capture issue: https://en.wikipedia.org/wiki/Hepatitis#Signs_and_symptoms.\n", "Clicks on links under \"Acute hepatitis\" were captured with section_id Acute_hepatitis, not Signs_and_symptoms.\n", "\n", "Post-capture data augmentation: click event section_ids where mapped to parent H2 section headings. See [populate-section-table.ipynb](populate-section-table.ipynb) for extraction details.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data\n", "%run -i 'data-defaults.py'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Raw section data from captured events\n", "- Raw total count of events (by all event types) for each section ID for WP:M pages only.\n", "- Limited to >= 3000 events.\n", "- shows the extent of the capture issue described above\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# \"raw\" section data from captured events to show extent of the capture issue described above\n", "pm_section_events_raw_query = \"\"\"\n", "SELECT section_id, action, count(*) count\n", "FROM \n", " citationusage \n", "WHERE page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " AND wiki = 'enwiki'\n", " {}\n", " AND to_date(event_time) >= '{}'\n", " AND to_date(event_time) <= '{}'\n", " AND useragent_is_bot = FALSE\n", "GROUP BY section_id, action\n", "ORDER BY count desc\n", "LIMIT 100\n", "\"\"\"\n", "pm_section_events_raw = spark.sql(\n", " pm_section_events_raw_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "pm_section_events_raw_pandas = pm_section_events_raw.toPandas()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
section_id
-- missing --143490365209523476None
Adverse_effectsNone42526343None
BackgroundNoneNone3496None
CauseNone38217787None
CausesNone1093018686None
ClassificationNone30296620None
Criminal_chargesNone3025NoneNone
DiagnosisNone735314525None
DownfallNoneNone3523None
Early_lifeNoneNone3192None
EpidemiologyNone804916396None
External_links114496NoneNoneNone
FunctionNoneNone3082None
Further_reading12920NoneNoneNone
GeneticsNoneNone6477None
HistoryNone2355248210None
ManagementNoneNone4420None
MechanismNoneNone4187None
Mechanism_of_actionNone469810453None
Medical_usesNone56729901None
MedicationsNoneNone3138None
Notes14437NoneNoneNone
PathophysiologyNone388210161None
Personal_lifeNone60765398None
PharmacodynamicsNoneNone7132None
PharmacokineticsNoneNone3705None
PreventionNone32025437None
PrognosisNone63687950None
References488685NoneNone45536
ResearchNone39226242None
Risk_factorsNoneNone3273None
Side_effectsNone40085390None
Signs_and_symptomsNone1712030088None
Society_and_cultureNoneNone3369None
TransmissionNoneNone3536None
TreatmentNone1205315874None
TypesNoneNone3559None
United_StatesNoneNone4951None
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "section_id \n", "-- missing -- 143490 365209 523476 None\n", "Adverse_effects None 4252 6343 None\n", "Background None None 3496 None\n", "Cause None 3821 7787 None\n", "Causes None 10930 18686 None\n", "Classification None 3029 6620 None\n", "Criminal_charges None 3025 None None\n", "Diagnosis None 7353 14525 None\n", "Downfall None None 3523 None\n", "Early_life None None 3192 None\n", "Epidemiology None 8049 16396 None\n", "External_links 114496 None None None\n", "Function None None 3082 None\n", "Further_reading 12920 None None None\n", "Genetics None None 6477 None\n", "History None 23552 48210 None\n", "Management None None 4420 None\n", "Mechanism None None 4187 None\n", "Mechanism_of_action None 4698 10453 None\n", "Medical_uses None 5672 9901 None\n", "Medications None None 3138 None\n", "Notes 14437 None None None\n", "Pathophysiology None 3882 10161 None\n", "Personal_life None 6076 5398 None\n", "Pharmacodynamics None None 7132 None\n", "Pharmacokinetics None None 3705 None\n", "Prevention None 3202 5437 None\n", "Prognosis None 6368 7950 None\n", "References 488685 None None 45536\n", "Research None 3922 6242 None\n", "Risk_factors None None 3273 None\n", "Side_effects None 4008 5390 None\n", "Signs_and_symptoms None 17120 30088 None\n", "Society_and_culture None None 3369 None\n", "Transmission None None 3536 None\n", "Treatment None 12053 15874 None\n", "Types None None 3559 None\n", "United_States None None 4951 None" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "section_pda_raw = pm_section_events_raw_pandas.copy()\n", "# replace 'NaN' section_id with 'missing'\n", "section_pda_raw.section_id.fillna(value='-- missing --', inplace=True)\n", "# limit to counts of 1K or more\n", "df_filtered_raw = section_pda_raw.query('count>3000').copy()\n", "# set precision before pivot\n", "df_filtered_raw['count'] = df_filtered_raw['count'].map(lambda x: '{0:.0f}'.format(x))\n", "df_filtered_raw.pivot(index='section_id', columns='action', values='count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count of WP:M pages by top-level (H2) section ID" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
section_h2count
0References30862
1External_links18460
2See_also13327
3History5666
4Diagnosis4654
5Treatment4263
6Further_reading3165
7Signs_and_symptoms2839
8Causes2149
9Epidemiology2066
10Prognosis1300
11Career1253
12Notes1244
13Research1221
14Cause1209
15Pathophysiology1145
16Medical_uses1016
17Society_and_culture937
18Biography842
19Prevention827
20Management796
21Early_life729
22Symptoms624
23Side_effects607
24Genetics597
25Mechanism593
26Types588
27Personal_life587
28Pharmacology553
29Bibliography543
30Background536
31Early_life_and_education526
32Publications502
33Presentation493
34Classification486
35Education480
36Chemistry470
37Life427
38Mechanism_of_action396
39Adverse_effects388
40Sources387
41Contraindications351
42Works340
43Awards338
44Interactions311
45Structure310
46Uses291
47Selected_publications290
48Function279
49Overview279
\n", "
" ], "text/plain": [ " section_h2 count\n", "0 References 30862\n", "1 External_links 18460\n", "2 See_also 13327\n", "3 History 5666\n", "4 Diagnosis 4654\n", "5 Treatment 4263\n", "6 Further_reading 3165\n", "7 Signs_and_symptoms 2839\n", "8 Causes 2149\n", "9 Epidemiology 2066\n", "10 Prognosis 1300\n", "11 Career 1253\n", "12 Notes 1244\n", "13 Research 1221\n", "14 Cause 1209\n", "15 Pathophysiology 1145\n", "16 Medical_uses 1016\n", "17 Society_and_culture 937\n", "18 Biography 842\n", "19 Prevention 827\n", "20 Management 796\n", "21 Early_life 729\n", "22 Symptoms 624\n", "23 Side_effects 607\n", "24 Genetics 597\n", "25 Mechanism 593\n", "26 Types 588\n", "27 Personal_life 587\n", "28 Pharmacology 553\n", "29 Bibliography 543\n", "30 Background 536\n", "31 Early_life_and_education 526\n", "32 Publications 502\n", "33 Presentation 493\n", "34 Classification 486\n", "35 Education 480\n", "36 Chemistry 470\n", "37 Life 427\n", "38 Mechanism_of_action 396\n", "39 Adverse_effects 388\n", "40 Sources 387\n", "41 Contraindications 351\n", "42 Works 340\n", "43 Awards 338\n", "44 Interactions 311\n", "45 Structure 310\n", "46 Uses 291\n", "47 Selected_publications 290\n", "48 Function 279\n", "49 Overview 279" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count of top-level (H2) section IDs for WP:M pages only\n", "pm_sections_query = \"\"\"\n", "SELECT section_h2, count(distinct page_id) count\n", "FROM\n", " ryanmax.population_wpm_sections \n", "GROUP BY section_h2\n", "ORDER BY count desc, section_h2\n", "\"\"\"\n", "pm_sections = spark.sql(pm_sections_query)\n", "pm_sections.toPandas().head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total count of events (all types) for each top-level (H2) section ID for WP:M pages only\n", "- Limited to sections with more than 1K fnClick events\n", "- **missing** values are largely because section IDs were not recorded \"if the section is the Main Section\" as per [Schema:CitationUsage](https://meta.wikimedia.org/wiki/Schema:CitationUsage)." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only\n", "pm_section_events_query = \"\"\"\n", "SELECT population_wpm_sections.section_h2, action, count(*) count, count(*)/{} AS daily_average\n", "FROM \n", " citationusage\n", " LEFT JOIN ryanmax.population_wpm_sections \n", " ON \n", " population_wpm_sections.page_id = citationusage.page_id \n", " AND population_wpm_sections.section_id = citationusage.section_id\n", "WHERE\n", " wiki = 'enwiki'\n", " AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " {}\n", " AND to_date(citationusage.event_time) >= '{}'\n", " AND to_date(citationusage.event_time) <= '{}'\n", " AND useragent_is_bot = FALSE\n", "GROUP BY population_wpm_sections.section_h2, action\n", "ORDER BY count desc\n", "\"\"\"\n", "pm_section_events = spark.sql(\n", " pm_section_events_query.format(\n", " days_in_study,\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "pm_section_events_pandas = pm_section_events.toPandas()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
section_h2
-- missing --144124.0367446.0527633.06.0
Adverse_effects206.08017.014158.0NaN
Applications18.01187.02429.0NaN
Background53.01853.04216.01.0
Biography60.01700.03864.0NaN
Career124.01801.03876.0NaN
Cause174.08959.021897.0NaN
Causes386.020480.039966.01.0
Characteristics66.01624.03835.0NaN
Chemistry14.02062.04100.0NaN
Classification1602.03771.08233.02.0
Clinical_significance19.01210.02708.0NaN
Criticism3.01263.02204.0NaN
Definition42.01644.03126.0NaN
Definitions21.01299.02501.0NaN
Description173.01234.02597.0NaN
Diagnosis1836.015648.034727.0NaN
Early_life28.01431.03047.0NaN
Effects10.02519.04717.0NaN
Epidemiology47.09952.021259.0NaN
Examples243.01022.01544.0NaN
ExperimentsNaN1096.01573.0NaN
Function16.01538.04510.0NaN
Genetics239.01293.03632.0NaN
Health_effectsNaN2426.05195.0NaN
History536.034264.075457.0NaN
Interactions5.01042.02046.0NaN
Life212.01532.04646.0NaN
Management11.07595.018470.0NaN
Mechanism71.02875.07266.0NaN
Mechanism_of_action79.03827.08596.0NaN
Medical_use7.01859.03987.0NaN
Medical_uses27.014913.029189.0NaN
Notable_cases6.02097.01742.0NaN
Overview80.01726.03172.0NaN
Pathogenesis1.01024.02869.0NaN
Pathophysiology68.05911.015846.0NaN
Personal_life6.06333.05641.0NaN
Pharmacology22.07245.019622.0NaN
Physiology4.01220.02799.0NaN
Prevention90.05387.010765.0NaN
Prognosis74.07022.09220.0NaN
Research121.06413.011408.0NaN
Risk_factors7.03796.07167.0NaN
Safety16.01321.02336.0NaN
Science_and_technologyNaN1135.02123.0NaN
Side_effects21.05930.09299.0NaN
Signs_and_symptoms810.027628.056668.01.0
Society_and_culture345.010760.020100.0NaN
Structure195.01159.03023.0NaN
Symptoms78.01622.02472.0NaN
TheranosNaN5974.010060.0NaN
Treatment443.019942.029533.0NaN
Treatments9.01016.01394.0NaN
Types345.04721.09512.0NaN
Uses34.04604.09748.0NaN
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "section_h2 \n", "-- missing -- 144124.0 367446.0 527633.0 6.0\n", "Adverse_effects 206.0 8017.0 14158.0 NaN\n", "Applications 18.0 1187.0 2429.0 NaN\n", "Background 53.0 1853.0 4216.0 1.0\n", "Biography 60.0 1700.0 3864.0 NaN\n", "Career 124.0 1801.0 3876.0 NaN\n", "Cause 174.0 8959.0 21897.0 NaN\n", "Causes 386.0 20480.0 39966.0 1.0\n", "Characteristics 66.0 1624.0 3835.0 NaN\n", "Chemistry 14.0 2062.0 4100.0 NaN\n", "Classification 1602.0 3771.0 8233.0 2.0\n", "Clinical_significance 19.0 1210.0 2708.0 NaN\n", "Criticism 3.0 1263.0 2204.0 NaN\n", "Definition 42.0 1644.0 3126.0 NaN\n", "Definitions 21.0 1299.0 2501.0 NaN\n", "Description 173.0 1234.0 2597.0 NaN\n", "Diagnosis 1836.0 15648.0 34727.0 NaN\n", "Early_life 28.0 1431.0 3047.0 NaN\n", "Effects 10.0 2519.0 4717.0 NaN\n", "Epidemiology 47.0 9952.0 21259.0 NaN\n", "Examples 243.0 1022.0 1544.0 NaN\n", "Experiments NaN 1096.0 1573.0 NaN\n", "Function 16.0 1538.0 4510.0 NaN\n", "Genetics 239.0 1293.0 3632.0 NaN\n", "Health_effects NaN 2426.0 5195.0 NaN\n", "History 536.0 34264.0 75457.0 NaN\n", "Interactions 5.0 1042.0 2046.0 NaN\n", "Life 212.0 1532.0 4646.0 NaN\n", "Management 11.0 7595.0 18470.0 NaN\n", "Mechanism 71.0 2875.0 7266.0 NaN\n", "Mechanism_of_action 79.0 3827.0 8596.0 NaN\n", "Medical_use 7.0 1859.0 3987.0 NaN\n", "Medical_uses 27.0 14913.0 29189.0 NaN\n", "Notable_cases 6.0 2097.0 1742.0 NaN\n", "Overview 80.0 1726.0 3172.0 NaN\n", "Pathogenesis 1.0 1024.0 2869.0 NaN\n", "Pathophysiology 68.0 5911.0 15846.0 NaN\n", "Personal_life 6.0 6333.0 5641.0 NaN\n", "Pharmacology 22.0 7245.0 19622.0 NaN\n", "Physiology 4.0 1220.0 2799.0 NaN\n", "Prevention 90.0 5387.0 10765.0 NaN\n", "Prognosis 74.0 7022.0 9220.0 NaN\n", "Research 121.0 6413.0 11408.0 NaN\n", "Risk_factors 7.0 3796.0 7167.0 NaN\n", "Safety 16.0 1321.0 2336.0 NaN\n", "Science_and_technology NaN 1135.0 2123.0 NaN\n", "Side_effects 21.0 5930.0 9299.0 NaN\n", "Signs_and_symptoms 810.0 27628.0 56668.0 1.0\n", "Society_and_culture 345.0 10760.0 20100.0 NaN\n", "Structure 195.0 1159.0 3023.0 NaN\n", "Symptoms 78.0 1622.0 2472.0 NaN\n", "Theranos NaN 5974.0 10060.0 NaN\n", "Treatment 443.0 19942.0 29533.0 NaN\n", "Treatments 9.0 1016.0 1394.0 NaN\n", "Types 345.0 4721.0 9512.0 NaN\n", "Uses 34.0 4604.0 9748.0 NaN" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "section_pda = pm_section_events_pandas.copy()\n", "# replace 'NaN' section_h2 with 'missing'\n", "section_pda.section_h2.fillna(value='-- missing --', inplace=True)\n", "pivot=section_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')\n", "section_h2s=pivot.index\n", "pivot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Daily averages of above" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
section_h2
-- missing --4503.8750011482.6875016488.531250.18750
Adverse_effects6.43750250.53125442.43750NaN
Applications0.5625037.0937575.90625NaN
Background1.6562557.90625131.750000.03125
Biography1.8750053.12500120.75000NaN
Career3.8750056.28125121.12500NaN
Cause5.43750279.96875684.28125NaN
Causes12.06250640.000001248.937500.03125
Characteristics2.0625050.75000119.84375NaN
Chemistry0.4375064.43750128.12500NaN
Classification50.06250117.84375257.281250.06250
Clinical_significance0.5937537.8125084.62500NaN
Criticism0.0937539.4687568.87500NaN
Definition1.3125051.3750097.68750NaN
Definitions0.6562540.5937578.15625NaN
Description5.4062538.5625081.15625NaN
Diagnosis57.37500489.000001085.21875NaN
Early_life0.8750044.7187595.21875NaN
Effects0.3125078.71875147.40625NaN
Epidemiology1.46875311.00000664.34375NaN
Examples7.5937531.9375048.25000NaN
ExperimentsNaN34.2500049.15625NaN
Function0.5000048.06250140.93750NaN
Genetics7.4687540.40625113.50000NaN
Health_effectsNaN75.81250162.34375NaN
History16.750001070.750002358.03125NaN
Interactions0.1562532.5625063.93750NaN
Life6.6250047.87500145.18750NaN
Management0.34375237.34375577.18750NaN
Mechanism2.2187589.84375227.06250NaN
Mechanism_of_action2.46875119.59375268.62500NaN
Medical_use0.2187558.09375124.59375NaN
Medical_uses0.84375466.03125912.15625NaN
Notable_cases0.1875065.5312554.43750NaN
Overview2.5000053.9375099.12500NaN
Pathogenesis0.0312532.0000089.65625NaN
Pathophysiology2.12500184.71875495.18750NaN
Personal_life0.18750197.90625176.28125NaN
Pharmacology0.68750226.40625613.18750NaN
Physiology0.1250038.1250087.46875NaN
Prevention2.81250168.34375336.40625NaN
Prognosis2.31250219.43750288.12500NaN
Research3.78125200.40625356.50000NaN
Risk_factors0.21875118.62500223.96875NaN
Safety0.5000041.2812573.00000NaN
Science_and_technologyNaN35.4687566.34375NaN
Side_effects0.65625185.31250290.59375NaN
Signs_and_symptoms25.31250863.375001770.875000.03125
Society_and_culture10.78125336.25000628.12500NaN
Structure6.0937536.2187594.46875NaN
Symptoms2.4375050.6875077.25000NaN
TheranosNaN186.68750314.37500NaN
Treatment13.84375623.18750922.90625NaN
Treatments0.2812531.7500043.56250NaN
Types10.78125147.53125297.25000NaN
Uses1.06250143.87500304.62500NaN
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "section_h2 \n", "-- missing -- 4503.87500 11482.68750 16488.53125 0.18750\n", "Adverse_effects 6.43750 250.53125 442.43750 NaN\n", "Applications 0.56250 37.09375 75.90625 NaN\n", "Background 1.65625 57.90625 131.75000 0.03125\n", "Biography 1.87500 53.12500 120.75000 NaN\n", "Career 3.87500 56.28125 121.12500 NaN\n", "Cause 5.43750 279.96875 684.28125 NaN\n", "Causes 12.06250 640.00000 1248.93750 0.03125\n", "Characteristics 2.06250 50.75000 119.84375 NaN\n", "Chemistry 0.43750 64.43750 128.12500 NaN\n", "Classification 50.06250 117.84375 257.28125 0.06250\n", "Clinical_significance 0.59375 37.81250 84.62500 NaN\n", "Criticism 0.09375 39.46875 68.87500 NaN\n", "Definition 1.31250 51.37500 97.68750 NaN\n", "Definitions 0.65625 40.59375 78.15625 NaN\n", "Description 5.40625 38.56250 81.15625 NaN\n", "Diagnosis 57.37500 489.00000 1085.21875 NaN\n", "Early_life 0.87500 44.71875 95.21875 NaN\n", "Effects 0.31250 78.71875 147.40625 NaN\n", "Epidemiology 1.46875 311.00000 664.34375 NaN\n", "Examples 7.59375 31.93750 48.25000 NaN\n", "Experiments NaN 34.25000 49.15625 NaN\n", "Function 0.50000 48.06250 140.93750 NaN\n", "Genetics 7.46875 40.40625 113.50000 NaN\n", "Health_effects NaN 75.81250 162.34375 NaN\n", "History 16.75000 1070.75000 2358.03125 NaN\n", "Interactions 0.15625 32.56250 63.93750 NaN\n", "Life 6.62500 47.87500 145.18750 NaN\n", "Management 0.34375 237.34375 577.18750 NaN\n", "Mechanism 2.21875 89.84375 227.06250 NaN\n", "Mechanism_of_action 2.46875 119.59375 268.62500 NaN\n", "Medical_use 0.21875 58.09375 124.59375 NaN\n", "Medical_uses 0.84375 466.03125 912.15625 NaN\n", "Notable_cases 0.18750 65.53125 54.43750 NaN\n", "Overview 2.50000 53.93750 99.12500 NaN\n", "Pathogenesis 0.03125 32.00000 89.65625 NaN\n", "Pathophysiology 2.12500 184.71875 495.18750 NaN\n", "Personal_life 0.18750 197.90625 176.28125 NaN\n", "Pharmacology 0.68750 226.40625 613.18750 NaN\n", "Physiology 0.12500 38.12500 87.46875 NaN\n", "Prevention 2.81250 168.34375 336.40625 NaN\n", "Prognosis 2.31250 219.43750 288.12500 NaN\n", "Research 3.78125 200.40625 356.50000 NaN\n", "Risk_factors 0.21875 118.62500 223.96875 NaN\n", "Safety 0.50000 41.28125 73.00000 NaN\n", "Science_and_technology NaN 35.46875 66.34375 NaN\n", "Side_effects 0.65625 185.31250 290.59375 NaN\n", "Signs_and_symptoms 25.31250 863.37500 1770.87500 0.03125\n", "Society_and_culture 10.78125 336.25000 628.12500 NaN\n", "Structure 6.09375 36.21875 94.46875 NaN\n", "Symptoms 2.43750 50.68750 77.25000 NaN\n", "Theranos NaN 186.68750 314.37500 NaN\n", "Treatment 13.84375 623.18750 922.90625 NaN\n", "Treatments 0.28125 31.75000 43.56250 NaN\n", "Types 10.78125 147.53125 297.25000 NaN\n", "Uses 1.06250 143.87500 304.62500 NaN" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "section_pda.pivot(index='section_h2', columns='action', values='daily_average').loc[section_h2s]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# count of events by event type and access mode for each top-level (H2) section ID for WP:M pages only\n", "pm_section_events_mode_query = \"\"\"\n", "SELECT population_wpm_sections.section_h2, action, mode, count(*) count\n", "FROM \n", " citationusage\n", " LEFT JOIN ryanmax.population_wpm_sections \n", " ON \n", " population_wpm_sections.page_id = citationusage.page_id \n", " AND population_wpm_sections.section_id = citationusage.section_id\n", "WHERE\n", " wiki = 'enwiki'\n", " AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " {}\n", " AND to_date(citationusage.event_time) >= '{}'\n", " AND to_date(citationusage.event_time) <= '{}'\n", " AND useragent_is_bot = FALSE\n", "GROUP BY population_wpm_sections.section_h2, action, mode\n", "ORDER BY count desc\n", "\"\"\"\n", "pm_section_events_mode = spark.sql(\n", " pm_section_events_mode_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "pm_section_events_mode_pandas = pm_section_events_mode.toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Desktop event counts for each top-level (H2) section ID for WP:M pages only\n", "- limits above apply here as well" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
section_h2
-- missing --65483.0123684.0485050.06.0
Adverse_effects49.02487.013283.0NaN
Background25.01024.04057.0NaN
Cause96.03624.020839.0NaN
Causes207.07194.037401.0NaN
Chemistry9.01232.03941.0NaN
Classification377.01697.07789.01.0
Diagnosis722.06724.032950.0NaN
Epidemiology33.05265.020627.0NaN
Function9.01096.04392.0NaN
Health_effectsNaN1019.04856.0NaN
History330.018914.072711.0NaN
Management5.03046.017377.0NaN
Mechanism40.01564.06992.0NaN
Mechanism_of_action54.02371.08291.0NaN
Medical_uses14.05291.027188.0NaN
Overview45.01003.03055.0NaN
Pathophysiology38.03170.015170.0NaN
Personal_life3.01389.05229.0NaN
Pharmacology14.04039.018964.0NaN
Prevention38.02140.010100.0NaN
Prognosis48.02126.08709.0NaN
Research70.02977.010980.0NaN
Risk_factors4.01524.06698.0NaN
Side_effects15.01775.08654.0NaN
Signs_and_symptoms240.08395.052838.01.0
Society_and_culture167.03998.019013.0NaN
TheranosNaN1840.09671.0NaN
Treatment135.05844.027318.0NaN
Types170.02025.08863.0NaN
Uses16.01978.09257.0NaN
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "section_h2 \n", "-- missing -- 65483.0 123684.0 485050.0 6.0\n", "Adverse_effects 49.0 2487.0 13283.0 NaN\n", "Background 25.0 1024.0 4057.0 NaN\n", "Cause 96.0 3624.0 20839.0 NaN\n", "Causes 207.0 7194.0 37401.0 NaN\n", "Chemistry 9.0 1232.0 3941.0 NaN\n", "Classification 377.0 1697.0 7789.0 1.0\n", "Diagnosis 722.0 6724.0 32950.0 NaN\n", "Epidemiology 33.0 5265.0 20627.0 NaN\n", "Function 9.0 1096.0 4392.0 NaN\n", "Health_effects NaN 1019.0 4856.0 NaN\n", "History 330.0 18914.0 72711.0 NaN\n", "Management 5.0 3046.0 17377.0 NaN\n", "Mechanism 40.0 1564.0 6992.0 NaN\n", "Mechanism_of_action 54.0 2371.0 8291.0 NaN\n", "Medical_uses 14.0 5291.0 27188.0 NaN\n", "Overview 45.0 1003.0 3055.0 NaN\n", "Pathophysiology 38.0 3170.0 15170.0 NaN\n", "Personal_life 3.0 1389.0 5229.0 NaN\n", "Pharmacology 14.0 4039.0 18964.0 NaN\n", "Prevention 38.0 2140.0 10100.0 NaN\n", "Prognosis 48.0 2126.0 8709.0 NaN\n", "Research 70.0 2977.0 10980.0 NaN\n", "Risk_factors 4.0 1524.0 6698.0 NaN\n", "Side_effects 15.0 1775.0 8654.0 NaN\n", "Signs_and_symptoms 240.0 8395.0 52838.0 1.0\n", "Society_and_culture 167.0 3998.0 19013.0 NaN\n", "Theranos NaN 1840.0 9671.0 NaN\n", "Treatment 135.0 5844.0 27318.0 NaN\n", "Types 170.0 2025.0 8863.0 NaN\n", "Uses 16.0 1978.0 9257.0 NaN" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "desktop_pda = pm_section_events_mode_pandas.query('mode == \"desktop\"').copy()\n", "# replace 'NaN' section_h2 with 'missing'\n", "desktop_pda.section_h2.fillna(value='-- missing --', inplace=True)\n", "desktop_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mobile event counts for each top-level (H2) section ID for WP:M pages only\n", "- limits above apply here as well" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
actionextClickfnClickfnHoverupClick
section_h2
-- missing --78641.0243762.042583.0NaN
Adverse_effects157.05530.0875.0NaN
Career38.01044.0132.0NaN
Cause78.05335.01058.0NaN
Causes179.013286.02565.01.0
Characteristics4.01009.0204.0NaN
Classification1225.02074.0444.01.0
Diagnosis1114.08924.01777.0NaN
Early_life12.01090.0193.0NaN
Effects10.01641.0305.0NaN
Epidemiology14.04687.0632.0NaN
Health_effectsNaN1407.0339.0NaN
History206.015350.02746.0NaN
Management6.04549.01093.0NaN
Mechanism31.01311.0274.0NaN
Mechanism_of_action25.01456.0305.0NaN
Medical_use2.01095.0232.0NaN
Medical_uses13.09622.02001.0NaN
Notable_cases3.01633.0126.0NaN
Pathophysiology30.02741.0676.0NaN
Personal_life3.04944.0412.0NaN
Pharmacology8.03206.0658.0NaN
Prevention52.03247.0665.0NaN
Prognosis26.04896.0511.0NaN
Research51.03436.0428.0NaN
Risk_factors3.02272.0469.0NaN
Side_effects6.04155.0645.0NaN
Signs_and_symptoms570.019233.03830.0NaN
Society_and_culture178.06762.01087.0NaN
Symptoms55.01138.0168.0NaN
TheranosNaN4134.0389.0NaN
Treatment308.014098.02215.0NaN
Types175.02696.0649.0NaN
Uses18.02626.0491.0NaN
\n", "
" ], "text/plain": [ "action extClick fnClick fnHover upClick\n", "section_h2 \n", "-- missing -- 78641.0 243762.0 42583.0 NaN\n", "Adverse_effects 157.0 5530.0 875.0 NaN\n", "Career 38.0 1044.0 132.0 NaN\n", "Cause 78.0 5335.0 1058.0 NaN\n", "Causes 179.0 13286.0 2565.0 1.0\n", "Characteristics 4.0 1009.0 204.0 NaN\n", "Classification 1225.0 2074.0 444.0 1.0\n", "Diagnosis 1114.0 8924.0 1777.0 NaN\n", "Early_life 12.0 1090.0 193.0 NaN\n", "Effects 10.0 1641.0 305.0 NaN\n", "Epidemiology 14.0 4687.0 632.0 NaN\n", "Health_effects NaN 1407.0 339.0 NaN\n", "History 206.0 15350.0 2746.0 NaN\n", "Management 6.0 4549.0 1093.0 NaN\n", "Mechanism 31.0 1311.0 274.0 NaN\n", "Mechanism_of_action 25.0 1456.0 305.0 NaN\n", "Medical_use 2.0 1095.0 232.0 NaN\n", "Medical_uses 13.0 9622.0 2001.0 NaN\n", "Notable_cases 3.0 1633.0 126.0 NaN\n", "Pathophysiology 30.0 2741.0 676.0 NaN\n", "Personal_life 3.0 4944.0 412.0 NaN\n", "Pharmacology 8.0 3206.0 658.0 NaN\n", "Prevention 52.0 3247.0 665.0 NaN\n", "Prognosis 26.0 4896.0 511.0 NaN\n", "Research 51.0 3436.0 428.0 NaN\n", "Risk_factors 3.0 2272.0 469.0 NaN\n", "Side_effects 6.0 4155.0 645.0 NaN\n", "Signs_and_symptoms 570.0 19233.0 3830.0 NaN\n", "Society_and_culture 178.0 6762.0 1087.0 NaN\n", "Symptoms 55.0 1138.0 168.0 NaN\n", "Theranos NaN 4134.0 389.0 NaN\n", "Treatment 308.0 14098.0 2215.0 NaN\n", "Types 175.0 2696.0 649.0 NaN\n", "Uses 18.0 2626.0 491.0 NaN" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mobile_pda = pm_section_events_mode_pandas.query('mode == \"mobile\"').copy()\n", "# replace 'NaN' section_h2 with 'missing'\n", "mobile_pda.section_h2.fillna(value='-- missing --', inplace=True)\n", "mobile_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Total event counts for Adverse_effects section H2s for WP:M pages\n", " - 20 pages with most events" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# event counts for pages with Adverse_effects section H2s\n", "adverse_events_query = \"\"\"\n", "SELECT citationusage.page_id, population_page_titles_20190420.page_title, count(*) AS event_count\n", "FROM \n", " citationusage\n", " LEFT JOIN ryanmax.population_wpm_sections \n", " ON \n", " population_wpm_sections.page_id = citationusage.page_id \n", " AND population_wpm_sections.section_id = citationusage.section_id\n", " LEFT JOIN ryanmax.population_page_titles_20190420\n", " ON\n", " population_page_titles_20190420.page_id = citationusage.page_id \n", "WHERE\n", " wiki = 'enwiki'\n", " AND population_wpm_sections.section_h2 = 'Adverse_effects'\n", " AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)\n", " {}\n", " AND to_date(citationusage.event_time) >= '{}'\n", " AND to_date(citationusage.event_time) <= '{}'\n", " AND useragent_is_bot = FALSE\n", "GROUP BY citationusage.page_id, population_page_titles_20190420.page_title\n", "ORDER BY event_count desc\n", "\"\"\"\n", "adverse_events = spark.sql(\n", " adverse_events_query.format(\n", " event_exclusion_sql, start_date_string, end_date_string\n", " ))\n", "adverse_events_pandas = adverse_events.toPandas()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total events for Adverse_effects: 22381\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
page_idpage_titleevent_count
01481886Cannabis (drug)1198
110024MDMA1099
21245311Finasteride511
317537Lysergic acid diethylamide464
483406Paracetamol433
5141915Fentanyl393
6724730Clonazepam375
78718425Circumcision339
8155627Ibuprofen332
9178197Statin315
10253720Metformin310
11229985Isotretinoin280
1222071Nonsteroidal anti-inflammatory drug263
131525Aspirin262
144781Benzodiazepine245
15185272Atypical antipsychotic237
16201310Citalopram229
17520574Venlafaxine220
1831690663Quinolone antibiotic219
19875202Mitragyna speciosa216
\n", "
" ], "text/plain": [ " page_id page_title event_count\n", "0 1481886 Cannabis (drug) 1198\n", "1 10024 MDMA 1099\n", "2 1245311 Finasteride 511\n", "3 17537 Lysergic acid diethylamide 464\n", "4 83406 Paracetamol 433\n", "5 141915 Fentanyl 393\n", "6 724730 Clonazepam 375\n", "7 8718425 Circumcision 339\n", "8 155627 Ibuprofen 332\n", "9 178197 Statin 315\n", "10 253720 Metformin 310\n", "11 229985 Isotretinoin 280\n", "12 22071 Nonsteroidal anti-inflammatory drug 263\n", "13 1525 Aspirin 262\n", "14 4781 Benzodiazepine 245\n", "15 185272 Atypical antipsychotic 237\n", "16 201310 Citalopram 229\n", "17 520574 Venlafaxine 220\n", "18 31690663 Quinolone antibiotic 219\n", "19 875202 Mitragyna speciosa 216" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('total events for Adverse_effects: ',adverse_events_pandas['event_count'].sum())\n", "adverse_events_pandas.head(20)" ] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN (large)", "language": "python", "name": "spark_yarn_pyspark_large" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }