{
"cells": [
{
"cell_type": "raw",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Multiple courses metric report"
]
},
{
"cell_type": "markdown",
"metadata": {
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Course structure"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"from __future__ import division\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from collections import OrderedDict\n",
"\n",
"import plotly\n",
"import plotly.offline as py\n",
"import plotly.graph_objs as go\n",
"import plotly.tools as tls\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"py.init_notebook_mode() # graphs charts inline (IPython)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"# only change course_list (for all courses on edX), mooc_list (Moocs) and pe_list (professional education) if needed\n",
"# list for all the courses\n",
"course_list = [\n",
" 'UBCx__Marketing1x__3T2015',\n",
" 'UBCx__Climate1x__2T2016', \n",
" 'UBCx__SPD1x__2T2016',\n",
" 'UBCx__SPD2x__2T2016',\n",
" 'UBCx__SPD3x__2T2016',\n",
" 'UBCx__UseGen_1x__1T2016',\n",
" 'UBCx__UseGen_2x__1T2016',\n",
" 'UBCx__China300_1x__1T2016',\n",
" 'UBCx__China300_2x__1T2016',\n",
" 'UBCx__Forest222x__1T2015',\n",
" 'UBCx__IndEdu200x__3T2015',\n",
" 'UBCx__Water201x_2__2T2015',\n",
" 'UBCx__CW1_1x__1T2016',\n",
" 'UBCx__CW1_2x__1T2016',\n",
" 'UBCx__Phot1x__1T2016',\n",
" 'UBCx__ITSx__2T2015'\n",
"]\n",
"# list for the mooc course\n",
"mooc_list = [\n",
" 'UBCx__Marketing1x__3T2015',\n",
" 'UBCx__Climate1x__2T2016', \n",
" 'UBCx__SPD1x__2T2016',\n",
" 'UBCx__SPD2x__2T2016',\n",
" 'UBCx__SPD3x__2T2016',\n",
" 'UBCx__UseGen_1x__1T2016',\n",
" 'UBCx__UseGen_2x__1T2016',\n",
" 'UBCx__China300_1x__1T2016',\n",
" 'UBCx__China300_2x__1T2016',\n",
" 'UBCx__Forest222x__1T2015',\n",
" 'UBCx__IndEdu200x__3T2015',\n",
" 'UBCx__Water201x_2__2T2015',\n",
"]\n",
"# list for professional education courses\n",
"pe_list = [\n",
" 'UBCx__CW1_1x__1T2016',\n",
" 'UBCx__CW1_2x__1T2016',\n",
" 'UBCx__Phot1x__1T2016',\n",
" 'UBCx__ITSx__2T2015'\n",
"]\n",
"# make sure courses are ordered in the visualization\n",
"indices = [course.replace('__', '/').replace('_', '.') for course in course_list]\n",
"indices[indices.index('UBCx/Water201x.2/2T2015')] = 'UBCx/Water201x_2/2T2015'\n",
"\n",
"mooc_indices = [course.replace('__', '/').replace('_', '.') for course in mooc_list]\n",
"mooc_indices[mooc_indices.index('UBCx/Water201x.2/2T2015')] = 'UBCx/Water201x_2/2T2015'\n",
"\n",
"pe_indices = [course.replace('__', '/').replace('_', '.') for course in pe_list]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"def query_moduleActivity(course_id):\n",
" \"\"\"\n",
" Give the course_id, query # students with any activity, # students attempted any problem \n",
" and # students watched any video for each chapter of the course\n",
" \"\"\"\n",
" # query # students attempted any problem for each chapter of the course, exclude those with less than 20 attempts\n",
" query = \"\"\"\n",
" Select sub.course_id As course_id, sub.index As index, sub.module_id As module_id,\n",
" sub.chapter_name As chapter_name, Count(Distinct sub.user_id) As tried_problem\n",
" From\n",
" (SELECT p.course_id As course_id, p.user_id As user_id, c2.index As index, \n",
" c2.module_id As module_id, c2.name As chapter_name\n",
" FROM [{0}.problem_analysis] p\n",
" Left Join [{0}.course_axis] c1\n",
" on p.problem_url_name = c1.url_name\n",
" Left Join [{0}.course_axis] c2\n",
" On c1.chapter_mid = c2.module_id) sub\n",
" Group By course_id, index, module_id, chapter_name\n",
" Order By index\"\"\".format(course_id)\n",
" tried_problem = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" tried_problem = tried_problem[tried_problem.tried_problem > 20]\n",
"\n",
" # query # students watched any video for each chapter of the course, exclude those with less than 20 views\n",
" query = \"\"\"\n",
" Select course_id, index, module_id, chapter_name, exact_count_distinct(username) As watched_video\n",
" From\n",
" (SELECT c1.course_id As course_id, v.username As username, c2.index As index, \n",
" c2.module_id As module_id, c2.name As chapter_name\n",
" FROM [{0}.video_stats_day] v\n",
" Left Join [{0}.course_axis] c1\n",
" on v.video_id = c1.url_name\n",
" Left Join [{0}.course_axis] c2\n",
" On c1.chapter_mid = c2.module_id) sub\n",
" Group By course_id, index, module_id, chapter_name\n",
" Order By index\"\"\".format(course_id)\n",
" watched_video = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" watched_video = watched_video[watched_video.watched_video > 20]\n",
"\n",
" # query # students with any activity for each chapter of the course, excluding those with less than 20 active students\n",
" query = \"\"\"\n",
" Select sub.course_id As course_id, sub.module_id As module_id, \n",
" c.name As chapter_name, c.index As index, sub.nactive As nactive\n",
" From [{0}.course_axis] c\n",
" Join \n",
" (Select course_id As course_id, Regexp_replace(module_id,'i4x://', '') As module_id, \n",
" Count(Distinct student_id) As nactive\n",
" From [{0}.studentmodule]\n",
" Where module_type = 'chapter' \n",
" Group By course_id, module_id) sub\n",
" On sub.module_id = c.module_id\n",
" Order By index\"\"\".format(course_id)\n",
" nactive = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" nactive = nactive[nactive.nactive > 20]\n",
" \n",
" # merge watched_video, tried_problem, nactive \n",
" module_activity = watched_video.merge(tried_problem, how='outer').merge(nactive, how='outer').fillna(0)\n",
" return module_activity[module_activity.chapter_name != 0].sort_values('index').set_index('chapter_name')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"modules = {}\n",
"for course_id in course_list:\n",
" modules[course_id] = query_moduleActivity(course_id)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"def query_cs(course_id): \n",
" \"\"\"\n",
" Given course_id (e.g. 'UBCx__Climate1x__1T2016'), \n",
" return a list of all the course items (graded_problem, self_test, video, assignment, chapter) \n",
" from course_axis table ordered by index.\n",
" IMPORTANT: Need to update course_axis in SPD1x first (delete items that belong to SPD2x and SPD3x)\n",
" \"\"\"\n",
" query = \"\"\"\n",
" SELECT c1.course_id As course_id,\n",
" Case \n",
" When c1.category='problem' And c1.graded='true' Then 'graded_problem'\n",
" When c1.category='problem' And c1.graded!='true' Then 'self_test' \n",
" Else c1.category\n",
" End As category, c1.index As index, c1.name As name,\n",
" c1.url_name As url_name, c2.name As chapter\n",
" FROM [[{0}.course_axis] c1\n",
" Left Join [{0}.course_axis] c2\n",
" On c1.chapter_mid = c2.module_id\n",
" Where c1.category in ('video', 'problem', 'openassessment', 'chapter')\n",
" Order By c1.index\"\"\".format(course_id)\n",
" indices = modules[course_id].index\n",
" structure = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" structure = structure[(structure.name.isin(indices)) | (structure.chapter.isin(indices))]\n",
" \n",
" query = \"\"\"\n",
" Select problem_url_name, Count(Distinct item.answer_id) As num\n",
" From [{0}.problem_analysis]\n",
" Group By problem_url_name\"\"\".format(course_id)\n",
" nQuestions = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" structure = structure.merge(nQuestions, left_on='url_name', right_on='problem_url_name', how='left')\\\n",
" .drop('problem_url_name', axis=1)\n",
" structure.num = structure.num.fillna(1)\n",
" return structure"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dfs = []\n",
"for course_id in course_list:\n",
" df = query_cs(course_id)\n",
" dfs.append(df)\n",
" \n",
"course_structures = pd.concat(dfs)\n",
"css_agg = course_structures.groupby(['course_id', 'category']).num.sum().unstack('category')\\\n",
".reindex(indices)[['chapter', 'video', 'graded_problem', 'self_test', 'openassessment']].fillna(0)\n",
"\n",
"# UseGen => different implementation for graded problems, need to query from course_item table\n",
"# for course in course_list:\n",
"# if 'UseGen' in course:\n",
"# value = pd.io.gbq.read_gbq(\"SELECT Count(*) FROM [%s.course_item]\" % course, \n",
"# project_id='ubcxdata', verbose=False)\n",
"# css_agg.ix[course.replace('__', '/').replace('_', '.'), 'graded_problem'] = value.values[0][0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# given the course_structures table, ploting the bar graph\n",
"data = []\n",
"# assign colors to different types of items\n",
"colors = {'video': 'rgb(202,178,214)', 'graded_problem': 'rgb(66,146,198)', \n",
" 'self_test': 'rgb(166,206,227)', 'openassessment': 'rgb(116,196,118)', 'chapter': 'rgb(0, 0, 0)'}\n",
"for i in range(0, css_agg.shape[1]):\n",
" data.append(go.Bar(x=css_agg.ix[:, i], y=css_agg.index, \n",
" marker=dict(color=colors[css_agg.columns[i]]), \n",
" orientation='h', name=css_agg.columns[i]))\n",
"\n",
"layout = go.Layout(\n",
" xaxis=dict(showgrid=False),\n",
" yaxis=dict(autorange='reversed'),\n",
" # adjusting height by # of courses in the course_list\n",
" height=25+30*len(css_agg),\n",
" width=600,\n",
" margin=go.Margin(l=180, b=25, t=0),\n",
" legend=dict(x=100, y=0),\n",
" barmode='stack')\n",
"\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# def rolling_count(df):\n",
"# df['block'] = (df['category'] != df['category'].shift(1)).astype(int).cumsum()\n",
"# df['count'] = df.groupby('block').num.cumsum()\n",
"# return df\n",
"\n",
"\n",
"# # iterate over courses in the course_list\n",
"# for course_id in indices:\n",
"# # query course structure (list of items in course_axis)\n",
"# # count # of times an item (graded_problem, self_test, video) appears consecutively\n",
"# df = course_structures[course_structures.course_id == course_id].copy()\n",
"# df.fillna(method='bfill', inplace=True)\n",
"# df = df.groupby('chapter').apply(rolling_count)\n",
"# idx = df.groupby(['chapter', 'block'])['count'].transform(max) == df['count']\n",
"# df = df.ix[idx]\n",
" \n",
"# # plotting\n",
"# data = [go.Bar(x=df['count'], y=[course_id]*len(df), \n",
"# orientation='h', hoverinfo='y',\n",
"# marker=dict(color=df.category.apply(lambda x: colors[x]).values))]\n",
"# layout = go.Layout(\n",
"# xaxis=dict(tickfont=dict(size=8), showgrid=False),\n",
"# barmode='stack', \n",
"# width=750,\n",
"# height=45,\n",
"# margin=go.Margin(l=180, b=10, t=0)\n",
"# )\n",
"# fig = go.Figure(data=data, layout=layout)\n",
"# py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def course_item(course_id=course_id):\n",
" \"\"\"\n",
" Given course_id, query students' event for video, graded_problem, \n",
" self_test, openassessment and chapter from the studentmodule table\n",
" => the numbers are slightly different from thosed queried from person_item and video_stats_day\n",
" \"\"\"\n",
" query = \"\"\"\n",
" SELECT sub.module_id As item_id, c.index As index, name, category, nstudents\n",
" FROM [ubcxdata:{0}.course_axis] c\n",
" Join \n",
" (Select Regexp_replace(module_id,'i4x://', '') As module_id, exact_count_distinct(student_id) As nstudents\n",
" From [ubcxdata:{0}.studentmodule]\n",
" Where module_type In ('openassessment', 'chapter')\n",
" Group By module_id) sub\n",
" On sub.module_id = c.module_id\n",
" Order By index\"\"\".format(course_id)\n",
" chapter_assign = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
"\n",
" query = \"\"\"\n",
" Select problem_url_name as item_id, index, name, \n",
" Case When graded='true' Then 'graded_problem' Else 'self_test' End As category,\n",
" exact_count_distinct(user_id) As nstudents\n",
" From [{0}.problem_analysis] p\n",
" Join [{0}.course_axis] c\n",
" On p.problem_url_name= c.url_name\n",
" Group By item_id, index, name, category\n",
" Order By index\"\"\".format(course_id)\n",
" nproblems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
"\n",
" query = \"\"\"\n",
" Select video_id as item_id, index_video as index, name, 'video' As category, videos_viewed As nstudents\n",
" From [{0}.video_stats]\n",
" Where videos_viewed > 20\"\"\".format(course_id)\n",
" nvideos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).dropna()\n",
"\n",
" courseItem = pd.concat([chapter_assign, nproblems, nvideos]).sort_values('index')\n",
" courseItem = courseItem[courseItem.nstudents > 20].reset_index(drop=True)\n",
" return courseItem"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"### Course structure vs. # students accessed"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"scrolled": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for i in range(len(course_list)):\n",
" courseItem = course_item(course_list[i])\n",
" # make it center in the middle\n",
" trace1 = go.Bar(x=courseItem.index+1, y=courseItem.nstudents, hoverinfo='text',\n",
" text=['{0}:
nstudents: {1}'.format(name.encode('utf-8'), value) \n",
" for name, value in zip(courseItem.name, courseItem.nstudents)],\n",
" marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n",
" trace2 = go.Bar(x=courseItem.index+1, y=-courseItem.nstudents, hoverinfo='none',\n",
" marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n",
" data = [trace1, trace2]\n",
" layout = go.Layout(barmode='relative', xaxis=dict(showticklabels=False), \n",
" yaxis=dict(showticklabels=False, showgrid=False, title=indices[i], zeroline=False), \n",
" height=200, width=850, margin=go.Margin(t=0, b=0), showlegend=False)\n",
" fig = go.Figure(data=data, layout=layout)\n",
" py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"def query_convs(mooc_list):\n",
" \"\"\"\n",
" Query nregistered, nviewed, nexplored, npassing, nverified for all the courses \n",
" in the mooc_list(e.g. ['UBCx__Climate101x__3T2015', 'UBCx__Climate1x__1T2016', 'UBCx__Marketing1x__3T2015'])\n",
" \"\"\"\n",
" pc_tables = ',\\n'.join(['[%s.person_course]' % x for x in course_list])\n",
" query = \\\n",
" \"\"\"SELECT course_id, Count(*) As Registered, \n",
" Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,\n",
" Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, \n",
" Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed, \n",
" Sum(Case When mode='verified' Then 1 Else 0 End) As Verified\n",
" FROM %s\n",
" Group By course_id\"\"\" % pc_tables\n",
" convs = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" convs.set_index('course_id', inplace=True)\n",
" \n",
" return convs"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"def query_convs_pe(course_list):\n",
" \"\"\"\n",
" Query nregistered, nviewed, nexplored, npassing for all the courses \n",
" in the pe_list(e.g. ['UBCx__Phot1x__1T2016', 'UBCx__ITSx__2T2015'])\n",
" \"\"\"\n",
" pc_tables = ',\\n'.join(['[%s.person_course]' % x for x in course_list])\n",
" query = \\\n",
" \"\"\"SELECT course_id, Count(*) As Registered, \n",
" Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,\n",
" Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, \n",
" Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed\n",
" FROM %s\n",
" Group By course_id\"\"\" % pc_tables\n",
" convs = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" convs.set_index('course_id', inplace=True)\n",
" \n",
" return convs"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"# mooc_list\n",
"convs = query_convs(mooc_list).reindex(mooc_indices)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"# % involved, passed, verified out of sampled students\n",
"convs_pct = convs.drop('Registered', axis=1).divide(convs.drop('Registered', axis=1).max(axis=1), axis=0)#.round(3)\n",
"# hover text\n",
"convs_txt = convs_pct.copy()\n",
"# normalize along the column for plotting in heatmap (different scales across columns)\n",
"convs_df = convs_txt.divide(convs_txt.max(axis=0), axis=1).ix[:, 1:]\n",
"# convert to %\n",
"convs_pct = convs_pct.applymap(lambda x: \"{0:.2f}\".format(x * 100))\n",
"convs_txt = convs_txt.applymap(lambda x: \"{0:.2f}\".format(x * 100)).ix[:, 1:]"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Total learners\n",
"- ** Registered **: learners registered in the course\n",
"- ** Sampled **: learners who accessed the course at least once\n",
"- ** Involved **: learners with sum_dt > 15 min\n",
"- ** Passed **: learners whose grade is at least 50%\n",
"- ** Verified **: those purchased the verified certificate\n",
"\n",
"** sum_dt **: Total elapsed time spent by learner on this course, based on time difference between consecutive events, with a 5 min max cutoff, based on event data"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"names = convs.columns\n",
"colors = ['rgb(55, 126, 184)', 'rgb(255, 127, 0)', 'rgb(77, 175, 74)', 'rgb(228, 26, 28)', 'rgb(152, 78, 163)']\n",
"fig = tls.make_subplots(rows=1, cols=3, print_grid=False,\n",
" subplot_titles=('# students', '% of sampled', 'Compare % of sampled'))\n",
"# plotting # students\n",
"for i in range(0, convs.shape[1]):\n",
" fig.append_trace(go.Bar(x=convs.ix[:, i], y=convs.index, orientation='h', \n",
" marker=dict(color=colors[i]), name=names[i]), 1, 1)\n",
"# plotting # of sampled\n",
"for i in range(0, convs_pct.shape[1]):\n",
" fig.append_trace(go.Bar(x=convs_pct.ix[:, i], y=convs_pct.index, orientation='h',\n",
" marker=dict(color=colors[i+1]), name=names[i+1], showlegend=False), 1, 2)\n",
"# plotting heatmap \n",
"fig.append_trace(go.Heatmap(z=convs_df.values, x=convs_df.columns, y=convs_df.index, \n",
" text=convs_txt.values, hoverinfo='y+text', \n",
" colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']], showscale=False), 1, 3)\n",
"\n",
"fig['layout']['xaxis1'].update(showgrid=False)\n",
"fig['layout']['yaxis1'].update(autorange='reversed')\n",
"fig['layout']['xaxis2'].update(showgrid=False)\n",
"fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')\n",
"fig['layout']['yaxis3'].update(showticklabels=False, autorange='reversed')\n",
"fig['layout']['xaxis3'].update(autorange='reversed')\n",
"# fig['layout']['legend'].update(x=0.13, y=0.15)\n",
"\n",
"fig['layout'].update(barmode='overlay', height=100+30*len(convs), width=900, \n",
" title='MOOC', margin=go.Margin(l=180, t=60, b=40))\n",
"py.iplot(fig) "
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"# pe_list\n",
"convs_pe = query_convs_pe(pe_list).reindex(pe_indices)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"convs_pct = convs_pe.drop('Registered', axis=1).divide(convs_pe.drop('Registered', axis=1).max(axis=1), axis=0)#.round(3)\n",
"convs_txt = convs_pct.copy()\n",
"convs_df = convs_txt.divide(convs_txt.max(axis=0), axis=1).ix[:, 1:]\n",
"convs_pct = convs_pct.applymap(lambda x: \"{0:.2f}\".format(x * 100))\n",
"convs_txt = convs_txt.applymap(lambda x: \"{0:.2f}\".format(x * 100)).ix[:, 1:]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"names = convs.columns\n",
"colors = ['rgb(55, 126, 184)', 'rgb(255, 127, 0)', 'rgb(77, 175, 74)', 'rgb(228, 26, 28)', 'rgb(152, 78, 163)']\n",
"fig = tls.make_subplots(rows=1, cols=3, print_grid=False,\n",
" subplot_titles=('# students', '% of sampled', 'Compare % of sampled'))\n",
"for i in range(0, convs_pe.shape[1]):\n",
" fig.append_trace(go.Bar(x=convs_pe.ix[:, i], y=convs_pe.index, orientation='h', \n",
" marker=dict(color=colors[i]), name=names[i]), 1, 1)\n",
"\n",
"for i in range(0, convs_pct.shape[1]):\n",
" fig.append_trace(go.Bar(x=convs_pct.ix[:, i], y=convs_pct.index, orientation='h',\n",
" marker=dict(color=colors[i+1]), name=names[i+1], showlegend=False), 1, 2)\n",
" \n",
"fig.append_trace(go.Heatmap(z=convs_df.values, x=convs_df.columns, y=convs_df.index, \n",
" text=convs_txt.values, hoverinfo='y+text', \n",
" colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']], showscale=False), 1, 3)\n",
"\n",
"fig['layout']['xaxis1'].update(showgrid=False)\n",
"fig['layout']['yaxis1'].update(autorange='reversed')\n",
"fig['layout']['xaxis2'].update(showgrid=False)\n",
"fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')\n",
"fig['layout']['yaxis3'].update(showticklabels=False, autorange='reversed')\n",
"fig['layout']['xaxis3'].update(autorange='reversed')\n",
"# fig['layout']['legend'].update(x=0.13, y=0.15)\n",
"\n",
"fig['layout'].update(barmode='overlay', height=100+30*len(convs_pe), width=900, \n",
" title='PE', margin=go.Margin(l=180, t=60, b=40))\n",
"py.iplot(fig) "
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"# dictionaries to store total # of videos/problems for each course in the course_list\n",
"total_videos = {}\n",
"total_problems = {}\n",
"def query_pcs_learned(course_list = course_list):\n",
" \"\"\"\n",
" Iterate over each course in the course_list, query and calculate ndays_act, sum_dt, nforum_posts, \n",
" nvideos_watched, nproblems_attempted, pct_video_watched, pct_problem_attempted for involved students.\n",
" Update total_videos and total_problems\n",
" \"\"\"\n",
" dfs = []\n",
" for i in range(len(course_list)):\n",
" query = \"\"\"\n",
" Select pc.user_id As user_id, pc.course_id As course_id, pc.mode As mode, pc.grade As grade, \n",
" pc.ndays_act As ndays_act, pc.sum_dt / 3600 As sum_dt, pc.nforum_posts As nforum_posts,\n",
" v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted\n",
" From [{0}.person_course] pc\n",
" Left Join\n",
" (SELECT username, Count(Distinct video_id) As videos_watched \n",
" FROM [{0}.video_stats_day]\n",
" Group By username) v\n",
" on pc.username = v.username\n",
" Left Join \n",
" (Select user_id, Count(Distinct item.answer_id) As problems_attempted\n",
" From [{0}.problem_analysis]\n",
" Group By user_id) p\n",
" On pc.user_id = p.user_id\n",
" Where pc.sum_dt > 900\"\"\".format(course_list[i])\n",
" df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)\n",
"\n",
" course_id = indices[i]\n",
" total_videos[course_id] = min(df.nvideos_watched.max(), css_agg.ix[course_id, 'video'])\n",
" df['pct_video_watched'] = df.nvideos_watched / total_videos[course_id]\n",
" total_problems[course_id] = min(df.nproblems_attempted.max(), \n",
" css_agg.ix[course_id, 'graded_problem'] + \n",
" css_agg.ix[course_id, 'self_test'])\n",
" df['pct_problem_attempted'] = df.nproblems_attempted / total_problems[course_id]\n",
" dfs.append(df)\n",
" pcs_learned = pd.concat(dfs)\n",
" return pcs_learned"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [],
"source": [
"cols = ['ndays_act', 'sum_dt', 'pct_video_watched', 'pct_problem_attempted', 'nforum_posts']\n",
"# for the involved\n",
"pcs_learned = query_pcs_learned()\n",
"# aggregate nforum_posts => % students posted, others by median\n",
"pcs_learned_agg = pcs_learned.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median, \n",
" 'pct_video_watched': np.median, 'pct_problem_attempted': np.median, \n",
" 'nforum_posts': lambda s: (s > 0).sum() / len(s)})\n",
"# order the courses as needed\n",
"pcs_learned_agg = pcs_learned_agg[cols].reindex(indices)\n",
"# for the passed\n",
"pcs_passed = pcs_learned[pcs_learned.grade >= 0.5]\n",
"pcs_passed_agg = pcs_passed.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median, \n",
" 'pct_video_watched': np.median, 'pct_problem_attempted': np.median, \n",
" 'nforum_posts': lambda s: (s > 0).sum() / len(s)})\n",
"pcs_passed_agg = pcs_passed_agg[cols].reindex(indices)\n",
"\n",
"# normalize along the columns for plotting heatmap\n",
"pcs_passed_norm = pcs_passed_agg.divide(pcs_passed_agg.max(axis=0), axis=1)\n",
"pcs_learned_norm = pcs_learned_agg.divide(pcs_learned_agg.max(axis=0), axis=1)\n",
"# formatting\n",
"pcs_passed_agg.sum_dt = pcs_passed_agg.sum_dt.round(2)\n",
"pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \\\n",
"pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: \"{0:.2f}\".format(x * 100))\n",
"pcs_learned_agg.sum_dt = pcs_learned_agg.sum_dt.round(2)\n",
"pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \\\n",
"pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: \"{0:.2f}\".format(x * 100))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Engagement\n",
"\n",
"### Students' engagement in the course: "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# normalized value for plotting heatmap, unnormalized for hoverinfo\n",
"names = ['median days active', 'median sum_dt (H)', \n",
" 'median % videos', 'median % problems', '% students posted']\n",
"trace1 = go.Heatmap(\n",
" z=pcs_learned_norm.values,\n",
" x=names,\n",
" y=pcs_learned_norm.index,\n",
" text = pcs_learned_agg.values,\n",
" hoverinfo='x+text',\n",
" showscale=False,\n",
" colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(51,160,44)']])\n",
"\n",
"\n",
"trace2 = go.Heatmap(\n",
" z=pcs_passed_norm.values,\n",
" x=names,\n",
" y=pcs_passed_norm.index,\n",
" text = pcs_passed_agg.values,\n",
" hoverinfo='x+text',\n",
" showscale=False,\n",
" colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']])\n",
"\n",
"fig = tls.make_subplots(rows=1, cols=2, print_grid=False,\n",
" subplot_titles=('Engagement for the involved', \n",
" 'Engagement for the passed'))\n",
"fig.append_trace(trace1, 1, 1)\n",
"fig.append_trace(trace2, 1, 2)\n",
"fig['layout']['yaxis1'].update(autorange='reversed')\n",
"fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')\n",
"\n",
"fig['layout'].update(\n",
" width=750, height=140+30*len(pcs_learned_agg),\n",
" margin=go.Margin(l=180, b=120, t=20)\n",
")\n",
"\n",
"py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfs = []\n",
"for course_id in course_list:\n",
" query = \"\"\"\n",
" Select pcd.course_id As course_id, pcd.username as username, \n",
" pcd.date As date, pc.grade As grade, pc.sum_dt As sum_dt\n",
" From [{0}.person_course_day] pcd\n",
" Join [{0}.person_course] pc\n",
" On pcd.username = pc.username\n",
" Where pcd.sum_dt > 0\"\"\".format(course_id)\n",
" \n",
" dfs.append(pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False))"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dates = pd.concat(dfs).reset_index(drop=True)\n",
"dates.date = pd.to_datetime(dates.date, format=\"%Y/%m/%d\")\n",
"dates = dates[dates.grade >= 0.5]"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def cal_days(df):\n",
" df['days'] = [int(i.days) for i in (df.date - df.date.min())]\n",
" return df\n",
"dates_days = dates.groupby(['course_id', 'username']).apply(cal_days)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dates_days['days_bin'] = pd.cut(dates_days.days, bins=[1, 7, 30, 60, 90, 180, 365], \n",
" labels=['1W', '1M', '2M', '3M', '6M', '>6M'], include_lowest=True, right=False)\n",
"days_agg = dates_days.groupby(['course_id', 'days_bin']).username.nunique().unstack('days_bin').fillna(0)\\\n",
".divide(dates_days.groupby('course_id').username.nunique(), axis=0).reindex(indices).applymap(lambda x: \"{0:.2f}\".format(x * 100))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"### Returning pattern for students that passed the course"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data = [\n",
" go.Heatmap(\n",
" z=days_agg.values,\n",
" colorscale=[[0, 'rgb(240,249,232)'], [0.25, 'rgb(186,228,188)'], [0.5, 'rgb(123,204,196)'], \n",
" [0.75, 'rgb(67,162,202)'], [1, 'rgb(8,104,172)']],\n",
" x=days_agg.columns,\n",
" y=days_agg.index\n",
" )\n",
"]\n",
"layout = go.Layout(width=600, height=60+30*len(days_agg), margin=go.Margin(l=180, b=40, t=20),\n",
" yaxis=dict(autorange='reversed'), xaxis=dict(title='Time elapsed'))\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Students' activity by chapter | Density map illustrating pattern of video and problem activity for the involved "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"scrolled": false,
"slideshow": {
"slide_type": "fragment"
},
"widefigure": true
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# iterate over courses in the course_list\n",
"for i in range(len(indices)):\n",
" showlegend = True if i==0 else False\n",
" fig = tls.make_subplots(rows=1, cols=2, print_grid=False)\n",
" # query # students with any activity, # students attempted any problem and # students watched any video\n",
" module_activity = modules[course_list[i]]\n",
" # plotting\n",
" fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.tried_problem, \n",
" name='tried a problem', fill='tozeroy', mode='lines',\n",
" line = dict(color = ('rgb(255, 127, 0)')), showlegend=showlegend), 1, 1)\n",
" fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.watched_video, \n",
" name='watched a video', fill='tonexty', mode='lines',\n",
" line = dict(color = ('rgb(77, 175, 74)')), showlegend=showlegend), 1, 1)\n",
" fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.nactive, \n",
" name='with any activity', fill='tonexty', mode='lines', \n",
" line = dict(color = ('rgb(55, 126, 184)')), showlegend=showlegend), 1, 1)\n",
" # select corresponding person_course \n",
" pc_activity = pcs_learned[pcs_learned.course_id==indices[i]].copy()\n",
" # convert to %\n",
" pc_activity[['pct_video_watched', 'pct_problem_attempted']] = \\\n",
" pc_activity[['pct_video_watched', 'pct_problem_attempted']].applymap(lambda x: \"{0:.2f}\".format(x * 100))\n",
" # plotting density map\n",
" fig.append_trace(\n",
" go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,\n",
" histnorm='probability',\n",
" autobinx=False,\n",
" xbins=dict(start=0, end=100, size=10),\n",
" autobiny=False,\n",
" ybins=dict(start=0, end=100, size=10),\n",
" colorscale=[[0, 'rgb(12,51,131)'], [1/1000, 'rgb(12,51,131)'], [1/100, 'rgb(242,211,56)'], \n",
" [1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],\n",
" zmin=0, zmax=0.601,\n",
" zsmooth='fast',\n",
" colorbar=dict(thickness=20)), 1, 2)\n",
"\n",
" fig['layout']['xaxis1'].update(title='chapter', showticklabels=False, showgrid=False)\n",
" fig['layout']['yaxis1'].update(title='# students', showgrid=False)\n",
" fig['layout']['xaxis2'].update(title='% videos (total:{0})'.format(int(total_videos[indices[i]])))\n",
" fig['layout']['yaxis2'].update(title='% problems (total:{0})'.format(int(total_problems[indices[i]])))\n",
" fig['layout']['legend'].update(x=0.2, y=1)\n",
" fig['layout'].update(height=250, width=750, margin=go.Margin(l=50, t=40, b=35), title=indices[i])\n",
" py.iplot(fig)"
]
}
],
"metadata": {
"celltoolbar": "Slideshow",
"hide_input": false,
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
},
"toc": {
"toc_cell": false,
"toc_number_sections": false,
"toc_section_display": "block",
"toc_threshold": 6,
"toc_window_display": true
},
"toc_position": {
"height": "170px",
"left": "1398.66px",
"right": "20px",
"top": "120px",
"width": "290px"
}
},
"nbformat": 4,
"nbformat_minor": 0
}