{ "cells": [ { "cell_type": "raw", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Multiple courses metric report" ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": false, "slideshow": { "slide_type": "slide" } }, "source": [ "## Course structure" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "from __future__ import division\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from collections import OrderedDict\n", "\n", "import plotly\n", "import plotly.offline as py\n", "import plotly.graph_objs as go\n", "import plotly.tools as tls\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "py.init_notebook_mode() # graphs charts inline (IPython)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "# only change course_list (for all courses on edX), mooc_list (Moocs) and pe_list (professional education) if needed\n", "# list for all the courses\n", "course_list = [\n", " 'UBCx__Marketing1x__3T2015',\n", " 'UBCx__Climate1x__2T2016', \n", " 'UBCx__SPD1x__2T2016',\n", " 'UBCx__SPD2x__2T2016',\n", " 'UBCx__SPD3x__2T2016',\n", " 'UBCx__UseGen_1x__1T2016',\n", " 'UBCx__UseGen_2x__1T2016',\n", " 'UBCx__China300_1x__1T2016',\n", " 'UBCx__China300_2x__1T2016',\n", " 'UBCx__Forest222x__1T2015',\n", " 'UBCx__IndEdu200x__3T2015',\n", " 'UBCx__Water201x_2__2T2015',\n", " 'UBCx__CW1_1x__1T2016',\n", " 'UBCx__CW1_2x__1T2016',\n", " 'UBCx__Phot1x__1T2016',\n", " 'UBCx__ITSx__2T2015'\n", "]\n", "# list for the mooc course\n", "mooc_list = [\n", " 'UBCx__Marketing1x__3T2015',\n", " 'UBCx__Climate1x__2T2016', \n", " 'UBCx__SPD1x__2T2016',\n", " 'UBCx__SPD2x__2T2016',\n", " 'UBCx__SPD3x__2T2016',\n", " 'UBCx__UseGen_1x__1T2016',\n", " 'UBCx__UseGen_2x__1T2016',\n", " 'UBCx__China300_1x__1T2016',\n", " 'UBCx__China300_2x__1T2016',\n", " 'UBCx__Forest222x__1T2015',\n", " 'UBCx__IndEdu200x__3T2015',\n", " 'UBCx__Water201x_2__2T2015',\n", "]\n", "# list for professional education courses\n", "pe_list = [\n", " 'UBCx__CW1_1x__1T2016',\n", " 'UBCx__CW1_2x__1T2016',\n", " 'UBCx__Phot1x__1T2016',\n", " 'UBCx__ITSx__2T2015'\n", "]\n", "# make sure courses are ordered in the visualization\n", "indices = [course.replace('__', '/').replace('_', '.') for course in course_list]\n", "indices[indices.index('UBCx/Water201x.2/2T2015')] = 'UBCx/Water201x_2/2T2015'\n", "\n", "mooc_indices = [course.replace('__', '/').replace('_', '.') for course in mooc_list]\n", "mooc_indices[mooc_indices.index('UBCx/Water201x.2/2T2015')] = 'UBCx/Water201x_2/2T2015'\n", "\n", "pe_indices = [course.replace('__', '/').replace('_', '.') for course in pe_list]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "def query_moduleActivity(course_id):\n", " \"\"\"\n", " Give the course_id, query # students with any activity, # students attempted any problem \n", " and # students watched any video for each chapter of the course\n", " \"\"\"\n", " # query # students attempted any problem for each chapter of the course, exclude those with less than 20 attempts\n", " query = \"\"\"\n", " Select sub.course_id As course_id, sub.index As index, sub.module_id As module_id,\n", " sub.chapter_name As chapter_name, Count(Distinct sub.user_id) As tried_problem\n", " From\n", " (SELECT p.course_id As course_id, p.user_id As user_id, c2.index As index, \n", " c2.module_id As module_id, c2.name As chapter_name\n", " FROM [{0}.problem_analysis] p\n", " Left Join [{0}.course_axis] c1\n", " on p.problem_url_name = c1.url_name\n", " Left Join [{0}.course_axis] c2\n", " On c1.chapter_mid = c2.module_id) sub\n", " Group By course_id, index, module_id, chapter_name\n", " Order By index\"\"\".format(course_id)\n", " tried_problem = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " tried_problem = tried_problem[tried_problem.tried_problem > 20]\n", "\n", " # query # students watched any video for each chapter of the course, exclude those with less than 20 views\n", " query = \"\"\"\n", " Select course_id, index, module_id, chapter_name, exact_count_distinct(username) As watched_video\n", " From\n", " (SELECT c1.course_id As course_id, v.username As username, c2.index As index, \n", " c2.module_id As module_id, c2.name As chapter_name\n", " FROM [{0}.video_stats_day] v\n", " Left Join [{0}.course_axis] c1\n", " on v.video_id = c1.url_name\n", " Left Join [{0}.course_axis] c2\n", " On c1.chapter_mid = c2.module_id) sub\n", " Group By course_id, index, module_id, chapter_name\n", " Order By index\"\"\".format(course_id)\n", " watched_video = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " watched_video = watched_video[watched_video.watched_video > 20]\n", "\n", " # query # students with any activity for each chapter of the course, excluding those with less than 20 active students\n", " query = \"\"\"\n", " Select sub.course_id As course_id, sub.module_id As module_id, \n", " c.name As chapter_name, c.index As index, sub.nactive As nactive\n", " From [{0}.course_axis] c\n", " Join \n", " (Select course_id As course_id, Regexp_replace(module_id,'i4x://', '') As module_id, \n", " Count(Distinct student_id) As nactive\n", " From [{0}.studentmodule]\n", " Where module_type = 'chapter' \n", " Group By course_id, module_id) sub\n", " On sub.module_id = c.module_id\n", " Order By index\"\"\".format(course_id)\n", " nactive = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " nactive = nactive[nactive.nactive > 20]\n", " \n", " # merge watched_video, tried_problem, nactive \n", " module_activity = watched_video.merge(tried_problem, how='outer').merge(nactive, how='outer').fillna(0)\n", " return module_activity[module_activity.chapter_name != 0].sort_values('index').set_index('chapter_name')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "modules = {}\n", "for course_id in course_list:\n", " modules[course_id] = query_moduleActivity(course_id)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "def query_cs(course_id): \n", " \"\"\"\n", " Given course_id (e.g. 'UBCx__Climate1x__1T2016'), \n", " return a list of all the course items (graded_problem, self_test, video, assignment, chapter) \n", " from course_axis table ordered by index.\n", " IMPORTANT: Need to update course_axis in SPD1x first (delete items that belong to SPD2x and SPD3x)\n", " \"\"\"\n", " query = \"\"\"\n", " SELECT c1.course_id As course_id,\n", " Case \n", " When c1.category='problem' And c1.graded='true' Then 'graded_problem'\n", " When c1.category='problem' And c1.graded!='true' Then 'self_test' \n", " Else c1.category\n", " End As category, c1.index As index, c1.name As name,\n", " c1.url_name As url_name, c2.name As chapter\n", " FROM [[{0}.course_axis] c1\n", " Left Join [{0}.course_axis] c2\n", " On c1.chapter_mid = c2.module_id\n", " Where c1.category in ('video', 'problem', 'openassessment', 'chapter')\n", " Order By c1.index\"\"\".format(course_id)\n", " indices = modules[course_id].index\n", " structure = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " structure = structure[(structure.name.isin(indices)) | (structure.chapter.isin(indices))]\n", " \n", " query = \"\"\"\n", " Select problem_url_name, Count(Distinct item.answer_id) As num\n", " From [{0}.problem_analysis]\n", " Group By problem_url_name\"\"\".format(course_id)\n", " nQuestions = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " structure = structure.merge(nQuestions, left_on='url_name', right_on='problem_url_name', how='left')\\\n", " .drop('problem_url_name', axis=1)\n", " structure.num = structure.num.fillna(1)\n", " return structure" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dfs = []\n", "for course_id in course_list:\n", " df = query_cs(course_id)\n", " dfs.append(df)\n", " \n", "course_structures = pd.concat(dfs)\n", "css_agg = course_structures.groupby(['course_id', 'category']).num.sum().unstack('category')\\\n", ".reindex(indices)[['chapter', 'video', 'graded_problem', 'self_test', 'openassessment']].fillna(0)\n", "\n", "# UseGen => different implementation for graded problems, need to query from course_item table\n", "# for course in course_list:\n", "# if 'UseGen' in course:\n", "# value = pd.io.gbq.read_gbq(\"SELECT Count(*) FROM [%s.course_item]\" % course, \n", "# project_id='ubcxdata', verbose=False)\n", "# css_agg.ix[course.replace('__', '/').replace('_', '.'), 'graded_problem'] = value.values[0][0]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# given the course_structures table, ploting the bar graph\n", "data = []\n", "# assign colors to different types of items\n", "colors = {'video': 'rgb(202,178,214)', 'graded_problem': 'rgb(66,146,198)', \n", " 'self_test': 'rgb(166,206,227)', 'openassessment': 'rgb(116,196,118)', 'chapter': 'rgb(0, 0, 0)'}\n", "for i in range(0, css_agg.shape[1]):\n", " data.append(go.Bar(x=css_agg.ix[:, i], y=css_agg.index, \n", " marker=dict(color=colors[css_agg.columns[i]]), \n", " orientation='h', name=css_agg.columns[i]))\n", "\n", "layout = go.Layout(\n", " xaxis=dict(showgrid=False),\n", " yaxis=dict(autorange='reversed'),\n", " # adjusting height by # of courses in the course_list\n", " height=25+30*len(css_agg),\n", " width=600,\n", " margin=go.Margin(l=180, b=25, t=0),\n", " legend=dict(x=100, y=0),\n", " barmode='stack')\n", "\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false, "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "# def rolling_count(df):\n", "# df['block'] = (df['category'] != df['category'].shift(1)).astype(int).cumsum()\n", "# df['count'] = df.groupby('block').num.cumsum()\n", "# return df\n", "\n", "\n", "# # iterate over courses in the course_list\n", "# for course_id in indices:\n", "# # query course structure (list of items in course_axis)\n", "# # count # of times an item (graded_problem, self_test, video) appears consecutively\n", "# df = course_structures[course_structures.course_id == course_id].copy()\n", "# df.fillna(method='bfill', inplace=True)\n", "# df = df.groupby('chapter').apply(rolling_count)\n", "# idx = df.groupby(['chapter', 'block'])['count'].transform(max) == df['count']\n", "# df = df.ix[idx]\n", " \n", "# # plotting\n", "# data = [go.Bar(x=df['count'], y=[course_id]*len(df), \n", "# orientation='h', hoverinfo='y',\n", "# marker=dict(color=df.category.apply(lambda x: colors[x]).values))]\n", "# layout = go.Layout(\n", "# xaxis=dict(tickfont=dict(size=8), showgrid=False),\n", "# barmode='stack', \n", "# width=750,\n", "# height=45,\n", "# margin=go.Margin(l=180, b=10, t=0)\n", "# )\n", "# fig = go.Figure(data=data, layout=layout)\n", "# py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def course_item(course_id=course_id):\n", " \"\"\"\n", " Given course_id, query students' event for video, graded_problem, \n", " self_test, openassessment and chapter from the studentmodule table\n", " => the numbers are slightly different from thosed queried from person_item and video_stats_day\n", " \"\"\"\n", " query = \"\"\"\n", " SELECT sub.module_id As item_id, c.index As index, name, category, nstudents\n", " FROM [ubcxdata:{0}.course_axis] c\n", " Join \n", " (Select Regexp_replace(module_id,'i4x://', '') As module_id, exact_count_distinct(student_id) As nstudents\n", " From [ubcxdata:{0}.studentmodule]\n", " Where module_type In ('openassessment', 'chapter')\n", " Group By module_id) sub\n", " On sub.module_id = c.module_id\n", " Order By index\"\"\".format(course_id)\n", " chapter_assign = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", "\n", " query = \"\"\"\n", " Select problem_url_name as item_id, index, name, \n", " Case When graded='true' Then 'graded_problem' Else 'self_test' End As category,\n", " exact_count_distinct(user_id) As nstudents\n", " From [{0}.problem_analysis] p\n", " Join [{0}.course_axis] c\n", " On p.problem_url_name= c.url_name\n", " Group By item_id, index, name, category\n", " Order By index\"\"\".format(course_id)\n", " nproblems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", "\n", " query = \"\"\"\n", " Select video_id as item_id, index_video as index, name, 'video' As category, videos_viewed As nstudents\n", " From [{0}.video_stats]\n", " Where videos_viewed > 20\"\"\".format(course_id)\n", " nvideos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).dropna()\n", "\n", " courseItem = pd.concat([chapter_assign, nproblems, nvideos]).sort_values('index')\n", " courseItem = courseItem[courseItem.nstudents > 20].reset_index(drop=True)\n", " return courseItem" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### Course structure vs. # students accessed" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "scrolled": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for i in range(len(course_list)):\n", " courseItem = course_item(course_list[i])\n", " # make it center in the middle\n", " trace1 = go.Bar(x=courseItem.index+1, y=courseItem.nstudents, hoverinfo='text',\n", " text=['{0}:
nstudents: {1}'.format(name.encode('utf-8'), value) \n", " for name, value in zip(courseItem.name, courseItem.nstudents)],\n", " marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n", " trace2 = go.Bar(x=courseItem.index+1, y=-courseItem.nstudents, hoverinfo='none',\n", " marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n", " data = [trace1, trace2]\n", " layout = go.Layout(barmode='relative', xaxis=dict(showticklabels=False), \n", " yaxis=dict(showticklabels=False, showgrid=False, title=indices[i], zeroline=False), \n", " height=200, width=850, margin=go.Margin(t=0, b=0), showlegend=False)\n", " fig = go.Figure(data=data, layout=layout)\n", " py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "def query_convs(mooc_list):\n", " \"\"\"\n", " Query nregistered, nviewed, nexplored, npassing, nverified for all the courses \n", " in the mooc_list(e.g. ['UBCx__Climate101x__3T2015', 'UBCx__Climate1x__1T2016', 'UBCx__Marketing1x__3T2015'])\n", " \"\"\"\n", " pc_tables = ',\\n'.join(['[%s.person_course]' % x for x in course_list])\n", " query = \\\n", " \"\"\"SELECT course_id, Count(*) As Registered, \n", " Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,\n", " Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, \n", " Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed, \n", " Sum(Case When mode='verified' Then 1 Else 0 End) As Verified\n", " FROM %s\n", " Group By course_id\"\"\" % pc_tables\n", " convs = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " convs.set_index('course_id', inplace=True)\n", " \n", " return convs" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "def query_convs_pe(course_list):\n", " \"\"\"\n", " Query nregistered, nviewed, nexplored, npassing for all the courses \n", " in the pe_list(e.g. ['UBCx__Phot1x__1T2016', 'UBCx__ITSx__2T2015'])\n", " \"\"\"\n", " pc_tables = ',\\n'.join(['[%s.person_course]' % x for x in course_list])\n", " query = \\\n", " \"\"\"SELECT course_id, Count(*) As Registered, \n", " Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,\n", " Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, \n", " Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed\n", " FROM %s\n", " Group By course_id\"\"\" % pc_tables\n", " convs = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " convs.set_index('course_id', inplace=True)\n", " \n", " return convs" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "# mooc_list\n", "convs = query_convs(mooc_list).reindex(mooc_indices)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "# % involved, passed, verified out of sampled students\n", "convs_pct = convs.drop('Registered', axis=1).divide(convs.drop('Registered', axis=1).max(axis=1), axis=0)#.round(3)\n", "# hover text\n", "convs_txt = convs_pct.copy()\n", "# normalize along the column for plotting in heatmap (different scales across columns)\n", "convs_df = convs_txt.divide(convs_txt.max(axis=0), axis=1).ix[:, 1:]\n", "# convert to %\n", "convs_pct = convs_pct.applymap(lambda x: \"{0:.2f}\".format(x * 100))\n", "convs_txt = convs_txt.applymap(lambda x: \"{0:.2f}\".format(x * 100)).ix[:, 1:]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Total learners\n", "- ** Registered **: learners registered in the course\n", "- ** Sampled **: learners who accessed the course at least once\n", "- ** Involved **: learners with sum_dt > 15 min\n", "- ** Passed **: learners whose grade is at least 50%\n", "- ** Verified **: those purchased the verified certificate\n", "\n", "** sum_dt **: Total elapsed time spent by learner on this course, based on time difference between consecutive events, with a 5 min max cutoff, based on event data" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "names = convs.columns\n", "colors = ['rgb(55, 126, 184)', 'rgb(255, 127, 0)', 'rgb(77, 175, 74)', 'rgb(228, 26, 28)', 'rgb(152, 78, 163)']\n", "fig = tls.make_subplots(rows=1, cols=3, print_grid=False,\n", " subplot_titles=('# students', '% of sampled', 'Compare % of sampled'))\n", "# plotting # students\n", "for i in range(0, convs.shape[1]):\n", " fig.append_trace(go.Bar(x=convs.ix[:, i], y=convs.index, orientation='h', \n", " marker=dict(color=colors[i]), name=names[i]), 1, 1)\n", "# plotting # of sampled\n", "for i in range(0, convs_pct.shape[1]):\n", " fig.append_trace(go.Bar(x=convs_pct.ix[:, i], y=convs_pct.index, orientation='h',\n", " marker=dict(color=colors[i+1]), name=names[i+1], showlegend=False), 1, 2)\n", "# plotting heatmap \n", "fig.append_trace(go.Heatmap(z=convs_df.values, x=convs_df.columns, y=convs_df.index, \n", " text=convs_txt.values, hoverinfo='y+text', \n", " colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']], showscale=False), 1, 3)\n", "\n", "fig['layout']['xaxis1'].update(showgrid=False)\n", "fig['layout']['yaxis1'].update(autorange='reversed')\n", "fig['layout']['xaxis2'].update(showgrid=False)\n", "fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')\n", "fig['layout']['yaxis3'].update(showticklabels=False, autorange='reversed')\n", "fig['layout']['xaxis3'].update(autorange='reversed')\n", "# fig['layout']['legend'].update(x=0.13, y=0.15)\n", "\n", "fig['layout'].update(barmode='overlay', height=100+30*len(convs), width=900, \n", " title='MOOC', margin=go.Margin(l=180, t=60, b=40))\n", "py.iplot(fig) " ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "# pe_list\n", "convs_pe = query_convs_pe(pe_list).reindex(pe_indices)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "convs_pct = convs_pe.drop('Registered', axis=1).divide(convs_pe.drop('Registered', axis=1).max(axis=1), axis=0)#.round(3)\n", "convs_txt = convs_pct.copy()\n", "convs_df = convs_txt.divide(convs_txt.max(axis=0), axis=1).ix[:, 1:]\n", "convs_pct = convs_pct.applymap(lambda x: \"{0:.2f}\".format(x * 100))\n", "convs_txt = convs_txt.applymap(lambda x: \"{0:.2f}\".format(x * 100)).ix[:, 1:]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "names = convs.columns\n", "colors = ['rgb(55, 126, 184)', 'rgb(255, 127, 0)', 'rgb(77, 175, 74)', 'rgb(228, 26, 28)', 'rgb(152, 78, 163)']\n", "fig = tls.make_subplots(rows=1, cols=3, print_grid=False,\n", " subplot_titles=('# students', '% of sampled', 'Compare % of sampled'))\n", "for i in range(0, convs_pe.shape[1]):\n", " fig.append_trace(go.Bar(x=convs_pe.ix[:, i], y=convs_pe.index, orientation='h', \n", " marker=dict(color=colors[i]), name=names[i]), 1, 1)\n", "\n", "for i in range(0, convs_pct.shape[1]):\n", " fig.append_trace(go.Bar(x=convs_pct.ix[:, i], y=convs_pct.index, orientation='h',\n", " marker=dict(color=colors[i+1]), name=names[i+1], showlegend=False), 1, 2)\n", " \n", "fig.append_trace(go.Heatmap(z=convs_df.values, x=convs_df.columns, y=convs_df.index, \n", " text=convs_txt.values, hoverinfo='y+text', \n", " colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']], showscale=False), 1, 3)\n", "\n", "fig['layout']['xaxis1'].update(showgrid=False)\n", "fig['layout']['yaxis1'].update(autorange='reversed')\n", "fig['layout']['xaxis2'].update(showgrid=False)\n", "fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')\n", "fig['layout']['yaxis3'].update(showticklabels=False, autorange='reversed')\n", "fig['layout']['xaxis3'].update(autorange='reversed')\n", "# fig['layout']['legend'].update(x=0.13, y=0.15)\n", "\n", "fig['layout'].update(barmode='overlay', height=100+30*len(convs_pe), width=900, \n", " title='PE', margin=go.Margin(l=180, t=60, b=40))\n", "py.iplot(fig) " ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "# dictionaries to store total # of videos/problems for each course in the course_list\n", "total_videos = {}\n", "total_problems = {}\n", "def query_pcs_learned(course_list = course_list):\n", " \"\"\"\n", " Iterate over each course in the course_list, query and calculate ndays_act, sum_dt, nforum_posts, \n", " nvideos_watched, nproblems_attempted, pct_video_watched, pct_problem_attempted for involved students.\n", " Update total_videos and total_problems\n", " \"\"\"\n", " dfs = []\n", " for i in range(len(course_list)):\n", " query = \"\"\"\n", " Select pc.user_id As user_id, pc.course_id As course_id, pc.mode As mode, pc.grade As grade, \n", " pc.ndays_act As ndays_act, pc.sum_dt / 3600 As sum_dt, pc.nforum_posts As nforum_posts,\n", " v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted\n", " From [{0}.person_course] pc\n", " Left Join\n", " (SELECT username, Count(Distinct video_id) As videos_watched \n", " FROM [{0}.video_stats_day]\n", " Group By username) v\n", " on pc.username = v.username\n", " Left Join \n", " (Select user_id, Count(Distinct item.answer_id) As problems_attempted\n", " From [{0}.problem_analysis]\n", " Group By user_id) p\n", " On pc.user_id = p.user_id\n", " Where pc.sum_dt > 900\"\"\".format(course_list[i])\n", " df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)\n", "\n", " course_id = indices[i]\n", " total_videos[course_id] = min(df.nvideos_watched.max(), css_agg.ix[course_id, 'video'])\n", " df['pct_video_watched'] = df.nvideos_watched / total_videos[course_id]\n", " total_problems[course_id] = min(df.nproblems_attempted.max(), \n", " css_agg.ix[course_id, 'graded_problem'] + \n", " css_agg.ix[course_id, 'self_test'])\n", " df['pct_problem_attempted'] = df.nproblems_attempted / total_problems[course_id]\n", " dfs.append(df)\n", " pcs_learned = pd.concat(dfs)\n", " return pcs_learned" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "cols = ['ndays_act', 'sum_dt', 'pct_video_watched', 'pct_problem_attempted', 'nforum_posts']\n", "# for the involved\n", "pcs_learned = query_pcs_learned()\n", "# aggregate nforum_posts => % students posted, others by median\n", "pcs_learned_agg = pcs_learned.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median, \n", " 'pct_video_watched': np.median, 'pct_problem_attempted': np.median, \n", " 'nforum_posts': lambda s: (s > 0).sum() / len(s)})\n", "# order the courses as needed\n", "pcs_learned_agg = pcs_learned_agg[cols].reindex(indices)\n", "# for the passed\n", "pcs_passed = pcs_learned[pcs_learned.grade >= 0.5]\n", "pcs_passed_agg = pcs_passed.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median, \n", " 'pct_video_watched': np.median, 'pct_problem_attempted': np.median, \n", " 'nforum_posts': lambda s: (s > 0).sum() / len(s)})\n", "pcs_passed_agg = pcs_passed_agg[cols].reindex(indices)\n", "\n", "# normalize along the columns for plotting heatmap\n", "pcs_passed_norm = pcs_passed_agg.divide(pcs_passed_agg.max(axis=0), axis=1)\n", "pcs_learned_norm = pcs_learned_agg.divide(pcs_learned_agg.max(axis=0), axis=1)\n", "# formatting\n", "pcs_passed_agg.sum_dt = pcs_passed_agg.sum_dt.round(2)\n", "pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \\\n", "pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: \"{0:.2f}\".format(x * 100))\n", "pcs_learned_agg.sum_dt = pcs_learned_agg.sum_dt.round(2)\n", "pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \\\n", "pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: \"{0:.2f}\".format(x * 100))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Engagement\n", "\n", "### Students' engagement in the course: " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# normalized value for plotting heatmap, unnormalized for hoverinfo\n", "names = ['median days active', 'median sum_dt (H)', \n", " 'median % videos', 'median % problems', '% students posted']\n", "trace1 = go.Heatmap(\n", " z=pcs_learned_norm.values,\n", " x=names,\n", " y=pcs_learned_norm.index,\n", " text = pcs_learned_agg.values,\n", " hoverinfo='x+text',\n", " showscale=False,\n", " colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(51,160,44)']])\n", "\n", "\n", "trace2 = go.Heatmap(\n", " z=pcs_passed_norm.values,\n", " x=names,\n", " y=pcs_passed_norm.index,\n", " text = pcs_passed_agg.values,\n", " hoverinfo='x+text',\n", " showscale=False,\n", " colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']])\n", "\n", "fig = tls.make_subplots(rows=1, cols=2, print_grid=False,\n", " subplot_titles=('Engagement for the involved', \n", " 'Engagement for the passed'))\n", "fig.append_trace(trace1, 1, 1)\n", "fig.append_trace(trace2, 1, 2)\n", "fig['layout']['yaxis1'].update(autorange='reversed')\n", "fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')\n", "\n", "fig['layout'].update(\n", " width=750, height=140+30*len(pcs_learned_agg),\n", " margin=go.Margin(l=180, b=120, t=20)\n", ")\n", "\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfs = []\n", "for course_id in course_list:\n", " query = \"\"\"\n", " Select pcd.course_id As course_id, pcd.username as username, \n", " pcd.date As date, pc.grade As grade, pc.sum_dt As sum_dt\n", " From [{0}.person_course_day] pcd\n", " Join [{0}.person_course] pc\n", " On pcd.username = pc.username\n", " Where pcd.sum_dt > 0\"\"\".format(course_id)\n", " \n", " dfs.append(pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False))" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dates = pd.concat(dfs).reset_index(drop=True)\n", "dates.date = pd.to_datetime(dates.date, format=\"%Y/%m/%d\")\n", "dates = dates[dates.grade >= 0.5]" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def cal_days(df):\n", " df['days'] = [int(i.days) for i in (df.date - df.date.min())]\n", " return df\n", "dates_days = dates.groupby(['course_id', 'username']).apply(cal_days)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dates_days['days_bin'] = pd.cut(dates_days.days, bins=[1, 7, 30, 60, 90, 180, 365], \n", " labels=['1W', '1M', '2M', '3M', '6M', '>6M'], include_lowest=True, right=False)\n", "days_agg = dates_days.groupby(['course_id', 'days_bin']).username.nunique().unstack('days_bin').fillna(0)\\\n", ".divide(dates_days.groupby('course_id').username.nunique(), axis=0).reindex(indices).applymap(lambda x: \"{0:.2f}\".format(x * 100))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### Returning pattern for students that passed the course" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data = [\n", " go.Heatmap(\n", " z=days_agg.values,\n", " colorscale=[[0, 'rgb(240,249,232)'], [0.25, 'rgb(186,228,188)'], [0.5, 'rgb(123,204,196)'], \n", " [0.75, 'rgb(67,162,202)'], [1, 'rgb(8,104,172)']],\n", " x=days_agg.columns,\n", " y=days_agg.index\n", " )\n", "]\n", "layout = go.Layout(width=600, height=60+30*len(days_agg), margin=go.Margin(l=180, b=40, t=20),\n", " yaxis=dict(autorange='reversed'), xaxis=dict(title='Time elapsed'))\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Students' activity by chapter | Density map illustrating pattern of video and problem activity for the involved " ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false, "scrolled": false, "slideshow": { "slide_type": "fragment" }, "widefigure": true }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# iterate over courses in the course_list\n", "for i in range(len(indices)):\n", " showlegend = True if i==0 else False\n", " fig = tls.make_subplots(rows=1, cols=2, print_grid=False)\n", " # query # students with any activity, # students attempted any problem and # students watched any video\n", " module_activity = modules[course_list[i]]\n", " # plotting\n", " fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.tried_problem, \n", " name='tried a problem', fill='tozeroy', mode='lines',\n", " line = dict(color = ('rgb(255, 127, 0)')), showlegend=showlegend), 1, 1)\n", " fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.watched_video, \n", " name='watched a video', fill='tonexty', mode='lines',\n", " line = dict(color = ('rgb(77, 175, 74)')), showlegend=showlegend), 1, 1)\n", " fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.nactive, \n", " name='with any activity', fill='tonexty', mode='lines', \n", " line = dict(color = ('rgb(55, 126, 184)')), showlegend=showlegend), 1, 1)\n", " # select corresponding person_course \n", " pc_activity = pcs_learned[pcs_learned.course_id==indices[i]].copy()\n", " # convert to %\n", " pc_activity[['pct_video_watched', 'pct_problem_attempted']] = \\\n", " pc_activity[['pct_video_watched', 'pct_problem_attempted']].applymap(lambda x: \"{0:.2f}\".format(x * 100))\n", " # plotting density map\n", " fig.append_trace(\n", " go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,\n", " histnorm='probability',\n", " autobinx=False,\n", " xbins=dict(start=0, end=100, size=10),\n", " autobiny=False,\n", " ybins=dict(start=0, end=100, size=10),\n", " colorscale=[[0, 'rgb(12,51,131)'], [1/1000, 'rgb(12,51,131)'], [1/100, 'rgb(242,211,56)'], \n", " [1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],\n", " zmin=0, zmax=0.601,\n", " zsmooth='fast',\n", " colorbar=dict(thickness=20)), 1, 2)\n", "\n", " fig['layout']['xaxis1'].update(title='chapter', showticklabels=False, showgrid=False)\n", " fig['layout']['yaxis1'].update(title='# students', showgrid=False)\n", " fig['layout']['xaxis2'].update(title='% videos (total:{0})'.format(int(total_videos[indices[i]])))\n", " fig['layout']['yaxis2'].update(title='% problems (total:{0})'.format(int(total_problems[indices[i]])))\n", " fig['layout']['legend'].update(x=0.2, y=1)\n", " fig['layout'].update(height=250, width=750, margin=go.Margin(l=50, t=40, b=35), title=indices[i])\n", " py.iplot(fig)" ] } ], "metadata": { "celltoolbar": "Slideshow", "hide_input": false, "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" }, "toc": { "toc_cell": false, "toc_number_sections": false, "toc_section_display": "block", "toc_threshold": 6, "toc_window_display": true }, "toc_position": { "height": "170px", "left": "1398.66px", "right": "20px", "top": "120px", "width": "290px" } }, "nbformat": 4, "nbformat_minor": 0 }