{
"cells": [
{
"cell_type": "raw",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"hideCode": false,
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Climate1T2016 metric report"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"from __future__ import division\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from collections import OrderedDict\n",
"\n",
"import plotly\n",
"import plotly.offline as py\n",
"import plotly.graph_objs as go\n",
"import plotly.tools as tls\n",
"\n",
"py.init_notebook_mode() # graphs charts inline (IPython)."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# change course_id as needed\n",
"course_id = 'UBCx__Climate101x__3T2015'\n",
"# update courses with graded_problems other than multiple choices\n",
"not_mc_list = [\n",
" 'UBCx__CW1_1x__1T2016',\n",
" 'UBCx__CW1_2x__1T2016',\n",
" 'UBCx__Phot1x__1T2016',\n",
" 'UBCx__ITSx__2T2015',\n",
" 'UBCx__SPD1x__2T2015',\n",
" 'UBCx__SPD1x__2T2016'\n",
"]\n",
"# update professional education courses\n",
"pe_list = [\n",
" 'UBCx__CW1_1x__1T2016',\n",
"# 'UBCx__CW1_2x__1T2016',\n",
" 'UBCx__Phot1x__1T2016',\n",
" 'UBCx__ITSx__2T2015'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"def query_moduleActivity(course_id=course_id):\n",
" \"\"\"\n",
" Give the course_id, query # students with any activity, # students attempted any problem \n",
" and # students watched any video for each chapter of the course\n",
" \"\"\"\n",
" # query # students attempted any problem for each chapter of the course, exclude those with less than 20 attempts\n",
" query = \"\"\"\n",
" Select course_id, sub.index As index, module_id, chapter_name, exact_count_distinct(user_id) As tried_problem\n",
" From\n",
" (SELECT p.course_id As course_id, p.user_id As user_id, c2.index As index, \n",
" c2.module_id As module_id, c2.name As chapter_name\n",
" FROM [{0}.problem_analysis] p\n",
" Left Join [{0}.course_axis] c1\n",
" on p.problem_url_name = c1.url_name\n",
" Left Join [{0}.course_axis] c2\n",
" On c1.chapter_mid = c2.module_id) sub\n",
" Group By course_id, index, module_id, chapter_name\n",
" Order By index\"\"\".format(course_id)\n",
" tried_problem = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" tried_problem = tried_problem[tried_problem.tried_problem > 20]\n",
"\n",
" # query # students watched any video for each chapter of the course, exclude those with less than 20 views\n",
" query = \"\"\"\n",
" Select course_id, index, module_id, chapter_name, exact_count_distinct(username) As watched_video\n",
" From\n",
" (SELECT c1.course_id As course_id, v.username As username, c2.index As index, \n",
" c2.module_id As module_id, c2.name As chapter_name\n",
" FROM [{0}.video_stats_day] v\n",
" Left Join [{0}.course_axis] c1\n",
" on v.video_id = c1.url_name\n",
" Left Join [{0}.course_axis] c2\n",
" On c1.chapter_mid = c2.module_id) sub\n",
" Group By course_id, index, module_id, chapter_name\n",
" Order By index\"\"\".format(course_id)\n",
" watched_video = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" watched_video = watched_video[watched_video.watched_video > 20]\n",
"\n",
" # query # students with any activity for each chapter of the course, excluding those with less than 20 active students\n",
" query = \"\"\"\n",
" Select sub.course_id As course_id, sub.module_id As module_id, \n",
" c.name As chapter_name, c.index As index, sub.nactive As nactive\n",
" From [{0}.course_axis] c\n",
" Join \n",
" (Select course_id As course_id, Regexp_replace(module_id,'i4x://', '') As module_id, \n",
" exact_count_distinct(student_id) As nactive\n",
" From [{0}.studentmodule]\n",
" Where module_type = 'chapter' \n",
" Group By course_id, module_id) sub\n",
" On sub.module_id = c.module_id\n",
" Order By index\"\"\".format(course_id)\n",
" nactive = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" nactive = nactive[nactive.nactive > 20]\n",
" \n",
" # merge watched_video, tried_problem, nactive \n",
" module_activity = watched_video.merge(tried_problem, how='outer').merge(nactive, how='outer').fillna(0)\n",
" return module_activity[module_activity.chapter_name != 0].sort_values('index').set_index('chapter_name')\n",
" \n",
"module_activity = query_moduleActivity()\n",
"# create a list of course indices to make sure courses are ordered in the visualization\n",
"indices = module_activity.index"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"def query_cs(course_id = course_id): \n",
" \"\"\"\n",
" Given course_id (e.g. 'UBCx__Climate1x__1T2016'), \n",
" return a list of all the course items (graded_problem, self_test, video, assignment, chapter) \n",
" from course_axis table ordered by index.\n",
" IMPORTANT: Need to update course_axis in SPD1x first (delete items that belong to SPD2x and SPD3x)\n",
" \"\"\"\n",
" query = \"\"\"\n",
" SELECT\n",
" Case \n",
" When c1.category='problem' And c1.graded='true' Then 'graded_problem'\n",
" When c1.category='problem' And c1.graded!='true' Then 'self_test' \n",
" Else c1.category\n",
" End As category, c1.index As index, c1.name As name,\n",
" c1.url_name As url_name, c2.name As chapter\n",
" FROM [[{0}.course_axis] c1\n",
" Left Join [{0}.course_axis] c2\n",
" On c1.chapter_mid = c2.module_id\n",
" Where c1.category in ('video', 'problem', 'openassessment', 'chapter')\n",
" Order By c1.index\"\"\".format(course_id)\n",
"\n",
" structure = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" structure = structure[(structure.name.isin(indices)) | (structure.chapter.isin(indices))]\n",
" \n",
" query = \"\"\"\n",
" Select problem_url_name, exact_count_distinct(item.answer_id) As num\n",
" From [{0}.problem_analysis]\n",
" Group By problem_url_name\"\"\".format(course_id)\n",
" nQuestions = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" structure = structure.merge(nQuestions, left_on='url_name', right_on='problem_url_name', how='left')\\\n",
" .drop('problem_url_name', axis=1)\n",
" structure.num = structure.num.fillna(1)\n",
" return structure\n",
"\n",
"course_structure = query_cs()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# count # of videos, graded_problems, self_test and assigments for each chapter and order te courses\n",
"cs_chapter = course_structure.groupby(['chapter', 'category']).num.sum().unstack('category')\\\n",
".reindex(indices).dropna(how='all')\n",
"\n",
"cols = []\n",
"# some courses don't have all the items\n",
"for col in ['video', 'graded_problem', 'self_test', 'openassessment']:\n",
" if col in cs_chapter.columns.values:\n",
" cols.append(col)\n",
"cs_chapter = cs_chapter[cols]\n",
"cs_chapter['chapter'] = np.nan\n",
"cs_chapter.fillna(0, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Course structure and activity"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"module_activity = module_activity.reindex(cs_chapter.index)\n",
"fig = tls.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=('Course structure', 'Module activity'))\n",
"\n",
"colors = {'video': 'rgb(202,178,214)', 'graded_problem': 'rgb(66,146,198)', \n",
" 'self_test': 'rgb(166,206,227)', 'openassessment': 'rgb(116,196,118)', \n",
" 'chapter': 'rgb(0, 0, 0)'}\n",
"\n",
"# traces for module activity\n",
"fig.append_trace(go.Scatter(y=module_activity.index, x=module_activity.watched_video, \n",
" name='watched a video', fill='tozerox', mode='lines', \n",
" line=dict(color='rgb(152,78,163)')), 1, 2)\n",
"fig.append_trace(go.Scatter(y=module_activity.index, x=module_activity.tried_problem, \n",
" name='tried a problem', fill='tonextx', mode='lines', \n",
" line=dict(color='rgb(66,146,198)')), 1, 2)\n",
"fig.append_trace(go.Scatter(y=module_activity.index, x=module_activity.nactive, \n",
" name='with any activity', fill='tonextx', mode='lines', \n",
" line=dict(color='rgb(255,127,0)')), 1, 2)\n",
"\n",
"# traces for course structure\n",
"for i in range(0, cs_chapter.shape[1]):\n",
" fig.append_trace(go.Bar(y=cs_chapter.index, x=cs_chapter.ix[:, i], orientation='h',\n",
" marker=dict(color=colors[cs_chapter.columns[i]]), name=cs_chapter.columns[i]), 1, 1)\n",
"\n",
"fig['layout']['yaxis1'].update(tickfont=dict(size=8), showgrid=False, autorange='reversed')\n",
"fig['layout']['yaxis2'].update(showticklabels=False, showgrid=False, autorange='reversed')\n",
"fig['layout']['xaxis1'].update(showgrid=False)\n",
"fig['layout']['xaxis2'].update(showgrid=False)\n",
"fig['layout']['legend'].update(x=1, y=0, traceorder='normal')\n",
"fig['layout'].update(height=50+30*len(cs_chapter), width=850, margin=go.Margin(l=185, t=25, b=20), barmode='stack')\n",
"py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# def rolling_count(df):\n",
"# df['block'] = (df['category'] != df['category'].shift(1)).astype(int).cumsum()\n",
"# df['count'] = df.groupby('block').num.cumsum()\n",
"# return df\n",
"# # count # of times an item (graded_problem, self_test, video) appears consecutively\n",
"# df = course_structure.fillna(method='bfill')\n",
"# df = df.groupby('chapter').apply(rolling_count)\n",
"# idx = df.groupby(['chapter', 'block'])['count'].transform(max) == df['count']\n",
"# df = df.ix[idx]\n",
"\n",
"# # plotting \n",
"# data = [go.Bar(x=df['count'], y=[course_id.replace('__', '/').replace('_', '.')]*len(df), \n",
"# orientation='h', hoverinfo='y',\n",
"# marker=dict(color=df.category.apply(lambda x: colors[x]).values))]\n",
"# layout = go.Layout(\n",
"# xaxis=dict(tickfont=dict(size=8), showgrid=False),\n",
"# yaxis=dict(showticklabels=False),\n",
"# barmode='stack', \n",
"# width=850,\n",
"# height=50,\n",
"# margin=go.Margin(b=15, t=0, l=100)\n",
"# )\n",
"# fig = go.Figure(data=data, layout=layout)\n",
"# py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def course_item(course_id=course_id):\n",
" \"\"\"\n",
" Given course_id, query students' event for video, graded_problem, \n",
" self_test, openassessment and chapter from the studentmodule table\n",
" => the numbers are slightly different from thosed queried from person_item and video_stats_day\n",
" \"\"\"\n",
" query = \"\"\"\n",
" SELECT sub.module_id As item_id, c.index As index, name, category, nstudents\n",
" FROM [ubcxdata:{0}.course_axis] c\n",
" Join \n",
" (Select Regexp_replace(module_id,'i4x://', '') As module_id, exact_count_distinct(student_id) As nstudents\n",
" From [ubcxdata:{0}.studentmodule]\n",
" Where module_type In ('openassessment', 'chapter')\n",
" Group By module_id) sub\n",
" On sub.module_id = c.module_id\n",
" Order By index\"\"\".format(course_id)\n",
" chapter_assign = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
"\n",
" query = \"\"\"\n",
" Select problem_url_name as item_id, index, name, \n",
" Case When graded='true' Then 'graded_problem' Else 'self_test' End As category,\n",
" exact_count_distinct(user_id) As nstudents\n",
" From [{0}.problem_analysis] p\n",
" Join [{0}.course_axis] c\n",
" On p.problem_url_name= c.url_name\n",
" Group By item_id, index, name, category\n",
" Order By index\"\"\".format(course_id)\n",
" nproblems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
"\n",
" query = \"\"\"\n",
" Select video_id as item_id, index_video as index, name, 'video' As category, videos_viewed As nstudents\n",
" From [{0}.video_stats]\n",
" Where videos_viewed > 20\"\"\".format(course_id)\n",
" nvideos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).dropna()\n",
"\n",
" courseItem = pd.concat([chapter_assign, nproblems, nvideos]).sort_values('index')\n",
" courseItem = courseItem[courseItem.nstudents > 20].reset_index(drop=True)\n",
" return courseItem\n",
"\n",
"courseItem = course_item()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# make it center in the middle\n",
"trace1 = go.Bar(x=courseItem.index+1, y=courseItem.nstudents, hoverinfo='text',\n",
" text=['{0}:
nstudents: {1}'.format(name.encode('utf-8'), value) \n",
" for name, value in zip(courseItem.name, courseItem.nstudents)],\n",
" marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n",
"trace2 = go.Bar(x=courseItem.index+1, y=-courseItem.nstudents, hoverinfo='none',\n",
" marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n",
"data = [trace1, trace2]\n",
"layout = go.Layout(barmode='relative', title='course structure vs. students activity', \n",
" xaxis=dict(showticklabels=False, title='course_structure'), \n",
" yaxis=dict(showticklabels=False, showgrid=False, title='nstudents', zeroline=False), \n",
" height=300, width=850, margin=go.Margin(t=25, b=15), showlegend=False)\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"def query_nstudents(cs = course_structure, course_id = course_id):\n",
" \"\"\"\n",
" Query and calculate number of students viewed the video, attempted the problem\n",
" \"\"\"\n",
" query = \"\"\"\n",
" Select l.video_id As video_id, nstudents, position As length\n",
" From (Select video_id, position, Row_number() Over (Partition By video_id Order By position Desc) As rn\n",
" From [{0}.video_stats_day]) l\n",
" Join \n",
" (Select video_id, exact_count_distinct(username) As nstudents\n",
" From [{0}.video_stats_day]\n",
" Where position != 0\n",
" Group By video_id) n\n",
" On l.video_id = n.video_id\n",
" Where rn=5\"\"\".format(course_id)\n",
" videos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" videos = cs[cs.category=='video'].merge(videos, left_on='url_name', right_on='video_id')\n",
" videos = videos[(videos.length<1800) & (videos.nstudents>10)].reset_index(drop=True)\n",
"\n",
" query = \"\"\"\n",
" SELECT c.chapter_name As chapter, c.section_name As section_name,\n",
" p.item_short_id As name, c.item_id As problem_id,\n",
" count(*) As nstudents, Sum(item_grade) As ncorrect\n",
" FROM [{0}.person_item] p\n",
" Join [{0}.course_item] c\n",
" On p.item_short_id = c.item_short_id\n",
" Group By chapter, section_name, name, c.item_nid, problem_id\n",
" Order By c.item_nid\"\"\".format(course_id)\n",
"\n",
" graded_problems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" graded_problems = graded_problems[graded_problems.ncorrect != 0].reset_index(drop=True)\n",
"\n",
" return videos, graded_problems"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"videos, problems = query_nstudents(cs = course_structure)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Video activity"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"choices = ['rgba(166, 206, 227, 0.8)', 'rgba(31, 120, 180, 0.8)', 'rgba(178, 223, 138, 0.8)', 'rgba(51, 160, 44, 0.8)', \n",
" 'rgba(251, 154, 153, 0.8)', 'rgba(227, 26, 28, 0.8)', 'rgba(253, 191, 111, 0.8)', 'rgba(255, 127, 0, 0.8)', \n",
" 'rgba(202, 178, 214, 0.8)', 'rgba(106,61,154, 0.8)']\n",
"# create a dictionary to map colors to chapters\n",
"colors_chapter = dict(zip(cs_chapter.index, choices[:len(cs_chapter.index)]))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# x-axis needs to start from 1, videos.length and videos.length are normalized so that maximum=100\n",
"# hoverinfo => question_name: actual value\n",
"trace1 = go.Bar(x = videos.index+1, y = videos.length/(videos.length.max()/100) , \n",
" text=['{0}: {1}s'.format(name.encode('utf-8'), value) \n",
" for name, value in zip(videos.name, videos.length.round(2))], hoverinfo='text',\n",
" marker=dict(color=videos.chapter.map(colors_chapter)), name='video_length')\n",
"trace2 = go.Bar(x = videos.index+1, y = -videos.nstudents/(videos.nstudents.max()/100), \n",
" text=['{0}: {1}'.format(name.encode('utf-8'), value) \n",
" for name, value in zip(videos.name, videos.nstudents)], hoverinfo='text',\n",
" marker=dict(color=videos.chapter.map(colors_chapter)), name='nstudents_watched')\n",
"\n",
"\n",
"data = [trace1, trace2]\n",
"layout = go.Layout(barmode='relative', xaxis=dict(showticklabels=False), \n",
" yaxis=dict(showticklabels=False, showgrid=False, title='nstudents_watched video_length (s)'), \n",
" height=360, width=850, margin=go.Margin(t=25, b=25), showlegend=False)\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Graded problem activity"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# x-axis needs to start from 1, problems.nstudents is normalized so that maximum=100 (the same as pct_correct)\n",
"# hoverinfo: question_name: actual value\n",
"pct_correct = (problems.ncorrect/problems.nstudents*100).round(2)\n",
"trace1 = go.Bar(x=problems.index+1, y = pct_correct, \n",
" text=['{0}
{1}: {2}'.format(section.encode('utf-8'), name.encode('utf-8'), value) \n",
" for section, name, value in zip(problems.section_name, problems.name, pct_correct)],\n",
" hoverinfo='text', marker=dict(color=problems.chapter.map(colors_chapter)), name='pct_correct')\n",
"trace2 = go.Bar(x=problems.index+1, y = -problems.nstudents/(problems.nstudents.max()/100), \n",
" text=['{0}
{1}: {2}'.format(section.encode('utf-8'), name.encode('utf-8'), value) \n",
" for section, name, value in zip(problems.section_name, problems.name, problems.nstudents)], \n",
" hoverinfo='text', marker=dict(color=problems.chapter.map(colors_chapter)), name='nstudents_attempted')\n",
"\n",
"data = [trace1, trace2]\n",
"layout = go.Layout(barmode='relative', xaxis=dict(showticklabels=False), \n",
" yaxis=dict(showticklabels=False, showgrid=False, title='nstudents_attempted pct_correct'), \n",
" height=360, width=850, margin=go.Margin(t=25, b=25), showlegend=False)\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"def query_least(course_id=course_id):\n",
" \"\"\"\n",
" Given the course_id, return the distribution of answers for the 10 least successful graded_problems\n",
" \"\"\"\n",
" least = \"', '\".join(problems.ix[pct_correct.argsort()[:10], 'problem_id'].values)\n",
" # query 10 least successful problems and corresponding responses by each student\n",
" query = \"\"\"\n",
" Select item.answer_id As problem_id, user_id, item.response As response, item.correctness\n",
" From [{0}.problem_analysis]\n",
" Where item.answer_id in ('{1}')\n",
" \"\"\".format(course_id, least)\n",
"\n",
" answers = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" answers = answers.merge(problems[['name', 'problem_id']])\n",
" # the correct answers: most common responses\n",
" correct = answers[answers.item_correctness=='correct'].groupby('name').response.first().to_dict()\n",
" # total # of responses\n",
" count = answers.groupby('name').response.count()\n",
" \n",
" answers.response = answers.response.apply(lambda x: x.replace('[', '').replace(']', '').split(', '))\n",
" rows = []\n",
" # explode items in a list to multiple rows <= those with more than 1 answer\n",
" _ = answers.apply(lambda row: [rows.append([row['name'], row['user_id'], choice]) \n",
" for choice in row.response], axis=1)\n",
" answers_new = pd.DataFrame(rows, columns=['name', 'user_id', 'response'])#.set_index(['name', 'opponent'])\n",
" # aggregate and calculate % distribution of answers for each problem\n",
" answers_pct = answers_new.groupby('name').response.value_counts().unstack('name').divide(count, axis=1)\n",
" return correct, answers_pct#, answers"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"if course_id in not_mc_list:\n",
" pass\n",
"else:\n",
" correct, answers_pct = query_least()\n",
" def correct_color(col):\n",
" # green for correct answers, blue otherwise\n",
" return ['rgb(44,162,95)' if x in correct[answers_pct.columns[col]] else 'rgb(49,130,189)' for x in answers_pct.index]\n",
"\n",
" fig = tls.make_subplots(rows=2, cols=5, print_grid=False, vertical_spacing=0.25)\n",
" # the first five problems\n",
" for i in range(5):\n",
" quesion = answers_pct.ix[:, i].dropna()\n",
" fig.append_trace(go.Bar(x=quesion.index, y=quesion, name=answers_pct.columns[i], \n",
" marker=dict(color=correct_color(i)), showlegend=False), 1, i+1)\n",
" # the next five problems\n",
" for i in range(5):\n",
" quesion = answers_pct.ix[:, i+5].dropna()\n",
" fig.append_trace(go.Bar(x=quesion.index, y=quesion, name=answers_pct.columns[i],\n",
" marker=dict(color=correct_color(i+5)), showlegend=False), 2, i+1) \n",
"\n",
" for i in range(1, answers_pct.shape[1]+1):\n",
" fig['layout']['xaxis%s' % i].update(tickangle=45, tickfont=dict(size=8),\n",
" title=answers_pct.columns[i-1], titlefont=dict(size=10))\n",
" fig['layout']['yaxis%s' % i].update(showgrid=False, tickfont=dict(size=8))\n",
"\n",
" fig['layout'].update(height=500, width=850, \n",
" title = 'Ten least successful graded problems') \n",
" py.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Overall engagement:\n",
"** Learner type **\n",
"- Registered: learners registered in the course\n",
"- Sampled: learners who accessed the course at least once\n",
"- Involved: learners with sum_dt > 15 min\n",
"- Passed: learners whose grade is at least 50%\n",
"- Verified: those purchased the verified certificate\n",
"\n",
"** sum_dt **: Total elapsed time spent by learner on this course, based on time difference between consecutive events, with a 5 min max cutoff, based on event data"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"def query_pc(course_id = course_id): \n",
" \"\"\"\n",
" Given course_id(e.g. 'UBCx__Marketing1x__3T2015'), query and calculate ndays_act, sum_dt, nforum_posts, \n",
" nvideos_watched, nproblems_attempted, pct_video_watched, pct_problem_attempted for sampled students,\n",
" also return total_videos and total_problems.\n",
" \"\"\"\n",
" query = \"\"\"\n",
" Select pc.user_id As user_id, pc.course_id As course_id, pc.mode As mode, pc.grade As grade, \n",
" pc.ndays_act As ndays_act, pc.sum_dt As sum_dt, pc.nforum_posts As nforum_posts,\n",
" v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted\n",
" From [{0}.person_course] pc\n",
" Left Join\n",
" (SELECT username, exact_count_distinct(video_id) As videos_watched \n",
" FROM [{0}.video_stats_day]\n",
" Group By username) v\n",
" on pc.username = v.username\n",
" Left Join \n",
" (Select user_id, exact_count_distinct(item.answer_id) As problems_attempted\n",
" From [{0}.problem_analysis]\n",
" Group By user_id) p\n",
" On pc.user_id = p.user_id\n",
" Where pc.sum_dt > 0\"\"\".format(course_id)\n",
" df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)\n",
"\n",
" # course_axis includes items not accessible to the students, \n",
" # => total_videos/total_problems are maximum number of videos/problems students accessed \n",
" # if smaller than the number from course_axis then use the latter one\n",
" total_videos = min(df.nvideos_watched.max(), cs_chapter.video.sum())\n",
" df['pct_video_watched'] = df.nvideos_watched / total_videos\n",
" \n",
" total_problems = min(df.nproblems_attempted.max(), \n",
" cs_chapter.graded_problem.sum() + cs_chapter.self_test.sum() if 'self_test' in cs_chapter.columns \n",
" else cs_chapter.graded_problem.sum())\n",
" df['pct_problem_attempted'] = df.nproblems_attempted / total_problems\n",
" \n",
" \n",
" return total_videos, total_problems, df"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"def compute_srp(pc):\n",
" # pc for the sampled\n",
" pc_sampled = pc.copy()\n",
" # pc for the involved\n",
" pc_learned = pc[pc.sum_dt>900].copy()\n",
" # pc for the passed\n",
" pc_passed = pc[pc.grade>=0.5].copy()\n",
" # added category column\n",
" pc_sampled['category'] = 'Sampled'\n",
" pc_learned['category'] = 'Involved'\n",
" pc_passed['category'] = 'Passed'\n",
" srp = pd.concat([pc_sampled, pc_learned, pc_passed])\n",
" # aggregate nforum_posts => # students posted, others by median, by learner type\n",
" srp_agg = srp.groupby('category').agg({'nvideos_watched': np.median, \n",
" 'nproblems_attempted': np.median, 'ndays_act': np.median, \n",
" 'sum_dt': np.median, 'nforum_posts': lambda x: (x > 0).sum()})\n",
" srp_agg = srp_agg.reindex(index = ['Sampled', 'Involved', 'Passed'])\n",
" \n",
" return srp_agg"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"total_videos, total_problems, pc = query_pc()\n",
"# convert to %\n",
"pc[['pct_video_watched', 'pct_problem_attempted']] = \\\n",
"pc[['pct_video_watched', 'pct_problem_attempted']].applymap(lambda x: \"{0:.2f}\".format(x * 100))\n",
"srp_agg = compute_srp(pc)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": true,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"def plot_pls(df, course_id=course_id, title=None):\n",
" \"\"\"\n",
" Plot students' activity: median sum_dt, median ndays_act, # students posted, \n",
" nproblems_attempted, nvideos_watched, grouped by passed vs. involved vs. sampled;\n",
" \"\"\"\n",
" if course_id in pe_list:\n",
" query = \\\n",
" \"\"\"SELECT Count(*) As Registered, \n",
" Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,\n",
" Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, \n",
" Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed\n",
" FROM [%s.person_course]\"\"\" % course_id\n",
" else:\n",
" query = \\\n",
" \"\"\"SELECT Count(*) As Registered, \n",
" Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,\n",
" Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, \n",
" Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed, \n",
" Sum(Case When mode='verified' Then 1 Else 0 End) As Verified\n",
" FROM [%s.person_course]\"\"\" % course_id\n",
" stats = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n",
" # print stats\n",
"\n",
"\n",
" trace1 = go.Bar(x=stats.values[0], y=stats.columns, orientation='h', showlegend=False, name='# of conversion')\n",
" trace2 = go.Bar(x=df.index, y=df.nvideos_watched, showlegend=True, name='nvideos watched')\n",
" trace3 = go.Bar(x=df.index, y=df.nproblems_attempted, showlegend=True, name='nproblems attempted')\n",
" trace4 = go.Bar(x=df.index, y=df.sum_dt/3600, showlegend=False, name='median sum_dt (H)')\n",
" trace5 = go.Bar(x=df.index, y=df.ndays_act, showlegend=False, name='median days active')\n",
" trace6 = go.Bar(x=df.index, y=df.nforum_posts, showlegend=False, name='# students posted')\n",
"\n",
" fig = tls.make_subplots(rows=1, cols=5, shared_xaxes=True, print_grid=False)\n",
" fig.append_trace(trace1, 1, 1)\n",
" fig.append_trace(trace2, 1, 5)\n",
" fig.append_trace(trace3, 1, 5)\n",
" fig.append_trace(trace4, 1, 2)\n",
" fig.append_trace(trace5, 1, 3)\n",
" fig.append_trace(trace6, 1, 4)\n",
"\n",
"\n",
" fig['layout'].update(barmode='stack', height=300, width=900, margin=go.Margin(t=40), title=title)\n",
" fig['layout']['legend'].update(font=dict(size=10))\n",
" fig['layout']['xaxis1'].update(title='# students', showgrid=False,\n",
" titlefont=dict(size=12), tickfont=dict(size=10))\n",
" fig['layout']['xaxis2'].update(title='median sum_dt (H)', showgrid=False,\n",
" titlefont=dict(size=12), tickfont=dict(size=10))\n",
" fig['layout']['xaxis3'].update(title='median days active', showgrid=False,\n",
" titlefont=dict(size=12), tickfont=dict(size=10))\n",
" fig['layout']['xaxis4'].update(title='# students posted', showgrid=False,\n",
" titlefont=dict(size=12), tickfont=dict(size=10))\n",
" fig['layout']['xaxis5'].update(title='median events', showgrid=False, \n",
" titlefont=dict(size=12), tickfont=dict(size=10))\n",
" fig['layout']['yaxis1'].update(autorange='reversed', showgrid=False, tickfont=dict(size=10))\n",
" fig['layout']['yaxis2'].update(showgrid=False, tickfont=dict(size=10))\n",
" fig['layout']['yaxis3'].update(showgrid=False, tickfont=dict(size=10))\n",
" fig['layout']['yaxis4'].update(showgrid=False, tickfont=dict(size=10))\n",
" fig['layout']['yaxis5'].update(showgrid=False, tickfont=dict(size=10))\n",
"\n",
" py.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"scrolled": true,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plot_pls(srp_agg, title=\"Students' engagement: Sampled vs. Involved vs. Passed\")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# pc for the involved\n",
"pc_activity = pc[pc.sum_dt>900].copy()\n",
"# density map\n",
"trace2 = go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,\n",
" histnorm='probability',\n",
" autobinx=False,\n",
" xbins=dict(start=0, end=100, size=10),\n",
" autobiny=False,\n",
" ybins=dict(start=0, end=100, size=10),\n",
" colorscale=[[0, 'rgb(8,81,156)'], [1/1000, 'rgb(8,81,156)'], [1/100, 'rgb(242,211,56)'], \n",
" [1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],\n",
" zsmooth='fast')\n",
"# illustration on how to read the density map\n",
"z = [[1, 0.5, 0.5, 0.5], [0.5, 0.5, 0, 0], [0.5, 0, 0.5, 0], [0.5, 0, 0, 1]]\n",
"z_text = [['Early dropout', '', 'Videos only', ''], ['', 'Progress', '', ''],\n",
" ['Problems only', '', 'Progress', ''], ['', '', '', 'Completed']]\n",
"annotations = []\n",
"for n, row in enumerate(z):\n",
" for m, val in enumerate(row):\n",
" text = z_text[n][m]\n",
" annotations.append(\n",
" dict(\n",
" text=str(text),\n",
" x=m, y=n,\n",
" font=dict(color='black'),\n",
" showarrow=False)\n",
" )\n",
"colorscale=[[0, 'rgb(82,82,82)'], [0.5, 'rgb(150,150,150)'], [1, 'rgb(204,204,204)']]\n",
"trace1 = go.Heatmap(z=z, colorscale=colorscale, showscale=False, hoverinfo='none')\n",
"\n",
"fig = tls.make_subplots(rows=1, cols=2, print_grid=False)\n",
"fig.append_trace(trace1, 1, 1)\n",
"fig.append_trace(trace2, 1, 2)\n",
"\n",
"fig['layout']['xaxis1'].update(ticks='', showticklabels=False, showgrid=False, title='% videos')\n",
"fig['layout']['xaxis2'].update(title='% videos (total:{0})'.format(int(total_videos)))\n",
"fig['layout']['yaxis1'].update(ticks='', showticklabels=False, showgrid=False, zeroline=False, title='% problems')\n",
"fig['layout']['yaxis2'].update(title='% problems (total:{0})'.format(int(total_problems)))\n",
"fig['layout'].update(\n",
" width=850, height=400, annotations=annotations,\n",
" title='Density map illustrating pattern of video and problem activity for the involved',\n",
" margin=go.Margin(l=40, t=40)\n",
")\n",
"\n",
"py.iplot(fig)"
]
}
],
"metadata": {
"celltoolbar": "Slideshow",
"hide_input": false,
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
},
"toc": {
"toc_cell": false,
"toc_number_sections": false,
"toc_section_display": "none",
"toc_threshold": 6,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 0
}