{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": "true" }, "source": [ "# Table of Contents\n", "

1  Course structure: Climate vs. China300.1x
1.1  By chapter
1.2  By order of course items
1.3  Course structure vs. students' activity
2  Density map illustrating pattern of video and problem activity for the involved
3  Students engagement: multi_courses
3.1  Bar graph
3.2  Heatmap
" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "from __future__ import division\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from collections import OrderedDict\n", "\n", "import plotly\n", "import plotly.offline as py\n", "import plotly.graph_objs as go\n", "import plotly.tools as tls\n", "\n", "py.init_notebook_mode() # graphs charts inline (IPython)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Course structure: Climate vs. China300.1x\n", "### By chapter" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def query_moduleActivity(course_id):\n", " \"\"\"\n", " Give the course_id, query # students with any activity, # students attempted any problem \n", " and # students watched any video for each chapter of the course\n", " \"\"\"\n", " # query # students attempted any problem for each chapter of the course, exclude those with less than 20 attempts\n", " query = \"\"\"\n", " Select course_id, sub.index As index, module_id, chapter_name, exact_count_distinct(user_id) As tried_problem\n", " From\n", " (SELECT p.course_id As course_id, p.user_id As user_id, c2.index As index, \n", " c2.module_id As module_id, c2.name As chapter_name\n", " FROM [{0}.problem_analysis] p\n", " Left Join [{0}.course_axis] c1\n", " on p.problem_url_name = c1.url_name\n", " Left Join [{0}.course_axis] c2\n", " On c1.chapter_mid = c2.module_id) sub\n", " Group By course_id, index, module_id, chapter_name\n", " Order By index\"\"\".format(course_id)\n", " tried_problem = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " tried_problem = tried_problem[tried_problem.tried_problem > 20]\n", "\n", " # query # students watched any video for each chapter of the course, exclude those with less than 20 views\n", " query = \"\"\"\n", " Select course_id, index, module_id, chapter_name, exact_count_distinct(username) As watched_video\n", " From\n", " (SELECT c1.course_id As course_id, v.username As username, c2.index As index, \n", " c2.module_id As module_id, c2.name As chapter_name\n", " FROM [{0}.video_stats_day] v\n", " Left Join [{0}.course_axis] c1\n", " on v.video_id = c1.url_name\n", " Left Join [{0}.course_axis] c2\n", " On c1.chapter_mid = c2.module_id) sub\n", " Group By course_id, index, module_id, chapter_name\n", " Order By index\"\"\".format(course_id)\n", " watched_video = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " watched_video = watched_video[watched_video.watched_video > 20]\n", "\n", " # query # students with any activity for each chapter of the course, excluding those with less than 20 active students\n", " query = \"\"\"\n", " Select sub.course_id As course_id, sub.module_id As module_id, \n", " c.name As chapter_name, c.index As index, sub.nactive As nactive\n", " From [{0}.course_axis] c\n", " Join \n", " (Select course_id As course_id, Regexp_replace(module_id,'i4x://', '') As module_id, \n", " exact_count_distinct(student_id) As nactive\n", " From [{0}.studentmodule]\n", " Where module_type = 'chapter' \n", " Group By course_id, module_id) sub\n", " On sub.module_id = c.module_id\n", " Order By index\"\"\".format(course_id)\n", " nactive = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " nactive = nactive[nactive.nactive > 20]\n", " \n", " # merge watched_video, tried_problem, nactive \n", " module_activity = watched_video.merge(tried_problem, how='outer').merge(nactive, how='outer').fillna(0)\n", " return module_activity[module_activity.chapter_name != 0].sort_values('index').set_index('chapter_name')\n", " \n", "module_activity = query_moduleActivity('UBCx__Climate101x__3T2015')\n", "# create a list of course indices to make sure courses are ordered in the visualization\n", "indices = module_activity.index" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def query_cs(course_id, indices): \n", " \"\"\"\n", " Given course_id (e.g. 'UBCx__Climate1x__1T2016'), \n", " return a list of all the course items (graded_problem, self_test, video, assignment, chapter) \n", " from course_axis table ordered by index.\n", " IMPORTANT: Need to update course_axis in SPD1x first (delete items that belong to SPD2x and SPD3x)\n", " \"\"\"\n", " query = \"\"\"\n", " SELECT\n", " Case \n", " When c1.category='problem' And c1.graded='true' Then 'graded_problem'\n", " When c1.category='problem' And c1.graded!='true' Then 'self_test' \n", " Else c1.category\n", " End As category, c1.index As index, c1.name As name,\n", " c1.url_name As url_name, c2.name As chapter\n", " FROM [[{0}.course_axis] c1\n", " Left Join [{0}.course_axis] c2\n", " On c1.chapter_mid = c2.module_id\n", " Where c1.category in ('video', 'problem', 'openassessment', 'chapter')\n", " Order By c1.index\"\"\".format(course_id)\n", "\n", " structure = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " structure = structure[(structure.name.isin(indices)) | (structure.chapter.isin(indices))]\n", " \n", " query = \"\"\"\n", " Select problem_url_name, exact_count_distinct(item.answer_id) As num\n", " From [{0}.problem_analysis]\n", " Group By problem_url_name\"\"\".format(course_id)\n", " nQuestions = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", " structure = structure.merge(nQuestions, left_on='url_name', right_on='problem_url_name', how='left')\\\n", " .drop('problem_url_name', axis=1)\n", " structure.num = structure.num.fillna(1)\n", " return structure\n", "\n", "course_structure = query_cs('UBCx__Climate101x__3T2015', indices)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# count # of videos, graded_problems, self_test and assigments for each chapter and order te courses\n", "cs_chapter = course_structure.groupby(['chapter', 'category']).num.sum().unstack('category')\\\n", ".reindex(indices).dropna(how='all')\n", "\n", "cols = []\n", "# some courses don't have all the items\n", "for col in ['video', 'graded_problem', 'self_test', 'openassessment']:\n", " if col in cs_chapter.columns.values:\n", " cols.append(col)\n", "cs_chapter = cs_chapter[cols]\n", "cs_chapter['chapter'] = np.nan\n", "cs_chapter.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [], "source": [ "module_activity2 = query_moduleActivity('UBCx__China300_1x__1T2016')\n", "# create a list of course indices to make sure courses are ordered in the visualization\n", "indices2 = module_activity2.index\n", "course_structure2 = query_cs('UBCx__China300_1x__1T2016', indices2)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# count # of videos, graded_problems, self_test and assigments for each chapter and order te courses\n", "cs_chapter2 = course_structure2.groupby(['chapter', 'category']).num.sum().unstack('category')\\\n", ".reindex(indices2).dropna(how='all')\n", "\n", "cols = []\n", "# some courses don't have all the items\n", "for col in ['video', 'graded_problem', 'self_test', 'openassessment']:\n", " if col in cs_chapter2.columns.values:\n", " cols.append(col)\n", "cs_chapter2 = cs_chapter2[cols]\n", "cs_chapter2['chapter'] = np.nan\n", "cs_chapter2.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "module_activity = module_activity.reindex(cs_chapter.index)\n", "fig = tls.make_subplots(rows=1, cols=2, print_grid=False, \n", " subplot_titles=('UBCx/Climate101x/3T2015', 'UBCx/China300.1x/1T2016'))\n", "\n", "colors = {'video': 'rgb(202,178,214)', 'graded_problem': 'rgb(66,146,198)', \n", " 'self_test': 'rgb(166,206,227)', 'openassessment': 'rgb(116,196,118)', 'chapter': 'rgb(0, 0, 0)'}\n", "\n", "for i in range(0, cs_chapter.shape[1]):\n", " fig.append_trace(go.Bar(x=cs_chapter.index, y=cs_chapter.ix[:, i],\n", " marker=dict(color=colors[cs_chapter.columns[i]]), name=cs_chapter.columns[i]), 1, 1)\n", " \n", "for i in range(0, cs_chapter2.shape[1]):\n", " fig.append_trace(go.Bar(x=cs_chapter2.index, y=cs_chapter2.ix[:, i], \n", " marker=dict(color=colors[cs_chapter.columns[i]]), name=cs_chapter.columns[i], showlegend=False), 1, 2)\n", "\n", "fig['layout']['yaxis1'].update(tickfont=dict(size=8), showgrid=False)\n", "fig['layout']['yaxis2'].update(showticklabels=False, showgrid=False)\n", "fig['layout']['xaxis1'].update(showgrid=False)\n", "fig['layout']['xaxis2'].update(showgrid=False)\n", "fig['layout']['legend'].update(x=1, y=0, traceorder='normal')\n", "fig['layout'].update(height=380, width=850, margin=go.Margin(b=150, l=20, r=20, t=25), barmode='stack')\n", "py.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### By order of course items" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def rolling_count(df):\n", " df['block'] = (df['category'] != df['category'].shift(1)).astype(int).cumsum()\n", " df['count'] = df.groupby('block').num.cumsum()\n", " return df\n", "# count # of times an item (graded_problem, self_test, video) appears consecutively\n", "df = course_structure.fillna(method='bfill')\n", "df = df.groupby('chapter').apply(rolling_count)\n", "idx = df.groupby(['chapter', 'block'])['count'].transform(max) == df['count']\n", "df = df.ix[idx]" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# plotting \n", "data = [go.Bar(x=df['count'], y=['UBCx/Climate101x/3T2015']*len(df), \n", " orientation='h', hoverinfo='y',\n", " marker=dict(color=df.category.apply(lambda x: colors[x]).values))]\n", "layout = go.Layout(\n", " xaxis=dict(tickfont=dict(size=8), showgrid=False),\n", "# yaxis=dict(showticklabels=False),\n", " barmode='stack', \n", " width=850,\n", " height=50,\n", " margin=go.Margin(b=15, t=0, l=180)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df2 = course_structure2.fillna(method='bfill')\n", "df2 = df2.groupby('chapter').apply(rolling_count)\n", "idx2 = df2.groupby(['chapter', 'block'])['count'].transform(max) == df2['count']\n", "df2 = df2.ix[idx2]" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# plotting \n", "data = [go.Bar(x=df2['count'], y=['UBCx/China300.1x/1T2016']*len(df2), \n", " orientation='h', hoverinfo='y',\n", " marker=dict(color=df2.category.apply(lambda x: colors[x]).values))]\n", "layout = go.Layout(\n", " xaxis=dict(tickfont=dict(size=8), showgrid=False),\n", "# yaxis=dict(showticklabels=False),\n", " barmode='stack', \n", " width=850,\n", " height=50,\n", " margin=go.Margin(b=15, t=0, l=180)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Course structure vs. students' activity" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def course_item(course_id):\n", " \"\"\"\n", " Given course_id, query students' event for video, graded_problem, \n", " self_test, openassessment and chapter from the studentmodule table\n", " => the numbers are slightly different from thosed queried from person_item and video_stats_day\n", " \"\"\"\n", " query = \"\"\"\n", " SELECT sub.module_id As item_id, c.index As index, name, category, nstudents\n", " FROM [ubcxdata:{0}.course_axis] c\n", " Join \n", " (Select Regexp_replace(module_id,'i4x://', '') As module_id, exact_count_distinct(student_id) As nstudents\n", " From [ubcxdata:{0}.studentmodule]\n", " Where module_type In ('openassessment', 'chapter')\n", " Group By module_id) sub\n", " On sub.module_id = c.module_id\n", " Order By index\"\"\".format(course_id)\n", " chapter_assign = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", "\n", " query = \"\"\"\n", " Select problem_url_name as item_id, index, name, \n", " Case When graded='true' Then 'graded_problem' Else 'self_test' End As category,\n", " exact_count_distinct(user_id) As nstudents\n", " From [{0}.problem_analysis] p\n", " Join [{0}.course_axis] c\n", " On p.problem_url_name= c.url_name\n", " Group By item_id, index, name, category\n", " Order By index\"\"\".format(course_id)\n", " nproblems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)\n", "\n", " query = \"\"\"\n", " Select video_id as item_id, index_video as index, name, 'video' As category, videos_viewed As nstudents\n", " From [{0}.video_stats]\n", " Where videos_viewed > 20\"\"\".format(course_id)\n", " nvideos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).dropna()\n", "\n", " courseItem = pd.concat([chapter_assign, nproblems, nvideos]).sort_values('index')\n", " courseItem = courseItem[courseItem.nstudents > 20].reset_index(drop=True)\n", " return courseItem\n", "\n", "courseItem = course_item('UBCx__Climate101x__3T2015')" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# make it center in the middle\n", "trace1 = go.Bar(x=courseItem.index+1, y=courseItem.nstudents, hoverinfo='text',\n", " text=['{0}:
nstudents: {1}'.format(name.encode('utf-8'), value) \n", " for name, value in zip(courseItem.name, courseItem.nstudents)],\n", " marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n", "trace2 = go.Bar(x=courseItem.index+1, y=-courseItem.nstudents, hoverinfo='none',\n", " marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))\n", "data = [trace1, trace2]\n", "layout = go.Layout(barmode='relative', title='UBCx/Climate101x/3T2015', \n", " xaxis=dict(showticklabels=False, title='course_structure'), \n", " yaxis=dict(showticklabels=False, showgrid=False, title='nstudents', zeroline=False), \n", " height=300, width=850, margin=go.Margin(t=25, b=15), showlegend=False)\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "courseItem2 = course_item('UBCx__China300_1x__1T2016')\n", "# make it center in the middle\n", "trace1 = go.Bar(x=courseItem2.index+1, y=courseItem2.nstudents, hoverinfo='text',\n", " text=['{0}:
nstudents: {1}'.format(name.encode('utf-8'), value) \n", " for name, value in zip(courseItem2.name, courseItem2.nstudents)],\n", " marker=dict(color=courseItem2.category.apply(lambda x: colors[x]).values))\n", "trace2 = go.Bar(x=courseItem2.index+1, y=-courseItem2.nstudents, hoverinfo='none',\n", " marker=dict(color=courseItem2.category.apply(lambda x: colors[x]).values))\n", "data = [trace1, trace2]\n", "layout = go.Layout(barmode='relative', title='UBCx/China300.1x/1T2016', \n", " xaxis=dict(showticklabels=False, title='course_structure'), \n", " yaxis=dict(showticklabels=False, showgrid=False, title='nstudents', zeroline=False), \n", " height=300, width=850, margin=go.Margin(t=25, b=15), showlegend=False)\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Density map illustrating pattern of video and problem activity for the involved" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def query_pc(course_id): \n", " \"\"\"\n", " Given course_id(e.g. 'UBCx__Marketing1x__3T2015'), query and calculate ndays_act, sum_dt, nforum_posts, \n", " nvideos_watched, nproblems_attempted, pct_video_watched, pct_problem_attempted for sampled students,\n", " also return total_videos and total_problems.\n", " \"\"\"\n", " query = \"\"\"\n", " Select pc.user_id As user_id, pc.course_id As course_id, pc.mode As mode, pc.grade As grade, \n", " pc.ndays_act As ndays_act, pc.sum_dt As sum_dt, pc.nforum_posts As nforum_posts,\n", " v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted\n", " From [{0}.person_course] pc\n", " Left Join\n", " (SELECT username, exact_count_distinct(video_id) As videos_watched \n", " FROM [{0}.video_stats_day]\n", " Group By username) v\n", " on pc.username = v.username\n", " Left Join \n", " (Select user_id, exact_count_distinct(item.answer_id) As problems_attempted\n", " From [{0}.problem_analysis]\n", " Group By user_id) p\n", " On pc.user_id = p.user_id\n", " Where pc.sum_dt > 0\"\"\".format(course_id)\n", " df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)\n", "\n", " # course_axis includes items not accessible to the students, \n", " # => total_videos/total_problems are maximum number of videos/problems students accessed \n", " # if smaller than the number from course_axis then use the latter one\n", " total_videos = min(df.nvideos_watched.max(), cs_chapter.video.sum())\n", " df['pct_video_watched'] = df.nvideos_watched / total_videos\n", " \n", " total_problems = min(df.nproblems_attempted.max(), \n", " cs_chapter.graded_problem.sum() + cs_chapter.self_test.sum() if 'self_test' in cs_chapter.columns \n", " else cs_chapter.graded_problem.sum())\n", " df['pct_problem_attempted'] = df.nproblems_attempted / total_problems\n", " \n", " \n", " return total_videos, total_problems, df" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": true }, "outputs": [], "source": [ "total_videos, total_problems, pc = query_pc('UBCx__Climate101x__3T2015')\n", "pc[['pct_video_watched', 'pct_problem_attempted']] = \\\n", "pc[['pct_video_watched', 'pct_problem_attempted']].applymap(lambda x: \"{0:.2f}\".format(x * 100))" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# pc for the involved\n", "pc_activity = pc[pc.sum_dt>900].copy()\n", "# density map\n", "trace1 = go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,\n", " histnorm='probability',\n", " autobinx=False,\n", " xbins=dict(start=0, end=100, size=10),\n", " autobiny=False,\n", " ybins=dict(start=0, end=100, size=10),\n", " colorscale=[[0, 'rgb(8,81,156)'], [1/1000, 'rgb(8,81,156)'], [1/100, 'rgb(242,211,56)'], \n", " [1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],\n", " showscale=False)\n", "trace2 = go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,\n", " histnorm='probability',\n", " autobinx=False,\n", " xbins=dict(start=0, end=100, size=25),\n", " autobiny=False,\n", " ybins=dict(start=0, end=100, size=25),\n", " colorscale=[[0, 'rgb(8,81,156)'], [1/1000, 'rgb(8,81,156)'], [1/100, 'rgb(242,211,56)'], \n", " [1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],\n", " showscale=False)\n", "trace3 = go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,\n", " histnorm='probability',\n", " autobinx=False,\n", " xbins=dict(start=0, end=100, size=10),\n", " autobiny=False,\n", " ybins=dict(start=0, end=100, size=10),\n", " colorscale=[[0, 'rgb(8,81,156)'], [1/1000, 'rgb(8,81,156)'], [1/100, 'rgb(242,211,56)'], \n", " [1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],\n", " colorbar=dict(thickness=20), zsmooth='fast')\n", "\n", "fig = tls.make_subplots(rows=1, cols=3, print_grid=False)\n", "fig.append_trace(trace1, 1, 1)\n", "fig.append_trace(trace2, 1, 2)\n", "fig.append_trace(trace3, 1, 3)\n", "\n", "fig['layout']['xaxis1'].update(title='% videos (total:{0})'.format(int(total_videos)))\n", "fig['layout']['xaxis2'].update(title='% videos (total:{0})'.format(int(total_videos)))\n", "fig['layout']['xaxis3'].update(title='% videos (total:{0})'.format(int(total_videos)))\n", "fig['layout']['yaxis1'].update(title='% problems (total:{0})'.format(int(total_problems)))\n", "fig['layout']['yaxis2'].update(showticklabels=False)\n", "fig['layout']['yaxis3'].update(showticklabels=False)\n", "fig['layout'].update(\n", " width=850, height=350, \n", " title='UBCx/Climate101x/3T2015',\n", " margin=go.Margin(l=40, t=40)\n", ")\n", "py.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Students engagement: multi_courses\n", "### Bar graph" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# only change course_list (for all courses on edX), mooc_list (Moocs) and pe_list (professional education) if needed\n", "# list for all the courses\n", "course_list = [\n", " 'UBCx__Marketing1x__3T2015',\n", " 'UBCx__Climate1x__2T2016', \n", " 'UBCx__SPD1x__2T2016',\n", " 'UBCx__SPD2x__2T2016',\n", " 'UBCx__SPD3x__2T2016',\n", " 'UBCx__UseGen_1x__1T2016',\n", " 'UBCx__UseGen_2x__1T2016',\n", " 'UBCx__China300_1x__1T2016',\n", " 'UBCx__China300_2x__1T2016',\n", " 'UBCx__Forest222x__1T2015',\n", " 'UBCx__IndEdu200x__3T2015',\n", " 'UBCx__Water201x_2__2T2015',\n", " 'UBCx__CW1_1x__1T2016',\n", " 'UBCx__CW1_2x__1T2016',\n", " 'UBCx__Phot1x__1T2016',\n", " 'UBCx__ITSx__2T2015'\n", "]\n", "indices = [course.replace('__', '/').replace('_', '.') for course in course_list]\n", "indices[indices.index('UBCx/Water201x.2/2T2015')] = 'UBCx/Water201x_2/2T2015'" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "# dictionaries to store total # of videos/problems for each course in the course_list\n", "total_videos = {}\n", "total_problems = {}\n", "def query_pcs_learned(course_list = course_list):\n", " \"\"\"\n", " Iterate over each course in the course_list, query and calculate ndays_act, sum_dt, nforum_posts, \n", " nvideos_watched, nproblems_attempted, pct_video_watched, pct_problem_attempted for involved students.\n", " Update total_videos and total_problems\n", " \"\"\"\n", " dfs = []\n", " for i in range(len(course_list)):\n", " query = \"\"\"\n", " Select pc.user_id As user_id, pc.course_id As course_id, pc.mode As mode, pc.grade As grade, \n", " pc.ndays_act As ndays_act, pc.sum_dt / 3600 As sum_dt, pc.nforum_posts As nforum_posts,\n", " v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted\n", " From [{0}.person_course] pc\n", " Left Join\n", " (SELECT username, Count(Distinct video_id) As videos_watched \n", " FROM [{0}.video_stats_day]\n", " Group By username) v\n", " on pc.username = v.username\n", " Left Join \n", " (Select user_id, Count(Distinct item.answer_id) As problems_attempted\n", " From [{0}.problem_analysis]\n", " Group By user_id) p\n", " On pc.user_id = p.user_id\n", " Where pc.sum_dt > 900\"\"\".format(course_list[i])\n", " df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)\n", "\n", " course_id = indices[i]\n", " total_videos[course_id] = df.nvideos_watched.quantile(0.975)\n", " df['pct_video_watched'] = df.nvideos_watched / total_videos[course_id]\n", " total_problems[course_id] = df.nproblems_attempted.quantile(0.975)\n", " df['pct_problem_attempted'] = df.nproblems_attempted / total_problems[course_id]\n", " dfs.append(df)\n", " pcs_learned = pd.concat(dfs)\n", " return pcs_learned" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": false, "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "cols = ['ndays_act', 'sum_dt', 'pct_video_watched', 'pct_problem_attempted', 'nforum_posts']\n", "# for the involved\n", "pcs_learned = query_pcs_learned()\n", "# aggregate nforum_posts => % students posted, others by median\n", "pcs_learned_agg = pcs_learned.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median, \n", " 'pct_video_watched': np.median, 'pct_problem_attempted': np.median, \n", " 'nforum_posts': lambda s: (s > 0).sum() / len(s)})\n", "# order the courses as needed\n", "pcs_learned_agg = pcs_learned_agg[cols].reindex(indices)\n", "# for the passed\n", "pcs_passed = pcs_learned[pcs_learned.grade >= 0.5]\n", "pcs_passed_agg = pcs_passed.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median, \n", " 'pct_video_watched': np.median, 'pct_problem_attempted': np.median, \n", " 'nforum_posts': lambda s: (s > 0).sum() / len(s)})\n", "pcs_passed_agg = pcs_passed_agg[cols].reindex(indices)\n", "\n", "# normalize along the columns for plotting heatmap\n", "pcs_passed_norm = pcs_passed_agg.divide(pcs_passed_agg.max(axis=0), axis=1)\n", "pcs_learned_norm = pcs_learned_agg.divide(pcs_learned_agg.max(axis=0), axis=1)\n", "# formatting\n", "pcs_passed_agg.sum_dt = pcs_passed_agg.sum_dt.round(2)\n", "pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \\\n", "pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: \"{0:.2f}\".format(x * 100))\n", "pcs_learned_agg.sum_dt = pcs_learned_agg.sum_dt.round(2)\n", "pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \\\n", "pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: \"{0:.2f}\".format(x * 100))" ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "color_passed = ['rgb(166,206,227)', 'rgb(253,191,111)', 'rgb(178,223,138)', 'rgb(251,154,153)', 'rgb(202,178,214)']\n", "color_learned = ['rgb(31,120,180)', 'rgb(255,127,0)', 'rgb(51,160,44)', 'rgb(227,26,28)', 'rgb(106,61,154)']\n", "fig = tls.make_subplots(rows=1, cols=pcs_passed_agg.shape[1], horizontal_spacing=0.02, print_grid=False)\n", "for i in range(0, pcs_passed_agg.shape[1]):\n", " fig.append_trace(go.Bar(x=pcs_passed_agg.ix[:, i], y=pcs_passed_agg.index, marker=dict(color=color_passed[i]),\n", " name='passed', orientation='h', showlegend=False), 1, i+1)\n", " fig.append_trace(go.Bar(x=pcs_learned_agg.ix[:, i], y=pcs_learned_agg.index, marker=dict(color=color_learned[i]),\n", " name='learned', orientation='h', showlegend=False), 1, i+1)\n", "\n", "showticklabels = True\n", "# fig['layout']['yaxis1'].update(autorange='reversed')\n", "for i in range(1, pcs_passed_agg.shape[1]+1):\n", " if i >= 2:\n", " showticklabels = False\n", " fig['layout']['xaxis%s' % i].update(title=names[i-1], titlefont=dict(size=10), \n", " tickfont=dict(size=8), showgrid=False)\n", " fig['layout']['yaxis%s' % i].update(showticklabels=showticklabels, showgrid=False, autorange='reversed')\n", "\n", "fig['layout'].update(barmode='overlay', height=80+25*len(pcs_learned_agg),\n", " width=800, margin=go.Margin(t=40, b=40, l=180, r=0), title=\"Engagement for the learned and the passed\")\n", "py.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Heatmap" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# normalized value for plotting heatmap, unnormalized for hoverinfo\n", "names = ['median days active', 'median sum_dt (H)', \n", " 'median % videos', 'median % problems', '% students posted']\n", "trace1 = go.Heatmap(\n", " z=pcs_learned_norm.values,\n", " x=names,\n", " y=pcs_learned_norm.index,\n", " text = pcs_learned_agg.values,\n", " hoverinfo='x+text',\n", " showscale=False,\n", " colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(51,160,44)']])\n", "\n", "\n", "trace2 = go.Heatmap(\n", " z=pcs_passed_norm.values,\n", " x=names,\n", " y=pcs_passed_norm.index,\n", " text = pcs_passed_agg.values,\n", " hoverinfo='x+text',\n", " showscale=False,\n", " colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']])\n", "\n", "fig = tls.make_subplots(rows=1, cols=2, print_grid=False,\n", " subplot_titles=('Engagement for the involved', \n", " 'Engagement for the passed'))\n", "fig.append_trace(trace1, 1, 1)\n", "fig.append_trace(trace2, 1, 2)\n", "fig['layout']['yaxis1'].update(autorange='reversed')\n", "fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')\n", "\n", "fig['layout'].update(\n", " width=750, height=140+30*len(pcs_learned_agg),\n", " margin=go.Margin(l=180, b=120, t=20)\n", ")\n", "\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "hide_input": true, "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" }, "toc": { "toc_cell": true, "toc_number_sections": true, "toc_section_display": "none", "toc_threshold": 6, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 0 }