{
 "metadata": {
  "name": "",
  "signature": "sha256:7bdf7d9dc9c4a3d0c208d248b308c7add37baaf1d6d89525f41717d2abcd0009"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Education Data Mining Testing System\n",
      "\n",
      "We need an in-house testing system to validate our machine learning algorithm. We need this in order to iterate towards better solutions. I am basing this in-house testing system on the Yu et al. JMLR Workshop and Conference Proceedings paper that the winning team submitted. The [leaderboard](https://pslcdatashop.web.cmu.edu/KDDCup/results_full.jsp) contains the full list of submissions and links to papers.\n",
      "\n",
      "In the Yu et al. paper, the main reason why they built their own testing system instead of just submitting to their answers and having the KDD Cup server score it was to avoid overfitting the solution."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%matplotlib inline\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "import sklearn"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Feature engineering"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Get the data: Algebra 2005-2006 (A56) and/or Algebra 2008-2009 (A89)\n",
      "a56_train_filepath = 'data/algebra0506/algebra_2005_2006_train.txt'\n",
      "#a89_train_filepath = 'data/algebra0809/algebra_2008_2009_train.txt'\n",
      "\n",
      "a56data = pd.read_table(a56_train_filepath)\n",
      "#a89data = pd.read_table(a89_train_filepath)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 73
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "hierarchy = a56data['Problem Hierarchy']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 74
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Split the problem hierarchy into 'Units' and 'Sections'"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "units, sections = [], []\n",
      "for i in range(len(hierarchy)):\n",
      "    units.append(hierarchy[i].split(',')[0].strip())\n",
      "    sections.append(hierarchy[i].split(',')[1].strip())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 75
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now add 'Units' and 'Sections' as columns within the dataframe\n",
      "a56data['Problem Unit'] = pd.Series(units, index=a56data.index)\n",
      "a56data['Problem Section'] = pd.Series(sections, index=a56data.index)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 76
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Rearrange order of columns\n",
      "cols = a56data.columns.tolist()\n",
      "cols = cols[0:3]+cols[-2::]+cols[3:-2]\n",
      "a56data = a56data[cols]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 77
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Create a temporary dataframe for the addition of new binary features"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df = a56data"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 79
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Split students into array of binary features"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "students = set(a56data['Anon Student Id'])\n",
      "print 'There are {0} students, so we will be adding as many columns to this dataframe.'.format(len(students))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "There are 574 students, so we will be adding as many columns to this dataframe.\n"
       ]
      }
     ],
     "prompt_number": 80
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "numrows = len(df)\n",
      "\n",
      "# Add a column for every student and fill them with zeros\n",
      "for stud in students:\n",
      "    df[stud] = pd.Series(np.zeros(numrows), index=df.index)\n",
      "    \n",
      "# For each student's problem entries in the dataframe, mark their column as 1\n",
      "for stud in students:\n",
      "    df.loc[df['Anon Student Id'] == stud,stud] = 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 81
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "np.shape(df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 82,
       "text": [
        "(809694, 595)"
       ]
      }
     ],
     "prompt_number": 82
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Split problem units into array of binary features"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "units = set(a56data['Problem Unit'])\n",
      "print 'There are {0} unique problem units, so we will be adding as many columns to this dataframe.'.format(len(units))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "There are 32 unique problem units, so we will be adding as many columns to this dataframe.\n"
       ]
      }
     ],
     "prompt_number": 83
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "numrows = len(df)\n",
      "\n",
      "# Add a column for every problem unit and fill them with zeros\n",
      "for u in units:\n",
      "    df[u] = pd.Series(np.zeros(numrows), index=df.index)\n",
      "    \n",
      "# For each student's attempt at a problem, mark the appropriate problem unit as 1\n",
      "for u in units:\n",
      "    df.loc[df['Problem Unit'] == u,u] = 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 84
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "np.shape(df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 85,
       "text": [
        "(809694, 627)"
       ]
      }
     ],
     "prompt_number": 85
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Split problem section into array of binary features"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sections = set(a56data['Problem Section'])\n",
      "print 'There are {0} unique problem sections, so we will be adding as many columns to this dataframe.'.format(len(sections))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "There are 138 unique problem sections, so we will be adding as many columns to this dataframe.\n"
       ]
      }
     ],
     "prompt_number": 86
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "numrows = len(df)\n",
      "\n",
      "# Add a column for every problem unit and fill them with zeros\n",
      "for s in sections:\n",
      "    df[s] = pd.Series(np.zeros(numrows), index=df.index)\n",
      "    \n",
      "# For each student's attempt at a problem, mark the appropriate problem unit as 1\n",
      "for s in sections:\n",
      "    df.loc[df['Problem Section'] == s,s] = 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 87
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "np.shape(df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 88,
       "text": [
        "(809694, 765)"
       ]
      }
     ],
     "prompt_number": 88
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Split problem name into array of binary features"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pnames = set(a56data['Problem Name'])\n",
      "print 'There are {0} unique problem names, so we will be adding as many columns to this dataframe.'.format(len(pnames))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "There are 1084 unique problem names, so we will be adding as many columns to this dataframe.\n"
       ]
      }
     ],
     "prompt_number": 105
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "numrows = len(df)\n",
      "\n",
      "# Add a column for every problem unit and fill them with zeros\n",
      "for n in pnames:\n",
      "    df[n] = pd.Series(np.zeros(numrows), index=df.index)\n",
      "    \n",
      "# For each student's attempt at a problem, mark the appropriate problem unit as 1\n",
      "for n in pnames:\n",
      "    df.loc[df['Problem Name'] == n,n] = 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 90
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "np.shape(df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 91,
       "text": [
        "(809694, 1849)"
       ]
      }
     ],
     "prompt_number": 91
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Split step name into array of binary features"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "snames = set(a56data['Step Name'])\n",
      "print 'There are {0} unique step names, so we will be adding as many columns to this dataframe.'.format(len(snames))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "There are 187539 unique step names, so we will be adding as many columns to this dataframe.\n"
       ]
      }
     ],
     "prompt_number": 106
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "numrows = len(df)\n",
      "\n",
      "# Add a column for every problem unit and fill them with zeros\n",
      "for n in snames:\n",
      "    df[n] = pd.Series(np.zeros(numrows), index=df.index)\n",
      "    \n",
      "# For each student's attempt at a problem, mark the appropriate problem unit as 1\n",
      "for n in snames:\n",
      "    df.loc[df['Step Name'] == n,n] = 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 54
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Create the testing dataframe"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create an empty testing dataframe\n",
      "testdf = pd.DataFrame(columns=df.columns)\n",
      "\n",
      "# Create the testing set\n",
      "for i in range(len(unique_units)):\n",
      "    # Get the last problem of the current problem unit\n",
      "    lastProb = list(df[df['Problem Unit'] == unique_units[i]]['Problem Name'])[-1]\n",
      "    \n",
      "    # Get all the rows corresponding to the last problem for the given problem unit\n",
      "    lastProbRows = a56data[(df['Problem Unit'] == unique_units[i]) & (df['Problem Name']==lastProb)]\n",
      "    \n",
      "    # Concatenate test dataframe with the rows just found\n",
      "    testdf = pd.concat([testdf,lastProbRows])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 92
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create a training dataframe that is equal to original dataframe with all the test cases removed\n",
      "trainIndex = df.index - testdf.index\n",
      "traindf = df.loc[trainIndex]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 93
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Get the target feature within the test set: the Correct First Attmpt\n",
      "CFAs = np.array(testdf['Correct First Attempt'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 94
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Test functions"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Define a helper function for calculating the root-mean-square error\n",
      "def RMSE(p,y):\n",
      "    ''' The Root-Mean-Square Error takes the predicted values p for the target\n",
      "    variable y and takes the square root of the mean of the square of their\n",
      "    differences. '''\n",
      "    return np.sqrt(np.sum(np.square(p-y))/len(y))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 95
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Test the RMSE for an array of all zeros\n",
      "p = np.zeros(len(CFAs))\n",
      "print 'An array of all zeros gives an RMSE of:',RMSE(p,CFAs)\n",
      "\n",
      "# Test the RMSE for an array of all ones\n",
      "p = np.ones(len(CFAs))\n",
      "print 'An array of all ones gives an RMSE of:',RMSE(p,CFAs)\n",
      "\n",
      "# Test the RMSE for an array of random 0s and 1s\n",
      "p = np.random.randint(0,2,len(CFAs)).astype(float)\n",
      "print 'An array of random ones and zeros gives an RMSE of:',RMSE(p,CFAs)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "An array of all zeros gives an RMSE of: 0.863841709437\n",
        "An array of all ones gives an RMSE of: 0.503763338322\n",
        "An array of random ones and zeros gives an RMSE of: 0.710079831105\n"
       ]
      }
     ],
     "prompt_number": 96
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Define the logistic function\n",
      "def logisfunc(x):\n",
      "    return 1.0 / (1.0 + np.exp(-x))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 97
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def logit_plots(X,y,model,feat_names):\n",
      "    num_samples  = np.shape(X)[0]\n",
      "    num_features = np.shape(X)[1]\n",
      "    print num_samples, 'number of samples'\n",
      "    print num_features, 'number of features'\n",
      "    \n",
      "    # The coefficients and bias of the decision function\n",
      "    coefs = model.coef_.ravel()\n",
      "    bias  = model.intercept_\n",
      "    \n",
      "    # Plot the decision function separately for each feature\n",
      "    x_plt = np.linspace(-1.0,1.0,300)\n",
      "    \n",
      "    fig = plt.figure(figsize=(3*num_features,6))\n",
      "    \n",
      "    for feat in range(num_features):\n",
      "        x     = x_plt * coefs[feat] + bias\n",
      "        decs  = logisfunc(x)\n",
      "        \n",
      "        fig.add_subplot(num_features,1,feat+1)\n",
      "        plt.plot(X[:,feat],y,'x')\n",
      "        plt.plot(x_plt,decs)\n",
      "        plt.axis((0,1,0,1))\n",
      "        plt.xlabel(feat_names[feat])\n",
      "    plt.tight_layout()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 98
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def error_metrics(p,yy):\n",
      "    '''Calculates the error metrics, i.e. the precision and recall.\n",
      "    Precision = True positives / Predicted positives\n",
      "    Recall    = True positives / Actual positives'''\n",
      "    predicted_positives = len(p[p==1])\n",
      "    actual_positives    = len(yy[yy==1])\n",
      "    # The predicted values for when actual values are 1\n",
      "    pp = p[yy==1]\n",
      "    # True positives are when these predicted values are also 1\n",
      "    true_positives      = len(pp[pp==1])\n",
      "    false_positives     = len(yy) - true_positives\n",
      "    \n",
      "    precision = float(true_positives) / float(predicted_positives)\n",
      "    recall    = float(true_positives) / float(actual_positives)\n",
      "    \n",
      "    F_1score  = 2.0 * precision * recall / (precision + recall)\n",
      "    \n",
      "    print 'Root-mean-square error: ', RMSE(p,yy)\n",
      "    \n",
      "    print '\\nPrecision: Of all predicted CFAs, what fraction actually succeeded?'\n",
      "    print precision\n",
      "    \n",
      "    print '\\nRecall: Of all actual CFAs, what fraction did we predict correctly?'\n",
      "    print recall\n",
      "    \n",
      "    print '\\nF_1 Score: ', F_1score"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 99
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Machine learning"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "traindf.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 100,
       "text": [
        "Index([u'Row', u'Anon Student Id', u'Problem Hierarchy', u'Problem Unit', u'Problem Section', u'Problem Name', u'Problem View', u'Step Name', u'Step Start Time', u'First Transaction Time', u'Correct Transaction Time', u'Step End Time', u'Step Duration (sec)', u'Correct Step Duration (sec)', u'Error Step Duration (sec)', u'Correct First Attempt', u'Incorrects', u'Hints', u'Corrects', u'KC(Default)', u'Opportunity(Default)', u'OCmTg4ha6w', u'M7hLaVJGvX', u'6bJ4auIa8L', u'D08sSQFrq3', u'J76B3lyQG9', u'77y7iIIXuv', u'k37lfl6ENf', u'olyVui8lHe', u'verm1Wp12u', u'S7E3jvZbGG', u'cua2tdf6zb', u'3Whv9UbPsR', u'3EWyQLUo83', u'XqBaV46VbC', u'qN6WN7C097', u'm9a501e1MM', u'8FcKH1d6A2', u'8eqtD31y66', u'7y5NJZ7S6u', u'rA9d62o2bb', u'5gtqSDwEt1', u'6d2wZ1x370', u'oeTCjHG37z', u'HvghFVBS5v', u'1to8tgu4IT', u'UO70mRwd30', u'XrH5AAPdtj', u'TAw598sUia', u'pI9Tpj13ty', u'v52MhQAVT9', u'nybp7zY98z', u'yuB6nxh3FX', u'bt0ZuHaCGY', u'dD5k5322J5', u'2AMmebFl86', u'45euP7C062', u'QS4cvQ8w0o', u'Dmq6441349', u'NX8N2fJ630', u'9hs21hUG5O', u'a47O44klh3', u'45TTYcotWk', u'U50h3ZFmGt', u'Vu26QoCms4', u'WmIXxzmmD3', u'6jCygPdswz', u'SH1cSNDIA8', u'eD0u9WOep7', u'X5eS5kvC8h', u'X223hIsDU4', u'Vc0J7iVot4', u'c8Gl35DPn4', u'EfN583275t', u'MPqJdqnPtc', u'hwF4tyWU50', u'0U9x5pNv8t', u'5x5fHvFFLv', u'RwM69ocq8e', u'DZXDgO0B3u', u'5ddRYL0LBA', u'4ajdr1a4Kc', u'st945Ucdi6', u'D431zXkvuC', u'ajPRq0Q08b', u'b2W4ZO3uLV', u'NZJd9Aq3De', u'3hSu07XfGd', u'80nlN05JQ6', u'z2zuhnARi6', u'w0FMzJORlK', u'ey9rvMnU57', u'XNEgfvXx50', u'On0ILT6rt5', u'XGFd357i6u', u'XHm4u4Y8OU', u'jx1ndu85oq', u'dc35t0IQHq', u'8r7mHkuKX9', u'c6LS9kDJe1', ...], dtype='object')"
       ]
      }
     ],
     "prompt_number": 100
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Define a helper function to normalize the feature matrix X\n",
      "import numba\n",
      "def autonorm(X):\n",
      "    ''' Calculates the mean and range of values of each column\n",
      "    in the matrix (features) subtracts the mean from each value\n",
      "    and divides by the range, thereby normalizing all values to\n",
      "    fall between -1 and 1.'''\n",
      "    x_means = np.mean(X,axis=0)\n",
      "    x_means = np.ones(np.shape(X))*x_means\n",
      "    x_maxs  = np.max(X,axis=0)\n",
      "    x_mins  = np.min(X,axis=0)\n",
      "    x_range = x_maxs - x_mins\n",
      "    X_normd = (X - x_means) / x_range\n",
      "    return X_normd\n",
      "\n",
      "autonorm_jit = numba.jit(autonorm)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 101
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "features_to_norm = ['Step Duration (sec)','Hints','Problem View']\n",
      "binary_features = list(students)+list(units)+list(sections)+list(pnames)#+list(snames)\n",
      "target_feature = ['Correct First Attempt']\n",
      "all_features = features_to_norm + binary_features + target_feature"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 113
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X = traindf[all_features].dropna()\n",
      "y = np.array(X[target_feature]).astype(int).ravel()\n",
      "X_to_norm = np.array(X[features_to_norm])\n",
      "X_nonnorm = np.array(X[binary_features])\n",
      "X_to_norm = autonorm(X_to_norm)\n",
      "X = np.concatenate((X_to_norm,X_nonnorm), axis=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "XX = testdf[all_features].dropna()\n",
      "yy = np.array(XX[target_feature]).astype(int).ravel()\n",
      "XX_to_norm = np.array(XX[features_to_norm])\n",
      "XX_nonnorm = np.array(XX[binary_features])\n",
      "XX_to_norm = autonorm(XX_to_norm)\n",
      "XX = np.concatenate((XX_to_norm,XX_nonnorm), axis=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 104
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn import linear_model\n",
      "model = linear_model.LogisticRegression()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 25
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "np.shape(X)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 26,
       "text": [
        "(760650, 577)"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "model.fit(X,y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 27,
       "text": [
        "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
        "          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)"
       ]
      }
     ],
     "prompt_number": 27
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "p = model.predict(XX).astype(float)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "error_metrics(p,yy)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Root-mean-square error:  0.401917482001\n",
        "\n",
        "Precision: Of all predicted CFAs, what fraction actually succeeded?\n",
        "0.831047542873\n",
        "\n",
        "Recall: Of all actual CFAs, what fraction did we predict correctly?\n",
        "0.983518472118\n",
        "\n",
        "F_1 Score:  0.900877237721\n"
       ]
      }
     ],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "np.shape(X)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 32,
       "text": [
        "(760650, 577)"
       ]
      }
     ],
     "prompt_number": 32
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "len(students)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 33,
       "text": [
        "574"
       ]
      }
     ],
     "prompt_number": 33
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "traindf.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 34,
       "text": [
        "Index([u'Row', u'Anon Student Id', u'Problem Hierarchy', u'Problem Unit', u'Problem Section', u'Problem Name', u'Problem View', u'Step Name', u'Step Start Time', u'First Transaction Time', u'Correct Transaction Time', u'Step End Time', u'Step Duration (sec)', u'Correct Step Duration (sec)', u'Error Step Duration (sec)', u'Correct First Attempt', u'Incorrects', u'Hints', u'Corrects', u'KC(Default)', u'Opportunity(Default)', u'OCmTg4ha6w', u'M7hLaVJGvX', u'6bJ4auIa8L', u'D08sSQFrq3', u'J76B3lyQG9', u'77y7iIIXuv', u'k37lfl6ENf', u'olyVui8lHe', u'verm1Wp12u', u'S7E3jvZbGG', u'cua2tdf6zb', u'3Whv9UbPsR', u'3EWyQLUo83', u'XqBaV46VbC', u'qN6WN7C097', u'm9a501e1MM', u'8FcKH1d6A2', u'8eqtD31y66', u'7y5NJZ7S6u', u'rA9d62o2bb', u'5gtqSDwEt1', u'6d2wZ1x370', u'oeTCjHG37z', u'HvghFVBS5v', u'1to8tgu4IT', u'UO70mRwd30', u'XrH5AAPdtj', u'TAw598sUia', u'pI9Tpj13ty', u'v52MhQAVT9', u'nybp7zY98z', u'yuB6nxh3FX', u'bt0ZuHaCGY', u'dD5k5322J5', u'2AMmebFl86', u'45euP7C062', u'QS4cvQ8w0o', u'Dmq6441349', u'NX8N2fJ630', u'9hs21hUG5O', u'a47O44klh3', u'45TTYcotWk', u'U50h3ZFmGt', u'Vu26QoCms4', u'WmIXxzmmD3', u'6jCygPdswz', u'SH1cSNDIA8', u'eD0u9WOep7', u'X5eS5kvC8h', u'X223hIsDU4', u'Vc0J7iVot4', u'c8Gl35DPn4', u'EfN583275t', u'MPqJdqnPtc', u'hwF4tyWU50', u'0U9x5pNv8t', u'5x5fHvFFLv', u'RwM69ocq8e', u'DZXDgO0B3u', u'5ddRYL0LBA', u'4ajdr1a4Kc', u'st945Ucdi6', u'D431zXkvuC', u'ajPRq0Q08b', u'b2W4ZO3uLV', u'NZJd9Aq3De', u'3hSu07XfGd', u'80nlN05JQ6', u'z2zuhnARi6', u'w0FMzJORlK', u'ey9rvMnU57', u'XNEgfvXx50', u'On0ILT6rt5', u'XGFd357i6u', u'XHm4u4Y8OU', u'jx1ndu85oq', u'dc35t0IQHq', u'8r7mHkuKX9', u'c6LS9kDJe1', ...], dtype='object')"
       ]
      }
     ],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "len(set(traindf['Step Name']))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 37,
       "text": [
        "171886"
       ]
      }
     ],
     "prompt_number": 37
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "len(traindf.columns)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 38,
       "text": [
        "595"
       ]
      }
     ],
     "prompt_number": 38
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "params = model.get_params(deep=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "params"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 46,
       "text": [
        "{'C': 1.0,\n",
        " 'class_weight': None,\n",
        " 'dual': False,\n",
        " 'fit_intercept': True,\n",
        " 'intercept_scaling': 1,\n",
        " 'penalty': 'l2',\n",
        " 'random_state': None,\n",
        " 'tol': 0.0001}"
       ]
      }
     ],
     "prompt_number": 46
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "T = model.predict_proba(X)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 48
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "T"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 51,
       "text": [
        "array([[ 0.70539268,  0.29460732],\n",
        "       [ 0.18470565,  0.81529435],\n",
        "       [ 0.45739163,  0.54260837],\n",
        "       ..., \n",
        "       [ 0.17350595,  0.82649405],\n",
        "       [ 0.70812947,  0.29187053],\n",
        "       [ 0.14044815,  0.85955185]])"
       ]
      }
     ],
     "prompt_number": 51
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}