{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import math\n",
      "from math import log\n",
      "from sklearn import metrics,preprocessing,cross_validation\n",
      "from sklearn.feature_extraction.text import TfidfVectorizer\n",
      "import sklearn.linear_model as lm\n",
      "import pandas as p\n",
      "from time import gmtime, strftime\n",
      "import scipy\n",
      "import sys\n",
      "import sklearn.decomposition\n",
      "from sklearn.metrics import mean_squared_error\n",
      "from string import punctuation\n",
      "from sklearn.neighbors import RadiusNeighborsRegressor, KNeighborsRegressor\n",
      "import time\n",
      "from scipy import sparse\n",
      "from matplotlib import *\n",
      "from itertools import combinations\n",
      "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier\n",
      "import operator\n",
      "from sklearn import svm"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def tied_rank(x):\n",
      "    \"\"\"\n",
      "    This function is by Ben Hamner and taken from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py\n",
      "\n",
      "    Computes the tied rank of elements in x.\n",
      "\n",
      "    This function computes the tied rank of elements in x.\n",
      "\n",
      "    Parameters\n",
      "    ----------\n",
      "    x : list of numbers, numpy array\n",
      "\n",
      "    Returns\n",
      "    -------\n",
      "    score : list of numbers\n",
      "            The tied rank f each element in x\n",
      "\n",
      "    \"\"\"\n",
      "    sorted_x = sorted(zip(x,range(len(x))))\n",
      "    r = [0 for k in x]\n",
      "    cur_val = sorted_x[0][0]\n",
      "    last_rank = 0\n",
      "    for i in range(len(sorted_x)):\n",
      "        if cur_val != sorted_x[i][0]:\n",
      "            cur_val = sorted_x[i][0]\n",
      "            for j in range(last_rank, i): \n",
      "                r[sorted_x[j][1]] = float(last_rank+1+i)/2.0\n",
      "            last_rank = i\n",
      "        if i==len(sorted_x)-1:\n",
      "            for j in range(last_rank, i+1): \n",
      "                r[sorted_x[j][1]] = float(last_rank+i+2)/2.0\n",
      "    return r\n",
      "\n",
      "def auc(actual, posterior):\n",
      "    \"\"\"\n",
      "    This function is by Ben Hamner and taken from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py\n",
      "    \n",
      "    Computes the area under the receiver-operater characteristic (AUC)\n",
      "\n",
      "    This function computes the AUC error metric for binary classification.\n",
      "\n",
      "    Parameters\n",
      "    ----------\n",
      "    actual : list of binary numbers, numpy array\n",
      "             The ground truth value\n",
      "    posterior : same type as actual\n",
      "                Defines a ranking on the binary numbers, from most likely to\n",
      "                be positive to least likely to be positive.\n",
      "\n",
      "    Returns\n",
      "    -------\n",
      "    score : double\n",
      "            The mean squared error between actual and posterior\n",
      "\n",
      "    \"\"\"\n",
      "    r = tied_rank(posterior)\n",
      "    num_positive = len([0 for x in actual if x==1])\n",
      "    num_negative = len(actual)-num_positive\n",
      "    sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])\n",
      "    auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /\n",
      "           (num_negative*num_positive))\n",
      "    sys.stdout.write('.')\n",
      "    return auc\n",
      "\n",
      "def auc_scorer(estimator, X, y):\n",
      "    predicted = estimator.predict_proba(X)[:,1]\n",
      "    return auc(y, predicted)\n",
      "       \n",
      "def normalize10day(stocks):\n",
      "    def process_column(i):\n",
      "        if operator.mod(i, 5) == 1:\n",
      "            return stocks[:,i] * 0\n",
      "        if operator.mod(i, 5) == 2:\n",
      "            return stocks[:,i] * 0\n",
      "        if operator.mod(i, 5) == 4:\n",
      "            return stocks[:,i] * 0\n",
      "            #return np.log(stocks[:,i] + 1)\n",
      "        else:\n",
      "            return stocks[:,i] / stocks[:,0]\n",
      "    n = stocks.shape[0]\n",
      "    stocks_dat =  np.array([ process_column(i) for i in range(46)]).transpose()\n",
      "    #stocks_movingavgO9O10 = np.array([int(i > j) for i,j in zip(stocks_dat[:,45], stocks_dat[:,40])]).reshape((n, 1))\n",
      "    #stocks_movingavgC9O10 = np.array([int(i > j) for i,j in zip(stocks_dat[:,45], stocks_dat[:,43])]).reshape((n, 1))\n",
      "    #return np.hstack((stocks_dat, stocks_movingavgO9O10, stocks_movingavgC9O10))\n",
      "    return stocks_dat\n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"loading data..\"\n",
      "train = np.array(p.read_table('./training.csv', sep = \",\"))\n",
      "test = np.array(p.read_table('./test.csv', sep = \",\"))\n",
      "\n",
      "################################################################################\n",
      "# READ IN THE TEST DATA\n",
      "################################################################################\n",
      "# all data from opening 1 to straight to opening 10\n",
      "X_test_stockdata = normalize10day(test[:,range(2, 48)]) # load in test data\n",
      "X_test_stockindicators = np.vstack((np.identity(94)[:,range(93)] for i in range(25)))\n",
      "\n",
      "#X_test = np.hstack((X_test_stockindicators, X_test_stockdata))\n",
      "X_test = X_test_stockdata\n",
      "\n",
      "#np.identity(94)[:,range(93)]\n",
      "\n",
      "################################################################################\n",
      "# READ IN THE TRAIN DATA\n",
      "################################################################################\n",
      "n_windows = 490\n",
      "windows = range(n_windows)\n",
      "\n",
      "X_windows = [train[:,range(1 + 5*w, 47 + 5*w)] for w in windows]\n",
      "X_windows_normalized = [normalize10day(w) for w in X_windows]\n",
      "X_stockdata = np.vstack(X_windows_normalized)\n",
      "X_stockindicators = np.vstack((np.identity(94)[:,range(93)] for i in range(n_windows)))\n",
      "\n",
      "#X = np.hstack((X_stockindicators, X_stockdata))\n",
      "X = X_stockdata\n",
      "\n",
      "# read in the response variable\n",
      "y_stockdata = np.vstack([train[:, [46 + 5*w, 49 + 5*w]] for w in windows])\n",
      "y = (y_stockdata[:,1] - y_stockdata[:,0] > 0) + 0\n",
      "\n",
      "print \"this step done\""
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "loading data..\n",
        "this step done"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"preparing models\"\n",
      "\n",
      "modelname = \"lasso\"\n",
      "\n",
      "if modelname == \"ridge\": \n",
      "    C = np.linspace(300, 5000, num = 10)[::-1]\n",
      "    models = [lm.LogisticRegression(penalty = \"l2\", C = c) for c in C]\n",
      "\n",
      "if modelname == \"lasso\": \n",
      "    C = np.linspace(300, 5000, num = 10)[::-1]\n",
      "    models = [lm.LogisticRegression(penalty = \"l1\", C = c) for c in C]\n",
      "\n",
      "if modelname == \"sgd\": \n",
      "    C = np.linspace(0.00005, .01, num = 5)\n",
      "    models = [lm.SGDClassifier(loss = \"log\", penalty = \"l2\", alpha = c, warm_start = False) for c in C]\n",
      "    \n",
      "if modelname == \"randomforest\":\n",
      "    C = np.linspace(50, 300, num = 10)\n",
      "    models = [RandomForestClassifier(n_estimators = int(c)) for c in C]\n",
      "\n",
      "print \"calculating cv scores\"\n",
      "cv_scores = [0] * len(models)\n",
      "for i, model in enumerate(models):\n",
      "    # for all of the models, save the cross-validation scores into the array cv_scores\n",
      "    cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, scoring = auc_scorer))\n",
      "    #cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, score_func = auc))\n",
      "    print \" (%d/%d) C = %f: CV = %f\" % (i + 1, len(C), C[i], cv_scores[i])\n",
      "\n",
      "# find which model and C is the best\n",
      "best = cv_scores.index(max(cv_scores))\n",
      "best_model = models[best]\n",
      "best_cv = cv_scores[best]\n",
      "best_C = C[best]\n",
      "print \"BEST %f: %f\" % (best_C, best_cv)\n",
      "\n",
      "print \"training on full data\"\n",
      "# fit the best model on the full data\n",
      "best_model.fit(X, y)\n",
      "\n",
      "print \"prediction\"\n",
      "# do a prediction and save it\n",
      "pred = best_model.predict_proba(X_test)[:,1]\n",
      "testfile = p.read_csv('./test.csv', sep=\",\", na_values=['?'], index_col=[0,1])\n",
      "\n",
      "# submit as D multiplied by 100 + stock id\n",
      "testindices = [100 * D + StId for (D, StId) in testfile.index]\n",
      "\n",
      "pred_df = p.DataFrame(np.vstack((testindices, pred)).transpose(), columns=[\"Id\", \"Prediction\"])\n",
      "pred_df.to_csv('./predictions/' + modelname + '/' + modelname + ' ' + strftime(\"%m-%d %X\") + \" C-\" + str(round(best_C,4)) + \" CV-\" + str(round(best_cv, 4)) + \".csv\", index = False)\n",
      "\n",
      "print \"submission file created\"\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "preparing models\n",
        "training on full data\n",
        "prediction"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "submission file created\n"
       ]
      }
     ],
     "prompt_number": 6
    }
   ],
   "metadata": {}
  }
 ]
}