{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import math\n", "from math import log\n", "from sklearn import metrics,preprocessing,cross_validation\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import sklearn.linear_model as lm\n", "import pandas as p\n", "from time import gmtime, strftime\n", "import scipy\n", "import sys\n", "import sklearn.decomposition\n", "from sklearn.metrics import mean_squared_error\n", "from string import punctuation\n", "from sklearn.neighbors import RadiusNeighborsRegressor, KNeighborsRegressor\n", "import time\n", "from scipy import sparse\n", "from matplotlib import *\n", "from itertools import combinations\n", "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier\n", "import operator\n", "from sklearn import svm" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "def tied_rank(x):\n", " \"\"\"\n", " This function is by Ben Hamner and taken from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py\n", "\n", " Computes the tied rank of elements in x.\n", "\n", " This function computes the tied rank of elements in x.\n", "\n", " Parameters\n", " ----------\n", " x : list of numbers, numpy array\n", "\n", " Returns\n", " -------\n", " score : list of numbers\n", " The tied rank f each element in x\n", "\n", " \"\"\"\n", " sorted_x = sorted(zip(x,range(len(x))))\n", " r = [0 for k in x]\n", " cur_val = sorted_x[0][0]\n", " last_rank = 0\n", " for i in range(len(sorted_x)):\n", " if cur_val != sorted_x[i][0]:\n", " cur_val = sorted_x[i][0]\n", " for j in range(last_rank, i): \n", " r[sorted_x[j][1]] = float(last_rank+1+i)/2.0\n", " last_rank = i\n", " if i==len(sorted_x)-1:\n", " for j in range(last_rank, i+1): \n", " r[sorted_x[j][1]] = float(last_rank+i+2)/2.0\n", " return r\n", "\n", "def auc(actual, posterior):\n", " \"\"\"\n", " This function is by Ben Hamner and taken from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py\n", " \n", " Computes the area under the receiver-operater characteristic (AUC)\n", "\n", " This function computes the AUC error metric for binary classification.\n", "\n", " Parameters\n", " ----------\n", " actual : list of binary numbers, numpy array\n", " The ground truth value\n", " posterior : same type as actual\n", " Defines a ranking on the binary numbers, from most likely to\n", " be positive to least likely to be positive.\n", "\n", " Returns\n", " -------\n", " score : double\n", " The mean squared error between actual and posterior\n", "\n", " \"\"\"\n", " r = tied_rank(posterior)\n", " num_positive = len([0 for x in actual if x==1])\n", " num_negative = len(actual)-num_positive\n", " sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])\n", " auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /\n", " (num_negative*num_positive))\n", " sys.stdout.write('.')\n", " return auc\n", "\n", "def auc_scorer(estimator, X, y):\n", " predicted = estimator.predict_proba(X)[:,1]\n", " return auc(y, predicted)\n", " \n", "def normalize10day(stocks):\n", " def process_column(i):\n", " if operator.mod(i, 5) == 1:\n", " return stocks[:,i] * 0\n", " if operator.mod(i, 5) == 2:\n", " return stocks[:,i] * 0\n", " if operator.mod(i, 5) == 4:\n", " return stocks[:,i] * 0\n", " #return np.log(stocks[:,i] + 1)\n", " else:\n", " return stocks[:,i] / stocks[:,0]\n", " n = stocks.shape[0]\n", " stocks_dat = np.array([ process_column(i) for i in range(46)]).transpose()\n", " #stocks_movingavgO9O10 = np.array([int(i > j) for i,j in zip(stocks_dat[:,45], stocks_dat[:,40])]).reshape((n, 1))\n", " #stocks_movingavgC9O10 = np.array([int(i > j) for i,j in zip(stocks_dat[:,45], stocks_dat[:,43])]).reshape((n, 1))\n", " #return np.hstack((stocks_dat, stocks_movingavgO9O10, stocks_movingavgC9O10))\n", " return stocks_dat\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "print \"loading data..\"\n", "train = np.array(p.read_table('./training.csv', sep = \",\"))\n", "test = np.array(p.read_table('./test.csv', sep = \",\"))\n", "\n", "################################################################################\n", "# READ IN THE TEST DATA\n", "################################################################################\n", "# all data from opening 1 to straight to opening 10\n", "X_test_stockdata = normalize10day(test[:,range(2, 48)]) # load in test data\n", "X_test_stockindicators = np.vstack((np.identity(94)[:,range(93)] for i in range(25)))\n", "\n", "#X_test = np.hstack((X_test_stockindicators, X_test_stockdata))\n", "X_test = X_test_stockdata\n", "\n", "#np.identity(94)[:,range(93)]\n", "\n", "################################################################################\n", "# READ IN THE TRAIN DATA\n", "################################################################################\n", "n_windows = 490\n", "windows = range(n_windows)\n", "\n", "X_windows = [train[:,range(1 + 5*w, 47 + 5*w)] for w in windows]\n", "X_windows_normalized = [normalize10day(w) for w in X_windows]\n", "X_stockdata = np.vstack(X_windows_normalized)\n", "X_stockindicators = np.vstack((np.identity(94)[:,range(93)] for i in range(n_windows)))\n", "\n", "#X = np.hstack((X_stockindicators, X_stockdata))\n", "X = X_stockdata\n", "\n", "# read in the response variable\n", "y_stockdata = np.vstack([train[:, [46 + 5*w, 49 + 5*w]] for w in windows])\n", "y = (y_stockdata[:,1] - y_stockdata[:,0] > 0) + 0\n", "\n", "print \"this step done\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "loading data..\n", "this step done" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "print \"preparing models\"\n", "\n", "modelname = \"lasso\"\n", "\n", "if modelname == \"ridge\": \n", " C = np.linspace(300, 5000, num = 10)[::-1]\n", " models = [lm.LogisticRegression(penalty = \"l2\", C = c) for c in C]\n", "\n", "if modelname == \"lasso\": \n", " C = np.linspace(300, 5000, num = 10)[::-1]\n", " models = [lm.LogisticRegression(penalty = \"l1\", C = c) for c in C]\n", "\n", "if modelname == \"sgd\": \n", " C = np.linspace(0.00005, .01, num = 5)\n", " models = [lm.SGDClassifier(loss = \"log\", penalty = \"l2\", alpha = c, warm_start = False) for c in C]\n", " \n", "if modelname == \"randomforest\":\n", " C = np.linspace(50, 300, num = 10)\n", " models = [RandomForestClassifier(n_estimators = int(c)) for c in C]\n", "\n", "print \"calculating cv scores\"\n", "cv_scores = [0] * len(models)\n", "for i, model in enumerate(models):\n", " # for all of the models, save the cross-validation scores into the array cv_scores\n", " cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, scoring = auc_scorer))\n", " #cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, score_func = auc))\n", " print \" (%d/%d) C = %f: CV = %f\" % (i + 1, len(C), C[i], cv_scores[i])\n", "\n", "# find which model and C is the best\n", "best = cv_scores.index(max(cv_scores))\n", "best_model = models[best]\n", "best_cv = cv_scores[best]\n", "best_C = C[best]\n", "print \"BEST %f: %f\" % (best_C, best_cv)\n", "\n", "print \"training on full data\"\n", "# fit the best model on the full data\n", "best_model.fit(X, y)\n", "\n", "print \"prediction\"\n", "# do a prediction and save it\n", "pred = best_model.predict_proba(X_test)[:,1]\n", "testfile = p.read_csv('./test.csv', sep=\",\", na_values=['?'], index_col=[0,1])\n", "\n", "# submit as D multiplied by 100 + stock id\n", "testindices = [100 * D + StId for (D, StId) in testfile.index]\n", "\n", "pred_df = p.DataFrame(np.vstack((testindices, pred)).transpose(), columns=[\"Id\", \"Prediction\"])\n", "pred_df.to_csv('./predictions/' + modelname + '/' + modelname + ' ' + strftime(\"%m-%d %X\") + \" C-\" + str(round(best_C,4)) + \" CV-\" + str(round(best_cv, 4)) + \".csv\", index = False)\n", "\n", "print \"submission file created\"\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "preparing models\n", "training on full data\n", "prediction" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "submission file created\n" ] } ], "prompt_number": 6 } ], "metadata": {} } ] }