{ "metadata": { "name": "TEST_greedy_ensemble_III" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import ensemble\n", "import features\n", "reload(ensemble)\n", "reload(features)\n", "from itertools import cycle\n", "import numpy as np\n", "import random\n", "from IPython.parallel import Client\n", "client = Client()\n", "print len(client)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "4\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import cPickle\n", "X, y = cPickle.load(open('data/blackbox.pkl', 'rb'))\n", "print X.shape, y.shape\n", "print np.unique(y)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(1000, 1875) (1000,)\n", "[1 2 3 4 5 6 7 8 9]\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "## clear and create ensemble\n", "!rm -fR tmp/blackbox_ensemble_iii/\n", "ensemble_path = ensemble.new_ensemble('blackbox_ensemble_iii', 'tmp/')\n", "print ensemble_path" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Users/lima/workspace/tutorials/ml-tutorials/tmp/blackbox_ensemble_iii" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "***FEATURE and DATA ENGINEERING***" ] }, { "cell_type": "code", "collapsed": false, "input": [ "## split data into train and validation\n", "from sklearn.cross_validation import train_test_split\n", "n_samples, n_features = X.shape\n", "train_index, test_index = train_test_split(range(n_samples), test_size = 0.2)\n", "data_records = []\n", "data_names = []" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 295 }, { "cell_type": "raw", "metadata": {}, "source": [ "## strided subpatches of original features - NOT REALLY USEFUL\n", "feature_patches = features.strided_seqs(range(n_features), stride = 25, subsize = 25)\n", "data_name_prefix = 'stride_%d_%d'\n", "for feats in feature_patches:\n", " train_X = features.patch(X, train_index, feats)\n", " train_y = features.patch(y, train_index)\n", " test_X = features.patch(X, test_index, feats)\n", " test_y = features.patch(y, test_index)\n", " train_name = 'train_' + data_name_prefix % (feats[0], feats[-1])\n", " test_name = 'test_' + data_name_prefix % (feats[0], feats[-1])\n", " data_records.append([train_name, (train_X, train_y), {}])\n", " data_records.append([test_name, (test_X, test_y), {}])\n", " data_names.append((train_name, test_name))" ] }, { "cell_type": "code", "collapsed": false, "input": [ "## kernel approximation subpatches of original features\n", "feature_patches = features.bootstrap_seqs(range(n_features), n_iter = 100, subsize = 100)\n", "data_name_prefix = 'ka_%d_%d'\n", "Xs = [features.patch(X, None, feats) for feats in feature_patches]\n", "ka_Xs = features.kernel_approximation(Xs, client, kernel = 'polynomial', degree = 2, n_components=100)\n", "for feats, ka_X in zip(feature_patches, ka_Xs):\n", " train_X = features.patch(ka_X, train_index, None)\n", " train_y = features.patch(y, train_index)\n", " test_X = features.patch(ka_X, test_index, None)\n", " test_y = features.patch(y, test_index)\n", " train_name = 'train_' + data_name_prefix % (feats[0], feats[-1])\n", " test_name = 'test_' + data_name_prefix % (feats[0], feats[-1])\n", " data_records.append([train_name, (train_X, train_y), {}])\n", " data_records.append([test_name, (test_X, test_y), {}])\n", " data_names.append((train_name, test_name))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 296 }, { "cell_type": "code", "collapsed": false, "input": [ "## bootstraped subpatches of original features\n", "feature_patches = features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 15)\n", "data_name_prefix = 'bs_%d_%d'\n", "for feats in feature_patches:\n", " train_X = features.patch(X, train_index, feats)\n", " train_y = features.patch(y, train_index)\n", " test_X = features.patch(X, test_index, feats)\n", " test_y = features.patch(y, test_index)\n", " train_name = 'train_' + data_name_prefix % (feats[0], feats[-1])\n", " test_name = 'test_' + data_name_prefix % (feats[0], feats[-1])\n", " data_records.append([train_name, (train_X, train_y), {}])\n", " data_records.append([test_name, (test_X, test_y), {}])\n", " data_names.append((train_name, test_name))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 297 }, { "cell_type": "code", "collapsed": false, "input": [ "## tri-kmeans features \n", "reload(features)\n", "tri_kmeans = features.TriKmeansFeatures(n_clusters = 20, \n", " feat_patches = features.bootstrap_seqs(range(n_features), \n", " n_iter = 500, subsize=20), \n", " client = client)\n", "tri_X = tri_kmeans.fit_transform(X)\n", "feature_patches = features.strided_seqs(range(tri_X.shape[1]), stride = 15, subsize=30)\n", "data_name_prefix = 'tri_%d_%d'\n", "for feats in feature_patches:\n", " train_X = features.patch(tri_X, train_index, feats)\n", " train_y = features.patch(y, train_index)\n", " test_X = features.patch(tri_X, test_index, feats)\n", " test_y = features.patch(y, test_index)\n", " train_name = 'train_' + data_name_prefix % (feats[0], feats[-1])\n", " test_name = 'test_' + data_name_prefix % (feats[0], feats[-1])\n", " data_records.append([train_name, (train_X, train_y), {}])\n", " data_records.append([test_name, (test_X, test_y), {}])\n", " data_names.append((train_name, test_name))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 24/24 tasks finished after 14 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n" ] } ], "prompt_number": 298 }, { "cell_type": "code", "collapsed": false, "input": [ "## 2nd order features\n", "## TODO" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 299 }, { "cell_type": "code", "collapsed": false, "input": [ "## Write data in batch\n", "ensemble.batch_write_data(ensemble_path, data_records)\n", "print len(data_names)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1266\n" ] } ], "prompt_number": 300 }, { "cell_type": "markdown", "metadata": {}, "source": [ "***FARM MODELS***" ] }, { "cell_type": "code", "collapsed": false, "input": [ "## different model configurations \n", "from sklearn import svm\n", "from sklearn import linear_model\n", "from sklearn import tree\n", "from sklearn.grid_search import IterGrid\n", "models = []\n", "## tree models\n", "tree_params = IterGrid({'criterion': ['gini', 'entropy'], 'max_depth': range(5, 16)})\n", "for param in tree_params:\n", " model_name = 'tree_%s_%d' % (param['criterion'], param['max_depth'])\n", " models.append((model_name, tree.DecisionTreeClassifier(**param)))\n", "\"\"\"\n", "## svc models - slow to train and evaluate, no obvious accuracy improvement\n", "svc_params = IterGrid({'C': np.logspace(1, 5, 5), \n", " 'gamma': np.logspace(-5, 2, 8), \n", " 'probability': [True]})\n", "for param in svc_params:\n", " model_name = 'svc_%g_%g' % (param['C'], param['gamma'])\n", " models.append((model_name, svm.SVC(**param)))\n", "\"\"\"\n", "## linear models\n", "\"\"\"\n", "sgd_params = IterGrid({'penalty': ['l1', 'l2', 'elasticnet'], \n", " 'alpha': np.logspace(-5, 2, 8)})\n", "for param in sgd_params:\n", " model_name = 'sgd_%s_%g' % (param['penalty'], param['alpha'])\n", " models.append((model_name, linear_model.SGDClassifier(**param)))\n", "\"\"\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 301, "text": [ "\"\\nsgd_params = IterGrid({'penalty': ['l1', 'l2', 'elasticnet'], \\n 'alpha': np.logspace(-5, 2, 8)})\\nfor param in sgd_params:\\n model_name = 'sgd_%s_%g' % (param['penalty'], param['alpha'])\\n models.append((model_name, linear_model.SGDClassifier(**param)))\\n\"" ] } ], "prompt_number": 301 }, { "cell_type": "code", "collapsed": false, "input": [ "## link models with data\n", "model_names = []\n", "model_records = []\n", "for (model_name, model) in models:\n", " for (train_data, validation_data) in data_names:\n", " model_meta = {\n", " 'is_probabilistic': False\n", " , 'train_data': train_data\n", " , 'validation_data': validation_data\n", " , 'test_data': None\n", " }\n", " model_data_name = '__'.join([model_name, train_data, validation_data])\n", " model_names.append(model_data_name)\n", " model_records.append([model_data_name, model, model_meta])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 302 }, { "cell_type": "code", "collapsed": false, "input": [ "## write models into ensemble\n", "print len(model_names)\n", "selected_models = range(len(model_names))\n", "from random import shuffle\n", "shuffle(selected_models)\n", "selected_models = selected_models[:10000]\n", "model_names = np.array(model_names)[selected_models]\n", "model_records = np.array(model_records)[selected_models]\n", "print len(model_names)\n", "ensemble.batch_write_model(ensemble_path, model_records)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "27852\n", "10000" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 303 }, { "cell_type": "code", "collapsed": false, "input": [ "## train models in parallel\n", "model_data_pairs = zip(model_names, cycle(['train_data']))\n", "ensemble.parallel_train_models(ensemble_path, model_data_pairs, client)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 24/24 tasks finished after 186 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n" ] } ], "prompt_number": 304 }, { "cell_type": "code", "collapsed": false, "input": [ "## construct the greedy ensemble\n", "reload(ensemble)\n", "\n", "## DISCRETE CLASSIFICATION\n", "from sklearn.metrics import accuracy_score\n", "ge = ensemble.GreedyEnsemble(ensemble_path, \n", " scorefn = ensemble.GreedyEnsemble.score_label_classification,\n", " votefn = ensemble.GreedyEnsemble.vote_major_class,\n", " client = client)\n", "\"\"\"\n", "## PROBALISTIC CLASSIFICATION\n", "ge = ensemble.GreedyEnsemble(ensemble_path, \n", " scorefn = ensemble.GreedyEnsemble.score_label_prob_classification,\n", " votefn = ensemble.GreedyEnsemble.vote_average,\n", " client = client)\n", "\"\"\"\n", "ge.fit(model_names, verbose = True)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 24/24 tasks finished after 266 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_12__train_ka_1404_956__test_ka_1404_956 improvement from 0.0 to 0.28\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_12__train_bs_254_412__test_bs_254_412 improvement from 0.28 to 0.32\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_10__train_tri_4875_4904__test_tri_4875_4904 improvement from 0.32 to 0.34\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_10__train_bs_676_867__test_bs_676_867 improvement from 0.34 to 0.355\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_14__train_tri_5415_5444__test_tri_5415_5444 improvement from 0.355 to 0.375\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_10__train_bs_1597_1597__test_bs_1597_1597 improvement from 0.375 to 0.38\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_13__train_ka_829_336__test_ka_829_336 improvement from 0.38 to 0.395\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_11__train_bs_328_285__test_bs_328_285 improvement from 0.395 to 0.405\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_8__train_bs_1161_617__test_bs_1161_617 improvement from 0.405 to 0.42\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_9__train_bs_1766_1870__test_bs_1766_1870 improvement from 0.42 to 0.44\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_14__train_ka_80_324__test_ka_80_324 improvement from 0.44 to 0.445\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_11__train_bs_335_560__test_bs_335_560 improvement from 0.445 to 0.445\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_15__train_ka_1542_1170__test_ka_1542_1170 improvement from 0.445 to 0.45\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_12__train_bs_780_780__test_bs_780_780 improvement from 0.45 to 0.45\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_11__train_bs_1099_1186__test_bs_1099_1186 improvement from 0.45 to 0.455\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_13__train_ka_1762_1451__test_ka_1762_1451 improvement from 0.455 to 0.455\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_12__train_bs_371_599__test_bs_371_599 improvement from 0.455 to 0.47\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_7__train_bs_549_321__test_bs_549_321 improvement from 0.47 to 0.475\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_entropy_13__train_tri_4080_4109__test_tri_4080_4109 improvement from 0.475 to 0.475\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_13__train_ka_72_1838__test_ka_72_1838 improvement from 0.475 to 0.48\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_11__train_ka_355_110__test_ka_355_110 improvement from 0.48 to 0.495\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_gini_15__train_bs_994_1245__test_bs_994_1245 NO improvement from 0.495 to 0.49\n" ] }, { "output_type": "pyout", "prompt_number": 305, "text": [ "GreedyEnsemble(client=,\n", " ensemble_path='/home/ce/mali/tutorials/ml-tutorials/tmp/blackbox_ensemble_iii',\n", " random_seed=0,\n", " scorefn=,\n", " votefn=)" ] } ], "prompt_number": 305 }, { "cell_type": "code", "collapsed": false, "input": [ "## performance checking on training data\n", "print ge.score(data_type = 'train_data')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 21/21 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "0.9925\n" ] } ], "prompt_number": 307 }, { "cell_type": "code", "collapsed": false, "input": [ "## performance checking on validation_data\n", "print ge.score(data_type = 'validation_data')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 21/21 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "0.495\n" ] } ], "prompt_number": 308 }, { "cell_type": "code", "collapsed": false, "input": [ "## write out ensemble\n", "## OVERWRITE the pervious solution - the results showed that\n", "## merging sevearl ensembles in a naive way is NOT really useful practice\n", "reload(ensemble)\n", "optimal_ensemble_path = 'tmp/blackbox_ensemble_iii_optimal'\n", "ensemble.copy_ensemble(ensemble_path, optimal_ensemble_path, ge.ensemble_, overwrite=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 310 }, { "cell_type": "code", "collapsed": false, "input": [ "!ls tmp/blackbox_ensemble_iii_optimal/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "data data.json models models.json\r\n" ] } ], "prompt_number": 311 }, { "cell_type": "code", "collapsed": false, "input": [ "print 'current number of models in the optimal ensemble',len(ensemble.all_model_names(optimal_ensemble_path))\n", "print ensemble.all_model_names(optimal_ensemble_path)\n", "for model_name in ensemble.all_model_names(optimal_ensemble_path):\n", " ensemble.update_model_record(optimal_ensemble_path, model_name, {'is_probabilistic': False})" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "current number of models in the optimal ensemble" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 21\n", "[u'tree_gini_13__train_ka_1762_1451__test_ka_1762_1451', u'tree_gini_12__train_ka_1404_956__test_ka_1404_956', u'tree_gini_15__train_ka_1542_1170__test_ka_1542_1170', u'tree_entropy_11__train_bs_1099_1186__test_bs_1099_1186', u'tree_entropy_10__train_tri_4875_4904__test_tri_4875_4904', u'tree_entropy_13__train_tri_4080_4109__test_tri_4080_4109', u'tree_gini_11__train_ka_355_110__test_ka_355_110', u'tree_gini_10__train_bs_1597_1597__test_bs_1597_1597', u'tree_gini_13__train_ka_72_1838__test_ka_72_1838', u'tree_gini_13__train_ka_829_336__test_ka_829_336', u'tree_gini_12__train_bs_780_780__test_bs_780_780', u'tree_gini_12__train_bs_254_412__test_bs_254_412', u'tree_entropy_8__train_bs_1161_617__test_bs_1161_617', u'tree_entropy_14__train_tri_5415_5444__test_tri_5415_5444', u'tree_entropy_11__train_bs_335_560__test_bs_335_560', u'tree_gini_9__train_bs_1766_1870__test_bs_1766_1870', u'tree_entropy_12__train_bs_371_599__test_bs_371_599', u'tree_entropy_10__train_bs_676_867__test_bs_676_867', u'tree_entropy_14__train_ka_80_324__test_ka_80_324', u'tree_entropy_7__train_bs_549_321__test_bs_549_321', u'tree_gini_11__train_bs_328_285__test_bs_328_285']\n" ] } ], "prompt_number": 312 }, { "cell_type": "code", "collapsed": false, "input": [ "## reconstruct the ensemble model\n", "## the models in the optimal ensemble is getting more and more \n", "## so it becomes an aggressive model\n", "gee = ensemble.GreedyEnsemble(optimal_ensemble_path, \n", " scorefn=accuracy_score, \n", " votefn=ensemble.GreedyEnsemble.vote_major_class, \n", " client = client)\n", "#gee.fit(ensemble.all_model_names(optimal_ensemble_path), verbose = True)\n", "gee.ensemble_ = ensemble.all_model_names(optimal_ensemble_path)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 313 }, { "cell_type": "code", "collapsed": false, "input": [ "print gee.score('train_data')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 21/21 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "0.9925\n" ] } ], "prompt_number": 314 }, { "cell_type": "code", "collapsed": false, "input": [ "print gee.score('validation_data')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 21/21 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "0.495\n" ] } ], "prompt_number": 315 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }