{ "metadata": { "name": "TEST_greedy_ensemble_II" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "TEST greedy ensemble on blackbox" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import ensemble\n", "import features\n", "from itertools import cycle\n", "from IPython.parallel import Client\n", "client = Client()\n", "print len(client)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "4\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import cPickle\n", "X, y = cPickle.load(open('data/blackbox.pkl', 'rb'))\n", "print X.shape, y.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(1000, 1875) (1000,)\n" ] } ], "prompt_number": 2 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "1. Different models on the same original data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "reload(ensemble)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 3, "text": [ "" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "## split data to train, validatin\n", "from sklearn.cross_validation import train_test_split\n", "train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.2)\n", "print train_X.shape, validation_X.shape\n", "print train_y.shape, validation_y.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(800, 1875) (200, 1875)\n", "(800,) (200,)\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "## make new ensemble\n", "!rm -fR tmp/blackbox_raw_ensemble\n", "ensemble_path = ensemble.new_ensemble('blackbox_raw_ensemble', 'tmp/')\n", "print ensemble_path" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Users/lima/workspace/tutorials/ml-tutorials/tmp/blackbox_raw_ensemble" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "## persist data\n", "ensemble.write_data(ensemble_path, 'train_blackbox', (train_X, train_y), {'description': 'blackbox data training'})\n", "ensemble.write_data(ensemble_path, 'validation_blackbox', (validation_X, validation_y), {'description': 'blackbox data validation'})" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "## confirm data writing\n", "!ls tmp/blackbox_raw_ensemble/data" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "train_blackbox.pkl validation_blackbox.pkl\r\n", "train_blackbox.pkl_01.npy validation_blackbox.pkl_01.npy\r\n", "train_blackbox.pkl_02.npy validation_blackbox.pkl_02.npy\r\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "## config different models\n", "from sklearn import svm\n", "from sklearn import linear_model\n", "from sklearn import tree\n", "models = {\n", " 'sgd_0.1': linear_model.SGDClassifier(alpha = 0.1)\n", " , 'svc_0.001': svm.SVC(probability=True, gamma=0.001)\n", " , 'svc_0.01': svm.SVC(probability=True, gamma = 0.01)\n", " , 'svc_0.1': svm.SVC(probability=True, gamma=0.1)\n", " , 'sgd_0.0001': linear_model.SGDClassifier(alpha = 0.0001)\n", " , 'sgd_0.001': linear_model.SGDClassifier(alpha = 0.001)\n", " , 'sgd_0.01': linear_model.SGDClassifier(alpha = 0.01) # SUPER model for digits\n", " , 'sgd_0.05': linear_model.SGDClassifier(alpha = 0.05)\n", " , 'sgd_0.15': linear_model.SGDClassifier(alpha = 0.05)\n", " , 'pac_1.0': linear_model.PassiveAggressiveClassifier(C=1.0)\n", " , 'pac_0.1': linear_model.PassiveAggressiveClassifier(C=0.1)\n", " , 'pac_0.01': linear_model.PassiveAggressiveClassifier(C=0.01)\n", " , 'pac_0.001': linear_model.PassiveAggressiveClassifier(C=0.001)\n", " , 'tree_5': tree.DecisionTreeClassifier(max_depth=5)\n", " , 'tree_3': tree.DecisionTreeClassifier(max_depth=3)\n", " , 'tree_1': tree.DecisionTreeClassifier(max_depth=1)\n", "}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "## write models\n", "common_model_meta = {\n", " 'is_probabilistic': False\n", " , 'train_data': 'train_blackbox'\n", " , 'validation_data': 'validation_blackbox'\n", "}\n", "for (model_name, model) in models.items():\n", " ensemble.write_model(ensemble_path, model_name, model, model_meta = common_model_meta)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "## confirm model writing\n", "!ls tmp/blackbox_raw_ensemble/models" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "pac_0.001.pkl sgd_0.0001.pkl sgd_0.1.pkl svc_0.1.pkl\r\n", "pac_0.01.pkl sgd_0.001.pkl sgd_0.15.pkl tree_1.pkl\r\n", "pac_0.1.pkl sgd_0.01.pkl svc_0.001.pkl tree_3.pkl\r\n", "pac_1.0.pkl sgd_0.05.pkl svc_0.01.pkl tree_5.pkl\r\n" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "## train models in parallel\n", "model_data_pairs = zip(models.keys(), cycle(['train_data']))\n", "ensemble.parallel_train_models(ensemble_path, model_data_pairs, client)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 4/4 tasks finished after 16 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "## construct ensemble model\n", "from sklearn import metrics\n", "ge = ensemble.GreedyEnsemble(ensemble_path, \n", " scorefn = metrics.accuracy_score,\n", " votefn = ensemble.GreedyEnsemble.vote_major_class,\n", " client = client)\n", "ge.fit(models.keys(), verbose = True)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 4/4 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "checking model svc_0.01 improvement from 0.0 to 0.25\n", "checking model tree_3 improvement from 0.25 to 0.25\n", "checking model sgd_0.1 improvement from 0.25 to 0.255\n", "checking model tree_5 improvement from 0.255 to 0.255\n", "checking model sgd_0.01 improvement from 0.255 to 0.255\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pac_1.0 NO improvement from 0.255 to 0.25\n" ] }, { "output_type": "pyout", "prompt_number": 12, "text": [ "GreedyEnsemble(client=,\n", " ensemble_path='/Users/lima/workspace/tutorials/ml-tutorials/tmp/blackbox_raw_ensemble',\n", " random_seed=0, scorefn=,\n", " votefn=)" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "## model performance\n", "print ge.score(data_type = 'train_data')\n", "print ge.score(data_type = 'validation_data')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 4/4 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "0.255\n" ] } ], "prompt_number": 13 }, { "cell_type": "raw", "metadata": {}, "source": [ "## try add all models to ensemble and see\n", "ge.ensemble_ = models.keys()\n", "print ge.score(data_type='train_data')\n", "print ge.score(data_type='validation_data')\n", "print ge.ensemble_ " ] }, { "cell_type": "raw", "metadata": {}, "source": [ "## try indivdual models and see the results\n", "for model_name in models.keys:\n", " print 'for model', model_name\n", " ge.ensemble_ = [model_name]\n", " print ge.score(data_type='train_data')\n", " print ge.score(data_type='validation_data')\n", " print '--------------------------'\n", " print ''" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "2. train different models on different data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "reload(ensemble)\n", "reload(features)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 169, "text": [ "" ] } ], "prompt_number": 169 }, { "cell_type": "code", "collapsed": false, "input": [ "## create new ensemble folder\n", "!rm -fR tmp/blackbox_mixed_ensemble/\n", "ensemble_path = ensemble.new_ensemble('blackbox_mixed_ensemble', 'tmp/')\n", "print ensemble_path" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/home/ce/mali/tutorials/ml-tutorials/tmp/blackbox_mixed_ensemble" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 170 }, { "cell_type": "code", "collapsed": false, "input": [ "## from sklearn.cross_validation import train_test_split\n", "n_samples, n_features = X.shape\n", "## same train and test index for each sub dataset\n", "train_index, test_index = train_test_split(range(n_samples), test_size = 0.2)\n", "data_names = []\n", "data_infor = []" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 171 }, { "cell_type": "code", "collapsed": false, "input": [ "## HEAVEY IO ENGAGEMENT\n", "## create different data from raw features\n", "## data set of all rows with different columns (strided)\n", "sub_features = features.strided_seqs(range(n_features), stride = 25, subsize = 25)\n", "sub_features += features.strided_seqs(range(n_features), stride = 10, subsize = 50)\n", "sub_features += features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 30)\n", "data_name_index = [('subdata_%d_%d' % (sub_feature[0], sub_feature[-1]), sub_feature) \n", " for sub_feature in sub_features]\n", "for data_name, feat_index in data_name_index:\n", " train_X = features.patch(X, train_index, feat_index)\n", " train_y = features.patch(y, train_index)\n", " test_X = features.patch(X, test_index, feat_index)\n", " test_y = features.patch(y, test_index)\n", " train_name = 'train_' + data_name\n", " test_name = 'test_' + data_name\n", " data_names.append((train_name, test_name))\n", " data_infor.append([train_name, (train_X, train_y), {}])\n", " data_infor.append([test_name, (test_X, test_y), {}])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 172 }, { "cell_type": "code", "collapsed": false, "input": [ "## creat MORE features from tri-kmeans result\n", "reload(features)\n", "feat_patches = features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 30)\n", "tri_kmeans = features.TriKmeansFeatures(n_clusters = 10, feat_patches = feat_patches, client = client)\n", "tri_X = tri_kmeans.fit_transform(X)\n", "sub_features = features.strided_seqs(range(tri_X.shape[1]), stride = 15, subsize=30)\n", "data_name_index = [('tridata_%d_%d' % (sub_feature[0], sub_feature[-1]), sub_feature) \n", " for sub_feature in sub_features]\n", "for data_name, feat_index in data_name_index:\n", " train_X = features.patch(tri_X, train_index, feat_index)\n", " train_y = features.patch(y, train_index)\n", " test_X = features.patch(tri_X, test_index, feat_index)\n", " test_y = features.patch(y, test_index)\n", " train_name = 'train_' + data_name\n", " test_name = 'test_' + data_name\n", " data_names.append((train_name, test_name))\n", " data_infor.append([train_name, (train_X, train_y), {}])\n", " data_infor.append([test_name, (test_X, test_y), {}])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 24/24 tasks finished after 12 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n" ] } ], "prompt_number": 173 }, { "cell_type": "code", "collapsed": false, "input": [ "ensemble.batch_write_data(ensemble_path, data_infor)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 174 }, { "cell_type": "code", "collapsed": false, "input": [ "print len(data_names)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1095\n" ] } ], "prompt_number": 175 }, { "cell_type": "code", "collapsed": false, "input": [ "## configure different models\n", "from sklearn import svm\n", "from sklearn import linear_model\n", "from sklearn import tree\n", "models = {\n", " 'sgd_0.1': linear_model.SGDClassifier(alpha = 0.1)\n", " #, 'svc_0.001': svm.SVC(probability=True, gamma=0.001)\n", " #, 'svc_0.01': svm.SVC(probability=True, gamma = 0.01)\n", " #, 'svc_0.1': svm.SVC(probability=True, gamma=0.1)\n", " #, 'sgd_0.0001': linear_model.SGDClassifier(alpha = 0.0001)\n", " #, 'sgd_0.001': linear_model.SGDClassifier(alpha = 0.001)\n", " #, 'sgd_0.01': linear_model.SGDClassifier(alpha = 0.01) # SUPER model for digits\n", " #, 'sgd_0.05': linear_model.SGDClassifier(alpha = 0.05)\n", " #, 'sgd_0.15': linear_model.SGDClassifier(alpha = 0.05)\n", " #, 'pac_1.0': linear_model.PassiveAggressiveClassifier(C=1.0)\n", " #, 'pac_0.1': linear_model.PassiveAggressiveClassifier(C=0.1)\n", " #, 'pac_0.01': linear_model.PassiveAggressiveClassifier(C=0.01)\n", " #, 'pac_0.001': linear_model.PassiveAggressiveClassifier(C=0.001)\n", " , 'tree_11': tree.DecisionTreeClassifier(max_depth=11)\n", " , 'tree_10': tree.DecisionTreeClassifier(max_depth=10)\n", " , 'tree_9': tree.DecisionTreeClassifier(max_depth=9)\n", " , 'tree_8': tree.DecisionTreeClassifier(max_depth=8)\n", " , 'tree_7': tree.DecisionTreeClassifier(max_depth=7)\n", " , 'tree_6': tree.DecisionTreeClassifier(max_depth=6)\n", " , 'tree_5': tree.DecisionTreeClassifier(max_depth=5)\n", " , 'tree_4': tree.DecisionTreeClassifier(max_depth=4)\n", " , 'tree_3': tree.DecisionTreeClassifier(max_depth=3)\n", " , 'tree_2': tree.DecisionTreeClassifier(max_depth=2)\n", " , 'tree_1': tree.DecisionTreeClassifier(max_depth=1)\n", "}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 176 }, { "cell_type": "code", "collapsed": false, "input": [ "model_metas = [{\n", " 'is_probabilistic': False\n", " , 'train_data': train_name\n", " , 'validation_data': validation_name\n", " , 'test_data': None\n", "} for (train_name, validation_name) in data_names]\n", "print len(model_metas)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1095\n" ] } ], "prompt_number": 177 }, { "cell_type": "code", "collapsed": false, "input": [ "n_candidates = len(models)\n", "from random import shuffle\n", "## write model configurations\n", "model_names = []\n", "model_infor = []\n", "for model_meta in model_metas:\n", " candidates = models.items()\n", " shuffle(candidates)\n", " candidates = candidates[:n_candidates]\n", " ## randomly select \n", " for (template_name, model) in candidates:\n", " model_name = template_name + \"_\" + model_meta[\"train_data\"] + \"_\" + model_meta['validation_data']\n", " model_names.append(model_name)\n", " model_infor.append([model_name, model, model_meta])\n", "ensemble.batch_write_model(ensemble_path, model_infor)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 178 }, { "cell_type": "code", "collapsed": false, "input": [ "## train models in parallel\n", "model_data_pairs = zip(model_names, cycle(['train_data']))\n", "ensemble.parallel_train_models(ensemble_path, model_data_pairs, client)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 24/24 tasks finished after 238 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n" ] } ], "prompt_number": 179 }, { "cell_type": "code", "collapsed": false, "input": [ "## construct ensemble model\n", "ensemble_path = 'tmp/blackbox_mixed_ensemble/'\n", "from sklearn.metrics import accuracy_score\n", "ge = ensemble.GreedyEnsemble(ensemble_path, \n", " scorefn = accuracy_score, \n", " votefn = ensemble.GreedyEnsemble.vote_major_class, \n", " client = client)\n", "ge.fit(model_names, verbose=True)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 24/24 tasks finished after 444 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_6_train_subdata_632_1701_test_subdata_632_1701 improvement from 0.0 to 0.27\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_10_train_subdata_860_909_test_subdata_860_909 improvement from 0.27 to 0.295\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_9_train_subdata_1180_1229_test_subdata_1180_1229 improvement from 0.295 to 0.335\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_10_train_subdata_1690_1739_test_subdata_1690_1739 improvement from 0.335 to 0.34\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_9_train_subdata_1840_14_test_subdata_1840_14 improvement from 0.34 to 0.37\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_8_train_subdata_1720_1769_test_subdata_1720_1769 improvement from 0.37 to 0.39\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_10_train_subdata_1790_1839_test_subdata_1790_1839 improvement from 0.39 to 0.41\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tree_11_train_subdata_347_1620_test_subdata_347_1620 improvement from 0.41 to 0.415\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " sgd_0.1_train_tridata_3630_3659_test_tridata_3630_3659 improvement from 0.415 to 0.42\n", "checking model" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " sgd_0.1_train_tridata_3390_3419_test_tridata_3390_3419 NO improvement from 0.42 to 0.415\n" ] }, { "output_type": "pyout", "prompt_number": 180, "text": [ "GreedyEnsemble(client=,\n", " ensemble_path='tmp/blackbox_mixed_ensemble/', random_seed=0,\n", " scorefn=,\n", " votefn=)" ] } ], "prompt_number": 180 }, { "cell_type": "code", "collapsed": false, "input": [ "## performance\n", "print ge.score(data_type = 'train_data')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 9/9 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "0.84875\n" ] } ], "prompt_number": 181 }, { "cell_type": "code", "collapsed": false, "input": [ "print ge.score(data_type = 'validation_data')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 9/9 tasks finished after 0 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "done\n", "0.42\n" ] } ], "prompt_number": 182 }, { "cell_type": "code", "collapsed": false, "input": [ "reload(ensemble)\n", "data_names = ensemble.all_data_names(ensemble_path)\n", "print len(data_names)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1656\n" ] } ], "prompt_number": 153 }, { "cell_type": "code", "collapsed": false, "input": [ "reload(ensemble)\n", "model_names = ensemble.all_model_names(ensemble_path)\n", "print len(model_names)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "9936\n" ] } ], "prompt_number": 154 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }