{ "metadata": { "name": "", "signature": "sha256:3bb3760ed193ed588be5a0a1bac7e773c32e03a40d80091272e7ba668f53f630" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import sklearn\n", "from sklearn.linear_model import Ridge\n", "\n", "from scipy import linalg\n", "from scipy import sparse\n", "from scipy.sparse import linalg as sp_linalg\n", "\n", "\n", "from sklearn.utils.extmath import safe_sparse_dot" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import sklearn\n", "from sklearn.linear_model import Ridge\n", "\n", "from scipy import linalg\n", "from scipy import sparse\n", "from scipy.sparse import linalg as sp_linalg\n", "\n", "\n", "from sklearn.utils.extmath import safe_sparse_dot" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# for some reason, the python ridge regression library does not\n", "# allow rescaled data for the sparse_cg solver..no idea why \n", "# probably should test the \n", "def _rescale_data(X, y, sample_weight):\n", " \"\"\"Rescale data so as to support sample_weight\"\"\"\n", " n_samples = X.shape[0]\n", " sample_weight = sample_weight * np.ones(n_samples)\n", " sample_weight = np.sqrt(sample_weight)\n", " sw_matrix = sparse.dia_matrix((sample_weight, 0),\n", " shape=(n_samples, n_samples))\n", " X = safe_sparse_dot(sw_matrix, X)\n", " y = safe_sparse_dot(sw_matrix, y)\n", " return X, y\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# classifier with/ instance weights\n", "# current: Ridge Regression w/rescaled data and real labels\n", "def classify(X, y, sample_weight, alpha):\n", " X, y = _rescale_data(X, y, sample_weight)\n", " classifier = Ridge(alpha=alpha, fit_intercept=False, solver='sparse_cg')\n", " classifier.fit(X, y)\n", " return classifier" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# select the highest confidence documents using our model\n", "# X = X[unlabelled_ids]\n", "# classify all docs\n", "# select based on score R(+) , (1-R) (-)\n", "del select_high_confidence_results(X, R, classifier):\n", " high_c_ids = []\n", " \n", " # apply to all data\n", " \n", " # select the top R positive, bottom (1-R) negative scores\n", " \n", " return high_c_ids" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# self training step\n", "# apply classifier to documents w/labels + guessed_labels \n", "# add R fraction of [+] documents (and 1-R [-]) guessed_labels set\n", "# retrain, with guessed_label ss weighted down\n", "def self_train_step(X, y, W, U, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):\n", " \n", " # apply classifier with sample weights W and U\n", " # to labeled docs and current guess labels\n", " current_labels = labeled_ids \n", " \n", " if guessed_ids.shape[0] > 0 then\n", " current_labels = np.union1d(labeled_ids,guessed_ids)\n", " \n", " X_current = X[current_labels]\n", " y_current = y[current_labels]\n", " \n", " # how do we set the sample_weights ?\n", " # create weights for all unlabeled, and then select?\n", " # wasteful but simple\n", " \n", " # does the guess get sample weight W or U ... I think U\n", " # notice: we normalize by num_guessed, not num_unlabelled\n", " instance_weights = np.empty_like(y)\n", " instance_weights[labeled_ids] = W / float(labeled_ids.shape[0])\n", " instance_weights[guessed_ids] = U / float(guessed_ids.shape[0]) \n", " \n", " current_weights = instance_weights[current_labels]\n", " \n", " current_model = classify(X_current, y_current, current_weights, alpha)\n", "\n", " # added the R/(1-R) high confidence (+)/(-) documents to the guessed set\n", " \n", " #\n", " #\n", " #\n", " \n", " # switch based on the current set of guesses total\n", " # so can switch out labels that were not present earlier\n", " \n", " #\n", " #\n", " #\n", " \n", " # switch \n", " \n", " return 0" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# multi-switch algo \n", "# switch [R (+)]/[1-R (-)] labels \n", "# if they make the current fit better\n", "# \n", "def switch_labels(X, y,labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R) :\n", " num_switched = 0\n", " \n", " \n", " return num_switched" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# metric to decide if we switch the labels or not\n", "# can we use the margin even for Regularized Least Squares?\n", "# the regularizer is the same \n", "#\n", "# if Xw=y, then w=(X^-1)y\n", "# #=> we need the current version of the classifier, with weights set\n", "# some function of the classifier\n", "def switch_metric(classifier )\n", " return 0" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# run the incremental self training algo\n", "#\n", "def self_train(X, y, labeled_ids, unlabeled_ids, R=0.5, U=1, W=0.001, alpha = 1.0:\n", "\n", " U_step_size = 0.001\n", " istep = 1\n", " # or, equivalently, num_steps = 1000\n", " # U_step_size = 1/num_steps\n", "\n", " \n", " # run initial classifier\n", " X_labeled = X[labeled_ids]\n", " y_labeled = y[labeled_ids]\n", " \n", " guessed_ids = [] # or np.empty\n", " \n", " # loop over istep = start to finish \n", " # U_step_size*U to U in increments U_step_size\n", " \n", " # or: break at maximum U steps\n", " # or: break at some convergence criteria?\n", " \n", " # set guessed_sample_weights \n", " # U_step = (U_step_size*istep)*U\n", " U_step = (U_step_size*istep)*U\n", " \n", " # apply current classifier to remaining unlabeled data\n", " # note: U = U_step\n", " self_train_step(X, y, W, U_step, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):\n", " \n", " \n", " switch_labels(X, y, labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R)\n", " \n", " # stop or keep switching / stop? \n", " # just run all the way to the end?" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }