{
 "metadata": {
  "name": "",
  "signature": "sha256:3bb3760ed193ed588be5a0a1bac7e773c32e03a40d80091272e7ba668f53f630"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import sklearn\n",
      "from sklearn.linear_model import Ridge\n",
      "\n",
      "from scipy import linalg\n",
      "from scipy import sparse\n",
      "from scipy.sparse import linalg as sp_linalg\n",
      "\n",
      "\n",
      "from sklearn.utils.extmath import safe_sparse_dot"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import sklearn\n",
      "from sklearn.linear_model import Ridge\n",
      "\n",
      "from scipy import linalg\n",
      "from scipy import sparse\n",
      "from scipy.sparse import linalg as sp_linalg\n",
      "\n",
      "\n",
      "from sklearn.utils.extmath import safe_sparse_dot"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# for some reason, the python ridge regression library does not\n",
      "# allow rescaled data for the sparse_cg solver..no idea why \n",
      "#  probably should test the \n",
      "def _rescale_data(X, y, sample_weight):\n",
      "    \"\"\"Rescale data so as to support sample_weight\"\"\"\n",
      "    n_samples = X.shape[0]\n",
      "    sample_weight = sample_weight * np.ones(n_samples)\n",
      "    sample_weight = np.sqrt(sample_weight)\n",
      "    sw_matrix = sparse.dia_matrix((sample_weight, 0),\n",
      "                                  shape=(n_samples, n_samples))\n",
      "    X = safe_sparse_dot(sw_matrix, X)\n",
      "    y = safe_sparse_dot(sw_matrix, y)\n",
      "    return X, y\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# classifier with/ instance weights\n",
      "# current:  Ridge Regression w/rescaled data and real labels\n",
      "def classify(X, y, sample_weight, alpha):\n",
      "    X, y = _rescale_data(X, y, sample_weight)\n",
      "    classifier = Ridge(alpha=alpha, fit_intercept=False, solver='sparse_cg')\n",
      "    classifier.fit(X, y)\n",
      "    return classifier"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# select the highest confidence documents using our model\n",
      "# X = X[unlabelled_ids]\n",
      "#  classify all docs\n",
      "#  select based on score  R(+) , (1-R) (-)\n",
      "del select_high_confidence_results(X, R, classifier):\n",
      "    high_c_ids = []\n",
      "  \n",
      "    # apply to all data\n",
      "    \n",
      "    # select the top R positive, bottom (1-R) negative scores\n",
      "    \n",
      "    return high_c_ids"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# self training step\n",
      "#  apply classifier to documents w/labels + guessed_labels \n",
      "#  add R fraction of [+] documents (and 1-R [-]) guessed_labels set\n",
      "#  retrain, with guessed_label ss weighted down\n",
      "def self_train_step(X, y, W, U, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):\n",
      "    \n",
      "    # apply classifier with sample weights W and U\n",
      "    # to labeled docs and current guess labels\n",
      "    current_labels = labeled_ids   \n",
      "    \n",
      "    if guessed_ids.shape[0] > 0 then\n",
      "        current_labels = np.union1d(labeled_ids,guessed_ids)\n",
      "        \n",
      "    X_current = X[current_labels]\n",
      "    y_current = y[current_labels]\n",
      "    \n",
      "    # how do we set the sample_weights ?\n",
      "    # create weights for all unlabeled, and then select?\n",
      "    # wasteful but simple\n",
      "    \n",
      "    # does the guess get sample weight W or U ... I think U\n",
      "    # notice: we normalize by num_guessed, not num_unlabelled\n",
      "    instance_weights = np.empty_like(y)\n",
      "    instance_weights[labeled_ids] = W / float(labeled_ids.shape[0])\n",
      "    instance_weights[guessed_ids] = U / float(guessed_ids.shape[0])  \n",
      "    \n",
      "    current_weights = instance_weights[current_labels]\n",
      " \n",
      "    current_model = classify(X_current, y_current, current_weights, alpha)\n",
      "\n",
      "    # added the R/(1-R) high confidence (+)/(-) documents to the guessed set\n",
      "    \n",
      "    #\n",
      "    #\n",
      "    #\n",
      "    \n",
      "    # switch based on the current set of guesses total\n",
      "    # so can switch out labels that were not present earlier\n",
      "    \n",
      "    #\n",
      "    #\n",
      "    #\n",
      "    \n",
      "    # switch \n",
      "    \n",
      "    return 0"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# multi-switch algo \n",
      "#  switch [R (+)]/[1-R (-)] labels \n",
      "#  if they make the current fit better\n",
      "#  \n",
      "def switch_labels(X, y,labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R) :\n",
      "    num_switched = 0\n",
      "   \n",
      "    \n",
      "    return num_switched"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# metric to decide if we switch the labels or not\n",
      "# can we use the margin even for Regularized Least Squares?\n",
      "# the regularizer is the same \n",
      "#\n",
      "#  if Xw=y, then w=(X^-1)y\n",
      "#  #=> we need the current version of the classifier, with weights set\n",
      "# some function of the classifier\n",
      "def switch_metric(classifier )\n",
      "    return 0"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# run the incremental self training algo\n",
      "#\n",
      "def self_train(X, y, labeled_ids, unlabeled_ids, R=0.5, U=1, W=0.001, alpha = 1.0:\n",
      "\n",
      "    U_step_size = 0.001\n",
      "    istep = 1\n",
      "    # or, equivalently, num_steps = 1000\n",
      "    #   U_step_size = 1/num_steps\n",
      "\n",
      "    \n",
      "    # run initial classifier\n",
      "    X_labeled = X[labeled_ids]\n",
      "    y_labeled = y[labeled_ids]\n",
      "        \n",
      "    guessed_ids = [] # or np.empty\n",
      "    \n",
      "    # loop over istep = start to finish \n",
      "    #  U_step_size*U to U in increments U_step_size\n",
      "    \n",
      "    #  or: break at maximum U steps\n",
      "    #  or: break at some convergence criteria?\n",
      "    \n",
      "    # set guessed_sample_weights \n",
      "    #  U_step = (U_step_size*istep)*U\n",
      "    U_step = (U_step_size*istep)*U\n",
      "    \n",
      "    # apply current classifier to remaining unlabeled data\n",
      "    #  note:  U = U_step\n",
      "    self_train_step(X, y, W, U_step, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):\n",
      "    \n",
      "    \n",
      "    switch_labels(X, y, labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R)\n",
      "    \n",
      "    # stop or keep switching / stop?  \n",
      "    #  just run all the way to the end?"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}