{ "metadata": { "name": "", "signature": "sha256:a127b5e035d0b64fb162511b97690025c0f8e4b4b53f5597b8d5fe17d1f91dfe" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import print_function\n", "import numpy as np\n", "import googleprediction" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Google's APL library is setup to work well with command line applications. Mimic some of that behavior here." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Initialize Model\n", "==============" ] }, { "cell_type": "code", "collapsed": false, "input": [ "model = googleprediction.GooglePredictor(\n", " \"myproject\",\n", " \"mybucket/X_train_spectra_ave_goog.csv\",\n", " \"tswift_fft_ave\",\n", " \"client_secrets.json\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "List Available Models\n", "============" ] }, { "cell_type": "code", "collapsed": false, "input": [ "model.list()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit Model to Training Data in the Storage Bucket\n", "=============" ] }, { "cell_type": "code", "collapsed": false, "input": [ "model.fit('CLASSIFICATION')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "List Model Features\n", "======" ] }, { "cell_type": "code", "collapsed": false, "input": [ "model.get_params()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load Data Files for Testing\n", "===========" ] }, { "cell_type": "code", "collapsed": false, "input": [ "with np.load(\"data_files.npz\") as data:\n", " X_train = data['X_train']\n", " Y_train = data['Y_train']\n", " X_test = data['X_test']\n", " Y_test = data['Y_test']\n", " X_comp = data['X_comp']\n", "del data" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train = np.float64(X_train)\n", "X_test = np.float64(X_test)\n", "X_comp = np.float64(X_comp)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute Frequency Spectra for Input Features\n", "=============" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def convert_to_spectra(X):\n", " out = []\n", " for row in X:\n", " xfft = np.fft.fft(row)\n", " n = len(xfft)\n", " half_n = np.ceil(n/2.0)\n", " xfft = (2.0 / n) * xfft[1:half_n]\n", " out.append(np.abs(xfft))\n", " out = np.array(out)\n", " return out" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train_spectra = convert_to_spectra(X_train)\n", "X_test_spectra = convert_to_spectra(X_test)\n", "X_comp_spectra = convert_to_spectra(X_comp)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Smooth the Spectra\n", "========" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def moving_average(X, n=3):\n", " ret = []\n", " for row in X:\n", " row = np.cumsum(row)\n", " row[n:] = row[n:] - row[:-n]\n", " row = row[n - 1:] / n\n", " ret.append(row)\n", " ret = np.array(ret)\n", " return ret" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train_spectra = moving_average(X_train_spectra, n=5)\n", "X_test_spectra = moving_average(X_test_spectra, n=5)\n", "X_comp_spectra = moving_average(X_comp_spectra, n=5)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train_spectra = np.int16(X_train_spectra)\n", "X_test_spectra = np.int16(X_test_spectra)\n", "X_comp_spectra = np.int16(X_comp_spectra)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "Y_test.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ "(6720,)" ] } ], "prompt_number": 15 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Predict on the Test Set\n", "================" ] }, { "cell_type": "code", "collapsed": false, "input": [ "out = model.predict(X_test_spectra)\n", "print(out)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "=======================\n", "Making some predictions\n", "=======================\n", "[u'1' u'0' u'1' ..., u'1' u'1' u'1']" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "out = np.int16(out)\n", "out.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 17, "text": [ "(6720,)" ] } ], "prompt_number": 17 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute Performance on the Test Set\n", "===============" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "print(classification_report(np.int16(Y_test), out))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " precision recall f1-score support\n", "\n", " 0 0.96 0.93 0.94 3381\n", " 1 0.93 0.96 0.94 3339\n", "\n", "avg / total 0.94 0.94 0.94 6720\n", "\n" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "confusion_matrix(Y_test, out, labels=[0, 1])" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 20, "text": [ "array([[3141, 240],\n", " [ 145, 3194]])" ] } ], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "print(accuracy_score(Y_test, out))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.942708333333\n" ] } ], "prompt_number": 21 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Save Predicitons\n", "==========" ] }, { "cell_type": "code", "collapsed": false, "input": [ "np.savetxt(\"gpapi_test_pred_fft.csv\", np.array(out,dtype=int), delimiter=',', fmt='%i')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }