{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-17T21:16:10.892885",
     "start_time": "2017-11-17T21:15:59.539013"
    },
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div class=\"bk-root\">\n",
       "        <a href=\"http://bokeh.pydata.org\" target=\"_blank\" class=\"bk-logo bk-logo-small bk-logo-notebook\"></a>\n",
       "        <span id=\"794a8e61-52c3-4b10-9104-cb680877dba2\">Loading BokehJS ...</span>\n",
       "    </div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/javascript": [
       "\n",
       "(function(global) {\n",
       "  function now() {\n",
       "    return new Date();\n",
       "  }\n",
       "\n",
       "  var force = \"1\";\n",
       "\n",
       "  if (typeof (window._bokeh_onload_callbacks) === \"undefined\" || force !== \"\") {\n",
       "    window._bokeh_onload_callbacks = [];\n",
       "    window._bokeh_is_loading = undefined;\n",
       "  }\n",
       "\n",
       "\n",
       "  \n",
       "  if (typeof (window._bokeh_timeout) === \"undefined\" || force !== \"\") {\n",
       "    window._bokeh_timeout = Date.now() + 5000;\n",
       "    window._bokeh_failed_load = false;\n",
       "  }\n",
       "\n",
       "  var NB_LOAD_WARNING = {'data': {'text/html':\n",
       "     \"<div style='background-color: #fdd'>\\n\"+\n",
       "     \"<p>\\n\"+\n",
       "     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n",
       "     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n",
       "     \"</p>\\n\"+\n",
       "     \"<ul>\\n\"+\n",
       "     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n",
       "     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n",
       "     \"</ul>\\n\"+\n",
       "     \"<code>\\n\"+\n",
       "     \"from bokeh.resources import INLINE\\n\"+\n",
       "     \"output_notebook(resources=INLINE)\\n\"+\n",
       "     \"</code>\\n\"+\n",
       "     \"</div>\"}};\n",
       "\n",
       "  function display_loaded() {\n",
       "    if (window.Bokeh !== undefined) {\n",
       "      Bokeh.$(\"#794a8e61-52c3-4b10-9104-cb680877dba2\").text(\"BokehJS successfully loaded.\");\n",
       "    } else if (Date.now() < window._bokeh_timeout) {\n",
       "      setTimeout(display_loaded, 100)\n",
       "    }\n",
       "  }\n",
       "\n",
       "  function run_callbacks() {\n",
       "    window._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n",
       "    delete window._bokeh_onload_callbacks\n",
       "    console.info(\"Bokeh: all callbacks have finished\");\n",
       "  }\n",
       "\n",
       "  function load_libs(js_urls, callback) {\n",
       "    window._bokeh_onload_callbacks.push(callback);\n",
       "    if (window._bokeh_is_loading > 0) {\n",
       "      console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n",
       "      return null;\n",
       "    }\n",
       "    if (js_urls == null || js_urls.length === 0) {\n",
       "      run_callbacks();\n",
       "      return null;\n",
       "    }\n",
       "    console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n",
       "    window._bokeh_is_loading = js_urls.length;\n",
       "    for (var i = 0; i < js_urls.length; i++) {\n",
       "      var url = js_urls[i];\n",
       "      var s = document.createElement('script');\n",
       "      s.src = url;\n",
       "      s.async = false;\n",
       "      s.onreadystatechange = s.onload = function() {\n",
       "        window._bokeh_is_loading--;\n",
       "        if (window._bokeh_is_loading === 0) {\n",
       "          console.log(\"Bokeh: all BokehJS libraries loaded\");\n",
       "          run_callbacks()\n",
       "        }\n",
       "      };\n",
       "      s.onerror = function() {\n",
       "        console.warn(\"failed to load library \" + url);\n",
       "      };\n",
       "      console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n",
       "      document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
       "    }\n",
       "  };var element = document.getElementById(\"794a8e61-52c3-4b10-9104-cb680877dba2\");\n",
       "  if (element == null) {\n",
       "    console.log(\"Bokeh: ERROR: autoload.js configured with elementid '794a8e61-52c3-4b10-9104-cb680877dba2' but no matching script tag was found. \")\n",
       "    return false;\n",
       "  }\n",
       "\n",
       "  var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.js'];\n",
       "\n",
       "  var inline_js = [\n",
       "    function(Bokeh) {\n",
       "      Bokeh.set_log_level(\"info\");\n",
       "    },\n",
       "    \n",
       "    function(Bokeh) {\n",
       "      \n",
       "      Bokeh.$(\"#794a8e61-52c3-4b10-9104-cb680877dba2\").text(\"BokehJS is loading...\");\n",
       "    },\n",
       "    function(Bokeh) {\n",
       "      console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n",
       "      Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n",
       "      console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n",
       "      Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n",
       "    }\n",
       "  ];\n",
       "\n",
       "  function run_inline_js() {\n",
       "    \n",
       "    if ((window.Bokeh !== undefined) || (force === \"1\")) {\n",
       "      for (var i = 0; i < inline_js.length; i++) {\n",
       "        inline_js[i](window.Bokeh);\n",
       "      }if (force === \"1\") {\n",
       "        display_loaded();\n",
       "      }} else if (Date.now() < window._bokeh_timeout) {\n",
       "      setTimeout(run_inline_js, 100);\n",
       "    } else if (!window._bokeh_failed_load) {\n",
       "      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n",
       "      window._bokeh_failed_load = true;\n",
       "    } else if (!force) {\n",
       "      var cell = $(\"#794a8e61-52c3-4b10-9104-cb680877dba2\").parents('.cell').data().cell;\n",
       "      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n",
       "    }\n",
       "\n",
       "  }\n",
       "\n",
       "  if (window._bokeh_is_loading === 0) {\n",
       "    console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n",
       "    run_inline_js();\n",
       "  } else {\n",
       "    load_libs(js_urls, function() {\n",
       "      console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n",
       "      run_inline_js();\n",
       "    });\n",
       "  }\n",
       "}(this));"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Custom libraries\n",
    "from datascienceutils import plotter\n",
    "from datascienceutils import analyze\n",
    "from datascienceutils import predictiveModels as pm\n",
    "\n",
    "# Standard libraries\n",
    "import json\n",
    "%matplotlib inline\n",
    "import datetime\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "\n",
    "from sklearn import cross_validation\n",
    "from sklearn import metrics\n",
    "\n",
    "from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource\n",
    "from bokeh.charts import Histogram\n",
    "import bokeh\n",
    "output_notebook()\n",
    "\n",
    "from sqlalchemy import create_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-17T21:24:25.564440",
     "start_time": "2017-11-17T21:24:13.748673"
    },
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "twoSigmaDf = None\n",
    "with pd.HDFStore(\"/home/anand/DataScientist/data/train.h5\", \"r\") as train:\n",
    "    # Note that the \"train\" dataframe is the only dataframe in the file\n",
    "    twoSigmaDf = train.get(\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.656220+05:30",
     "start_time": "2017-06-29T07:10:03.493Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "twoSigmaDf.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.656523+05:30",
     "start_time": "2017-06-29T07:10:03.495Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "twoSigmaDf.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.656802+05:30",
     "start_time": "2017-06-29T07:10:03.497Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "twoSigmaDf[twoSigmaDf.id==10]['y'].describe()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.657073+05:30",
     "start_time": "2017-06-29T07:10:03.499Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grouped_ids = twoSigmaDf.groupby('id').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.657341+05:30",
     "start_time": "2017-06-29T07:10:03.501Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "twoSigmaDf[twoSigmaDf.id==12]['y'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.657607+05:30",
     "start_time": "2017-06-29T07:10:03.502Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "twoSigmaDf['timestamp'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.657873+05:30",
     "start_time": "2017-06-29T07:10:03.504Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "twoSigmaDf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.658135+05:30",
     "start_time": "2017-06-29T07:10:03.506Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "twoSigmaDf.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.658428+05:30",
     "start_time": "2017-06-29T07:10:03.508Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a = twoSigmaDf.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.658692+05:30",
     "start_time": "2017-06-29T07:10:03.509Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sampleDf = twoSigmaDf.sample(n=100000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.658954+05:30",
     "start_time": "2017-06-29T07:10:03.511Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "analyze.dist_analyze(twoSigmaDf, 'timestamp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.659221+05:30",
     "start_time": "2017-06-29T07:10:03.513Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "analyze.dist_analyze(twoSigmaDf, 'y')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.659494+05:30",
     "start_time": "2017-06-29T07:10:03.514Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "neg_model = twoSigmaDf[twoSigmaDf.y< -0.08]\n",
    "pos_model = twoSigmaDf[twoSigmaDf.y> 0.09]\n",
    "central_model = twoSigmaDf[twoSigmaDf.y> -0.08][twoSigmaDf.y < 0.09]\n",
    "non_central_model = pd.concat([neg_model, pos_model])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.659777+05:30",
     "start_time": "2017-06-29T07:10:03.516Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "analyze.dist_analyze(non_central_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.660050+05:30",
     "start_time": "2017-06-29T07:10:03.518Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "analyze.dist_analyze(neg_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.660312+05:30",
     "start_time": "2017-06-29T07:10:03.519Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "analyze.dist_analyze(pos_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.660571+05:30",
     "start_time": "2017-06-29T07:10:03.521Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "analyze.dist_analyze(central_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.660834+05:30",
     "start_time": "2017-06-29T07:10:03.522Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(neg_model.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.661096+05:30",
     "start_time": "2017-06-29T07:10:03.524Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "neg_model.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.661355+05:30",
     "start_time": "2017-06-29T07:10:03.525Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pos_model.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.661635+05:30",
     "start_time": "2017-06-29T07:10:03.527Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "central_model.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.661999+05:30",
     "start_time": "2017-06-29T07:10:03.528Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "central_model.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.662327+05:30",
     "start_time": "2017-06-29T07:10:03.529Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "prediction = 2.217474e-04\n",
    "mean = twoSigmaDf['y'].mean()\n",
    "sampleDf['rmse_pred'] = sampleDf['y'].apply(lambda x: (x-prediction)**2)\n",
    "sampleDf['rmse_mean'] = sampleDf['y'].apply(lambda x: (x-mean)**2)\n",
    "\n",
    "r_square = 1 - sampleDf['rmse_pred'].sum()/sampleDf['rmse_mean'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.662637+05:30",
     "start_time": "2017-06-29T07:10:03.531Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "out = np.sqrt(abs(r_square))\n",
    "print(out* -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.663175+05:30",
     "start_time": "2017-06-29T07:10:03.532Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sample2Df = twoSigmaDf.sample(n=300000)\n",
    "prediction = 2.217474e-04\n",
    "mean = sample2Df['y'].mean()\n",
    "sample2Df['rmse_pred'] = sample2Df['y'].apply(lambda x: (x-prediction)**2)\n",
    "sample2Df['rmse_mean'] = sample2Df['y'].apply(lambda x: (x-mean)**2)\n",
    "\n",
    "r_square = 1 - sample2Df['rmse_pred'].sum()/sample2Df['rmse_mean'].sum()\n",
    "out = np.sqrt(abs(r_square))\n",
    "print(out, r_square)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.663488+05:30",
     "start_time": "2017-06-29T07:10:03.534Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sample6Df = twoSigmaDf.sample(n=600000)\n",
    "prediction = 0\n",
    "mean = sample6Df['y'].mean()\n",
    "sample6Df['rmse_pred'] = sample6Df['y'].apply(lambda x: (x-prediction)**2)\n",
    "sample6Df['rmse_mean'] = sample6Df['y'].apply(lambda x: (x-mean)**2)\n",
    "\n",
    "r_square = 1 - sample6Df['rmse_pred'].sum()/sample6Df['rmse_mean'].sum()\n",
    "out = np.sqrt(abs(r_square))\n",
    "print(out, r_square)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.663778+05:30",
     "start_time": "2017-06-29T07:10:03.536Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prediction = 0\n",
    "mean = twoSigmaDf['y'].mean()\n",
    "central_mean = central_model['y'].mean()\n",
    "def get_prediction(x):\n",
    "    if x <= -0.08:\n",
    "        return -0.085\n",
    "    elif x >= 0.09:\n",
    "        return 0.095\n",
    "    else:\n",
    "        return central_mean\n",
    "\n",
    "def get_r_square(prediction, actual):\n",
    "    assert len(prediction) == len(actual), \"Lengths of actuals and predictions should be equal\"\n",
    "    rmse_pred = 0\n",
    "    rmse_mean = 0\n",
    "    mean = np.mean(actual)\n",
    "    for pred,ea in zip(prediction, actual):\n",
    "        rmse_pred += (ea - pred)**2\n",
    "        rmse_mean += (ea - mean)**2\n",
    "    return 1 - rmse_pred/rmse_mean\n",
    "    \n",
    "    \n",
    "    pass\n",
    "    \n",
    "#twoSigmaDf['prediction'] = twoSigmaDf['y'].apply(lambda x: get_prediction(x))\n",
    "twoSigmaDf['rmse_pred'] = twoSigmaDf['y'].apply(lambda x: (x-get_prediction(x))**2)\n",
    "twoSigmaDf['rmse_mean'] = twoSigmaDf['y'].apply(lambda x: (x-mean)**2)\n",
    "\n",
    "r_square = 1 - twoSigmaDf['rmse_pred'].sum()/twoSigmaDf['rmse_mean'].sum()\n",
    "out = np.sqrt(abs(r_square))\n",
    "\n",
    "print(out, r_square)\n",
    "print(get_r_square(twoSigmaDf['y'].apply(lambda x: get_prediction(x)), twoSigmaDf['y']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.664053+05:30",
     "start_time": "2017-06-29T07:10:03.537Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_category(x):\n",
    "    if x <= -0.08:\n",
    "        return 'Neg'\n",
    "    elif x >= 0.09:\n",
    "        return \"Pos\"\n",
    "    else:\n",
    "        return \"Mid\"\n",
    "twoSigmaDf['category'] = twoSigmaDf['y'].apply(lambda x: get_category(x))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.664317+05:30",
     "start_time": "2017-06-29T07:10:03.539Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a = twoSigmaDf.groupby('category').describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.664581+05:30",
     "start_time": "2017-06-29T07:10:03.541Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "b = a.transpose().reset_index()\n",
    "b.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.664844+05:30",
     "start_time": "2017-06-29T07:10:03.542Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from datascienceutils import utils\n",
    "anova_dict = dict()\n",
    "for col in twoSigmaDf.columns:\n",
    "    anova_dict[col] = utils.calculate_anova(sample2Df, 'y', col)\n",
    "print(anova_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.665113+05:30",
     "start_time": "2017-06-29T07:10:03.544Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "filteredDf = twoSigmaDf[['id', 'timestamp', 'technical_34', 'technical_20']]\n",
    "filteredDf.fillna(filteredDf.mean(), inplace=True)\n",
    "target = twoSigmaDf['category']\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "gnb = GaussianNB()\n",
    "#gnb_model = pm.train(filteredDf, target, 'gaussianNB')\n",
    "gnb.fit(filteredDf, target)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-29T12:40:03.665378+05:30",
     "start_time": "2017-06-29T07:10:03.545Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_pred_by_category(x):\n",
    "    if x == 'Mid':\n",
    "        return 0\n",
    "    elif x == 'Neg':\n",
    "        return -0.085\n",
    "    else:\n",
    "        return 0.095\n",
    "num_predictions = list(map(get_pred_by_category, gnb.predict(filteredDf)))\n",
    "\n",
    "get_r_square(num_predictions, list(twoSigmaDf['y']))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "latex_envs": {
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 0
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}