{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:16:10.892885", "start_time": "2017-11-17T21:15:59.539013" }, "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " Loading BokehJS ...\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(global) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " var force = \"1\";\n", "\n", " if (typeof (window._bokeh_onload_callbacks) === \"undefined\" || force !== \"\") {\n", " window._bokeh_onload_callbacks = [];\n", " window._bokeh_is_loading = undefined;\n", " }\n", "\n", "\n", " \n", " if (typeof (window._bokeh_timeout) === \"undefined\" || force !== \"\") {\n", " window._bokeh_timeout = Date.now() + 5000;\n", " window._bokeh_failed_load = false;\n", " }\n", "\n", " var NB_LOAD_WARNING = {'data': {'text/html':\n", " \"
\\n\"+\n", " \"

\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"

\\n\"+\n", " \"\\n\"+\n", " \"\\n\"+\n", " \"from bokeh.resources import INLINE\\n\"+\n", " \"output_notebook(resources=INLINE)\\n\"+\n", " \"\\n\"+\n", " \"
\"}};\n", "\n", " function display_loaded() {\n", " if (window.Bokeh !== undefined) {\n", " Bokeh.$(\"#794a8e61-52c3-4b10-9104-cb680877dba2\").text(\"BokehJS successfully loaded.\");\n", " } else if (Date.now() < window._bokeh_timeout) {\n", " setTimeout(display_loaded, 100)\n", " }\n", " }\n", "\n", " function run_callbacks() {\n", " window._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", " delete window._bokeh_onload_callbacks\n", " console.info(\"Bokeh: all callbacks have finished\");\n", " }\n", "\n", " function load_libs(js_urls, callback) {\n", " window._bokeh_onload_callbacks.push(callback);\n", " if (window._bokeh_is_loading > 0) {\n", " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", " return null;\n", " }\n", " if (js_urls == null || js_urls.length === 0) {\n", " run_callbacks();\n", " return null;\n", " }\n", " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", " window._bokeh_is_loading = js_urls.length;\n", " for (var i = 0; i < js_urls.length; i++) {\n", " var url = js_urls[i];\n", " var s = document.createElement('script');\n", " s.src = url;\n", " s.async = false;\n", " s.onreadystatechange = s.onload = function() {\n", " window._bokeh_is_loading--;\n", " if (window._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", " run_callbacks()\n", " }\n", " };\n", " s.onerror = function() {\n", " console.warn(\"failed to load library \" + url);\n", " };\n", " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", " }\n", " };var element = document.getElementById(\"794a8e61-52c3-4b10-9104-cb680877dba2\");\n", " if (element == null) {\n", " console.log(\"Bokeh: ERROR: autoload.js configured with elementid '794a8e61-52c3-4b10-9104-cb680877dba2' but no matching script tag was found. \")\n", " return false;\n", " }\n", "\n", " var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.js'];\n", "\n", " var inline_js = [\n", " function(Bokeh) {\n", " Bokeh.set_log_level(\"info\");\n", " },\n", " \n", " function(Bokeh) {\n", " \n", " Bokeh.$(\"#794a8e61-52c3-4b10-9104-cb680877dba2\").text(\"BokehJS is loading...\");\n", " },\n", " function(Bokeh) {\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n", " }\n", " ];\n", "\n", " function run_inline_js() {\n", " \n", " if ((window.Bokeh !== undefined) || (force === \"1\")) {\n", " for (var i = 0; i < inline_js.length; i++) {\n", " inline_js[i](window.Bokeh);\n", " }if (force === \"1\") {\n", " display_loaded();\n", " }} else if (Date.now() < window._bokeh_timeout) {\n", " setTimeout(run_inline_js, 100);\n", " } else if (!window._bokeh_failed_load) {\n", " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", " window._bokeh_failed_load = true;\n", " } else if (!force) {\n", " var cell = $(\"#794a8e61-52c3-4b10-9104-cb680877dba2\").parents('.cell').data().cell;\n", " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", " }\n", "\n", " }\n", "\n", " if (window._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", " run_inline_js();\n", " } else {\n", " load_libs(js_urls, function() {\n", " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", " run_inline_js();\n", " });\n", " }\n", "}(this));" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Custom libraries\n", "from datascienceutils import plotter\n", "from datascienceutils import analyze\n", "from datascienceutils import predictiveModels as pm\n", "\n", "# Standard libraries\n", "import json\n", "%matplotlib inline\n", "import datetime\n", "import numpy as np\n", "import pandas as pd\n", "import random\n", "\n", "from sklearn import cross_validation\n", "from sklearn import metrics\n", "\n", "from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource\n", "from bokeh.charts import Histogram\n", "import bokeh\n", "output_notebook()\n", "\n", "from sqlalchemy import create_engine" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:24:25.564440", "start_time": "2017-11-17T21:24:13.748673" }, "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "twoSigmaDf = None\n", "with pd.HDFStore(\"/home/anand/DataScientist/data/train.h5\", \"r\") as train:\n", " # Note that the \"train\" dataframe is the only dataframe in the file\n", " twoSigmaDf = train.get(\"train\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.656220+05:30", "start_time": "2017-06-29T07:10:03.493Z" }, "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "twoSigmaDf.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.656523+05:30", "start_time": "2017-06-29T07:10:03.495Z" }, "collapsed": true }, "outputs": [], "source": [ "twoSigmaDf.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.656802+05:30", "start_time": "2017-06-29T07:10:03.497Z" }, "collapsed": true }, "outputs": [], "source": [ "twoSigmaDf[twoSigmaDf.id==10]['y'].describe()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.657073+05:30", "start_time": "2017-06-29T07:10:03.499Z" }, "collapsed": true }, "outputs": [], "source": [ "grouped_ids = twoSigmaDf.groupby('id').count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.657341+05:30", "start_time": "2017-06-29T07:10:03.501Z" }, "collapsed": true }, "outputs": [], "source": [ "twoSigmaDf[twoSigmaDf.id==12]['y'].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.657607+05:30", "start_time": "2017-06-29T07:10:03.502Z" }, "collapsed": true }, "outputs": [], "source": [ "twoSigmaDf['timestamp'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.657873+05:30", "start_time": "2017-06-29T07:10:03.504Z" }, "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "twoSigmaDf.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.658135+05:30", "start_time": "2017-06-29T07:10:03.506Z" }, "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "twoSigmaDf.corr()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.658428+05:30", "start_time": "2017-06-29T07:10:03.508Z" }, "collapsed": true }, "outputs": [], "source": [ "a = twoSigmaDf.corr()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.658692+05:30", "start_time": "2017-06-29T07:10:03.509Z" }, "collapsed": true }, "outputs": [], "source": [ "sampleDf = twoSigmaDf.sample(n=100000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.658954+05:30", "start_time": "2017-06-29T07:10:03.511Z" }, "collapsed": true }, "outputs": [], "source": [ "analyze.dist_analyze(twoSigmaDf, 'timestamp')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.659221+05:30", "start_time": "2017-06-29T07:10:03.513Z" }, "collapsed": true }, "outputs": [], "source": [ "analyze.dist_analyze(twoSigmaDf, 'y')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.659494+05:30", "start_time": "2017-06-29T07:10:03.514Z" }, "collapsed": true }, "outputs": [], "source": [ "neg_model = twoSigmaDf[twoSigmaDf.y< -0.08]\n", "pos_model = twoSigmaDf[twoSigmaDf.y> 0.09]\n", "central_model = twoSigmaDf[twoSigmaDf.y> -0.08][twoSigmaDf.y < 0.09]\n", "non_central_model = pd.concat([neg_model, pos_model])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.659777+05:30", "start_time": "2017-06-29T07:10:03.516Z" }, "collapsed": true }, "outputs": [], "source": [ "analyze.dist_analyze(non_central_model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.660050+05:30", "start_time": "2017-06-29T07:10:03.518Z" }, "collapsed": true }, "outputs": [], "source": [ "analyze.dist_analyze(neg_model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.660312+05:30", "start_time": "2017-06-29T07:10:03.519Z" }, "collapsed": true }, "outputs": [], "source": [ "analyze.dist_analyze(pos_model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.660571+05:30", "start_time": "2017-06-29T07:10:03.521Z" }, "collapsed": true }, "outputs": [], "source": [ "analyze.dist_analyze(central_model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.660834+05:30", "start_time": "2017-06-29T07:10:03.522Z" }, "collapsed": true }, "outputs": [], "source": [ "len(neg_model.columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.661096+05:30", "start_time": "2017-06-29T07:10:03.524Z" }, "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "neg_model.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.661355+05:30", "start_time": "2017-06-29T07:10:03.525Z" }, "collapsed": true }, "outputs": [], "source": [ "pos_model.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.661635+05:30", "start_time": "2017-06-29T07:10:03.527Z" }, "collapsed": true }, "outputs": [], "source": [ "central_model.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.661999+05:30", "start_time": "2017-06-29T07:10:03.528Z" }, "collapsed": true }, "outputs": [], "source": [ "central_model.corr()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.662327+05:30", "start_time": "2017-06-29T07:10:03.529Z" }, "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "prediction = 2.217474e-04\n", "mean = twoSigmaDf['y'].mean()\n", "sampleDf['rmse_pred'] = sampleDf['y'].apply(lambda x: (x-prediction)**2)\n", "sampleDf['rmse_mean'] = sampleDf['y'].apply(lambda x: (x-mean)**2)\n", "\n", "r_square = 1 - sampleDf['rmse_pred'].sum()/sampleDf['rmse_mean'].sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.662637+05:30", "start_time": "2017-06-29T07:10:03.531Z" }, "collapsed": true }, "outputs": [], "source": [ "out = np.sqrt(abs(r_square))\n", "print(out* -1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.663175+05:30", "start_time": "2017-06-29T07:10:03.532Z" }, "collapsed": true }, "outputs": [], "source": [ "sample2Df = twoSigmaDf.sample(n=300000)\n", "prediction = 2.217474e-04\n", "mean = sample2Df['y'].mean()\n", "sample2Df['rmse_pred'] = sample2Df['y'].apply(lambda x: (x-prediction)**2)\n", "sample2Df['rmse_mean'] = sample2Df['y'].apply(lambda x: (x-mean)**2)\n", "\n", "r_square = 1 - sample2Df['rmse_pred'].sum()/sample2Df['rmse_mean'].sum()\n", "out = np.sqrt(abs(r_square))\n", "print(out, r_square)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.663488+05:30", "start_time": "2017-06-29T07:10:03.534Z" }, "collapsed": true }, "outputs": [], "source": [ "sample6Df = twoSigmaDf.sample(n=600000)\n", "prediction = 0\n", "mean = sample6Df['y'].mean()\n", "sample6Df['rmse_pred'] = sample6Df['y'].apply(lambda x: (x-prediction)**2)\n", "sample6Df['rmse_mean'] = sample6Df['y'].apply(lambda x: (x-mean)**2)\n", "\n", "r_square = 1 - sample6Df['rmse_pred'].sum()/sample6Df['rmse_mean'].sum()\n", "out = np.sqrt(abs(r_square))\n", "print(out, r_square)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.663778+05:30", "start_time": "2017-06-29T07:10:03.536Z" }, "collapsed": true }, "outputs": [], "source": [ "prediction = 0\n", "mean = twoSigmaDf['y'].mean()\n", "central_mean = central_model['y'].mean()\n", "def get_prediction(x):\n", " if x <= -0.08:\n", " return -0.085\n", " elif x >= 0.09:\n", " return 0.095\n", " else:\n", " return central_mean\n", "\n", "def get_r_square(prediction, actual):\n", " assert len(prediction) == len(actual), \"Lengths of actuals and predictions should be equal\"\n", " rmse_pred = 0\n", " rmse_mean = 0\n", " mean = np.mean(actual)\n", " for pred,ea in zip(prediction, actual):\n", " rmse_pred += (ea - pred)**2\n", " rmse_mean += (ea - mean)**2\n", " return 1 - rmse_pred/rmse_mean\n", " \n", " \n", " pass\n", " \n", "#twoSigmaDf['prediction'] = twoSigmaDf['y'].apply(lambda x: get_prediction(x))\n", "twoSigmaDf['rmse_pred'] = twoSigmaDf['y'].apply(lambda x: (x-get_prediction(x))**2)\n", "twoSigmaDf['rmse_mean'] = twoSigmaDf['y'].apply(lambda x: (x-mean)**2)\n", "\n", "r_square = 1 - twoSigmaDf['rmse_pred'].sum()/twoSigmaDf['rmse_mean'].sum()\n", "out = np.sqrt(abs(r_square))\n", "\n", "print(out, r_square)\n", "print(get_r_square(twoSigmaDf['y'].apply(lambda x: get_prediction(x)), twoSigmaDf['y']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.664053+05:30", "start_time": "2017-06-29T07:10:03.537Z" }, "collapsed": true }, "outputs": [], "source": [ "def get_category(x):\n", " if x <= -0.08:\n", " return 'Neg'\n", " elif x >= 0.09:\n", " return \"Pos\"\n", " else:\n", " return \"Mid\"\n", "twoSigmaDf['category'] = twoSigmaDf['y'].apply(lambda x: get_category(x))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.664317+05:30", "start_time": "2017-06-29T07:10:03.539Z" }, "collapsed": true }, "outputs": [], "source": [ "a = twoSigmaDf.groupby('category').describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.664581+05:30", "start_time": "2017-06-29T07:10:03.541Z" }, "collapsed": true }, "outputs": [], "source": [ "b = a.transpose().reset_index()\n", "b.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.664844+05:30", "start_time": "2017-06-29T07:10:03.542Z" }, "collapsed": true }, "outputs": [], "source": [ "from datascienceutils import utils\n", "anova_dict = dict()\n", "for col in twoSigmaDf.columns:\n", " anova_dict[col] = utils.calculate_anova(sample2Df, 'y', col)\n", "print(anova_dict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.665113+05:30", "start_time": "2017-06-29T07:10:03.544Z" }, "collapsed": true }, "outputs": [], "source": [ "filteredDf = twoSigmaDf[['id', 'timestamp', 'technical_34', 'technical_20']]\n", "filteredDf.fillna(filteredDf.mean(), inplace=True)\n", "target = twoSigmaDf['category']\n", "from sklearn.naive_bayes import GaussianNB\n", "gnb = GaussianNB()\n", "#gnb_model = pm.train(filteredDf, target, 'gaussianNB')\n", "gnb.fit(filteredDf, target)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-06-29T12:40:03.665378+05:30", "start_time": "2017-06-29T07:10:03.545Z" }, "collapsed": true }, "outputs": [], "source": [ "def get_pred_by_category(x):\n", " if x == 'Mid':\n", " return 0\n", " elif x == 'Neg':\n", " return -0.085\n", " else:\n", " return 0.095\n", "num_predictions = list(map(get_pred_by_category, gnb.predict(filteredDf)))\n", "\n", "get_r_square(num_predictions, list(twoSigmaDf['y']))" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "latex_envs": { "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 0 } }, "nbformat": 4, "nbformat_minor": 1 }