{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:57:31.109861", "start_time": "2017-11-17T21:57:31.095000" }, "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " Loading BokehJS ...\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(global) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " var force = \"1\";\n", "\n", " if (typeof (window._bokeh_onload_callbacks) === \"undefined\" || force !== \"\") {\n", " window._bokeh_onload_callbacks = [];\n", " window._bokeh_is_loading = undefined;\n", " }\n", "\n", "\n", " \n", " if (typeof (window._bokeh_timeout) === \"undefined\" || force !== \"\") {\n", " window._bokeh_timeout = Date.now() + 5000;\n", " window._bokeh_failed_load = false;\n", " }\n", "\n", " var NB_LOAD_WARNING = {'data': {'text/html':\n", " \"
\\n\"+\n", " \"

\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"

\\n\"+\n", " \"\\n\"+\n", " \"\\n\"+\n", " \"from bokeh.resources import INLINE\\n\"+\n", " \"output_notebook(resources=INLINE)\\n\"+\n", " \"\\n\"+\n", " \"
\"}};\n", "\n", " function display_loaded() {\n", " if (window.Bokeh !== undefined) {\n", " Bokeh.$(\"#c6289e64-189d-4236-a670-421feb29b434\").text(\"BokehJS successfully loaded.\");\n", " } else if (Date.now() < window._bokeh_timeout) {\n", " setTimeout(display_loaded, 100)\n", " }\n", " }\n", "\n", " function run_callbacks() {\n", " window._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", " delete window._bokeh_onload_callbacks\n", " console.info(\"Bokeh: all callbacks have finished\");\n", " }\n", "\n", " function load_libs(js_urls, callback) {\n", " window._bokeh_onload_callbacks.push(callback);\n", " if (window._bokeh_is_loading > 0) {\n", " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", " return null;\n", " }\n", " if (js_urls == null || js_urls.length === 0) {\n", " run_callbacks();\n", " return null;\n", " }\n", " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", " window._bokeh_is_loading = js_urls.length;\n", " for (var i = 0; i < js_urls.length; i++) {\n", " var url = js_urls[i];\n", " var s = document.createElement('script');\n", " s.src = url;\n", " s.async = false;\n", " s.onreadystatechange = s.onload = function() {\n", " window._bokeh_is_loading--;\n", " if (window._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", " run_callbacks()\n", " }\n", " };\n", " s.onerror = function() {\n", " console.warn(\"failed to load library \" + url);\n", " };\n", " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", " }\n", " };var element = document.getElementById(\"c6289e64-189d-4236-a670-421feb29b434\");\n", " if (element == null) {\n", " console.log(\"Bokeh: ERROR: autoload.js configured with elementid 'c6289e64-189d-4236-a670-421feb29b434' but no matching script tag was found. \")\n", " return false;\n", " }\n", "\n", " var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.js'];\n", "\n", " var inline_js = [\n", " function(Bokeh) {\n", " Bokeh.set_log_level(\"info\");\n", " },\n", " \n", " function(Bokeh) {\n", " \n", " Bokeh.$(\"#c6289e64-189d-4236-a670-421feb29b434\").text(\"BokehJS is loading...\");\n", " },\n", " function(Bokeh) {\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n", " }\n", " ];\n", "\n", " function run_inline_js() {\n", " \n", " if ((window.Bokeh !== undefined) || (force === \"1\")) {\n", " for (var i = 0; i < inline_js.length; i++) {\n", " inline_js[i](window.Bokeh);\n", " }if (force === \"1\") {\n", " display_loaded();\n", " }} else if (Date.now() < window._bokeh_timeout) {\n", " setTimeout(run_inline_js, 100);\n", " } else if (!window._bokeh_failed_load) {\n", " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", " window._bokeh_failed_load = true;\n", " } else if (!force) {\n", " var cell = $(\"#c6289e64-189d-4236-a670-421feb29b434\").parents('.cell').data().cell;\n", " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", " }\n", "\n", " }\n", "\n", " if (window._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", " run_inline_js();\n", " } else {\n", " load_libs(js_urls, function() {\n", " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", " run_inline_js();\n", " });\n", " }\n", "}(this));" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Custom libraries\n", "from datascienceutils import plotter\n", "from datascienceutils import analyze\n", "\n", "# Standard libraries\n", "import json\n", "%matplotlib inline\n", "import datetime\n", "import numpy as np\n", "import pandas as pd\n", "import random\n", "\n", "\n", "from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource\n", "from bokeh.charts import Histogram\n", "import bokeh\n", "output_notebook()\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:57:48.071794", "start_time": "2017-11-17T21:57:48.022371" }, "collapsed": false, "scrolled": false }, "outputs": [ { "ename": "FileNotFoundError", "evalue": "File b'/home/anand/DataScientist/data/metadata.csv' does not exist", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mirisDf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'/home/anand/DataScientist/data/metadata.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 653\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 654\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 655\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 656\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 657\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 403\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 404\u001b[0m \u001b[1;31m# Create the parser.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 405\u001b[1;33m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 406\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 407\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 762\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 763\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 764\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 766\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[1;34m(self, engine)\u001b[0m\n\u001b[0;32m 983\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'c'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 984\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'c'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 985\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 986\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 987\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 1603\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'allow_leading_cols'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1604\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1605\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1606\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1607\u001b[0m \u001b[1;31m# XXX\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)\u001b[1;34m()\u001b[0m\n", "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)\u001b[1;34m()\u001b[0m\n", "\u001b[1;31mFileNotFoundError\u001b[0m: File b'/home/anand/DataScientist/data/metadata.csv' does not exist" ] } ], "source": [ "irisDf = pd.read_csv('/home/anand/DataScientist/data/metadata.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:42:13.107760", "start_time": "2017-02-25T22:42:12.580525" }, "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/numpy/lib/function_base.py:4116: RuntimeWarning: Invalid value encountered in percentile\n", " interpolation=interpolation)\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
call_durationantenna_id
count342.0000000.0
mean228.997076NaN
std441.944354NaN
min1.000000NaN
25%NaNNaN
50%NaNNaN
75%NaNNaN
max3072.000000NaN
\n", "
" ], "text/plain": [ " call_duration antenna_id\n", "count 342.000000 0.0\n", "mean 228.997076 NaN\n", "std 441.944354 NaN\n", "min 1.000000 NaN\n", "25% NaN NaN\n", "50% NaN NaN\n", "75% NaN NaN\n", "max 3072.000000 NaN" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "irisDf.describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:42:13.241292", "start_time": "2017-02-25T22:42:13.110968" }, "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
interactiondirectioncorrespondent_iddatetimecall_durationantenna_id
0callout+1095358004332016-10-16 07:34:52NaNNaN
1callin+9194869578322016-10-16 12:49:4015.0NaN
2callin+9144308111002016-10-16 14:35:26101.0NaN
3callout+9189034218072016-10-16 14:37:47NaNNaN
4callout+9146342218072016-10-16 14:38:3077.0NaN
\n", "
" ], "text/plain": [ " interaction direction correspondent_id datetime call_duration \\\n", "0 call out +109535800433 2016-10-16 07:34:52 NaN \n", "1 call in +919486957832 2016-10-16 12:49:40 15.0 \n", "2 call in +914430811100 2016-10-16 14:35:26 101.0 \n", "3 call out +918903421807 2016-10-16 14:37:47 NaN \n", "4 call out +914634221807 2016-10-16 14:38:30 77.0 \n", "\n", " antenna_id \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "irisDf.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:42:13.427606", "start_time": "2017-02-25T22:42:13.244186" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "call_duration 195314.812308\n", "antenna_id NaN\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "irisDf.var()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:42:13.722517", "start_time": "2017-02-25T22:42:13.430932" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "call_duration 3.96145\n", "antenna_id NaN\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "irisDf.skew()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:42:13.925090", "start_time": "2017-02-25T22:42:13.727277" }, "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
call_durationantenna_id
call_duration1.0NaN
antenna_idNaNNaN
\n", "
" ], "text/plain": [ " call_duration antenna_id\n", "call_duration 1.0 NaN\n", "antenna_id NaN NaN" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "irisDf.corr()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:42:14.133476", "start_time": "2017-02-25T22:42:13.926573" }, "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "Index(['call_duration', 'antenna_id'], dtype='object')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "irisDf.select_dtypes(include=[np.number]).columns" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:43:03.564120", "start_time": "2017-02-25T22:43:03.482091" }, "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# Correlation btw Numerical Columns\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "# Pandas correlation coefficients matrix\n", " call_duration antenna_id\n", "call_duration 1.0 NaN\n", "antenna_id NaN NaN\n", "# Pandas co-variance coefficients matrix\n", " call_duration antenna_id\n", "call_duration 195314.812308 NaN\n", "antenna_id NaN NaN\n" ] } ], "source": [ "analyze.correlation_analyze(irisDf)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:43:17.392063", "start_time": "2017-02-25T22:43:15.512536" }, "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Variance of call_duration\n", "195314.812308\n", "Skewness of call_duration\n", "3.96144995214\n", "Variance of antenna_id\n", "nan\n", "Skewness of antenna_id\n", "nan\n", "Too many categorise for col: correspondent_id can't plot pie-chart\n", "Too many categorise for col: datetime can't plot pie-chart\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle\n", " warnings.warn(\"Path marker shapes currently not handled, defaulting to Circle\")\n", "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the\n", "axes property. A removal date has not been set.\n", " stacklevel=1)\n", "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected\n", " warnings.warn(\"Path marker sizes support is limited and may not display as expected\")\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "analyze.dist_analyze(irisDf)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:46:00.139444", "start_time": "2017-02-25T22:45:59.910923" }, "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Variance of call_duration\n", "195314.812308\n", "Skewness of call_duration\n", "3.96144995214\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle\n", " warnings.warn(\"Path marker shapes currently not handled, defaulting to Circle\")\n", "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the\n", "axes property. A removal date has not been set.\n", " stacklevel=1)\n", "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected\n", " warnings.warn(\"Path marker sizes support is limited and may not display as expected\")\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "analyze.dist_analyze(irisDf, 'call_duration')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:46:00.674097", "start_time": "2017-02-25T22:46:00.141103" }, "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1752 1752\n" ] }, { "ename": "ValueError", "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0manalyze\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mregression_analyze\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mirisDf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'call_duration'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'datetime'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.1.10-py3.5.egg/datascienceutils/analyze.py\u001b[0m in \u001b[0;36mregression_analyze\u001b[1;34m(df, col1, col2, trainsize, non_linear)\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[0mtarget\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 160\u001b[0m models = [\n\u001b[1;32m--> 161\u001b[1;33m \u001b[0mpm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnew_df\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumn\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcol1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodelType\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'LinearRegression'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 162\u001b[0m \u001b[0mpm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnew_df\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumn\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcol1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodelType\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'RidgeRegression'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 163\u001b[0m \u001b[0mpm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnew_df\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumn\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcol1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodelType\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'RidgeRegressionCV'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.1.10-py3.5.egg/datascienceutils/predictiveModels.py\u001b[0m in \u001b[0;36mtrain\u001b[1;34m(dataframe, target, modelType, column, **kwargs)\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[0msource\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdataframe\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcolumn\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msource\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 32\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msource\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 33\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataframe\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/linear_model/base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 510\u001b[0m \u001b[0mn_jobs_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 511\u001b[0m X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],\n\u001b[1;32m--> 512\u001b[1;33m y_numeric=True, multi_output=True)\n\u001b[0m\u001b[0;32m 513\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 514\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0matleast_1d\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)\u001b[0m\n\u001b[0;32m 519\u001b[0m X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,\n\u001b[0;32m 520\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mallow_nd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 521\u001b[1;33m ensure_min_features, warn_on_dtype, estimator)\n\u001b[0m\u001b[0;32m 522\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 523\u001b[0m y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[0;32m 405\u001b[0m % (array.ndim, estimator_name))\n\u001b[0;32m 406\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 407\u001b[1;33m \u001b[0m_assert_all_finite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 408\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 409\u001b[0m \u001b[0mshape_repr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_shape_repr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[1;34m(X)\u001b[0m\n\u001b[0;32m 56\u001b[0m and not np.isfinite(X).all()):\n\u001b[0;32m 57\u001b[0m raise ValueError(\"Input contains NaN, infinity\"\n\u001b[1;32m---> 58\u001b[1;33m \" or a value too large for %r.\" % X.dtype)\n\u001b[0m\u001b[0;32m 59\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')." ] } ], "source": [ "analyze.regression_analyze(irisDf, 'call_duration', 'datetime')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-02-25T22:46:00.675254", "start_time": "2017-02-25T17:15:59.923Z" }, "collapsed": false }, "outputs": [], "source": [ "irisDf.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "latex_envs": { "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 0 } }, "nbformat": 4, "nbformat_minor": 0 }