{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:26:38.391077", "start_time": "2017-11-17T21:26:35.782547" }, "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " Loading BokehJS ...\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(global) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " var force = \"1\";\n", "\n", " if (typeof (window._bokeh_onload_callbacks) === \"undefined\" || force !== \"\") {\n", " window._bokeh_onload_callbacks = [];\n", " window._bokeh_is_loading = undefined;\n", " }\n", "\n", "\n", " \n", " if (typeof (window._bokeh_timeout) === \"undefined\" || force !== \"\") {\n", " window._bokeh_timeout = Date.now() + 5000;\n", " window._bokeh_failed_load = false;\n", " }\n", "\n", " var NB_LOAD_WARNING = {'data': {'text/html':\n", " \"
\\n\"+\n", " \"

\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"

\\n\"+\n", " \"\\n\"+\n", " \"\\n\"+\n", " \"from bokeh.resources import INLINE\\n\"+\n", " \"output_notebook(resources=INLINE)\\n\"+\n", " \"\\n\"+\n", " \"
\"}};\n", "\n", " function display_loaded() {\n", " if (window.Bokeh !== undefined) {\n", " Bokeh.$(\"#8ddb7ce4-4461-4d04-a332-7e6c859ff046\").text(\"BokehJS successfully loaded.\");\n", " } else if (Date.now() < window._bokeh_timeout) {\n", " setTimeout(display_loaded, 100)\n", " }\n", " }\n", "\n", " function run_callbacks() {\n", " window._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", " delete window._bokeh_onload_callbacks\n", " console.info(\"Bokeh: all callbacks have finished\");\n", " }\n", "\n", " function load_libs(js_urls, callback) {\n", " window._bokeh_onload_callbacks.push(callback);\n", " if (window._bokeh_is_loading > 0) {\n", " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", " return null;\n", " }\n", " if (js_urls == null || js_urls.length === 0) {\n", " run_callbacks();\n", " return null;\n", " }\n", " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", " window._bokeh_is_loading = js_urls.length;\n", " for (var i = 0; i < js_urls.length; i++) {\n", " var url = js_urls[i];\n", " var s = document.createElement('script');\n", " s.src = url;\n", " s.async = false;\n", " s.onreadystatechange = s.onload = function() {\n", " window._bokeh_is_loading--;\n", " if (window._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", " run_callbacks()\n", " }\n", " };\n", " s.onerror = function() {\n", " console.warn(\"failed to load library \" + url);\n", " };\n", " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", " }\n", " };var element = document.getElementById(\"8ddb7ce4-4461-4d04-a332-7e6c859ff046\");\n", " if (element == null) {\n", " console.log(\"Bokeh: ERROR: autoload.js configured with elementid '8ddb7ce4-4461-4d04-a332-7e6c859ff046' but no matching script tag was found. \")\n", " return false;\n", " }\n", "\n", " var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.js'];\n", "\n", " var inline_js = [\n", " function(Bokeh) {\n", " Bokeh.set_log_level(\"info\");\n", " },\n", " \n", " function(Bokeh) {\n", " \n", " Bokeh.$(\"#8ddb7ce4-4461-4d04-a332-7e6c859ff046\").text(\"BokehJS is loading...\");\n", " },\n", " function(Bokeh) {\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.3.min.css\");\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.3.min.css\");\n", " }\n", " ];\n", "\n", " function run_inline_js() {\n", " \n", " if ((window.Bokeh !== undefined) || (force === \"1\")) {\n", " for (var i = 0; i < inline_js.length; i++) {\n", " inline_js[i](window.Bokeh);\n", " }if (force === \"1\") {\n", " display_loaded();\n", " }} else if (Date.now() < window._bokeh_timeout) {\n", " setTimeout(run_inline_js, 100);\n", " } else if (!window._bokeh_failed_load) {\n", " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", " window._bokeh_failed_load = true;\n", " } else if (!force) {\n", " var cell = $(\"#8ddb7ce4-4461-4d04-a332-7e6c859ff046\").parents('.cell').data().cell;\n", " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", " }\n", "\n", " }\n", "\n", " if (window._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", " run_inline_js();\n", " } else {\n", " load_libs(js_urls, function() {\n", " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", " run_inline_js();\n", " });\n", " }\n", "}(this));" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Custom libraries\n", "from datascienceutils import plotter\n", "from datascienceutils import analyze\n", "from datascienceutils import predictiveModels as pm\n", "from datascienceutils import sklearnUtils as sku\n", "\n", "from IPython.display import Image\n", "# Standard libraries\n", "import json\n", "%matplotlib inline\n", "import datetime\n", "import numpy as np\n", "import pandas as pd\n", "import random\n", "\n", "from sklearn import cross_validation\n", "from sklearn import metrics\n", "\n", "from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource\n", "from bokeh.charts import Histogram\n", "import bokeh\n", "output_notebook()\n", "\n", "# Set pandas display options\n", "#pd.set_option('display.width', pd.util.terminal.get_terminal_size()[0])\n", "pd.set_option('display.expand_frame_repr', False)\n", "pd.set_option('max_colwidth', 800)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:26:44.912135", "start_time": "2017-11-17T21:26:44.887664" }, "collapsed": true }, "outputs": [], "source": [ "# Data set from https://archive.ics.uci.edu/ml/machine-learning-databases/audiology/ i.e: famous uci ml data set repository\n", "\n", "with open('./data/audiology.data', 'r') as fd:\n", " data = fd.readlines()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:26:51.568329", "start_time": "2017-11-17T21:26:51.532891" }, "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['WARNING: This database should be credited to the original owner whenever\\n',\n", " ' used for any publication whatsoever.\\n',\n", " '\\n',\n", " '1. Title: Audiology Database\\n',\n", " '\\n',\n", " '2. Sources:\\n',\n", " ' (a) Original Owner: Professor Jergen at Baylor College of Medicine\\n',\n", " ' (b) Donor: Bruce Porter (porter@fall.cs.utexas.EDU)\\n',\n", " ' (c) Date Received: 12/3/1987\\n',\n", " '\\n',\n", " '3. Past Usage: \\n',\n", " ' -- See: Bareiss, E. Ray, & Porter, Bruce (1987). Protos: An '\n", " 'Exemplar-Based\\n',\n", " ' Learning Apprentice. In the Proceedings of the 4th International\\n',\n", " ' Workshop on Machine Learning, 12-23, Irvine, CA: Morgan Kaufmann.\\n',\n", " '\\n',\n", " '4. Relevant Information:\\n',\n", " ' -- Contact Ray Bareiss (rbareiss@uunet.uucp ??), now at Vanderbilt \\n',\n", " ' University, for more information.\\n',\n", " ' -- Domain expert: Professor Craig Wier of the University of Texas, '\n", " 'Austin.\\n',\n", " '\\n',\n", " '5. Number of instances: 200 training cases, 26 test cases\\n',\n", " '\\n',\n", " '6. Number of attributes: ???\\n',\n", " '\\n',\n", " '7. Attribute information: (all attributes are nominally valued)\\n',\n", " ' 1. case identifier.\\n',\n", " ' 2. classification (24 classes)\\n',\n", " ' 3. List of case features\\n',\n", " ' -- format: form f(v) should be read as \"feature f has value v\"\\n',\n", " '\\n',\n", " '8. Missing attribute values:\\n',\n", " ' -- This database does NOT use a standard set of attributes per '\n", " 'instance.\\n',\n", " '\\n',\n", " '9. Class Distribution: (in the training set)\\n',\n", " ' 1. acoustic_neuroma: 1\\n',\n", " ' 2. bells_palsy: 1\\n',\n", " ' 3. cochlear_age: 46\\n',\n", " ' 4. cochlear_age_and_noise: 18\\n',\n", " ' 5. cochlear_age_plus_poss_menieres: 1\\n',\n", " ' 6. cochlear_noise_and_heredity: 2\\n',\n", " ' 7. cochlear_poss_noise: 16\\n',\n", " ' 8. cochlear_unknown: 48\\n',\n", " ' 9. conductive_discontinuity: 2\\n',\n", " ' 10. conductive_fixation: 6\\n',\n", " ' 11. mixed_cochlear_age_fixation: 1\\n',\n", " ' 12. mixed_cochlear_age_otitis_media: 4\\n',\n", " ' 13. mixed_cochlear_age_s_om: 2\\n',\n", " ' 14. mixed_cochlear_unk_discontinuity: 2\\n',\n", " ' 15. mixed_cochlear_unk_fixation: 5\\n',\n", " ' 16. mixed_cochlear_unk_ser_om: 3\\n',\n", " ' 17. mixed_poss_central_om: 1\\n',\n", " ' 18. mixed_poss_noise_om: 2\\n',\n", " ' 19. normal_ear: 20\\n',\n", " ' 20. otitis_media: 4\\n',\n", " ' 21. poss_central: 1\\n',\n", " ' 22. possible_brainstem_disorder: 4\\n',\n", " ' 23. possible_menieres: 8\\n',\n", " ' 24. retrocochlear_unknown: 2\\n',\n", " ' --------------------Total: 200\\n']\n" ] } ], "source": [ "from pprint import pprint\n", "with open('./data/audiology.names', 'r') as fd:\n", " pprint(fd.readlines())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:26:59.041263", "start_time": "2017-11-17T21:26:59.026120" }, "collapsed": true }, "outputs": [], "source": [ "\n", "all_obs = set()\n", "\n", "def parse_line(line):\n", " global all_obs\n", " line = line.strip('\\n')\n", " line = line.strip(']')\n", " line = line.strip('[')\n", " all_f = line.split(',')\n", " caseid = all_f[0]\n", " classif = all_f[1]\n", " descs = all_f[2:]\n", " descs[0] = descs[0].strip('[')\n", " features = list()\n", " for ea in descs:\n", " all_obs.add(ea)\n", " descs = ','.join(descs)\n", " return [caseid, classif, descs]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:26:59.894302", "start_time": "2017-11-17T21:26:59.046235" }, "collapsed": false }, "outputs": [], "source": [ "audiology_df = pd.DataFrame(columns=['case_id', 'classification', 'case_features']) #'age_gt_60', 'boneAbnormal','airBoneGap', 'ar_c(normal)'])\n", "for idx, each in enumerate(data):\n", " if bool(each):\n", " line = parse_line(each)\n", " audiology_df.loc[idx] = line\n", " " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:26:59.943481", "start_time": "2017-11-17T21:26:59.896842" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
case_idclassificationcase_features
0p1cochlear_unknownboneAbnormal,air(mild),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a)
1p2cochlear_unknownboneAbnormal,air(moderate),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a)
2p3mixed_cochlear_age_fixationage_gt_60,airBoneGap,boneAbnormal,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(as)
3p4mixed_cochlear_age_otitis_mediaage_gt_60,airBoneGap,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(b)
4p5cochlear_ageage_gt_60,boneAbnormal,air(mild),ar_c(normal),ar_u(normal),bone(mild),o_ar_c(normal),o_ar_u(normal),speech(good),static(normal),tymp(a)
\n", "
" ], "text/plain": [ " case_id classification case_features\n", "0 p1 cochlear_unknown boneAbnormal,air(mild),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a)\n", "1 p2 cochlear_unknown boneAbnormal,air(moderate),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a)\n", "2 p3 mixed_cochlear_age_fixation age_gt_60,airBoneGap,boneAbnormal,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(as)\n", "3 p4 mixed_cochlear_age_otitis_media age_gt_60,airBoneGap,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(b)\n", "4 p5 cochlear_age age_gt_60,boneAbnormal,air(mild),ar_c(normal),ar_u(normal),bone(mild),o_ar_c(normal),o_ar_u(normal),speech(good),static(normal),tymp(a)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audiology_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Looks like the case_features are all text labels/observations by doctors. Let's split them into features and make them boolean." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:27:00.252014", "start_time": "2017-11-17T21:26:59.946003" }, "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " case_id case_features\n", "classification \n", "acoustic_neuroma 1 1\n", "bells_palsy 1 1\n", "cochlear_age 46 46\n", "cochlear_age_and_noise 18 18\n", "cochlear_age_plus_poss_menieres 1 1\n", "cochlear_noise_and_heredity 2 2\n", "cochlear_poss_noise 16 16\n", "cochlear_unknown 48 48\n", "conductive_discontinuity 2 2\n", "conductive_fixation 6 6\n", "mixed_cochlear_age_fixation 1 1\n", "mixed_cochlear_age_otitis_media 4 4\n", "mixed_cochlear_age_s_om 2 2\n", "mixed_cochlear_unk_discontinuity 2 2\n", "mixed_cochlear_unk_fixation 5 5\n", "mixed_cochlear_unk_ser_om 3 3\n", "mixed_poss_central_om 1 1\n", "mixed_poss_noise_om 2 2\n", "normal_ear 20 20\n", "otitis_media 4 4\n", "poss_central 1 1\n", "possible_brainstem_disorder 4 4\n", "possible_menieres 8 8\n", "retrocochlear_unknown 2 2\n" ] } ], "source": [ "print(audiology_df.groupby('classification').count())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:27:00.649407", "start_time": "2017-11-17T21:27:00.253804" }, "collapsed": true }, "outputs": [], "source": [ "#def check_defect_presence():\n", "# if ea in all_obs:\n", "# pass\n", "for ea in all_obs:\n", " audiology_df[ea] = audiology_df['case_features'].apply( lambda x: True if ea in x else False)\n", "audiology_df.drop('case_features', 1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:27:00.907440", "start_time": "2017-11-17T21:27:00.650898" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
case_idclassificationbone(normal)history(fullness)o_ar_c(elevated)mod_s_sn_gt_500mod_sn_gt_4knotch_4klate_wave(poor)s_sn_gt_2k...air(normal)tymp(b)m_s_sn_gt_2ko_ar_u(elevated)age_gt_60tymp(ad)history(recruitment)m_m_snm_sn_gt_1ko_ar_u(normal)
0p1cochlear_unknownFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
1p2cochlear_unknownFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
2p3mixed_cochlear_age_fixationFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
3p4mixed_cochlear_age_otitis_mediaFalseFalseFalseFalseFalseFalseFalseFalse...FalseTrueFalseFalseTrueFalseFalseFalseFalseFalse
4p5cochlear_ageFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseTrue
\n", "

5 rows × 89 columns

\n", "
" ], "text/plain": [ " case_id classification bone(normal) history(fullness) o_ar_c(elevated) mod_s_sn_gt_500 mod_sn_gt_4k notch_4k late_wave(poor) s_sn_gt_2k ... air(normal) tymp(b) m_s_sn_gt_2k o_ar_u(elevated) age_gt_60 tymp(ad) history(recruitment) m_m_sn m_sn_gt_1k o_ar_u(normal)\n", "0 p1 cochlear_unknown False False False False False False False False ... False False False False False False False False False True\n", "1 p2 cochlear_unknown False False False False False False False False ... False False False False False False False False False True\n", "2 p3 mixed_cochlear_age_fixation False False False False False False False False ... False False False False True False False False False False\n", "3 p4 mixed_cochlear_age_otitis_media False False False False False False False False ... False True False False True False False False False False\n", "4 p5 cochlear_age False False False False False False False False ... False False False False True False False False False True\n", "\n", "[5 rows x 89 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audiology_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## OKay, based on the above data set sample, the only meaningful thing we can try is to see if we can predict the case classification based on any of the observed features.\n", "\n", "## We have 87 features,(I'm assuming these are labels that came out of human judgment) and most of it is false.. aka this is a sparsely populated dataset in these dimensions, and most likely the dimensions are not orthogonal(aka independent) to(of) each other. \n", "\n", "## Due to these reasons, \n", " * a tree based prediction is best(since it is all boolean features)\n", " * Xgboost since it is mostly False/empty features.(aka sparse features)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:27:01.129284", "start_time": "2017-11-17T21:27:00.908902" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
case_idclassificationbone(normal)history(fullness)o_ar_c(elevated)mod_s_sn_gt_500mod_sn_gt_4knotch_4klate_wave(poor)s_sn_gt_2k...air(normal)tymp(b)m_s_sn_gt_2ko_ar_u(elevated)age_gt_60tymp(ad)history(recruitment)m_m_snm_sn_gt_1ko_ar_u(normal)
0p1cochlear_unknownFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
1p2cochlear_unknownFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
2p3mixed_cochlear_age_fixationFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
3p4mixed_cochlear_age_otitis_mediaFalseFalseFalseFalseFalseFalseFalseFalse...FalseTrueFalseFalseTrueFalseFalseFalseFalseFalse
4p5cochlear_ageFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseTrue
\n", "

5 rows × 89 columns

\n", "
" ], "text/plain": [ " case_id classification bone(normal) history(fullness) o_ar_c(elevated) mod_s_sn_gt_500 mod_sn_gt_4k notch_4k late_wave(poor) s_sn_gt_2k ... air(normal) tymp(b) m_s_sn_gt_2k o_ar_u(elevated) age_gt_60 tymp(ad) history(recruitment) m_m_sn m_sn_gt_1k o_ar_u(normal)\n", "0 p1 cochlear_unknown False False False False False False False False ... False False False False False False False False False True\n", "1 p2 cochlear_unknown False False False False False False False False ... False False False False False False False False False True\n", "2 p3 mixed_cochlear_age_fixation False False False False False False False False ... False False False False True False False False False False\n", "3 p4 mixed_cochlear_age_otitis_media False False False False False False False False ... False True False False True False False False False False\n", "4 p5 cochlear_age False False False False False False False False ... False False False False True False False False False True\n", "\n", "[5 rows x 89 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audiology_df.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:27:01.341072", "start_time": "2017-11-17T21:27:01.130712" }, "collapsed": true }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "le = LabelEncoder()\n", "le.fit(audiology_df['classification'].unique())\n", "audiology_df['classification'] = le.transform(audiology_df['classification'])\n", "target = audiology_df.classification\n", "\n", "audiology_df.drop(['case_id', 'classification'], 1, inplace=True)\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:27:01.684735", "start_time": "2017-11-17T21:27:01.342669" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bone(normal)history(fullness)o_ar_c(elevated)mod_s_sn_gt_500mod_sn_gt_4knotch_4klate_wave(poor)s_sn_gt_2kbone(moderate)mod_gt_4k...air(normal)tymp(b)m_s_sn_gt_2ko_ar_u(elevated)age_gt_60tymp(ad)history(recruitment)m_m_snm_sn_gt_1ko_ar_u(normal)
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseTrueFalseFalseTrueFalseFalseFalseFalseFalse
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseTrue
\n", "

5 rows × 87 columns

\n", "
" ], "text/plain": [ " bone(normal) history(fullness) o_ar_c(elevated) mod_s_sn_gt_500 mod_sn_gt_4k notch_4k late_wave(poor) s_sn_gt_2k bone(moderate) mod_gt_4k ... air(normal) tymp(b) m_s_sn_gt_2k o_ar_u(elevated) age_gt_60 tymp(ad) history(recruitment) m_m_sn m_sn_gt_1k o_ar_u(normal)\n", "0 False False False False False False False False False False ... False False False False False False False False False True\n", "1 False False False False False False False False False False ... False False False False False False False False False True\n", "2 False False False False False False False False False False ... False False False False True False False False False False\n", "3 False False False False False False False False False False ... False True False False True False False False False False\n", "4 False False False False False False False False False False ... False False False False True False False False False True\n", "\n", "[5 rows x 87 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audiology_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2017-11-17T15:56:59.135Z" }, "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean squared error: 24.52\n", "Variance score: 0.72\n" ] } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(audiology_df, target, test_size=0.3)\n", "tree_model = pm.train(X_train, y_train, 'tree')\n", "tree_model.fit(X_train, y_train)\n", "# The mean squared error\n", "print(\"Mean squared error: %.2f\"\n", " % np.mean((tree_model.predict(X_test) - y_test) ** 2))\n", "# Explained variance score: 1 is perfect prediction\n", "print('Variance score: %.2f' % tree_model.score(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2017-11-17T15:56:59.145Z" }, "collapsed": true }, "outputs": [], "source": [ "plotter.show_tree_model(tree_model, model_type='tree')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2017-11-17T15:56:59.159Z" }, "collapsed": true }, "outputs": [], "source": [ "# Train the model using the training sets\n", "xgb_model = pm.train(X_train, y_train, 'xgboost')\n", "xgb_model.fit(X_train, y_train)\n", "# The mean squared error\n", "print(\"Mean squared error: %.2f\"\n", " % np.mean((xgb_model.predict(X_test) - y_test) ** 2))\n", "# Explained variance score: 1 is perfect prediction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2017-11-17T15:56:59.168Z" }, "collapsed": true }, "outputs": [], "source": [ "plotter.show_tree_model(xgb_model, model_type='xgboost')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }