{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Connecting to remote spark through DSX-HI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "success configuring sparkmagic livy.\n"
     ]
    }
   ],
   "source": [
    "%load_ext sparkmagic.magics\n",
    "from dsx_core_utils import proxy_util,dsxhi_util\n",
    "proxy_util.configure_proxy_livy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['https://becks1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://becks1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://cdh513edge11.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://cdh514edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://cdh515edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://cdh515edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://centos74edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://centos74edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://rated3.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://yccdh5.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://yccdh5.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://ycedge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://ycedge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://zinc1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://zinc1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1']\n"
     ]
    }
   ],
   "source": [
    "dsxhi_util.list_livy_endpoints()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Pushing the python virtual environment to cluster using DSX-HI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{ \"imageId\": \"968c2101554e0d1e0d4fdd3720aaa565a2910cb46f4d7ed61188b6ceeec22930\",\r\n",
      "  \"scriptCommand\": \"anaconda2/bin/python2.7\",\r\n",
      "  \"libPaths\": [\"usr/local/spark-2.0.2-bin-hadoop2.7/python\",\"user-home/.scripts/common-helpers/batch/pmml\",\"user-home/.scripts/common-helpers/saas\",\"user-home/_global_/python-2.7\"] }\r\n"
     ]
    }
   ],
   "source": [
    "!cat /user-home/_global_/.remote-images/dsx-hi/dsx-scripted-ml-python2.json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Create Session Properties\n",
    "Using values from `dsx-scripted-ml-python2.json`, we'll need to:\n",
    "\n",
    "- (1) Pull the archive from HDFS to the Yarn Distributed cache using spark conf **--archives**\n",
    "- (2) Override the default PYSPARK_PYTHON, from the relative path `scriptCommand`\n",
    "\n",
    "---\n",
    "\n",
    "Example DSX_HI Properties for using dsx-scripted-ml-python2.tar.gz Virtual Environment:\n",
    "```\n",
    "{\"proxyUser\": \"user1\", \"archives\": [\"/user/dsxhi/environments/26611bf7fe595f786139d6d2132de070fc813f6a0ef7a4e25857b79c8cd4b565/dsx-scripted-ml-python2.tar.gz\"],\"conf\":{\"spark.yarn.appMasterEnv.PYSPARK_PYTHON\":\"dsx-scripted-ml-python2.tar.gz/anaconda2/bin/python\"}}\n",
    "```\n",
    "### Files currently on HDFS:\n",
    "```\n",
    "/user/dsxhi/environments/26611bf7fe595f786139d6d2132de070fc813f6a0ef7a4e25857b79c8cd4b565/dsx-scripted-ml-python2.tar.gz\n",
    "/user/dsxhi/environments/pythonAddons/pythonAddons.tar.gz\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a7cf4dfb29f849fc9558c1be1c822164",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "MagicsControllerWidget(children=(Tab(children=(ManageSessionWidget(children=(HTML(value=u'<br/>'), HTML(value=…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Added endpoint https://zinc1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1\n",
      "Starting Spark application\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr><tr><td>913</td><td>application_1533478912530_0775</td><td>pyspark</td><td>idle</td><td><a target=\"_blank\" href=\"http://ales1.fyre.ibm.com:8088/proxy/application_1533478912530_0775/\">Link</a></td><td><a target=\"_blank\" href=\"http://ales3.fyre.ibm.com:8042/node/containerlogs/container_e32_1533478912530_0775_01_000001/user1\">Link</a></td><td>✔</td></tr></table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SparkSession available as 'spark'.\n"
     ]
    }
   ],
   "source": [
    "%manage_spark"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Reading the dataset from HDFS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%spark\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Reading the data from hdfs\n",
    "data = spark.read.option(\"delimiter\",\",\").option(\"header\",\"false\").csv(\"hdfs:///user/user1/SMSSpamCollection.csv\")\n",
    "dataset = data.toPandas()\n",
    "dataset = dataset.iloc[:,:2]\n",
    "\n",
    "message = dataset['_c1']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Extracting the Bag of Words features (Text to Vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "%%spark\n",
    "# Creating the Bag of Words model\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "cv = CountVectorizer(max_features = 1500)\n",
    "X = cv.fit_transform(message).toarray()\n",
    "y = dataset.iloc[:, 0].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Perform Machine learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 93.00%\n",
      "y_test  y_pred  count\n",
      "     1       1     94\n",
      "     0       1      4\n",
      "     1       0     10\n",
      "     0       0     92\n",
      "/hadoop/yarn/local/usercache/user1/appcache/application_1533478912530_0773/container_e32_1533478912530_0773_01_000001/dsx-scripted-ml-python2.tar.gz/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)"
     ]
    }
   ],
   "source": [
    "%%spark\n",
    "# Splitting the dataset into the Training set and Test set\n",
    "from sklearn.cross_validation import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)\n",
    "\n",
    "#Fitting Logistic Regression to the training set\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "classifier = LogisticRegression(random_state=0)\n",
    "classifier.fit(X_train,y_train)\n",
    "\n",
    "#Predicting the Test Results\n",
    "y_pred = classifier.predict(X_test)\n",
    "\n",
    "#Printing the model accuracy\n",
    "from sklearn.metrics import accuracy_score\n",
    "print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))\n",
    "\n",
    "#Making the Confusion Matrix\n",
    "from sklearn.metrics import confusion_matrix\n",
    "cm = confusion_matrix(y_test,y_pred)\n",
    "\n",
    "#Printing the confusion matrix in a table\n",
    "x=pd.Series(np.array([1,0,1,0]))\n",
    "y=pd.Series(np.array([1,1,0,0]))\n",
    "z=pd.Series(np.array([cm[0][0],cm[0][1],cm[1][0],cm[1][1]]))\n",
    "\n",
    "cm_df = pd.DataFrame({'y_test':x,'y_pred':y,'count':z})\n",
    "cm_df = cm_df[['y_test','y_pred','count']]\n",
    "print cm_df.to_string(index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python2.7 with DSX Spark 2.0.2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}