{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stochastic gradient descent (SGD) \n",
    "SGD is an  incremental gradient descent algorithm which modifies its weights, in an effort to reach a local minimum. The cuML implementation can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well  as cuDF DataFrames. In order to convert your dataset into a cuDF dataframe format please refer the documentation on https://rapidsai.github.io/projects/cudf/en/latest/. The SGD algorithm implemented in cuML can accept the following parameters:\n",
    "1. loss : 'hinge', 'log', 'squared_loss' (default = 'squared_loss')\n",
    "2. penalty: 'none', 'l1', 'l2', 'elasticnet' (default = 'none')\n",
    "3. alpha: float (default = 0.0001)\n",
    "4. fit_intercept : boolean (default = True)\n",
    "5. epochs : int (default = 1000)\n",
    "6. tol : float (default = 1e-3)\n",
    "7. shuffle : boolean (default = True)\n",
    "8. eta0 : float (default = 0.0)\n",
    "9. power_t : float (default = 0.5)\n",
    "10. learning_rate : 'optimal', 'constant', 'invscaling', 'adaptive' (default = 'constant')\n",
    "11. n_iter_no_change : int (default = 5)\n",
    "\n",
    "For additional information on the SGD model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import cudf\n",
    "import os\n",
    "from cuml.solvers import SGD as cumlSGD\n",
    "from sklearn.linear_model import SGDRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Helper Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for sgd \n",
    "import gzip\n",
    "# change the path of the mortgage dataset if you have saved it in a different directory\n",
    "def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):\n",
    "    if os.path.exists(cached):\n",
    "        print('use mortgage data')\n",
    "\n",
    "        with gzip.open(cached) as f:\n",
    "            X = np.load(f)\n",
    "        # the 4th column is 'adj_remaining_months_to_maturity'\n",
    "        # used as the label\n",
    "        X = X[:,[i for i in range(X.shape[1]) if i!=4]]\n",
    "        y = X[:,4:5]\n",
    "        rindices = np.random.randint(0,X.shape[0]-1,nrows)\n",
    "        X = X[rindices,:ncols]\n",
    "        y = y[rindices]\n",
    "\n",
    "    else:\n",
    "        # create a random dataset\n",
    "        print('use random data')\n",
    "        X = np.random.rand(nrows,ncols)\n",
    "        y = np.random.randint(0,10,size=(nrows,1))\n",
    "    train_rows = int(nrows*0.8)\n",
    "    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})\n",
    "    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})\n",
    "    df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})\n",
    "    df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})\n",
    "    return df_X_train, df_X_test, df_y_train, df_y_test\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this function checks if the results obtained from two different methods (sklearn and cuml) are the same\n",
    "from sklearn.metrics import mean_squared_error\n",
    "def array_equal(a,b,threshold=2e-3,with_sign=True):\n",
    "    a = to_nparray(a).ravel()\n",
    "    b = to_nparray(b).ravel()\n",
    "    if with_sign == False:\n",
    "        a,b = np.abs(a),np.abs(b)\n",
    "    error = mean_squared_error(a,b)\n",
    "    res = error<threshold\n",
    "    return res\n",
    "\n",
    "# the function converts a variable from ndarray or dataframe format to numpy array\n",
    "def to_nparray(x):\n",
    "    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):\n",
    "        return np.array(x)\n",
    "    elif isinstance(x,np.float64):\n",
    "        return np.array([x])\n",
    "    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):\n",
    "        return x.to_pandas().values\n",
    "    return x\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run tests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# nrows = number of samples\n",
    "# ncols = number of features of each sample\n",
    "nrows = 2**20\n",
    "ncols = 399\n",
    "\n",
    "# dataset is split into a ratio of 80:20, \n",
    "# 80% is used as the training data and the remaining 20% is used as the test data\n",
    "X_train, X_test, y_train, y_test = load_data(nrows,ncols)\n",
    "y_train_ser = y_train['fea0']\n",
    "print('training data',X_train.shape)\n",
    "print('training label',y_train.shape)\n",
    "print('testing data',X_test.shape)\n",
    "print('testing label',y_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we set the parameters usedby both libraries. You can change the number of iterations used by changing the `iterations` variable.  Please note that making this too high can cause the functions to take a long time to complete."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#set parameters \n",
    "learning_rate = 'adaptive'\n",
    "datatype = np.float32\n",
    "penalty = 'elasticnet'\n",
    "loss = 'squared_loss'\n",
    "iterations = 10 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `max_iter` parameter controls the maxixmum number of iterations the model can run for but it doesn’t guarantee that the model will definitely run for all those epochs, therefore the sklearn might run for less number of epochs than the cuML model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# use the sklearn SGD Regressor model to fit the dataset \n",
    "sk_sgd = SGDRegressor(learning_rate=learning_rate, eta0=0.07,\n",
    "                       max_iter=iterations, tol=0.0, fit_intercept=True,\n",
    "                       penalty=penalty, loss=loss)\n",
    "sk_sgd.fit(X_train, y_train_ser)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# test the model by predicting its results for the unseen test set\n",
    "y_sk = sk_sgd.predict(X_test)\n",
    "\n",
    "# calculate the Mean Squared Error for the model's predictions\n",
    "error_sk = mean_squared_error(y_test,y_sk)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# convert the pandas dataframe to cuDF dataframe and series\n",
    "X_cudf = cudf.DataFrame.from_pandas(X_train)\n",
    "X_cudf_test = cudf.DataFrame.from_pandas(X_test)\n",
    "y_cudf = cudf.Series(y_train_ser)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# fit the training data on cuML's implementation of SGD\n",
    "cu_sgd = cumlSGD(learning_rate=learning_rate, eta0=0.07, epochs=iterations, #epochs == n_iter\n",
    "                 batch_size=512,\n",
    "                 tol=0.0, penalty=penalty, loss=loss)\n",
    "cu_sgd.fit(X_cudf, y_cudf)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# test the model by predicting its values for the test set\n",
    "y_pred = cu_sgd.predict(X_cudf_test)\n",
    "y_pred = to_nparray(y_pred).ravel()\n",
    "# calculate the Mean Squared Error for the model's predictions\n",
    "error_cu = mean_squared_error(y_test,y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print the MSE of the sklearn and cuML models to compare them\n",
    "print(\"SKL MSE(y):\")\n",
    "print(error_sk)\n",
    "print(\"CUML MSE(y):\")\n",
    "print(error_cu)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}