# Stochastic gradient descent (SGD) 
SGD is an  incremental gradient descent algorithm which modifies its weights, in an effort to reach a local minimum. The cuML implementation can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well  as cuDF DataFrames. In order to convert your dataset into a cuDF dataframe format please refer the documentation on https://rapidsai.github.io/projects/cudf/en/latest/. The SGD algorithm implemented in cuML can accept the following parameters:
1. loss : 'hinge', 'log', 'squared_loss' (default = 'squared_loss')
2. penalty: 'none', 'l1', 'l2', 'elasticnet' (default = 'none')
3. alpha: float (default = 0.0001)
4. fit_intercept : boolean (default = True)
5. epochs : int (default = 1000)
6. tol : float (default = 1e-3)
7. shuffle : boolean (default = True)
8. eta0 : float (default = 0.0)
9. power_t : float (default = 0.5)
10. learning_rate : 'optimal', 'constant', 'invscaling', 'adaptive' (default = 'constant')
11. n_iter_no_change : int (default = 5)

For additional information on the SGD model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html


In [None]:
import numpy as np
import pandas as pd
import cudf
import os
from cuml.solvers import SGD as cumlSGD
from sklearn.linear_model import SGDRegressor

# Helper Functions

In [None]:
# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for sgd 
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]

    else:
        # create a random dataset
        print('use random data')
        X = np.random.rand(nrows,ncols)
        y = np.random.randint(0,10,size=(nrows,1))
    train_rows = int(nrows*0.8)
    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
    df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
    df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
    return df_X_train, df_X_test, df_y_train, df_y_test


In [None]:
# this function checks if the results obtained from two different methods (sklearn and cuml) are the same
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a).ravel()
    b = to_nparray(b).ravel()
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x


# Run tests

In [None]:
%%time
# nrows = number of samples
# ncols = number of features of each sample
nrows = 2**20
ncols = 399

# dataset is split into a ratio of 80:20, 
# 80% is used as the training data and the remaining 20% is used as the test data
X_train, X_test, y_train, y_test = load_data(nrows,ncols)
y_train_ser = y_train['fea0']
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)

Here we set the parameters usedby both libraries. You can change the number of iterations used by changing the `iterations` variable.  Please note that making this too high can cause the functions to take a long time to complete.

In [None]:
#set parameters 
learning_rate = 'adaptive'
datatype = np.float32
penalty = 'elasticnet'
loss = 'squared_loss'
iterations = 10 

The `max_iter` parameter controls the maxixmum number of iterations the model can run for but it doesnâ€™t guarantee that the model will definitely run for all those epochs, therefore the sklearn might run for less number of epochs than the cuML model

In [None]:
%%time
# use the sklearn SGD Regressor model to fit the dataset 
sk_sgd = SGDRegressor(learning_rate=learning_rate, eta0=0.07,
                       max_iter=iterations, tol=0.0, fit_intercept=True,
                       penalty=penalty, loss=loss)
sk_sgd.fit(X_train, y_train_ser)


In [None]:
%%time
# test the model by predicting its results for the unseen test set
y_sk = sk_sgd.predict(X_test)

# calculate the Mean Squared Error for the model's predictions
error_sk = mean_squared_error(y_test,y_sk)


In [None]:
%%time
# convert the pandas dataframe to cuDF dataframe and series
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = cudf.Series(y_train_ser)

In [None]:
%%time
# fit the training data on cuML's implementation of SGD
cu_sgd = cumlSGD(learning_rate=learning_rate, eta0=0.07, epochs=iterations, #epochs == n_iter
                 batch_size=512,
                 tol=0.0, penalty=penalty, loss=loss)
cu_sgd.fit(X_cudf, y_cudf)


In [None]:
%%time
# test the model by predicting its values for the test set
y_pred = cu_sgd.predict(X_cudf_test)
y_pred = to_nparray(y_pred).ravel()
# calculate the Mean Squared Error for the model's predictions
error_cu = mean_squared_error(y_test,y_pred)

In [None]:
# print the MSE of the sklearn and cuML models to compare them
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)