# Linear regression model

In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## 1. Generating a training set

In [None]:
np.random.seed(71)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

## 2. Exploring the training set

In [None]:
plt.figure(figsize=(6,4))
plt.plot(X, y, 'b.')
plt.xlabel('$x_1$')
plt.ylabel('$y$', rotation = 0, fontsize=14)
plt.axis([0, 2, 0, 15])

## 3. Computing model's parameters

In [None]:
X_b = np.c_[np.ones((100, 1)), X]
theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

In [None]:
theta

In [None]:
X_new = np.array([[0], [2]])
# add x0 = 1 to each instance
X_new_b = np.c_[np.ones((2, 1)), X_new]
y_predict = X_new_b @ theta
y_predict

In [None]:
plt.plot(X_new, y_predict, 'r-', label = "Predictions")
plt.plot(X, y, 'b.')
plt.axis([0, 2, 0, 15])
plt.xlabel("$x_1$")
plt.ylabel("$y$", rotation = 0)
plt.legend(loc="upper left")

In [None]:
import sklearn
assert sklearn.__version__ >= '0.21.3'

from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()
lm.fit(X, y)
lm.intercept_, lm.coef_

In [None]:
lm.predict(X_new)

## 4. Linear regression using batch gradient descent

In [None]:
alpha = 0.1

# limit the number of iterations to avoid to take long time to search for the optimal value, i.e,, only good enough
n_iterations = 1000

m, n = X.shape
# randommly initiliazes theta
theta = np.random.randn(2, 1)

for iteration in range(n_iterations):
 gradients = 2 / m * X_b.T.dot(X_b.dot(theta) - y)
 theta = theta - alpha * gradients

In [None]:
theta

In [None]:
X_new_b @ theta

### 4.1. Visualizing the gradient descent

In [None]:
theta_path_bgd = []

def plot_gradient_descent(theta, alpha, theta_path=None):
 m = len(X_b)
 plt.plot(X, y, "b.")
 n_iterations = 1000
 for iteration in range(n_iterations):
 if iteration < 10:
 y_predict = X_new_b.dot(theta)
 style = "b-" if iteration > 0 else "r--"
 plt.plot(X_new, y_predict, style)
 gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
 theta = theta - alpha * gradients
 if theta_path is not None:
 theta_path.append(theta)
 plt.xlabel("$x_1$", fontsize=18)
 plt.axis([0, 2, 0, 15])
 plt.title(r"$\eta = {}$".format(alpha), fontsize=16)

In [None]:
np.random.seed(42)
theta = np.random.randn(2,1)

plt.figure(figsize=(10,4))
plt.subplot(131); plot_gradient_descent(theta, alpha=0.02)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(132); plot_gradient_descent(theta, alpha=0.1, theta_path=theta_path_bgd)
plt.subplot(133); plot_gradient_descent(theta, alpha=0.5)


## 5. Stochastic Gradient Descent

In [None]:
theta_path_sgd = []
m = len(X_b)
np.random.seed(42)

n_epochs = 50

# learning schedule hyperparameters
t0, t1 = 5, 50 

In [None]:
def learning_schedule(t):
 return t0 / (t + t1)

In [None]:
theta = np.random.randn(2,1) # random initialization

for epoch in range(n_epochs):
 for i in range(m):
 if epoch == 0 and i < 20: 
 y_predict = X_new_b.dot(theta) 
 style = "b-" if i > 0 else "r--" 
 plt.plot(X_new, y_predict, style) 
 random_index = np.random.randint(m)
 xi = X_b[random_index:random_index+1]
 yi = y[random_index:random_index+1]
 gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
 eta = learning_schedule(epoch * m + i)
 theta = theta - eta * gradients
 theta_path_sgd.append(theta) 

plt.plot(X, y, "b.") 
plt.xlabel("$x_1$", fontsize=18) 
plt.ylabel("$y$", rotation=0, fontsize=18) 
plt.axis([0, 2, 0, 15]) 

In [None]:
theta

In [None]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1, random_state=42)
sgd_reg.fit(X, y.ravel())

sgd_reg.intercept_, sgd_reg.coef_

## Mini-batch gradient descent

In [None]:
theta_path_mgd = []

n_iterations = 50
minibatch_size = 20

np.random.seed(42)
theta = np.random.randn(2,1) # random initialization

t0, t1 = 200, 1000

In [None]:
def learning_schedule(t):
 return t0 / (t + t1)

In [None]:
t = 0
for epoch in range(n_iterations):
 shuffled_indices = np.random.permutation(m)
 X_b_shuffled = X_b[shuffled_indices]
 y_shuffled = y[shuffled_indices]
 for i in range(0, m, minibatch_size):
 t += 1
 xi = X_b_shuffled[i:i+minibatch_size]
 yi = y_shuffled[i:i+minibatch_size]
 gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)
 eta = learning_schedule(t)
 theta = theta - eta * gradients
 theta_path_mgd.append(theta)

In [None]:
theta

In [None]:
theta_path_bgd = np.array(theta_path_bgd)
theta_path_sgd = np.array(theta_path_sgd)
theta_path_mgd = np.array(theta_path_mgd)

In [None]:
plt.figure(figsize=(12,5))
plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], "r-s", linewidth=1, label="Stochastic")
plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], "g-+", linewidth=2, label="Mini-batch")
plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], "b-o", linewidth=3, label="Batch")
plt.legend(loc="upper right", fontsize=16)
plt.xlabel(r"$\theta_0$", fontsize=20)
plt.ylabel(r"$\theta_1$ ", fontsize=20, rotation=0)
plt.axis([2.5, 4.5, 2.3, 3.9])