Steps in Data Preprocessing These are the steps:
Import libraries
Import dataset
Finding for missing values
Encoding categorical data
Data splitting
Feature Scaling
import pandas as pd
import numpy as np
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Documentation
import handcalcs.render
# Plot
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm # color map
import seaborn as sns
from sympy import Sum, symbols, Indexed, lambdify, diff
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from mpl_toolkits.mplot3d.axes3d import Axes3D
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Path
data_path = './Data/'
data = pd.read_csv(data_path+"IceCreamData.csv").reset_index(drop=True)
data.shape
(500, 2)
data
Temperature | Revenue | |
---|---|---|
0 | 24.566884 | 534.799028 |
1 | 26.005191 | 625.190122 |
2 | 27.790554 | 660.632289 |
3 | 20.595335 | 487.706960 |
4 | 11.503498 | 316.240194 |
... | ... | ... |
495 | 22.274899 | 524.746364 |
496 | 32.893092 | 755.818399 |
497 | 12.588157 | 306.090719 |
498 | 22.362402 | 566.217304 |
499 | 28.957736 | 655.660388 |
500 rows × 2 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temperature 500 non-null float64 1 Revenue 500 non-null float64 dtypes: float64(2) memory usage: 7.9 KB
# Rename the columns
data.rename(columns={ 'Temperature': 'Temperature(Celsius)', 'Revenue': 'Revenue(Dollar)'}, inplace=True)
data.columns
Index(['Temperature(Celsius)', 'Revenue(Dollar)'], dtype='object')
fig = plt.figure(figsize=[26, 8])
plt.scatter( data['Temperature(Celsius)'], data['Revenue(Dollar)'], s=150, alpha = 0.4, color ='#E274CF',label = 'Sales')
plt.title('Temperature VS Revenue', fontsize = 22)
plt.xlabel('Temperature (Celesius)', fontsize = 18)
plt.ylabel('Revenue (Dollar)', fontsize = 18)
plt.legend()
plt.show()
x = np.array(data['Temperature(Celsius)']).reshape(-1,1)
y = np.array(data['Revenue(Dollar)']).reshape(-1,1)
# train, test split......
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print("X_train Size :",len(X_train))
print("Y_train Size :",len(y_train))
print("X_test Size :",len(X_test))
print("Y_test Size :",len(y_test))
print("Train Size :", (len(X_train)/len(x))*100)
print("Train Size :", (len(X_test)/len(x))*100)
X_train Size : 400 Y_train Size : 400 X_test Size : 100 Y_test Size : 100 Train Size : 80.0 Train Size : 20.0
The mathematical background for simple linear regression is given by:
$$ \large y = mx + b $$Where:
To find the values of (m) and (b) that minimize the error, we can use the Ordinary Least Squares (OLS) method. The formulas for (m) and (b) in terms of the data points are:
$$ \large m = \frac{\sum_{i=1}^{n}(x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^{n}(x_i - \bar{x})^2} $$$$ \large b = \bar{y} - m\bar{x} $$Where:
These formulas represent the slope ((m)) and y-intercept ((b)) obtained through the Ordinary Least Squares method for simple linear regression.
regr = LinearRegression()
regr.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
c = regr.intercept_
c
array([46.80464128])
m = regr.coef_
m
array([[21.38197386]])
regr.score(X_train,y_train)
0.9802510145652878
y_pred = c + m * X_train
round(r2_score(y_train,y_pred),3)
0.98
fig = plt.figure(figsize=[16, 8])
plt.scatter(X_train, y_train, s=150, alpha = 0.4, color ='#E274CF',label = 'Sales')
plt.plot(X_train, regr.predict(X_train), color='blue', linewidth=2, label = 'Predict')
plt.title('Temperature (Celsius) VS Revenue (Dollar)', fontsize = 22)
plt.xlabel('Temperature (Celsius)', fontsize = 18)
plt.ylabel('Revenue (Dollar)', fontsize = 18)
plt.legend()
plt.show()
def mse(y_train, y_pred):
size = len(X_train)
mse_calc = 1/size * sum((y_train - y_pred)**2)
return mse_calc
print('The directly calculated value of MSE is ', mse(y_train , y_pred))
The directly calculated value of MSE is [616.50535423]
nr_thetas = 50
th_0 = np.linspace(start=-0, stop=1, num=nr_thetas)
th_1 = np.linspace(start=-0, stop=4, num=nr_thetas)
plot_t0, plot_t1 = np.meshgrid(th_0, th_1)
plot_cost = np.zeros((nr_thetas, nr_thetas))
for i in range(nr_thetas):
for j in range(nr_thetas):
y_pred = plot_t0[i][j] + plot_t1[i][j]*X_train
plot_cost[i][j] = mean_squared_error(y_train, y_pred)
print('value of plot_cost', plot_cost.min())
value of plot_cost 205704.7929950078
fig = plt.figure(figsize=[16, 12])
ax = fig.add_subplot(projection='3d')
ax.set_xlabel('Theta_0 The Intercept', fontsize=20, color ="black")
ax.set_ylabel('Theta_1 The Slope', fontsize=20, color ='green')
ax.set_zlabel('Cost function - The Mean Square Error', fontsize=20, color = 'brown')
ax.azim = -15
ax.plot_surface(plot_t0, plot_t1, plot_cost, cmap=cm.cool, alpha =0.4 )
plt.show()
print('Min value of plot_cost', plot_cost.min())# pulling out the lowest mean sqaure error from our surface plot.
# how to get the theta0 and theta1 values associated with that cost??
ij_min = np.unravel_index(indices=plot_cost.argmin(), shape=plot_cost.shape) # unravel_index function of np
print('Min occurs at (i,j):', ij_min) # row, col
Min value of plot_cost 205704.7929950078 Min occurs at (i,j): (49, 49)
def grad(x, y, thetas):
n = y.size
#Create theta0_slope and theta1_slope to hold slope values from partial derivs
theta0_slope = (-2/n) * sum(y - thetas[0] - thetas[1]*x)
theta1_slope = (-2/n) * sum((y - thetas[0] - thetas[1]*x)*x)
return np.array([theta0_slope[0], theta1_slope[0]])
multiplier = 0.00001
thetas = np.array([6.9,6.9])
# Collecting data points for scatter plot
plot_vals = thetas.reshape(1, 2)
mse_vals = mse(y_train, thetas[0] + thetas[1]*X_train)
for i in range(1000):
thetas = thetas - multiplier * grad(X_train, y_train, thetas)
# Appending the new values to numpy arrays
plot_vals = np.concatenate((plot_vals, thetas.reshape(1, 2)), axis=0)
mse_vals = np.append(arr=mse_vals, values=mse(y_train, thetas[0] + thetas[1]*X_train))
intercept = thetas[0]
slope = thetas[1]
intercept, slope
(7.63300274750823, 22.940892699493137)
# Plotting MSE
fig = plt.figure(figsize=[16, 12])
ax = fig.add_subplot(projection='3d')
ax.set_xlabel('Theta_0 The Intercept', fontsize=20, color ="black")
ax.set_ylabel('Theta_1 The Slope', fontsize=20, color ='black')
ax.set_zlabel('Cost Salary- MSE', fontsize=20, color ='red')
ax.azim = -75
ax.scatter(plot_vals[:, 0], plot_vals[:, 1], mse_vals, s=50, color='black')
# ax.plot_surface(plot_t0, plot_t1, plot_cost, cmap=cm.rainbow, alpha=0.4)
plt.show()
nr_thetas = 0 # restting nr_thetas values to zero....so error will not be shown when the notebook is re-run
# Assuming you have plot_t0, plot_t1, plot_cost, mse_vals defined
# Create a 3D plot
fig = plt.figure(figsize=[16, 12])
ax = fig.add_subplot(projection='3d')
# Set labels
ax.set_xlabel('Theta_0 The Intercept', fontsize=20, color="black")
ax.set_ylabel('Theta_1 The Slope', fontsize=20, color='black')
ax.set_zlabel('Cost Salary- MSE', fontsize=20, color='red')
ax.azim = -75
# Scatter plot
# ax.scatter(plot_vals[:, 0], plot_vals[:, 1], mse_vals, s=50, color='black')
# Plot the surface
surf = ax.plot_surface(plot_t0, plot_t1, plot_cost, cmap=cm.rainbow, alpha=0.4)
# Add a colorbar which maps values to colors
fig.colorbar(surf, ax=ax, shrink=0.5, aspect=10)
plt.show()
def f(x):
return x**2 + x + 1
def df(x):
return 2*x + 1
# Plotting MSE
# Three subplots 1. GD, 2 the GD on the Slope, 3 a close up
plt.figure(figsize=[20, 5])
#1
plt.subplot(1,3,1)
plt.grid()
plt.title('Cost function', fontsize=17)
plt.xlabel('X', fontsize=16)
plt.ylabel('f(x)', fontsize=16)
plt.plot(mse_vals, f(mse_vals), color='skyblue', linewidth=3)
values = np.array(mse_vals)
plt.scatter(mse_vals, f(values), color='red', s=100, alpha=0.6)
#2
plt.subplot(1,3,2)
plt.title('Slope of the cost function', fontsize=17)
plt.xlabel('X', fontsize=16)
plt.ylabel('df(x)', fontsize=16)
plt.grid()
plt.plot(mse_vals, df(mse_vals), color='skyblue', linewidth=5, alpha=0.6)
plt.scatter(mse_vals, df(mse_vals), color='red', s=100, alpha=0.5)
#3
plt.subplot(1,3,3)
plt.title('Gradient Descent (close up)', fontsize=17)
plt.xlabel('X', fontsize=16)
plt.grid()
plt.plot(mse_vals, df(mse_vals), color='skyblue', linewidth=6, alpha=0.8)
plt.scatter(mse_vals, df(mse_vals), color='red', s=300, alpha=0.6)
plt.show()
y_train_opt = intercept + slope * X_train
round(r2_score(y_train, y_train_opt),3), regr.score(X_train, y_train), regr.score(X_train, y_train_opt)
(0.974, 0.9802510145652878, 0.9947503023305504)
fig = plt.figure(figsize=[16, 8])
plt.scatter(X_train, y_train, s=150, alpha = 0.4, color ='#E274CF',label = 'Sales')
plt.plot(X_train, regr.predict(X_train), color='blue', linewidth=3, label = 'Predict')
plt.plot(X_train, y_train_opt, color='green', linewidth=3, label = 'Optimized')
plt.title('Temperature (Celsius) & Revenue (Dollar)', fontsize = 22)
plt.xlabel('Temperature (Celsius)', fontsize = 18)
plt.ylabel('Revenue (Dollar)', fontsize = 18)
plt.legend()
plt.show()