import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib
import scipy.optimize as opt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
data = sio.loadmat('/content/drive/My Drive/AndrewNg-ML/ex5data1.mat')
X = data['X']
y = data['y']
Xval = data['Xval']
yval = data['yval']
Xtest = data['Xtest']
ytest = data['ytest']
X.shape, y.shape
((12, 1), (12, 1))
X[:, 0]
array([-15.93675813, -29.15297922, 36.18954863, 37.49218733, -48.05882945, -8.94145794, 15.30779289, -34.70626581, 1.38915437, -44.38375985, 7.01350208, 22.76274892])
y
array([[ 2.13431051], [ 1.17325668], [34.35910918], [36.83795516], [ 2.80896507], [ 2.12107248], [14.71026831], [ 2.61418439], [ 3.74017167], [ 3.73169131], [ 7.62765885], [22.7524283 ]])
plt.figure(figsize=(8, 6))
plt.scatter(X.ravel(), y.ravel(), s=50, c='r', marker='x', linewidths=1)
# pandas has trouble taking this 2d ndarray to construct a dataframe
# so we have to ravel the data
plt.xlabel('Change in water level (x)')
plt.ylabel('Water flowing out of the dam (y)')
plt.ylim(ymin=0);
def ravel_data(data):
return map(np.ravel, [data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']])
X, y, Xval, yval, Xtest, ytest = ravel_data(data)
X
array([-15.93675813, -29.15297922, 36.18954863, 37.49218733, -48.05882945, -8.94145794, 15.30779289, -34.70626581, 1.38915437, -44.38375985, 7.01350208, 22.76274892])
# add bias term
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
def linearRegCostFunction(theta, X, y, lamb = 1):
m = y.size
h = X.dot(theta)
J = (1/(2*m)) * np.sum(np.square(h-y))
reg = (lamb/(2*m)) * np.sum(np.square(theta[1:]))
return J + reg
theta = np.ones(X.shape[1])
linearRegCostFunction(theta, X, y)
303.9931922202643
def gradient(theta, X, y):
m = X.shape[0]
h_minus_y = X.dot(theta) - y
sum_term = X.T.dot(h_minus_y)
return sum_term / m
def regularized_gradient(theta, X, y, l=1):
m = X.shape[0]
regularized_term = theta.copy() # same shape as theta
regularized_term[0] = 0 # don't regularize intercept theta
regularized_term = (l / m) * regularized_term
return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
array([-15.30301567, 598.25074417])
def trainLinearReg(X, y, l=1):
"""linear regression
args:
X: feature matrix, (m, n+1) # with incercept x0=1
y: target vector, (m, )
l: lambda constant for regularization
return: trained parameters
"""
# init theta
theta = np.ones(X.shape[1])
# train it
res = opt.minimize(fun = linearRegCostFunction,
x0 = theta,
args = (X, y, l),
method = 'TNC',
jac = regularized_gradient, # Method for computing the gradient vector.
options={'disp': True})
return res
res = trainLinearReg(X, y, l=0)
res
fun: 22.37390649510892 jac: array([ 1.52549682e-07, -7.76071267e-09]) message: 'Converged (|f_n-f_(n-1)| ~= 0)' nfev: 9 nit: 4 status: 1 success: True x: array([13.08790367, 0.36777923])
final_theta = res.x
final_theta
array([13.08790367, 0.36777923])
b = final_theta[0] # intercept
m = final_theta[1] # slope
plt.figure(figsize=(8, 6))
plt.scatter(X[:,1], y, label="Training data", c='r', marker='x', )
plt.plot(X[:, 1], X[:, 1]*m + b, label="Prediction")
plt.legend(loc=2)
plt.show()
def costFunction(theta, X, y):
m = y.size
h = X.dot(theta)
J = (1/(2*m)) * np.sum(np.square(h-y))
return J
def learningCurve(X, y, Xval, yval, Lambda):
error_train = []
error_cross_val = []
m = y.size
for i in range(1, m+1):
subsetX = X[0:i, :]
subsetY = y[0:i]
fit_theta = trainLinearReg(subsetX, subsetY, Lambda).x
error_i_train = costFunction(fit_theta, subsetX, subsetY)
error_i_cross_val = costFunction(fit_theta, Xval, yval)
error_train.append(error_i_train)
error_cross_val.append(error_i_cross_val)
plt.figure(figsize=(10, 8))
plt.plot(np.arange(1, m+1), error_train, label='training cost')
plt.plot(np.arange(1, m+1), error_cross_val, label='cross validation cost')
plt.legend(loc=1)
plt.show()
learningCurve(X, y, Xval, yval, Lambda=0)
def prepare_poly_data(*args, power):
"""
args: keep feeding in X, Xval, or Xtest
will return in the same order
"""
def prepare_for_each_dataset(x):
# expand feature
df = poly_features(x, power=power)
# normalization
ndarr = normalize_feature(df).values
# add bias term
return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)
return [prepare_for_each_dataset(x) for x in args]
def poly_features(x, power, as_ndarray=False):
data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
df = pd.DataFrame(data)
return df.values if as_ndarray else df
def normalize_feature(df):
"""Applies function along input axis(default 0) of DataFrame."""
return df.apply(lambda column: (column - column.mean()) / column.std())
X, y, Xval, yval, Xtest, ytest = ravel_data(data)
poly_features(X, power=3)
f1 | f2 | f3 | |
---|---|---|---|
0 | -15.936758 | 253.980260 | -4047.621971 |
1 | -29.152979 | 849.896197 | -24777.006175 |
2 | 36.189549 | 1309.683430 | 47396.852168 |
3 | 37.492187 | 1405.664111 | 52701.422173 |
4 | -48.058829 | 2309.651088 | -110999.127750 |
5 | -8.941458 | 79.949670 | -714.866612 |
6 | 15.307793 | 234.328523 | 3587.052500 |
7 | -34.706266 | 1204.524887 | -41804.560890 |
8 | 1.389154 | 1.929750 | 2.680720 |
9 | -44.383760 | 1969.918139 | -87432.373590 |
10 | 7.013502 | 49.189211 | 344.988637 |
11 | 22.762749 | 518.142738 | 11794.353058 |
X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)
X_poly[:3, :]
array([[ 1.00000000e+00, -3.62140776e-01, -7.55086688e-01, 1.82225876e-01, -7.06189908e-01, 3.06617917e-01, -5.90877673e-01, 3.44515797e-01, -5.08481165e-01], [ 1.00000000e+00, -8.03204845e-01, 1.25825266e-03, -2.47936991e-01, -3.27023420e-01, 9.33963187e-02, -4.35817606e-01, 2.55416116e-01, -4.48912493e-01], [ 1.00000000e+00, 1.37746700e+00, 5.84826715e-01, 1.24976856e+00, 2.45311974e-01, 9.78359696e-01, -1.21556976e-02, 7.56568484e-01, -1.70352114e-01]])
learningCurve(X_poly, y, Xval_poly, yval, Lambda=0)
# lambda = 0
# lambda = 1
learningCurve(X_poly, y, Xval_poly, yval, Lambda=1)
# lambda = 100
learningCurve(X_poly, y, Xval_poly, yval, Lambda=100)
lambda_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cross_validation_cost = [], []
for l in lambda_candidate:
res = trainLinearReg(X_poly, y, l)
fit_theta = res.x
each_training_cost = linearRegCostFunction(fit_theta, X_poly, y, 0)
each_cross_validation_cost = linearRegCostFunction(fit_theta, Xval_poly, yval, 0)
training_cost.append(each_training_cost)
cross_validation_cost.append(each_cross_validation_cost)
plt.figure(figsize=(10, 8))
plt.plot(lambda_candidate, training_cost, label='training')
plt.plot(lambda_candidate, cross_validation_cost, label='cross validation')
plt.legend(loc=2)
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()
best_lambda_index = np.argmin(cross_validation_cost)
best_lambda = lambda_candidate[best_lambda_index]
best_lambda # for cross validation dataset
1
# use test data
lambda_candidate2 = [0, 0.01, 0.03, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]
for l in lambda_candidate2:
theta = trainLinearReg(X_poly, y, l).x
print('test cost(l={}) = {}'.format(l, linearRegCostFunction(theta, Xtest_poly, ytest, 0)))
test cost(l=0) = 10.551425432669346 test cost(l=0.01) = 10.88035182200567 test cost(l=0.03) = 10.022274720437942 test cost(l=0.1) = 8.631939930646022 test cost(l=0.2) = 7.753395943326273 test cost(l=0.3) = 7.336639301523873 test cost(l=0.4) = 7.139677748128903 test cost(l=0.5) = 7.066548903774858 test cost(l=1) = 7.4662640770936175 test cost(l=2) = 9.358707950127894
We can see that the best lambda is 0.5 if we use test dataset.