# https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes/tree/master/code/ex4-NN%20back%20propagation
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import matplotlib
import scipy.optimize as opt
from sklearn.metrics import classification_report
from google.colab import files
import io
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
data = sio.loadmat('/content/drive/My Drive/AndrewNg-ML/ex4data1.mat')
weights = sio.loadmat('/content/drive/My Drive/AndrewNg-ML/ex4weights.mat')
data
{'X': array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]), '__globals__': [], '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011', '__version__': '1.0', 'y': array([[10], [10], [10], ..., [ 9], [ 9], [ 9]], dtype=uint8)}
X_raw = data['X']
y = data['y']
X_raw.shape, y.shape
((5000, 400), (5000, 1))
X = np.insert(X_raw, 0, np.ones(X_raw.shape[0]), axis=1) # insert bias term
X.shape
(5000, 401)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_onehot.shape
(5000, 10)
y[0], y_onehot[0,:]
(array([10], dtype=uint8), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]))
theta1, theta2 = weights['Theta1'], weights['Theta2']
theta1.shape, theta2.shape
((25, 401), (10, 26))
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def feed_forward(theta, X):
"""apply to architecture 400+1 * 25+1 *10
X: 5000 * 401
"""
theta1 = theta['Theta1'] # (25, 401)
theta2 = theta['Theta2'] # (10, 26)
m = X.shape[0]
a1 = X # 5000 * 401
z2 = a1 @ theta1.T # 5000 * 25
a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1) # 5000*26
z3 = a2 @ theta2.T # 5000 * 10
h = sigmoid(z3) # 5000*10, this is h_theta(X)
return a1, z2, a2, z3, h # need all those for backprop
_, _, _, _, h = feed_forward(weights, X)
h
array([[1.12661530e-04, 1.74127856e-03, 2.52696959e-03, ..., 4.01468105e-04, 6.48072305e-03, 9.95734012e-01], [4.79026796e-04, 2.41495958e-03, 3.44755685e-03, ..., 2.39107046e-03, 1.97025086e-03, 9.95696931e-01], [8.85702310e-05, 3.24266731e-03, 2.55419797e-02, ..., 6.22892325e-02, 5.49803551e-03, 9.28008397e-01], ..., [5.17641791e-02, 3.81715020e-03, 2.96297510e-02, ..., 2.15667361e-03, 6.49826950e-01, 2.42384687e-05], [8.30631310e-04, 6.22003774e-04, 3.14518512e-04, ..., 1.19366192e-02, 9.71410499e-01, 2.06173648e-04], [4.81465717e-05, 4.58821829e-04, 2.15146201e-05, ..., 5.73434571e-03, 6.96288990e-01, 8.18576980e-02]])
def cost(theta, X, y):
m = X.shape[0] # get the data size m
_, _, _, _, h = feed_forward(theta, X)
# np.multiply is pairwise operation
pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))
return pair_computation.sum() / m
cost(weights, X, y_onehot)
0.2876291651613189
def regularized_cost(theta, X, y, l=1):
"""the first column of theta1 and theta2 is intercept theta, ignore them when you do regularization"""
theta1 = theta['Theta1']
theta2 = theta['Theta2']
m = X.shape[0]
reg_t1 = (l / (2 * m)) * np.power(theta1[:, 1:], 2).sum() # ignore the first column
reg_t2 = (l / (2 * m)) * np.power(theta2[:, 1:], 2).sum()
return cost(theta, X, y) + reg_t1 + reg_t2
regularized_cost(weights, X, y_onehot)
0.38376985909092365
def sigmoid_gradient(z):
return np.multiply(sigmoid(z), 1 - sigmoid(z))
sigmoid_gradient(0)
0.25
def gradient(theta, X, y):
# initialize
m = X.shape[0]
theta1 = theta['Theta1']
theta2 = theta['Theta2']
delta1 = np.zeros(theta1.shape) # (25, 401)
delta2 = np.zeros(theta2.shape) # (10, 26)
a1, z2, a2, z3, h = feed_forward(theta, X)
for i in range(m):
a1i = a1[i, :] # (1, 401)
z2i = z2[i, :] # (1, 25)
a2i = a2[i, :] # (1, 26)
hi = h[i, :] # (1, 10)
yi = y[i, :] # (1, 10)
# delta3 = a_3 - y_3
# a_3 is h
d3i = hi - yi # (1, 10)
z2i = np.insert(z2i, 0, np.ones(1)) # make it (1, 26) to compute d2i
d2i = np.multiply(theta2.T @ d3i, sigmoid_gradient(z2i)) # (1, 26)
delta2 += np.matrix(d3i).T @ np.matrix(a2i) # (1, 10).T @ (1, 26) -> (10, 26)
# for delta2, we have skip the first column
delta1 += np.matrix(d2i[1:]).T @ np.matrix(a1i) # (1, 25).T @ (1, 401) -> (25, 401)
delta1 = delta1 / m
delta2 = delta2 / m
return delta1, delta2
delta1, delta2 = gradient(weights, X, y_onehot)
delta1.shape, delta2.shape
((25, 401), (10, 26))
def regularized_gradient(theta, X, y, lamb=1):
"""don't regularize theta of bias terms"""
m = X.shape[0]
delta1, delta2 = gradient(theta, X, y)
theta1 = theta['Theta1']
theta2 = theta['Theta2']
theta1[:, 0] = 0
reg_term_d1 = (lamb / m) * theta1
delta1 = delta1 + reg_term_d1
t2[:, 0] = 0
reg_term_d2 = (lamb / m) * theta2
delta2 = delta2 + reg_term_d2
return delta1, delta2
2.6 Learning parameters using fmincg
def roll_param(a, b):
return np.concatenate((np.ravel(a), np.ravel(b)))
def unroll_param(weights_vec):
""" turn into ndarray of (25, 401), (10, 26)"""
return weights_vec[:25 * 401].reshape(25, 401), weights_vec[25 * 401:].reshape(10, 26)
def random_init(size):
return np.random.uniform(-0.12, 0.12, size)
def feed_forward_for_training(theta, X):
"""apply to architecture 400+1 * 25+1 *10
X: 5000 * 401
"""
t1, t2 = unroll_param(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
a1 = X # 5000 * 401
z2 = a1 @ t1.T # 5000 * 25
a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1) # 5000*26
z3 = a2 @ t2.T # 5000 * 10
h = sigmoid(z3) # 5000*10, this is h_theta(X)
return a1, z2, a2, z3, h # you need all those for backprop
def cost_for_training(theta, X, y):
m = X.shape[0] # get the data size m
_, _, _, _, h = feed_forward_for_training(theta, X)
# np.multiply is pairwise operation
pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))
return pair_computation.sum() / m
def regularized_cost_for_training(theta, X, y, l=1):
t1, t2 = unroll_param(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
reg_t1 = (l / (2 * m)) * np.power(t1[:, 1:], 2).sum()
reg_t2 = (l / (2 * m)) * np.power(t2[:, 1:], 2).sum()
return cost_for_training(theta, X, y) + reg_t1 + reg_t2
def gradient_for_training(theta, X, y):
# initialize
t1, t2 = unroll_param(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
delta1 = np.zeros(t1.shape) # (25, 401)
delta2 = np.zeros(t2.shape) # (10, 26)
a1, z2, a2, z3, h = feed_forward_for_training(theta, X)
for i in range(m):
a1i = a1[i, :] # (1, 401)
z2i = z2[i, :] # (1, 25)
a2i = a2[i, :] # (1, 26)
hi = h[i, :] # (1, 10)
yi = y[i, :] # (1, 10)
d3i = hi - yi # (1, 10)
z2i = np.insert(z2i, 0, np.ones(1)) # make it (1, 26) to compute d2i
d2i = np.multiply(t2.T @ d3i, sigmoid_gradient(z2i)) # (1, 26)
# careful with np vector transpose
delta2 += np.matrix(d3i).T @ np.matrix(a2i) # (1, 10).T @ (1, 26) -> (10, 26)
delta1 += np.matrix(d2i[1:]).T @ np.matrix(a1i) # (1, 25).T @ (1, 401) -> (25, 401)
delta1 = delta1 / m
delta2 = delta2 / m
return roll_param(delta1, delta2)
def regularized_gradient_for_training(theta, X, y, l=1):
"""don't regularize theta of bias terms"""
m = X.shape[0]
delta1, delta2 = unroll_param(gradient_for_training(theta, X, y))
t1, t2 = unroll_param(theta)
t1[:, 0] = 0
reg_term_d1 = (l / m) * t1
delta1 = delta1 + reg_term_d1
t2[:, 0] = 0
reg_term_d2 = (l / m) * t2
delta2 = delta2 + reg_term_d2
return roll_param(delta1, delta2)
def nn_training(X, y):
init_theta = random_init(10285) # 25*401 + 10*26
res = opt.minimize(fun=regularized_cost_for_training,
x0=init_theta,
args=(X, y, 1),
method='TNC',
jac=regularized_gradient_for_training,
options={'maxiter': 400})
return res
res = nn_training(X, y_onehot)
res
fun: 0.3169437668486655 jac: array([-1.79591936e-04, 2.41673169e-08, 2.67292904e-08, ..., -5.31782596e-05, -6.82145999e-05, -1.94636218e-04]) message: 'Max. number of function evaluations reached' nfev: 400 nit: 28 status: 3 success: False x: array([ 0.00000000e+00, 1.20836585e-04, 1.33646452e-04, ..., -3.61389440e-01, -2.84696480e+00, 4.45006612e-02])
y
array([[10], [10], [10], ..., [ 9], [ 9], [ 9]], dtype=uint8)
final_theta = res.x
def show_accuracy(theta, X, y):
_, _, _, _, h = feed_forward_for_training(theta, X)
y_pred = np.argmax(h, axis=1) + 1
print(classification_report(y, y_pred))
show_accuracy(final_theta, X, y)
precision recall f1-score support 1 0.99 0.95 0.97 500 2 0.99 0.94 0.97 500 3 0.92 0.99 0.95 500 4 1.00 0.87 0.93 500 5 1.00 0.76 0.86 500 6 0.92 0.99 0.95 500 7 0.99 0.96 0.97 500 8 0.73 1.00 0.85 500 9 0.98 0.91 0.94 500 10 0.96 1.00 0.98 500 accuracy 0.94 5000 macro avg 0.95 0.94 0.94 5000 weighted avg 0.95 0.94 0.94 5000