# https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes/tree/master/code/ex4-NN%20back%20propagation


import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import matplotlib
import scipy.optimize as opt
from sklearn.metrics import classification_report


from google.colab import files
import io


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


data = sio.loadmat('/content/drive/My Drive/AndrewNg-ML/ex4data1.mat')
weights = sio.loadmat('/content/drive/My Drive/AndrewNg-ML/ex4weights.mat')


data

{'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}


X_raw = data['X']
y = data['y']

X_raw.shape, y.shape

((5000, 400), (5000, 1))


X = np.insert(X_raw, 0, np.ones(X_raw.shape[0]), axis=1) # insert bias term
X.shape

(5000, 401)


from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_onehot.shape

(5000, 10)


y[0], y_onehot[0,:]

(array([10], dtype=uint8), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]))


theta1, theta2 = weights['Theta1'], weights['Theta2']
theta1.shape, theta2.shape

((25, 401), (10, 26))


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def feed_forward(theta, X):
    """apply to architecture 400+1 * 25+1 *10
    X: 5000 * 401
    """

    theta1 = theta['Theta1']  # (25, 401)
    theta2 = theta['Theta2']  #  (10, 26)
    m = X.shape[0]
    a1 = X  # 5000 * 401

    z2 = a1 @ theta1.T  # 5000 * 25
    a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1)  # 5000*26

    z3 = a2 @ theta2.T  # 5000 * 10
    h = sigmoid(z3)  # 5000*10, this is h_theta(X)

    return a1, z2, a2, z3, h  # need all those for backprop


_, _, _, _, h = feed_forward(weights, X)
h

array([[1.12661530e-04, 1.74127856e-03, 2.52696959e-03, ...,
        4.01468105e-04, 6.48072305e-03, 9.95734012e-01],
       [4.79026796e-04, 2.41495958e-03, 3.44755685e-03, ...,
        2.39107046e-03, 1.97025086e-03, 9.95696931e-01],
       [8.85702310e-05, 3.24266731e-03, 2.55419797e-02, ...,
        6.22892325e-02, 5.49803551e-03, 9.28008397e-01],
       ...,
       [5.17641791e-02, 3.81715020e-03, 2.96297510e-02, ...,
        2.15667361e-03, 6.49826950e-01, 2.42384687e-05],
       [8.30631310e-04, 6.22003774e-04, 3.14518512e-04, ...,
        1.19366192e-02, 9.71410499e-01, 2.06173648e-04],
       [4.81465717e-05, 4.58821829e-04, 2.15146201e-05, ...,
        5.73434571e-03, 6.96288990e-01, 8.18576980e-02]])


def cost(theta, X, y):
    m = X.shape[0]  # get the data size m

    _, _, _, _, h = feed_forward(theta, X)

    # np.multiply is pairwise operation
    pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))

    return pair_computation.sum() / m


cost(weights, X, y_onehot)

0.2876291651613189


def regularized_cost(theta, X, y, l=1):
    """the first column of theta1 and theta2 is intercept theta, ignore them when you do regularization"""
    theta1 = theta['Theta1']
    theta2 = theta['Theta2']
    m = X.shape[0]

    reg_t1 = (l / (2 * m)) * np.power(theta1[:, 1:], 2).sum()  # ignore the first column
    reg_t2 = (l / (2 * m)) * np.power(theta2[:, 1:], 2).sum()

    return cost(theta, X, y) + reg_t1 + reg_t2


regularized_cost(weights, X, y_onehot)

0.38376985909092365


def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), 1 - sigmoid(z))


sigmoid_gradient(0)

0.25


def gradient(theta, X, y):
    # initialize
    m = X.shape[0]
    theta1 = theta['Theta1']
    theta2 = theta['Theta2']

    delta1 = np.zeros(theta1.shape)  # (25, 401)
    delta2 = np.zeros(theta2.shape)  # (10, 26)

    a1, z2, a2, z3, h = feed_forward(theta, X)

    for i in range(m):
        a1i = a1[i, :]  # (1, 401)
        z2i = z2[i, :]  # (1, 25)
        a2i = a2[i, :]  # (1, 26)

        hi = h[i, :]    # (1, 10)
        yi = y[i, :]    # (1, 10)

        # delta3 = a_3 - y_3
        # a_3 is h
        d3i = hi - yi  # (1, 10)

        z2i = np.insert(z2i, 0, np.ones(1))  # make it (1, 26) to compute d2i
        d2i = np.multiply(theta2.T @ d3i, sigmoid_gradient(z2i))  # (1, 26)

        delta2 += np.matrix(d3i).T @ np.matrix(a2i)  # (1, 10).T @ (1, 26) -> (10, 26)
        # for delta2, we have skip the first column
        delta1 += np.matrix(d2i[1:]).T @ np.matrix(a1i)  # (1, 25).T @ (1, 401) -> (25, 401)

    delta1 = delta1 / m
    delta2 = delta2 / m

    return delta1, delta2


delta1, delta2 = gradient(weights, X, y_onehot)
delta1.shape, delta2.shape

((25, 401), (10, 26))


def regularized_gradient(theta, X, y, lamb=1):
    """don't regularize theta of bias terms"""
    m = X.shape[0]
    delta1, delta2 = gradient(theta, X, y)
    theta1 = theta['Theta1']
    theta2 = theta['Theta2']

    theta1[:, 0] = 0
    reg_term_d1 = (lamb / m) * theta1
    delta1 = delta1 + reg_term_d1

    t2[:, 0] = 0
    reg_term_d2 = (lamb / m) * theta2
    delta2 = delta2 + reg_term_d2

    return delta1, delta2


def roll_param(a, b):  
    return np.concatenate((np.ravel(a), np.ravel(b)))


def unroll_param(weights_vec):
    """ turn into ndarray of (25, 401), (10, 26)"""
    return weights_vec[:25 * 401].reshape(25, 401), weights_vec[25 * 401:].reshape(10, 26)


def random_init(size):
    return np.random.uniform(-0.12, 0.12, size)


def feed_forward_for_training(theta, X):
    """apply to architecture 400+1 * 25+1 *10
    X: 5000 * 401
    """

    t1, t2 = unroll_param(theta)  # t1: (25,401) t2: (10,26)
    m = X.shape[0]
    a1 = X  # 5000 * 401

    z2 = a1 @ t1.T  # 5000 * 25
    a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1)  # 5000*26

    z3 = a2 @ t2.T  # 5000 * 10
    h = sigmoid(z3)  # 5000*10, this is h_theta(X)

    return a1, z2, a2, z3, h  # you need all those for backprop


def cost_for_training(theta, X, y):
    m = X.shape[0]  # get the data size m

    _, _, _, _, h = feed_forward_for_training(theta, X)

    # np.multiply is pairwise operation
    pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))

    return pair_computation.sum() / m


def regularized_cost_for_training(theta, X, y, l=1):
    t1, t2 = unroll_param(theta)  # t1: (25,401) t2: (10,26)
    m = X.shape[0]

    reg_t1 = (l / (2 * m)) * np.power(t1[:, 1:], 2).sum()  
    reg_t2 = (l / (2 * m)) * np.power(t2[:, 1:], 2).sum()

    return cost_for_training(theta, X, y) + reg_t1 + reg_t2


def gradient_for_training(theta, X, y):
    # initialize
    t1, t2 = unroll_param(theta)  # t1: (25,401) t2: (10,26)
    m = X.shape[0]

    delta1 = np.zeros(t1.shape)  # (25, 401)
    delta2 = np.zeros(t2.shape)  # (10, 26)

    a1, z2, a2, z3, h = feed_forward_for_training(theta, X)

    for i in range(m):
        a1i = a1[i, :]  # (1, 401)
        z2i = z2[i, :]  # (1, 25)
        a2i = a2[i, :]  # (1, 26)

        hi = h[i, :]    # (1, 10)
        yi = y[i, :]    # (1, 10)

        d3i = hi - yi  # (1, 10)

        z2i = np.insert(z2i, 0, np.ones(1))  # make it (1, 26) to compute d2i
        d2i = np.multiply(t2.T @ d3i, sigmoid_gradient(z2i))  # (1, 26)

        # careful with np vector transpose
        delta2 += np.matrix(d3i).T @ np.matrix(a2i)  # (1, 10).T @ (1, 26) -> (10, 26)
        delta1 += np.matrix(d2i[1:]).T @ np.matrix(a1i)  # (1, 25).T @ (1, 401) -> (25, 401)

    delta1 = delta1 / m
    delta2 = delta2 / m

    return roll_param(delta1, delta2)


def regularized_gradient_for_training(theta, X, y, l=1):
    """don't regularize theta of bias terms"""
    m = X.shape[0]
    delta1, delta2 = unroll_param(gradient_for_training(theta, X, y))
    t1, t2 = unroll_param(theta)

    t1[:, 0] = 0
    reg_term_d1 = (l / m) * t1
    delta1 = delta1 + reg_term_d1

    t2[:, 0] = 0
    reg_term_d2 = (l / m) * t2
    delta2 = delta2 + reg_term_d2

    return roll_param(delta1, delta2)


def nn_training(X, y):
    init_theta = random_init(10285)  # 25*401 + 10*26

    res = opt.minimize(fun=regularized_cost_for_training,
                       x0=init_theta,
                       args=(X, y, 1),
                       method='TNC',
                       jac=regularized_gradient_for_training,
                       options={'maxiter': 400})
    return res


res = nn_training(X, y_onehot)
res

     fun: 0.3169437668486655
     jac: array([-1.79591936e-04,  2.41673169e-08,  2.67292904e-08, ...,
       -5.31782596e-05, -6.82145999e-05, -1.94636218e-04])
 message: 'Max. number of function evaluations reached'
    nfev: 400
     nit: 28
  status: 3
 success: False
       x: array([ 0.00000000e+00,  1.20836585e-04,  1.33646452e-04, ...,
       -3.61389440e-01, -2.84696480e+00,  4.45006612e-02])

y

array([[10],
       [10],
       [10],
       ...,
       [ 9],
       [ 9],
       [ 9]], dtype=uint8)


final_theta = res.x


def show_accuracy(theta, X, y):
    _, _, _, _, h = feed_forward_for_training(theta, X)

    y_pred = np.argmax(h, axis=1) + 1

    print(classification_report(y, y_pred))


show_accuracy(final_theta, X, y)

              precision    recall  f1-score   support

           1       0.99      0.95      0.97       500
           2       0.99      0.94      0.97       500
           3       0.92      0.99      0.95       500
           4       1.00      0.87      0.93       500
           5       1.00      0.76      0.86       500
           6       0.92      0.99      0.95       500
           7       0.99      0.96      0.97       500
           8       0.73      1.00      0.85       500
           9       0.98      0.91      0.94       500
          10       0.96      1.00      0.98       500

    accuracy                           0.94      5000
   macro avg       0.95      0.94      0.94      5000
weighted avg       0.95      0.94      0.94      5000