# Reference: https://github.com/kaleko/CourseraML/tree/master/ex8


import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import matplotlib
import scipy.optimize as opt


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


ex8data1 = sio.loadmat('/content/drive/My Drive/AndrewNg-ML/ex8data1.mat')


X = ex8data1['X']
ycv = ex8data1['yval']
Xcv = ex8data1['Xval']


# Visualize the data
def plotData(X, newFig=False):
    if newFig:
        plt.figure(figsize=(8,6))
    plt.plot(X[:,0], X[:,1],'b+')
    plt.xlabel('Latency [ms]',fontsize=16)
    plt.ylabel('Throughput [mb/s]',fontsize=16)
    plt.grid(True)


plotData(X)


# multivariate Gaussian
def gaussian_distribution(X, mu, sig2):   # sig2 = sigma^2
    """
    Function to compute the gaussian return values for a feature
    matrix, myX, given the already computed mu vector and sigma matrix.
    If sigma is a vector, it is turned into a diagonal matrix
    Uses a loop over rows; I didn't quite figure out a vectorized implementation.
    """
    m = X.shape[0]
    n = X.shape[1]
    if np.ndim(sig2) == 1:
        sig2 = np.diag(sig2)

    norm = 1./(np.power((2*np.pi), n/2) * np.sqrt(np.linalg.det(sig2)))
    inv = np.linalg.inv(sig2)
    exp = np.zeros((m,1))
    for irow in range(m):
        xrow = X[irow]
        exp[irow] = np.exp(-0.5*((xrow-mu).T).dot(inv).dot(xrow-mu))
    return norm*exp


def getGaussianParams(X, useMultivariate = True):
    """
    Function that given a feature matrix X that is (m x n)
    returns a mean vector and a sigmasquared vector that are
    both (n x 1) in shape.
    This can do it either as a 1D gaussian for each feature,
    or as a multivariate gaussian.
    """
    m = X.shape[0]
    mu = np.mean(X, axis=0)  
    if not useMultivariate:
        sigma2 = np.sum(np.square(X-mu),axis=0)/float(m)
        return mu, sigma2
    else:
        sigma2 = ((X-mu).T.dot(X-mu))/float(m)
        return mu, sigma2


mu, sig2 = getGaussianParams(X, useMultivariate = True)


def plotContours(mu, sigma2, newFig=False, useMultivariate = True):
    delta = .5
    x = np.arange(0,30,delta)
    y = np.arange(0,30,delta)
    meshx, meshy = np.meshgrid(x, y)
    coord_list = [ entry.ravel() for entry in (meshx, meshy) ]
    points = np.vstack(coord_list).T
    z = gaussian_distribution(points, mu, sigma2)
    #if not useMultivariate:
    #    z = gausOrthog(points, mu, sigma2)
    #else: z = gausMV(points, mu, sigma2)
    z = z.reshape((x.shape[0],x.shape[0]))

    if newFig: plt.figure(figsize=(8,6))
    
    cont_levels = [10**exp for exp in range(-20,0,3)]
    contour = plt.contour(meshx, meshy, z, levels=cont_levels)

    plt.title('Gaussian Contours',fontsize=16)


# First contours without using multivariate gaussian:
plotData(X, newFig=True)
useMV = False
plotContours(*getGaussianParams(X, useMV), newFig=False, useMultivariate = useMV)

# Then contours with multivariate gaussian:
plotData(X, newFig=True)
useMV = True
plotContours(*getGaussianParams(X, useMV), newFig=False, useMultivariate = useMV)


def computeF1(predVec, trueVec):
    """
    F1 = 2 * (P*R)/(P+R)
    where P is precision, R is recall
    Precision = "of all predicted y=1, what fraction had true y=1"
    Recall = "of all true y=1, what fraction predicted y=1?
    Note predictionVec and trueLabelVec should be boolean vectors.
    """
    #print predVec.shape
    #print trueVec.shape
    #assert predVec.shape == trueVec.shape
    
    P, R = 0., 0.
    if float(np.sum(predVec)):
        P = np.sum([int(trueVec[x]) for x in range(predVec.shape[0]) \
                    if predVec[x]]) / float(np.sum(predVec))
    if float(np.sum(trueVec)):
        R = np.sum([int(predVec[x]) for x in range(trueVec.shape[0]) \
                    if trueVec[x]]) / float(np.sum(trueVec))
        
    return 2*P*R/(P+R) if (P+R) else 0


def selectThreshold(ycv, pCVs):
    """
    Function to select the best epsilon value from the CV set
    by looping over possible epsilon values and computing the F1
    score for each.
    """
    # Make a list of possible epsilon values
    nsteps = 1000
    epses = np.linspace(np.min(pCVs),np.max(pCVs),nsteps)
    
    # Compute the F1 score for each epsilon value, and store the best 
    # F1 score (and corresponding best epsilon)
    bestF1, bestEps = 0, 0
    trueVec = (ycv == 1).flatten()
    for eps in epses:
        predVec = pCVs < eps
        thisF1 = computeF1(predVec, trueVec)
        if thisF1 > bestF1:
            bestF1 = thisF1
            bestEps = eps
            
    print ("Best F1 is , best eps is: ")
    print (bestF1,bestEps)
    return bestF1, bestEps


# Using the gaussian parameters from the full training set,
# figure out the p-value for each point in the CV set
pCVs = gaussian_distribution(Xcv, mu, sig2)

#You should see a value for epsilon of about 8.99e-05.
bestF1, bestEps = selectThreshold(ycv,pCVs)

Best F1 is , best eps is: 
0.8750000000000001 9.074844572965702e-05


def plotAnomalies(X, bestEps, newFig = False, useMultivariate = True):
    ps = gaussian_distribution(X, *getGaussianParams(X, useMultivariate))
    anoms = np.array([X[x] for x in range(X.shape[0]) if ps[x] < bestEps])
    if newFig: plt.figure(figsize=(8,6))
    plt.scatter(anoms[:,0],anoms[:,1], s=80, facecolors='none', edgecolors='r')


plotData(X, newFig=True)
plotContours(mu, sig2, newFig=False, useMultivariate=True)
plotAnomalies(X, bestEps, newFig=False, useMultivariate=True)


ex8data2 = sio.loadmat('/content/drive/My Drive/AndrewNg-ML/ex8data2.mat')


Xpart2 = ex8data2['X']
ycvpart2 = ex8data2['yval']
Xcvpart2 = ex8data2['Xval']


mu, sig2 = getGaussianParams(Xpart2, useMultivariate=False)
ps = gaussian_distribution(Xpart2, mu, sig2)
psCV = gaussian_distribution(Xcvpart2, mu, sig2)

# Using the gaussian parameters from the full training set,
# figure out the p-value for each point in the CV set
pCVs = gaussian_distribution(Xcvpart2, mu, sig2)

# You should see a value epsilon of about 1.38e-18, and 117 anomalies found.
bestF1, bestEps = selectThreshold(ycvpart2, pCVs)
anoms = [Xpart2[x] for x in range(Xpart2.shape[0]) if ps[x] < bestEps]
print ('# of anomalies found: ')
print (len(anoms))

Best F1 is , best eps is: 
0.6153846153846154 1.3786074982000245e-18
# of anomalies found: 
117