# Reference: https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import io


ex2data1_upload = files.upload()

Saving ex2data1.txt to ex2data1.txt


ex2data1 = pd.read_csv(io.BytesIO(ex2data1_upload['ex2data1.txt']), header=None, names=['Exam 1', 'Exam 2', 'Admitted'])


ex2data2_upload = files.upload()

Saving ex2data2.txt to ex2data2 (1).txt


ex2data2 = pd.read_csv(io.BytesIO(ex2data2_upload['ex2data2.txt']), header=None, names=['test1', 'test2', 'accepted'])


ex2data1.head()


ex2data2.head()


admitted = ex2data1[ex2data1['Admitted'].isin([1])]
notAdmitted = ex2data1[ex2data1['Admitted'].isin([0])]

plt.figure(figsize=(10,6))
plt.scatter(admitted['Exam 1'], admitted['Exam 2'], s=50, c='b', marker='o', label='Admitted')
plt.scatter(notAdmitted['Exam 1'], notAdmitted['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
plt.xlabel('Exam 1 Score')
plt.ylabel('Exam 2 Score')

Text(0, 0.5, 'Exam 2 Score')


def sigmoid(z):
  return 1 / (1 + np.exp(-z))


def cost(theta, X, y):
    m = y.size
    h = sigmoid(X.dot(theta))  # 100, 1
    log_h = np.log(h)          # 100, 1
    log_one_minusH = np.log(1 - h)
    sum = np.dot(-y.T, log_h) - np.dot((1-y).T, log_one_minusH)
    return sum / m


# add a ones column - this makes the matrix multiplication work out easier
ex2data1.insert(0, 'Ones', 1)


# set X (training data) and y (target variable)
cols = ex2data1.shape[1]
X1 = ex2data1.iloc[:, 0:cols-1]
y1 = ex2data1.iloc[:, cols-1:cols]

X1_mat = np.matrix(X1)
y1_mat = np.matrix(y1)


X1.iloc[0:5]


X1_mat[0:5]

matrix([[ 1.        , 34.62365962, 78.02469282],
        [ 1.        , 30.28671077, 43.89499752],
        [ 1.        , 35.84740877, 72.90219803],
        [ 1.        , 60.18259939, 86.3085521 ],
        [ 1.        , 79.03273605, 75.34437644]])


theta = np.zeros(3)


X1_mat.shape, y1_mat.shape, theta.shape

((100, 3), (100, 1), (3,))


X1.dot(theta).shape

(100,)


X1_mat.dot(theta).shape

(1, 100)


cost(theta, X1, y1)

array([0.69314718])


def gradient(theta, X, y):
  m = y.size
  h = sigmoid(X.dot(theta))  # 100, 1 
  return (1 / m) * X.T @ (h - y) # (3, 100) * (100, 1)


X1 = X1.to_numpy()


y1 = np.squeeze(np.asarray(y1_mat))


gradient(theta, X1, y1)

array([ -0.1       , -12.00921659, -11.26284221])


import scipy.optimize as opt


theta = np.zeros(3)


res = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X1, y1))


print(res)

(array([-25.16131855,   0.20623159,   0.20147149]), 36, 0)


res = opt.minimize(fun=cost, x0=theta, args=(X1, y1), method='Newton-CG', jac=gradient)
print(res)

     fun: 0.20349770195754097
     jac: array([-2.22493188e-05, -1.45448488e-03, -1.36924251e-03])
 message: 'Optimization terminated successfully.'
    nfev: 72
    nhev: 0
     nit: 28
    njev: 241
  status: 0
 success: True
       x: array([-25.15977355,   0.20621882,   0.20145941])


def predict(x, theta):
    prob = sigmoid(x @ theta)
    return (prob >= 0.5).astype(int)


final_theta = res.x
y_pred = predict(X1, final_theta)
from sklearn.metrics import classification_report

print(classification_report(y1, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86        40
           1       0.90      0.92      0.91        60

    accuracy                           0.89       100
   macro avg       0.89      0.88      0.88       100
weighted avg       0.89      0.89      0.89       100


# plot Decision Boundary
print(res.x) # this is final theta

[-25.15977355   0.20621882   0.20145941]


coef = -(res.x / res.x[2])  # find the equation
print(coef)

x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x

[124.88755497  -1.02362462  -1.        ]


ex2data1.describe()


import seaborn as sns
sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot('Exam 1', 'Exam 2', hue='Admitted', data = ex2data1, 
           size=6, 
           fit_reg=False, 
           scatter_kws={"s": 25}
          )

plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)


df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()


sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot('test1', 'test2', hue='accepted', data=df, 
           size=6, 
           fit_reg=False, 
           scatter_kws={"s": 50}
          )

plt.title('Regularized Logistic Regression')
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)


def feature_mapping(x, y, power, as_ndarray=False):
#     """return mapped features as ndarray or dataframe"""
    # data = {}
    # # inclusive
    # for i in np.arange(power + 1):
    #     for p in np.arange(i + 1):
    #         data["f{}{}".format(i - p, p)] = np.power(x, i - p) * np.power(y, p)

    data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
                for i in np.arange(power + 1)
                for p in np.arange(i + 1)
            }

    if as_ndarray:
        return pd.DataFrame(data).values
    else:
        return pd.DataFrame(data)


x1 = np.array(df.test1)
x2 = np.array(df.test2)


data = feature_mapping(x1, x2, power=6)
print(data.shape)
data.head()

(118, 28)


data.describe()


def get_y(df):
    return np.array(df.iloc[:, -1])


theta = np.zeros(data.shape[1])
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
print(X.shape)

y = get_y(df)
print(y.shape)

(118, 28)
(118,)


def regularized_cost(theta, X, y, l=1):
    theta_j1_to_n = theta[1:]
    regularized_term = (l / (2 * len(X))) * np.power(theta_j1_to_n, 2).sum()

    return cost(theta, X, y) + regularized_term


regularized_cost(theta, X, y, l=1)

0.6931471805599454


def regularized_gradient(theta, X, y, l=1):
    theta_j1_to_n = theta[1:]
    regularized_theta = (l / len(X)) * theta_j1_to_n

    # by doing this, no offset is on theta_0
    regularized_term = np.concatenate([np.array([0]), regularized_theta])

    return gradient(theta, X, y) + regularized_term


regularized_gradient(theta, X, y)

array([8.47457627e-03, 1.87880932e-02, 7.77711864e-05, 5.03446395e-02,
       1.15013308e-02, 3.76648474e-02, 1.83559872e-02, 7.32393391e-03,
       8.19244468e-03, 2.34764889e-02, 3.93486234e-02, 2.23923907e-03,
       1.28600503e-02, 3.09593720e-03, 3.93028171e-02, 1.99707467e-02,
       4.32983232e-03, 3.38643902e-03, 5.83822078e-03, 4.47629067e-03,
       3.10079849e-02, 3.10312442e-02, 1.09740238e-03, 6.31570797e-03,
       4.08503006e-04, 7.26504316e-03, 1.37646175e-03, 3.87936363e-02])


import scipy.optimize as opt


print('init cost = {}'.format(regularized_cost(theta, X, y)))

res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res

init cost = 0.6931471805599454

     fun: 0.5290027297130675
     jac: array([ 1.96155545e-08,  2.26570813e-08,  2.45080357e-08,  3.79471079e-08,
       -4.65072849e-09,  2.89854099e-08,  3.30172156e-08,  1.72884135e-08,
        2.50749741e-08, -5.81604980e-09, -1.20055820e-08, -8.01873353e-09,
       -2.67385626e-08,  6.76384879e-09, -3.06403040e-08,  1.98999264e-08,
        1.43220093e-08, -1.05199705e-08, -6.39393788e-09,  1.61989560e-08,
       -2.82679044e-08, -1.44827983e-08, -1.28685488e-09, -1.20644688e-08,
       -7.50395402e-09, -1.15134582e-08,  1.64650590e-09, -3.15937164e-08])
 message: 'Optimization terminated successfully.'
    nfev: 7
    nhev: 0
     nit: 6
    njev: 62
  status: 0
 success: True
       x: array([ 1.27273886,  0.62527109,  1.18108962, -2.01995798, -0.91742463,
       -1.431661  ,  0.12400861, -0.36553297, -0.35723759, -0.17513036,
       -1.45815878, -0.05099011, -0.61555855, -0.27470598, -1.19281891,
       -0.24218685, -0.20600467, -0.04473229, -0.27778532, -0.29537678,
       -0.45635924, -1.04320478,  0.0277715 , -0.29243293,  0.01556582,
       -0.32738097, -0.14388717, -0.92465458])


final_theta = res.x
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82        60
           1       0.78      0.91      0.84        58

    accuracy                           0.83       118
   macro avg       0.84      0.83      0.83       118
weighted avg       0.84      0.83      0.83       118


from sklearn import linear_model
model = linear_model.LogisticRegression(penalty='l2', C=1.0)
model.fit(X, y.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


model.score(X, y)

0.8305084745762712


# Plotting the decision boundary
def draw_boundary(power, l):
#     """
#     power: polynomial power for mapped feature
#     l: lambda constant
#     """
    density = 1000
    threshhold = 2 * 10**-3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})

    plt.scatter(x, y, c='R', s=10)
    plt.title('Decision boundary')
    plt.show()


def feature_mapped_logistic_regression(power, l):
#     """for drawing purpose only.. not a well generealize logistic regression
#     power: int
#         raise x1, x2 to polynomial power
#     l: int
#         lambda constant for regularization term
#     """
    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    x1 = np.array(df.test1)
    x2 = np.array(df.test2)
    y = get_y(df)

    X = feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient)
    final_theta = res.x

    return final_theta


def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.5, density)
    t2 = np.linspace(-1, 1.5, density)

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = feature_mapping(x_cord, y_cord, power)  # this is a dataframe

    inner_product = mapped_cord.values @ theta

    decision = mapped_cord[np.abs(inner_product) < threshhold]

    return decision.f10, decision.f01


draw_boundary(power=6, l=1)  #lambda=1

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:16: MatplotlibDeprecationWarning: Support for uppercase single-letter colors is deprecated since Matplotlib 3.1 and will be removed in 3.3; please use lowercase instead.
  app.launch_new_instance()


draw_boundary(power=6, l=0)  # no regularization, over fitting

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)


draw_boundary(power=6, l=100)  # underfitting，#lambda=100

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

	test1	test2	accepted
0	0.051267	0.69956	1
1	-0.092742	0.68494	1
2	-0.213710	0.69225	1
3	-0.375000	0.50219	1
4	-0.513250	0.46564	1

	Ones	Exam 1	Exam 2	Admitted
count	100.0	100.000000	100.000000	100.000000
mean	1.0	65.644274	66.221998	0.600000
std	0.0	19.458222	18.582783	0.492366
min	1.0	30.058822	30.603263	0.000000
25%	1.0	50.919511	48.179205	0.000000
50%	1.0	67.032988	67.682381	1.000000
75%	1.0	80.212529	79.360605	1.000000
max	1.0	99.827858	98.869436	1.000000

	test1	test2	accepted
0	0.051267	0.69956	1
1	-0.092742	0.68494	1
2	-0.213710	0.69225	1
3	-0.375000	0.50219	1
4	-0.513250	0.46564	1

	f00	f10	f01	f20	f11	f02	f30	f21	f12	f03	f40	f31	f22	f13	f04	f50	f41	f32	f23	f14	f05	f60	f51	f42	f33	f24	f15	f06
0	1.0	0.051267	0.69956	0.002628	0.035864	0.489384	0.000135	0.001839	0.025089	0.342354	0.000007	0.000094	0.001286	0.017551	0.239497	3.541519e-07	0.000005	0.000066	0.000900	0.012278	0.167542	1.815630e-08	2.477505e-07	0.000003	0.000046	0.000629	0.008589	0.117206
1	1.0	-0.092742	0.68494	0.008601	-0.063523	0.469143	-0.000798	0.005891	-0.043509	0.321335	0.000074	-0.000546	0.004035	-0.029801	0.220095	-6.860919e-06	0.000051	-0.000374	0.002764	-0.020412	0.150752	6.362953e-07	-4.699318e-06	0.000035	-0.000256	0.001893	-0.013981	0.103256
2	1.0	-0.213710	0.69225	0.045672	-0.147941	0.479210	-0.009761	0.031616	-0.102412	0.331733	0.002086	-0.006757	0.021886	-0.070895	0.229642	-4.457837e-04	0.001444	-0.004677	0.015151	-0.049077	0.158970	9.526844e-05	-3.085938e-04	0.001000	-0.003238	0.010488	-0.033973	0.110047
3	1.0	-0.375000	0.50219	0.140625	-0.188321	0.252195	-0.052734	0.070620	-0.094573	0.126650	0.019775	-0.026483	0.035465	-0.047494	0.063602	-7.415771e-03	0.009931	-0.013299	0.017810	-0.023851	0.031940	2.780914e-03	-3.724126e-03	0.004987	-0.006679	0.008944	-0.011978	0.016040
4	1.0	-0.513250	0.46564	0.263426	-0.238990	0.216821	-0.135203	0.122661	-0.111283	0.100960	0.069393	-0.062956	0.057116	-0.051818	0.047011	-3.561597e-02	0.032312	-0.029315	0.026596	-0.024128	0.021890	1.827990e-02	-1.658422e-02	0.015046	-0.013650	0.012384	-0.011235	0.010193

	f00	f10	f01	f20	f11	f02	f30	f21	f12	f03	f40	f31	f22	f13	f04	f50	f41	f32	f23	f14	f05	f60	f51	f42	f33	f24	f15	f06
count	118.0	118.000000	118.000000	118.000000	118.000000	118.000000	1.180000e+02	118.000000	118.000000	118.000000	1.180000e+02	118.000000	118.000000	118.000000	1.180000e+02	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02
mean	1.0	0.054779	0.183102	0.247575	-0.025472	0.301370	5.983333e-02	0.030682	0.015483	0.142350	1.225384e-01	-0.005251	0.050433	-0.011048	1.710985e-01	5.196507e-02	0.011812	9.432094e-03	0.018278	4.089084e-03	0.115710	7.837118e-02	-0.000703	1.893340e-02	-0.001705	2.259170e-02	-0.006302	1.257256e-01
std	0.0	0.496654	0.519743	0.248532	0.224075	0.284536	2.746459e-01	0.134706	0.150143	0.326134	2.092709e-01	0.096738	0.068211	0.116735	2.815658e-01	2.148098e-01	0.072274	5.455787e-02	0.058513	9.993907e-02	0.299092	1.938621e-01	0.058271	3.430092e-02	0.037443	4.346935e-02	0.090621	2.964416e-01
min	1.0	-0.830070	-0.769740	0.000040	-0.484096	0.000026	-5.719317e-01	-0.358121	-0.483743	-0.456071	1.612020e-09	-0.296854	0.000006	-0.483390	6.855856e-10	-3.940702e-01	-0.246068	-1.592528e-01	-0.142660	-4.830370e-01	-0.270222	6.472253e-14	-0.203971	2.577297e-10	-0.113448	2.418097e-10	-0.482684	1.795116e-14
25%	1.0	-0.372120	-0.254385	0.043243	-0.178209	0.061086	-5.155632e-02	-0.023672	-0.042980	-0.016492	1.869975e-03	-0.029360	0.004076	-0.046392	3.741593e-03	-7.147973e-03	-0.001926	-3.659760e-03	-0.001400	-7.449462e-03	-0.001072	8.086369e-05	-0.006381	1.258285e-04	-0.005749	3.528590e-04	-0.016662	2.298277e-04
50%	1.0	-0.006336	0.213455	0.165397	-0.016521	0.252195	-2.544062e-07	0.006603	-0.000039	0.009734	2.736163e-02	-0.000512	0.018229	-0.000446	6.360222e-02	-1.021440e-11	0.000205	-1.473547e-07	0.001026	-8.972096e-09	0.000444	4.527344e-03	-0.000004	3.387050e-03	-0.000005	3.921378e-03	-0.000020	1.604015e-02
75%	1.0	0.478970	0.646562	0.389925	0.100795	0.464189	1.099616e-01	0.086392	0.079510	0.270310	1.520801e-01	0.015050	0.070063	0.012576	2.155453e-01	2.526861e-02	0.019183	1.370560e-02	0.021148	2.751341e-02	0.113020	5.932959e-02	0.002104	2.090875e-02	0.001024	2.103622e-02	0.001289	1.001215e-01
max	1.0	1.070900	1.108900	1.146827	0.568307	1.229659	1.228137e+00	0.449251	0.505577	1.363569	1.315212e+00	0.369805	0.322973	0.449772	1.512062e+00	1.408460e+00	0.304409	2.451845e-01	0.287323	4.012965e-01	1.676725	1.508320e+00	0.250577	2.018260e-01	0.183548	2.556084e-01	0.436209	1.859321e+00

	Ones	Exam 1	Exam 2	Admitted
0	1	34.623660	78.024693	0
1	1	30.286711	43.894998	0
2	1	35.847409	72.902198	0
3	1	60.182599	86.308552	1
4	1	79.032736	75.344376	1