# Reference: https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import io
ex2data1_upload = files.upload()
Saving ex2data1.txt to ex2data1.txt
ex2data1 = pd.read_csv(io.BytesIO(ex2data1_upload['ex2data1.txt']), header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
ex2data2_upload = files.upload()
Saving ex2data2.txt to ex2data2 (1).txt
ex2data2 = pd.read_csv(io.BytesIO(ex2data2_upload['ex2data2.txt']), header=None, names=['test1', 'test2', 'accepted'])
ex2data1.head()
| Ones | Exam 1 | Exam 2 | Admitted | |
|---|---|---|---|---|
| 0 | 1 | 34.623660 | 78.024693 | 0 |
| 1 | 1 | 30.286711 | 43.894998 | 0 |
| 2 | 1 | 35.847409 | 72.902198 | 0 |
| 3 | 1 | 60.182599 | 86.308552 | 1 |
| 4 | 1 | 79.032736 | 75.344376 | 1 |
ex2data2.head()
| test1 | test2 | accepted | |
|---|---|---|---|
| 0 | 0.051267 | 0.69956 | 1 |
| 1 | -0.092742 | 0.68494 | 1 |
| 2 | -0.213710 | 0.69225 | 1 |
| 3 | -0.375000 | 0.50219 | 1 |
| 4 | -0.513250 | 0.46564 | 1 |
admitted = ex2data1[ex2data1['Admitted'].isin([1])]
notAdmitted = ex2data1[ex2data1['Admitted'].isin([0])]
plt.figure(figsize=(10,6))
plt.scatter(admitted['Exam 1'], admitted['Exam 2'], s=50, c='b', marker='o', label='Admitted')
plt.scatter(notAdmitted['Exam 1'], notAdmitted['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
plt.xlabel('Exam 1 Score')
plt.ylabel('Exam 2 Score')
Text(0, 0.5, 'Exam 2 Score')
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y):
m = y.size
h = sigmoid(X.dot(theta)) # 100, 1
log_h = np.log(h) # 100, 1
log_one_minusH = np.log(1 - h)
sum = np.dot(-y.T, log_h) - np.dot((1-y).T, log_one_minusH)
return sum / m
# add a ones column - this makes the matrix multiplication work out easier
ex2data1.insert(0, 'Ones', 1)
# set X (training data) and y (target variable)
cols = ex2data1.shape[1]
X1 = ex2data1.iloc[:, 0:cols-1]
y1 = ex2data1.iloc[:, cols-1:cols]
X1_mat = np.matrix(X1)
y1_mat = np.matrix(y1)
X1.iloc[0:5]
| Ones | Exam 1 | Exam 2 | |
|---|---|---|---|
| 0 | 1 | 34.623660 | 78.024693 |
| 1 | 1 | 30.286711 | 43.894998 |
| 2 | 1 | 35.847409 | 72.902198 |
| 3 | 1 | 60.182599 | 86.308552 |
| 4 | 1 | 79.032736 | 75.344376 |
X1_mat[0:5]
matrix([[ 1. , 34.62365962, 78.02469282],
[ 1. , 30.28671077, 43.89499752],
[ 1. , 35.84740877, 72.90219803],
[ 1. , 60.18259939, 86.3085521 ],
[ 1. , 79.03273605, 75.34437644]])
theta = np.zeros(3)
X1_mat.shape, y1_mat.shape, theta.shape
((100, 3), (100, 1), (3,))
X1.dot(theta).shape
(100,)
X1_mat.dot(theta).shape
(1, 100)
cost(theta, X1, y1)
array([0.69314718])
def gradient(theta, X, y):
m = y.size
h = sigmoid(X.dot(theta)) # 100, 1
return (1 / m) * X.T @ (h - y) # (3, 100) * (100, 1)
X1 = X1.to_numpy()
y1 = np.squeeze(np.asarray(y1_mat))
gradient(theta, X1, y1)
array([ -0.1 , -12.00921659, -11.26284221])
import scipy.optimize as opt
theta = np.zeros(3)
res = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X1, y1))
print(res)
(array([-25.16131855, 0.20623159, 0.20147149]), 36, 0)
res = opt.minimize(fun=cost, x0=theta, args=(X1, y1), method='Newton-CG', jac=gradient)
print(res)
fun: 0.20349770195754097
jac: array([-2.22493188e-05, -1.45448488e-03, -1.36924251e-03])
message: 'Optimization terminated successfully.'
nfev: 72
nhev: 0
nit: 28
njev: 241
status: 0
success: True
x: array([-25.15977355, 0.20621882, 0.20145941])
def predict(x, theta):
prob = sigmoid(x @ theta)
return (prob >= 0.5).astype(int)
final_theta = res.x
y_pred = predict(X1, final_theta)
from sklearn.metrics import classification_report
print(classification_report(y1, y_pred))
precision recall f1-score support
0 0.87 0.85 0.86 40
1 0.90 0.92 0.91 60
accuracy 0.89 100
macro avg 0.89 0.88 0.88 100
weighted avg 0.89 0.89 0.89 100
# plot Decision Boundary
print(res.x) # this is final theta
[-25.15977355 0.20621882 0.20145941]
coef = -(res.x / res.x[2]) # find the equation
print(coef)
x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x
[124.88755497 -1.02362462 -1. ]
ex2data1.describe()
| Ones | Exam 1 | Exam 2 | Admitted | |
|---|---|---|---|---|
| count | 100.0 | 100.000000 | 100.000000 | 100.000000 |
| mean | 1.0 | 65.644274 | 66.221998 | 0.600000 |
| std | 0.0 | 19.458222 | 18.582783 | 0.492366 |
| min | 1.0 | 30.058822 | 30.603263 | 0.000000 |
| 25% | 1.0 | 50.919511 | 48.179205 | 0.000000 |
| 50% | 1.0 | 67.032988 | 67.682381 | 1.000000 |
| 75% | 1.0 | 80.212529 | 79.360605 | 1.000000 |
| max | 1.0 | 99.827858 | 98.869436 | 1.000000 |
import seaborn as sns
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('Exam 1', 'Exam 2', hue='Admitted', data = ex2data1,
size=6,
fit_reg=False,
scatter_kws={"s": 25}
)
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()
| test1 | test2 | accepted | |
|---|---|---|---|
| 0 | 0.051267 | 0.69956 | 1 |
| 1 | -0.092742 | 0.68494 | 1 |
| 2 | -0.213710 | 0.69225 | 1 |
| 3 | -0.375000 | 0.50219 | 1 |
| 4 | -0.513250 | 0.46564 | 1 |
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('test1', 'test2', hue='accepted', data=df,
size=6,
fit_reg=False,
scatter_kws={"s": 50}
)
plt.title('Regularized Logistic Regression')
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
def feature_mapping(x, y, power, as_ndarray=False):
# """return mapped features as ndarray or dataframe"""
# data = {}
# # inclusive
# for i in np.arange(power + 1):
# for p in np.arange(i + 1):
# data["f{}{}".format(i - p, p)] = np.power(x, i - p) * np.power(y, p)
data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
for i in np.arange(power + 1)
for p in np.arange(i + 1)
}
if as_ndarray:
return pd.DataFrame(data).values
else:
return pd.DataFrame(data)
x1 = np.array(df.test1)
x2 = np.array(df.test2)
data = feature_mapping(x1, x2, power=6)
print(data.shape)
data.head()
(118, 28)
| f00 | f10 | f01 | f20 | f11 | f02 | f30 | f21 | f12 | f03 | f40 | f31 | f22 | f13 | f04 | f50 | f41 | f32 | f23 | f14 | f05 | f60 | f51 | f42 | f33 | f24 | f15 | f06 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.051267 | 0.69956 | 0.002628 | 0.035864 | 0.489384 | 0.000135 | 0.001839 | 0.025089 | 0.342354 | 0.000007 | 0.000094 | 0.001286 | 0.017551 | 0.239497 | 3.541519e-07 | 0.000005 | 0.000066 | 0.000900 | 0.012278 | 0.167542 | 1.815630e-08 | 2.477505e-07 | 0.000003 | 0.000046 | 0.000629 | 0.008589 | 0.117206 |
| 1 | 1.0 | -0.092742 | 0.68494 | 0.008601 | -0.063523 | 0.469143 | -0.000798 | 0.005891 | -0.043509 | 0.321335 | 0.000074 | -0.000546 | 0.004035 | -0.029801 | 0.220095 | -6.860919e-06 | 0.000051 | -0.000374 | 0.002764 | -0.020412 | 0.150752 | 6.362953e-07 | -4.699318e-06 | 0.000035 | -0.000256 | 0.001893 | -0.013981 | 0.103256 |
| 2 | 1.0 | -0.213710 | 0.69225 | 0.045672 | -0.147941 | 0.479210 | -0.009761 | 0.031616 | -0.102412 | 0.331733 | 0.002086 | -0.006757 | 0.021886 | -0.070895 | 0.229642 | -4.457837e-04 | 0.001444 | -0.004677 | 0.015151 | -0.049077 | 0.158970 | 9.526844e-05 | -3.085938e-04 | 0.001000 | -0.003238 | 0.010488 | -0.033973 | 0.110047 |
| 3 | 1.0 | -0.375000 | 0.50219 | 0.140625 | -0.188321 | 0.252195 | -0.052734 | 0.070620 | -0.094573 | 0.126650 | 0.019775 | -0.026483 | 0.035465 | -0.047494 | 0.063602 | -7.415771e-03 | 0.009931 | -0.013299 | 0.017810 | -0.023851 | 0.031940 | 2.780914e-03 | -3.724126e-03 | 0.004987 | -0.006679 | 0.008944 | -0.011978 | 0.016040 |
| 4 | 1.0 | -0.513250 | 0.46564 | 0.263426 | -0.238990 | 0.216821 | -0.135203 | 0.122661 | -0.111283 | 0.100960 | 0.069393 | -0.062956 | 0.057116 | -0.051818 | 0.047011 | -3.561597e-02 | 0.032312 | -0.029315 | 0.026596 | -0.024128 | 0.021890 | 1.827990e-02 | -1.658422e-02 | 0.015046 | -0.013650 | 0.012384 | -0.011235 | 0.010193 |
data.describe()
| f00 | f10 | f01 | f20 | f11 | f02 | f30 | f21 | f12 | f03 | f40 | f31 | f22 | f13 | f04 | f50 | f41 | f32 | f23 | f14 | f05 | f60 | f51 | f42 | f33 | f24 | f15 | f06 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 118.0 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 1.180000e+02 | 118.000000 | 118.000000 | 118.000000 | 1.180000e+02 | 118.000000 | 118.000000 | 118.000000 | 1.180000e+02 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 |
| mean | 1.0 | 0.054779 | 0.183102 | 0.247575 | -0.025472 | 0.301370 | 5.983333e-02 | 0.030682 | 0.015483 | 0.142350 | 1.225384e-01 | -0.005251 | 0.050433 | -0.011048 | 1.710985e-01 | 5.196507e-02 | 0.011812 | 9.432094e-03 | 0.018278 | 4.089084e-03 | 0.115710 | 7.837118e-02 | -0.000703 | 1.893340e-02 | -0.001705 | 2.259170e-02 | -0.006302 | 1.257256e-01 |
| std | 0.0 | 0.496654 | 0.519743 | 0.248532 | 0.224075 | 0.284536 | 2.746459e-01 | 0.134706 | 0.150143 | 0.326134 | 2.092709e-01 | 0.096738 | 0.068211 | 0.116735 | 2.815658e-01 | 2.148098e-01 | 0.072274 | 5.455787e-02 | 0.058513 | 9.993907e-02 | 0.299092 | 1.938621e-01 | 0.058271 | 3.430092e-02 | 0.037443 | 4.346935e-02 | 0.090621 | 2.964416e-01 |
| min | 1.0 | -0.830070 | -0.769740 | 0.000040 | -0.484096 | 0.000026 | -5.719317e-01 | -0.358121 | -0.483743 | -0.456071 | 1.612020e-09 | -0.296854 | 0.000006 | -0.483390 | 6.855856e-10 | -3.940702e-01 | -0.246068 | -1.592528e-01 | -0.142660 | -4.830370e-01 | -0.270222 | 6.472253e-14 | -0.203971 | 2.577297e-10 | -0.113448 | 2.418097e-10 | -0.482684 | 1.795116e-14 |
| 25% | 1.0 | -0.372120 | -0.254385 | 0.043243 | -0.178209 | 0.061086 | -5.155632e-02 | -0.023672 | -0.042980 | -0.016492 | 1.869975e-03 | -0.029360 | 0.004076 | -0.046392 | 3.741593e-03 | -7.147973e-03 | -0.001926 | -3.659760e-03 | -0.001400 | -7.449462e-03 | -0.001072 | 8.086369e-05 | -0.006381 | 1.258285e-04 | -0.005749 | 3.528590e-04 | -0.016662 | 2.298277e-04 |
| 50% | 1.0 | -0.006336 | 0.213455 | 0.165397 | -0.016521 | 0.252195 | -2.544062e-07 | 0.006603 | -0.000039 | 0.009734 | 2.736163e-02 | -0.000512 | 0.018229 | -0.000446 | 6.360222e-02 | -1.021440e-11 | 0.000205 | -1.473547e-07 | 0.001026 | -8.972096e-09 | 0.000444 | 4.527344e-03 | -0.000004 | 3.387050e-03 | -0.000005 | 3.921378e-03 | -0.000020 | 1.604015e-02 |
| 75% | 1.0 | 0.478970 | 0.646562 | 0.389925 | 0.100795 | 0.464189 | 1.099616e-01 | 0.086392 | 0.079510 | 0.270310 | 1.520801e-01 | 0.015050 | 0.070063 | 0.012576 | 2.155453e-01 | 2.526861e-02 | 0.019183 | 1.370560e-02 | 0.021148 | 2.751341e-02 | 0.113020 | 5.932959e-02 | 0.002104 | 2.090875e-02 | 0.001024 | 2.103622e-02 | 0.001289 | 1.001215e-01 |
| max | 1.0 | 1.070900 | 1.108900 | 1.146827 | 0.568307 | 1.229659 | 1.228137e+00 | 0.449251 | 0.505577 | 1.363569 | 1.315212e+00 | 0.369805 | 0.322973 | 0.449772 | 1.512062e+00 | 1.408460e+00 | 0.304409 | 2.451845e-01 | 0.287323 | 4.012965e-01 | 1.676725 | 1.508320e+00 | 0.250577 | 2.018260e-01 | 0.183548 | 2.556084e-01 | 0.436209 | 1.859321e+00 |
def get_y(df):
return np.array(df.iloc[:, -1])
theta = np.zeros(data.shape[1])
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
print(X.shape)
y = get_y(df)
print(y.shape)
(118, 28) (118,)
def regularized_cost(theta, X, y, l=1):
theta_j1_to_n = theta[1:]
regularized_term = (l / (2 * len(X))) * np.power(theta_j1_to_n, 2).sum()
return cost(theta, X, y) + regularized_term
regularized_cost(theta, X, y, l=1)
0.6931471805599454
def regularized_gradient(theta, X, y, l=1):
theta_j1_to_n = theta[1:]
regularized_theta = (l / len(X)) * theta_j1_to_n
# by doing this, no offset is on theta_0
regularized_term = np.concatenate([np.array([0]), regularized_theta])
return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
array([8.47457627e-03, 1.87880932e-02, 7.77711864e-05, 5.03446395e-02,
1.15013308e-02, 3.76648474e-02, 1.83559872e-02, 7.32393391e-03,
8.19244468e-03, 2.34764889e-02, 3.93486234e-02, 2.23923907e-03,
1.28600503e-02, 3.09593720e-03, 3.93028171e-02, 1.99707467e-02,
4.32983232e-03, 3.38643902e-03, 5.83822078e-03, 4.47629067e-03,
3.10079849e-02, 3.10312442e-02, 1.09740238e-03, 6.31570797e-03,
4.08503006e-04, 7.26504316e-03, 1.37646175e-03, 3.87936363e-02])
import scipy.optimize as opt
print('init cost = {}'.format(regularized_cost(theta, X, y)))
res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res
init cost = 0.6931471805599454
fun: 0.5290027297130675
jac: array([ 1.96155545e-08, 2.26570813e-08, 2.45080357e-08, 3.79471079e-08,
-4.65072849e-09, 2.89854099e-08, 3.30172156e-08, 1.72884135e-08,
2.50749741e-08, -5.81604980e-09, -1.20055820e-08, -8.01873353e-09,
-2.67385626e-08, 6.76384879e-09, -3.06403040e-08, 1.98999264e-08,
1.43220093e-08, -1.05199705e-08, -6.39393788e-09, 1.61989560e-08,
-2.82679044e-08, -1.44827983e-08, -1.28685488e-09, -1.20644688e-08,
-7.50395402e-09, -1.15134582e-08, 1.64650590e-09, -3.15937164e-08])
message: 'Optimization terminated successfully.'
nfev: 7
nhev: 0
nit: 6
njev: 62
status: 0
success: True
x: array([ 1.27273886, 0.62527109, 1.18108962, -2.01995798, -0.91742463,
-1.431661 , 0.12400861, -0.36553297, -0.35723759, -0.17513036,
-1.45815878, -0.05099011, -0.61555855, -0.27470598, -1.19281891,
-0.24218685, -0.20600467, -0.04473229, -0.27778532, -0.29537678,
-0.45635924, -1.04320478, 0.0277715 , -0.29243293, 0.01556582,
-0.32738097, -0.14388717, -0.92465458])
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.90 0.75 0.82 60
1 0.78 0.91 0.84 58
accuracy 0.83 118
macro avg 0.84 0.83 0.83 118
weighted avg 0.84 0.83 0.83 118
from sklearn import linear_model
model = linear_model.LogisticRegression(penalty='l2', C=1.0)
model.fit(X, y.ravel())
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
model.score(X, y)
0.8305084745762712
# Plotting the decision boundary
def draw_boundary(power, l):
# """
# power: polynomial power for mapped feature
# l: lambda constant
# """
density = 1000
threshhold = 2 * 10**-3
final_theta = feature_mapped_logistic_regression(power, l)
x, y = find_decision_boundary(density, power, final_theta, threshhold)
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})
plt.scatter(x, y, c='R', s=10)
plt.title('Decision boundary')
plt.show()
def feature_mapped_logistic_regression(power, l):
# """for drawing purpose only.. not a well generealize logistic regression
# power: int
# raise x1, x2 to polynomial power
# l: int
# lambda constant for regularization term
# """
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
x1 = np.array(df.test1)
x2 = np.array(df.test2)
y = get_y(df)
X = feature_mapping(x1, x2, power, as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=regularized_gradient)
final_theta = res.x
return final_theta
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.5, density)
t2 = np.linspace(-1, 1.5, density)
cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
mapped_cord = feature_mapping(x_cord, y_cord, power) # this is a dataframe
inner_product = mapped_cord.values @ theta
decision = mapped_cord[np.abs(inner_product) < threshhold]
return decision.f10, decision.f01
draw_boundary(power=6, l=1) #lambda=1
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:16: MatplotlibDeprecationWarning: Support for uppercase single-letter colors is deprecated since Matplotlib 3.1 and will be removed in 3.3; please use lowercase instead. app.launch_new_instance()
draw_boundary(power=6, l=0) # no regularization, over fitting
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
draw_boundary(power=6, l=100) # underfitting,#lambda=100
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/regression.py:580: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)