Binary classification single feature¶
Classification using "raw" python or libraries (SciKit Learn, Tensorflow).
The classification is first on a single boundary defined by a continuous univariate function and added white noise
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as pltcolors
from sklearn import metrics as metrics
from sklearn.linear_model import LogisticRegression as SkLinReg
import scipy as sy
import seaborn as sns
import pandas
import tensorflow as tf
Model¶
We want to measure or predict a value y to be above a threshold. E.g.: y is a temperature.
We know a feature x, y is related to x through a quadratic function we do not a priori know and some unknown
This unknown is modeled by a Gaussian noise
# Single feature, Gaussian noise
nFeatures = 1
def generateBatch(N):
#
xMin = 0
xMax = 1
b = 0.2
std = 0.2
# Threshold from 0 to 1
threshold = 1
#
x = np.random.uniform(xMin, xMax, N)
# 4th degree relation between y and x
yClean = 2*(x**4 + (x-0.3)**3 + b)
labels = yClean + np.random.normal(0, std, N) > threshold
return (x, yClean, labels)
The values of X are uniformly distributed and independent
N = 2000
# x and y have 1 dim in R, label has 1 dim in B
xTrain, yCleanTrain, labelTrain = generateBatch(N)
colors = ['blue','red']
fig = plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.scatter(xTrain, yCleanTrain, c=labelTrain, cmap=pltcolors.ListedColormap(colors), marker=',', alpha=0.01)
plt.xlabel('x')
plt.ylabel('y')
plt.grid()
plt.subplot(1,2,2)
plt.scatter(xTrain, labelTrain, marker=',', alpha=0.01)
plt.xlabel('x')
plt.ylabel('label')
plt.grid()
count, bins, ignored = plt.hist(labelTrain*1.0, 10, density=True, alpha=0.5)
p = np.mean(labelTrain)
print('Bernouilli parameter of the distribution:', p)
Note: The two values are not a priori equi probable. In theory, ressampling of the training values would be required to balance the a priori distribution.
xTest, yTest, labelTest = generateBatch(N)
Helpers¶
def plotHeatMap(X, classes, title=None, fmt='.2g', ax=None, xlabel=None, ylabel=None):
""" Fix heatmap plot from Seaborn with pyplot 3.1.0, 3.1.1
https://stackoverflow.com/questions/56942670/matplotlib-seaborn-first-and-last-row-cut-in-half-of-heatmap-plot
"""
ax = sns.heatmap(X, xticklabels=classes, yticklabels=classes, annot=True, fmt=fmt, cmap=plt.cm.Blues, ax=ax) #notation: "annot" not "annote"
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
if title:
ax.set_title(title)
if xlabel:
ax.set_xlabel(xlabel)
if ylabel:
ax.set_ylabel(ylabel)
def plotConfusionMatrix(yTrue, yEst, classes, title=None, fmt='.2g', ax=None):
plotHeatMap(metrics.confusion_matrix(yTrue, yEst), classes, title, fmt, ax, xlabel='Estimations', \
ylabel='True values');
Logistic and log of Logistic functions¶
def logistic(X):
return (1+(np.exp(-(X))))**-1
xx = np.linspace(-10, 10)
xlogistic = logistic(xx)
plt.figure(figsize=(10,5))
plt.subplot(1, 2, 1)
plt.plot(xx, xlogistic)
plt.grid()
plt.subplot(1, 2, 2)
plt.plot(xx, np.log(xlogistic))
plt.grid()
Logistic regression¶
\begin{align} y \in \left\{ 0, 1 \right\} \end{align}\begin{align} p(Y=1 \mid x) & = \frac{1}{1+e^{-f_\theta(x)}} \\ f_\theta(x) & = b + w x \\ \theta &= \{b, w\} \end{align}We are looking for the value of w that maximize the likelyhood: \begin{align} \hat{\theta} & = \max_{\theta}{\prod_{i=0}^N{p(y_i \mid x_i, w)}} \\ & = \max_{\theta}{\sum_{i=0}^N{log \left(p(y_i \mid x_i, w)\right)} } \\ & = \max_{\theta}{\sum_{i=0}^N{log \left(\left(\frac{1}{1+e^{-f_\theta(x_i)}}\right)^{y_i}\left(1-\frac{1}{1+e^{-f_\theta(x_i)}}\right)^{1-y_i}\right)} } \\ & = \max_{\theta}{\sum_{i=0}^N{log \left(y_i * \left(\frac{1}{1+e^{-f_\theta(x_i)}}\right) + \left(1-y_i\right) * \left(1-\frac{1}{1+e^{-f_\theta(x_I)}}\right) \right)} } \\ \end{align}
Using the fact that $y_i$ is either 0 or 1. The last formulation is avoiding logarithm of zero as one of the two terms within the sum is null.
Since the number of classes is 2, the maximum log likelyhood is also called binary cross entropy.
Reference:
Fitting of $b$ and then $w$¶
Suboptimal fitting:
- Taking some assumption on $w$ to fit $b$ as $\hat{b}$
- and then fitting $w$ with the $\hat{b}$ estimate
b = np.linspace(-5, 5)
w = 1
px = np.zeros(len(b))
for i in range(len(b)):
fx = logistic(b[i] + w*xTrain)
px[i] = 1/N * np.sum(np.log(labelTrain*fx + (1-labelTrain)*(1-fx)))
plt.plot(b, px);
plt.xlabel('$b$')
plt.ylabel('l(b, X)')
plt.grid()
bHat = b[np.argmax(px)]
print('Estimate b =', bHat)
w = np.linspace(-20, 20)
px = np.zeros(len(w))
for i in range(len(w)):
fx = logistic(bHat + w[i]*xTrain)
px[i] = 1/N * np.sum(np.log(labelTrain*fx + (1-labelTrain)*(1-fx)))
plt.plot(w, px);
plt.xlabel('w')
plt.ylabel('l(w, X)')
plt.grid()
wHat = w[np.argmax(px)]
print('Estimate w =', wHat)
pXTest0 = logistic(bHat + wHat * xTest)
labelEst0 = pXTest0 > 0.5
plt.scatter(xTest, pXTest0, c=labelEst0, cmap=pltcolors.ListedColormap(colors), marker=',', alpha=0.01);
plt.scatter(xTest, yTest/np.max(yTest), c = labelTest, cmap=pltcolors.ListedColormap(colors), marker='x', alpha=0.01);
plt.xlabel('x')
plt.legend(('Estimated probability', 'Normalized model'));
plt.hist(labelEst0*1.0, 10, density=True)
print('Bernouilli parameter =', np.mean(labelEst0))
accuracy0 = np.sum(labelTest == labelEst0)/N
print('Accuracy =', accuracy0)
Precision¶
$p(y = 1 \mid \hat{y} = 1)$
print('Precision =', np.sum(labelTest[labelEst0 == 1])/np.sum(labelEst0))
Recall¶
$p(\hat{y} = 1 \mid y = 1)$
print('Recall =', np.sum(labelTest[labelEst0 == 1])/np.sum(labelTest))
Confusion matrix¶
plotConfusionMatrix(labelTest, labelEst0, np.array(['Blue', 'Red']));
print(metrics.classification_report(labelTest, labelEst0))
SciKit Learn¶
References:
- SciKit documentation
- https://www.geeksforgeeks.org/ml-logistic-regression-using-python/
model1 = SkLinReg(solver='lbfgs')
model1.fit(xTrain.reshape(-1,1), labelTrain)
model1.coef_
labelEst1 = model1.predict(xTest.reshape(-1,1))
print('Accuracy =',model1.score(xTest.reshape(-1,1), labelTest))
plt.hist(labelEst1*1.0, 10, density=True)
print('Bernouilli parameter =', np.mean(labelEst1))
Confusion matrix (plot)¶
plotConfusionMatrix(labelTest, labelEst1, np.array(['Blue', 'Red']))
Classification report¶
print(metrics.classification_report(labelTest, labelEst1))
ROC curve¶
logit_roc_auc = metrics.roc_auc_score(labelTest, labelEst1)
fpr, tpr, thresholds = metrics.roc_curve(labelTest, model1.predict_proba(xTest.reshape(-1,1))[:,1])
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right");
Using TensorFlow 2.0¶
In TensorFlow 2.0 many possibilities are available to design a sequential layer. It could be based on high level API using Keras, down to function code close to the syntax of Tensorflow 1.0.
Following design is showing how to implement a custom layer within a Sequential pipeline of Keras, and how to implement a custom metric. This is the favoured method to implement custom code in TensorFlow 2.0.
# Labels as floats {0., 1.}
labelTrainF = np.multiply(labelTrain, 1.0)
labelTrainF.dtype, labelTrainF.shape
# (Mini) Batch size
nBatch = 100
# Number of batches per Epoch
nBatchPerEpoch =20
# Number of epochs
nEpochMax = 1000
# Simple custom layer exposing the linear regression model
class MyLogisticRegressionLayer(tf.keras.layers.Layer):
def __init__(self, *args, **kwargs):
super(MyLogisticRegressionLayer, self).__init__(*args, **kwargs)
def build(self, input_shape):
self.w = self.add_weight(
shape=input_shape[0],
dtype=self.dtype,
initializer=tf.keras.initializers.ones(),
#regularizer=tf.keras.regularizers.l2(0.02),
trainable=True)
self.b = self.add_weight(
shape=1,
dtype=self.dtype,
initializer=tf.keras.initializers.ones(),
#regularizer=tf.keras.regularizers.l2(0.02),
trainable=True)
@tf.function
def call(self, x, training=None):
return tf.math.sigmoid(tf.math.add(tf.math.multiply(x, self.w), self.b))
# Using TensorFlow 2.0 style of metrics to implement accuracy
class MyBinaryAccuracy(tf.keras.metrics.Metric):
def __init__(self, name='my_accuracy', **kwargs):
super(MyBinaryAccuracy, self).__init__(name=name, **kwargs)
self.accuracySum = self.add_weight(name='accuracySum',
initializer='zeros')
self.accuracyCount = self.add_weight(name='accuracyCount',
initializer='zeros')
def update_state(self, labels, yEst):
labels = tf.cast(labels, tf.bool)
labelEst = tf.greater(yEst, 0.5)
values = tf.cast(tf.equal(labels, labelEst), self.dtype)
self.accuracySum.assign_add(tf.reduce_sum(values))
self.accuracyCount.assign_add(values.get_shape()[0])
def result(self):
return self.accuracySum / self.accuracyCount
# Model 1, instantiate the custom layer
model1 = tf.keras.Sequential([MyLogisticRegressionLayer(input_shape=[nFeatures], dtype="float64")])
# Stochastic Gradient Descent Optimizer
optim1 = tf.keras.optimizers.SGD(0.01)
# Perform a train step on a mini-batch
# This function's code is rewritten by TensorFlow 2.0 and shall be compiled at every execution of the optimizer
@tf.function
def trainStep1(x, labels):
with tf.GradientTape() as tape:
predictions = model1(x, training=True)
loss = -tf.reduce_sum(tf.math.log((labels * predictions) + ((1 - labels) * (1 - predictions))))
#loss = tf.keras.losses.categorical_crossentropy(labels, predictions)
gradients = tape.gradient(loss, model1.trainable_variables)
optim1.apply_gradients(zip(gradients, model1.trainable_variables))
return loss, predictions
# Initialize values and loop on epochs and mini batch
epoch = 0
cost_epoch = 1
histo = []
accuracy = MyBinaryAccuracy()
for epoch in range(nEpochMax):
cost_cumul = 0
accuracy.reset_states()
for b in range(0, nBatchPerEpoch*nBatch, nBatch):
cost, predictions = trainStep1(xTrain[b : b + nBatch], labelTrainF[b : b + nBatch])
cost_cumul += cost
accuracy.update_state(labelTrainF[b : b + nBatch], predictions)
cost_epoch = cost_cumul / nBatchPerEpoch
W = model1.get_weights()
histo.append((cost_epoch.numpy(), accuracy.result().numpy(), W[1][0], W[0]))
print("Predicted model: {b:.3f} + {w:.3f} x, num epochs={c}".format(w=W[0], b=W[1][0], c=len(histo)))
# Save history as a Panda Data Frame
df = pandas.DataFrame(histo, columns = ('cost', 'accuracy', 'b', 'w0'))
SGD shows that there is not a single optimal value for b+w (intercept + slope) but a straight line as shown on the graph below. This is explained by the single feature: the decision boundary does not need to be a straight line, a single intercept point would be enough.
plt.scatter(df['b'], df['w0'], marker='.', alpha=0.2);
plt.xlabel('intercept')
plt.ylabel('weight');
fig, ax = plt.subplots(1,2, figsize=(16, 4))
ax[0].plot(df['cost'])
ax[0].grid()
ax[1].plot(df['accuracy'])
ax[1].grid()
Where to go from here ?¶
More complex models with the 2 feature binary classification (Notebook) or the K Nearest Neighbors classifier (Notebook)
Compare with the single feature linear regression using simple algorithms (Notebook), or using Tensorflow (Notebook)