Linear classification, multi class based on a 2D (2 features) problem¶
Following the binary (2 class) classification (HTML / Jupyter), let's generalize to more classes.
Two models generated datasets are used : based on the Czech Republic and Norway flags. Both have three colors corresponding to the three classes we want to separate. The Czech flag is linearly separable, the Norway's one is not.
Learning goals :
- Multiclass classification with softmax activation
- Multilayer neural network to handle not linear functions
- Use regularization to improve score and convergence
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as pltcolors
from sklearn import metrics as skMetrics
import pandas
import seaborn as sns
usingTensorBoard = False
Helpers¶
# Helper to plot the flag as a 2D parametric label
def flagPlot(x0, x1, y, title, colors):
ax = plt.gca()
ax.set_facecolor((0.8, 0.8, 0.8))
plt.scatter(x0, x1, c=y, cmap=pltcolors.ListedColormap(colors), marker='x', alpha=0.5);
plt.xlabel('x0')
plt.ylabel('x1')
plt.grid()
plt.title(title)
cb = plt.colorbar()
loc = np.arange(0,2.1,1)
cb.set_ticks(loc)
cb.set_ticklabels([0,1,2]);
def plotHeatMap(X, classes, title=None, fmt='.2g', ax=None, xlabel=None, ylabel=None):
""" Fix heatmap plot from Seaborn with pyplot 3.1.0, 3.1.1
https://stackoverflow.com/questions/56942670/matplotlib-seaborn-first-and-last-row-cut-in-half-of-heatmap-plot
"""
ax = sns.heatmap(X, xticklabels=classes, yticklabels=classes, annot=True, fmt=fmt, cmap=plt.cm.Blues, ax=ax) #notation: "annot" not "annote"
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
if title:
ax.set_title(title)
if xlabel:
ax.set_xlabel(xlabel)
if ylabel:
ax.set_ylabel(ylabel)
def plotConfusionMatrix(yTrue, yEst, classes, title=None, fmt='.2g', ax=None):
plotHeatMap(skMetrics.confusion_matrix(yTrue, yEst), classes, title, fmt, ax, xlabel='Estimations', ylabel='True values');
# Czech flag colors
czechColors = np.array(['blue', 'red', 'white'])
nFeatures = 2
nClasses = 3
# Generate a multi class sample based on the Czech banner geometry. Tip of the triangle if at the origin
def generateCzechBatch(n, noiseSigma=0.1):
# xMax adjusted such that the 3 classes are quasi equiprobable
xMin = np.array([-1, -1])
xMax = np.array([0.5, 1])
#
X = np.random.uniform(xMin, xMax, (n, nFeatures))
noisyX = X + np.random.normal(0, noiseSigma, X.shape)
y = np.zeros(n)
y[noisyX[:,1] > 0] = 2 # White
y[noisyX[:,1] <= 0] = 1 # Red
y[(noisyX[:,0] <= 0) & (np.abs(noisyX[:,1]) < np.abs(noisyX[:,0]))] = 0 # Blue triangle
return X, y
N = 1000
xTrain, yTrain = generateCzechBatch(N)
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
flagPlot(xTrain[:,0], xTrain[:,1], yTrain, 'Generated', czechColors)
plt.subplot(1,2,2)
plt.hist(yTrain, 10, density=True, alpha=0.5)
plt.title('Generated histogram');
Test data¶
xTest, yTest = generateCzechBatch(N)
Softmax regression¶
Maximum a posteriori with softmax is the generalization of the logistic regression in case number of classes is greater than 2.
The softmax will normalize the set of output values to a sum of 1, outputs are then probabilities.
With K the number of classes: \begin{align} y \in \left\{ 1 ... K \right\} \end{align}
The softmax probability law of Y is: \begin{align} p (Y=c \mid x, w) & = \frac{e^{w_{c} x }}{\sum_{j=1}^{K}e^{w_{j} x}} \end{align}
We are looking for the value of w that maximizes the likelyhood: \begin{align} \hat{w} & = \arg \max_{w}{\prod_{i=0}^N{p(y_i \mid x_i, w)}} \\ & = \arg \max_{w}{\sum_{i=0}^N{\log \bigl(p(y_i \mid x_i, w)\bigr)} } \\ & = \arg \max_{w}{\sum_{c=1}^K\sum_{i=0}^N{\mathbb{1}_{y_i=c} * \log \biggl(\frac{e^{w_{c} x }}{\sum_{j=1}^{K}e^{w_{j} x}} \biggr)} } \\ \end{align}
Where $\mathbb{1}_{y_i=c}$ is the indicator function which takes value 1 if $y_i=c$ and 0 otherwise
When using an iterative gradient descent, the cost function is then defined as: \begin{align} J_w = - \sum_{c=1}^K\sum_{i=0}^N{\mathbb{1}_{y_i=c} * \log \biggl(\frac{e^{w_{c} x }}{\sum_{j=1}^{K}e^{w_{j} x}}\biggr)} \end{align}
Reference:
The simplest model of linear regression is based on a single layers with as many neurons as the number of classes (=3). The activation of the layer is a softmax taking reference values (y/label train) in range [0,3)
# Number of epochs
nEpoch = 1024
nBatch = 128 # 32 is default
# Model
model0 = keras.models.Sequential([
keras.layers.Dense(nClasses, activation=keras.activations.softmax,
kernel_regularizer=keras.regularizers.l1(0.001),
input_shape=[nFeatures])
])
model0.compile(optimizer='adam',
loss=keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
# Tensor board display
callbacks = []
if usingTensorBoard:
ks = keras.callbacks.TensorBoard(log_dir="./logs/", histogram_freq=1, write_graph=True, write_grads=True, batch_size=1)
callbacks = [ks]
# Fit
hist0 = model0.fit(xTrain, yTrain, epochs=nEpoch, batch_size=nBatch, verbose=0, callbacks=callbacks)
weights0, biases0 = model0.get_weights()
print('Est W=', weights0)
print('b=', biases0)
# Parametric plot of the linear model
t = np.linspace(-1, 1)
markers = ('.', 'v', '4')
for i in range(0,3):
plt.scatter(weights0[0, i] * t + biases0[i], weights0[1, i]*t, marker=markers[i], c=t, cmap="RdBu_r")
plt.legend(['Blue (0)', 'Red (1)', 'White (2)']);
plt.title('Parametric plot of the linear model, red = high, blue = low');
plt.figure(figsize=(15,4))
plt.subplot(1,3,1)
plt.plot(hist0.history['loss'])
plt.grid()
plt.title('Loss')
plt.subplot(1,3,2)
plt.plot(hist0.history['accuracy'])
plt.grid()
plt.title('Accuracy');
Testing the model¶
yEst0 = model0.predict(xTest)
labelEst0 = np.argmax(yEst0, axis=1)
plt.figure(figsize=(16,4))
plt.subplot(1,3,1)
flagPlot(xTest[:,0], xTest[:,1], labelEst0, 'Estimated', czechColors)
plt.subplot(1,3,2)
plt.hist(labelEst0, 10, density=True, alpha=0.5)
plt.title('Estimated histo')
plt.subplot(1,3,3)
flagPlot(xTest[:,0], xTest[:,1], yTest, 'Generated', czechColors)
plotConfusionMatrix(yTest, labelEst0, czechColors);
print(skMetrics.classification_report(yTest, labelEst0))
Conclusions on initial model¶
Fitting is excellent, the separation of the classes is very accurate
Verification is done that the Czech flag is linearly separable.
Norway flag regression¶
Norway's flag is more challenging than the Czech one as it contains a double white-blue cross.
Evidently it is not linearly separable. Actualy, this class of problems, the simplest being the XOR truth table, is one motivation to neural networks.
Data model for the Norway flag¶
Geometry is modified to get quasi equiprobable categories
# Norway flag colors
norwayColors = ['red', 'white', 'navy']
# Generate a multi class sample based on the Norway banner geometry.
def generateNorwayBatch(n, noiseSigma=0.1):
# xMax adjusted such that the 3 classes are quasi equiprobable
xMin = np.array([-1, -1])
xMax = np.array([1, 1])
#
X = np.random.uniform(xMin, xMax, (n, nFeatures))
a1 = 0.43
a2 = 0.18
noisyX = X + np.random.normal(0, noiseSigma, X.shape)
y = np.zeros(n) # Red = background
y[((noisyX[:,0] > -a1) & (noisyX[:,0] < a1)) | ((noisyX[:,1] > -a1) & (noisyX[:,1] < a1))] = 1 # White cross
y[((noisyX[:,0] > -a2) & (noisyX[:,0] < a2)) | ((noisyX[:,1] > -a2) & (noisyX[:,1] < a2))] = 2 # Navy cross over white
return X, y
N = 1000
xTrain1, yTrain1 = generateNorwayBatch(N)
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
flagPlot(xTrain1[:,0], xTrain1[:,1], yTrain1, 'Generated', norwayColors)
plt.subplot(1,2,2)
plt.hist(yTrain1, 10, density=True, alpha=0.5)
plt.title('Generated histogram');
Test data¶
xTest1, yTest1 = generateNorwayBatch(N)
Single layer model¶
nEpoch = 128
nBatch = 128
# Model
model1 = keras.models.Sequential([
keras.layers.Dense(nClasses, activation=keras.activations.softmax, input_shape=[nFeatures],
kernel_regularizer=keras.regularizers.l1(0.001))
])
model1.compile(optimizer='adam',
loss=keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
# Fit single layer model on Norway data
hist1 = model1.fit(xTrain1, yTrain1, epochs=nEpoch, batch_size=nBatch, verbose=0, callbacks=callbacks)
weights1, biases1 = model1.get_weights()
print('Est W=', weights1)
print('b=', biases1)
plt.figure(figsize=(15,4))
plt.subplot(1,3,1)
plt.plot(hist1.history['loss'])
plt.grid()
plt.title('Loss')
plt.subplot(1,3,2)
plt.plot(hist1.history['accuracy'])
plt.grid()
plt.title('Accuracy');
Testing the model¶
yEst1 = model1.predict(xTest1)
labelEst1 = np.argmax(yEst1, axis=1)
plt.figure(figsize=(16,4))
plt.subplot(1,3,1)
flagPlot(xTest1[:,0], xTest1[:,1], labelEst1, 'Estimated', norwayColors)
plt.subplot(1,3,2)
plt.hist(labelEst1, 10, density=True, alpha=0.5)
plt.title('Estimated histo')
plt.subplot(1,3,3)
flagPlot(xTest1[:,0], xTest1[:,1], yTest1, 'Generated', norwayColors)
As expected, the linear model is not able to separate the classes.
Two layer model on Norway¶
nEpoch = 512
nBatch = 128
# Model
model2 = keras.models.Sequential([
keras.layers.Dense(16, activation=keras.activations.relu, input_shape=[nFeatures],
kernel_regularizer=keras.regularizers.l1(0.001),
bias_regularizer=keras.regularizers.l1(0.001)),
keras.layers.Dense(nClasses, activation=keras.activations.softmax,
kernel_regularizer=keras.regularizers.l1(0.0001))
])
model2.compile(optimizer='adam',
loss=keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
# Fit two layer model on Norway data
hist2 = model2.fit(xTrain1, yTrain1, epochs=nEpoch, batch_size=nBatch, verbose=0, callbacks=callbacks)
plt.figure(figsize=(15,4))
plt.subplot(1,3,1)
plt.plot(hist2.history['loss'])
plt.grid()
plt.title('Loss')
plt.subplot(1,3,2)
plt.plot(hist2.history['accuracy'])
plt.grid()
plt.title('Accuracy');
Testing the model¶
yEst2 = model2.predict(xTest1)
labelEst2 = np.argmax(yEst2, axis=1)
plt.figure(figsize=(16,4))
plt.subplot(1,3,1)
flagPlot(xTest1[:,0], xTest1[:,1], labelEst2, 'Estimated', norwayColors)
plt.subplot(1,3,2)
plt.hist(labelEst2, 10, density=True, alpha=0.5)
plt.title('Estimated histo')
plt.subplot(1,3,3)
flagPlot(xTest1[:,0], xTest1[:,1], yTest1, 'Generated', norwayColors)
plotConfusionMatrix(yTest1, labelEst2, norwayColors);
The two layer neural network is improving compared to the single layer (linear) model.
Accuracy is greatly improved ranging from ~60% to 95% depending on the fitting convergence
print(skMetrics.classification_report(yTest1, labelEst2))
Three layer neural network on Norway¶
nEpoch = 512
nBatch = 128
# Model
model3 = keras.models.Sequential([
keras.layers.Dense(8, activation=keras.activations.linear, input_shape=[nFeatures],
kernel_regularizer=keras.regularizers.l1(0.001),
bias_regularizer=keras.regularizers.l1(0.001)),
keras.layers.Activation(keras.activations.relu),
keras.layers.Dense(8, activation=keras.activations.linear,
kernel_regularizer=keras.regularizers.l1(0.001),
bias_regularizer=keras.regularizers.l1(0.001)),
keras.layers.Activation(keras.activations.relu),
keras.layers.Dense(nClasses, activation=keras.activations.softmax)
])
model3.compile(optimizer='adam',
loss=keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
# Fit on Norway data
hist3 = model3.fit(xTrain1, yTrain1, epochs=nEpoch, batch_size=nBatch, verbose=0, callbacks=callbacks)
plt.figure(figsize=(15,4))
plt.subplot(1,3,1)
plt.plot(hist3.history['loss'])
plt.grid()
plt.title('Loss')
plt.subplot(1,3,2)
plt.plot(hist3.history['accuracy'])
plt.grid()
plt.title('Accuracy');
Testing the model¶
yEst3 = model3.predict(xTest1)
labelEst3 = np.argmax(yEst3, axis=1)
plt.figure(figsize=(16,4))
plt.subplot(1,3,1)
flagPlot(xTest1[:,0], xTest1[:,1], labelEst3, 'Estimated', norwayColors)
plt.subplot(1,3,2)
plt.hist(labelEst3, 10, density=True, alpha=0.5)
plt.title('Estimated histo')
plt.subplot(1,3,3)
flagPlot(xTest1[:,0], xTest1[:,1], yTest1, 'Generated', norwayColors)
plotConfusionMatrix(yTest1, labelEst3, norwayColors);
print(skMetrics.classification_report(yTest1, labelEst3))
The three layer neural network is improving the accuracy and F1-score
A large added value of the (deep) neural net is shown... at the expense of a much more complex model:
weights3 = model3.get_weights()
model3.summary()
t = np.linspace(-1, 1)
width3_1 = len(weights3[1])
for i in range(0,width3_1):
plt.scatter(weights3[0][0, i] * t + weights3[1][i], weights3[0][1, i] * t, marker='.', c=t, cmap="RdBu_r")
plt.title('First stage weights parametric view ({:d} components)'.format(width3_1));
The direction graph above is showing that the two main directions (along axes) are identified by the learnt features of the neural network using pairs of neurons (one for each direction). Some other neurons are used within the top left quadrant.
weights3[0]