Linear regression using Tensor Flow 2.0¶

Reuse the same univariate polynomial model as in the univariate linear regression workbook (Jupyter Notebook) but with a TensorFlow implementation

From TensorFlow 2.0, recommended API is clearly the one of Keras. Building and fitting a model is performed in most cases with few lines of code. This is hidding a lot of the details on how the fit is done.

This tutorial is a mid step between the home made gradient descent of above mentionned tutorial and the fully wrapped Keras model as in Bivariate linear regression with Keras (Notebook)

The model is mostly hand designed, even if doing the same as the Keras Dense layer. But the optimizers are the powerful ones of Keras and TensorFlow.

Learning goals:

Design a model in TensorFlow 2.0
Use TensorFlow to perform gradient descents.
Compare several optimizers

References:

https://databricks.com/tensorflow/training-and-convergence

import tensorflow as tf # TF 2.0 required
import numpy as np
import matplotlib.pyplot as plt
import pandas
from sklearn import metrics

Data model¶

# Univariate
numFeatures = 1

def generateBatch(N, stochastic = False):
    #
    xMin = 0
    xMax = 0.5
    #
    b = 0.35
    std = 0.01
    #
    if stochastic:
        x = np.random.uniform(xMin, xMax, N)
    else:
        x = np.linspace(xMin, xMax, N)
    yClean = x**4 + (x-0.3)**3 + b
    y =  yClean + np.random.normal(0, std, N) 
    return (x, y, yClean)

Reference values for linear regression¶

From LinearRegressionUnivariate.ipynb

wRef, bRef = 0.145, 0.323

Test data¶

Ntest = 100000
(xTest, yTest1, yTestClean1) = generateBatch(Ntest)

Shared parameters¶

# (Mini) Batch size
nBatch = 128
# Number of batches per Epoch
nBatchPerEpoch = 10
# Stop threshold on the Epoch MSE
threshold = 1e-4
# Safe guard to stop on number of epochs
nEpochMax = 2000

Helpers¶

# Plot helper to show target reference value
def plotToRef(label, subNum, subRow, subCol, ref, values):
    nIter = len(values)
    r = range(nIter)
    plt.subplot(subNum, subRow, subCol, alpha=1)
    plt.title(label)
    plt.plot(r, values, r, np.ones((nIter))*ref, alpha=0.5)
    plt.grid();

Training model¶

From TensorFlow 2.0, the symbolic variables are no longer explicit, they look alike any Python variables.

The model is generated at "compile" time, rewritting and checking the code to act as a symbolic model

# Simple custom layer exposing the linear regression model
class MyDenseLayer(tf.keras.layers.Layer):
    def __init__(self, *args, **kwargs):
        super(MyDenseLayer, self).__init__(*args, **kwargs)
    
    def build(self, input_shape):
        self.w = self.add_weight(
            shape=1.0,
            dtype=self.dtype,
            initializer=tf.keras.initializers.ones(),
            #regularizer=tf.keras.regularizers.l2(0.02),
            trainable=True)
        self.b = self.add_weight(
            shape=1.0,
            dtype=self.dtype,
            initializer=tf.keras.initializers.ones(),
            #regularizer=tf.keras.regularizers.l2(0.02),
            trainable=True)

    @tf.function
    def call(self, x, training=None):
        return x * self.w + self.b

Gradient descent optimizer¶

# Model 1, instantiate the custom layer
model1 = tf.keras.Sequential([MyDenseLayer(input_shape=[numFeatures], dtype="float64")])

# Stochastic Gradient Descent Optimizer
optim1 = tf.keras.optimizers.SGD(0.01)

# Perform a train step on a mini-batch
#  This function's code is rewritten by TensorFlow 2.0 and shall be compiled at every execution of the optimizer
@tf.function
def trainStep1(x, y):
    with tf.GradientTape() as tape:
        predictions = model1(x, training=True)
        loss = tf.keras.losses.mean_squared_error(y, predictions)
        
        gradients = tape.gradient(loss, model1.trainable_variables)
        optim1.apply_gradients(zip(gradients, model1.trainable_variables))
        return loss
    
# Initialize values and loop on epochs and mini batch
epoch = 0
mse_epoch = 1
hist = []
while mse_epoch > threshold and epoch < nEpochMax:
    mse_cumul = 0
    for b in range(0, nBatchPerEpoch):  
        (xTrain, yTrain, yTrainClean) = generateBatch(nBatch, True)
        mse_cumul += trainStep1(xTrain, yTrain)
        
    W = model1.get_weights()
    mse_epoch = mse_cumul / nBatchPerEpoch
    hist.append((W[1][0], W[0][0], mse_epoch))
    epoch += 1
        
print("Predicted model: {a:.3f} x + {b:.3f}, num epochs={c}".format(a=w, b=b, c=len(wLearn)))
df1 = pandas.DataFrame(hist, columns = ('b', 'w', 'MSE'))

fig = plt.figure(figsize=(16,10))
plotToRef('b', 2, 2, 1, bRef, df1['b'])
plotToRef('w', 2, 2, 2, wRef, df1['w'])
plt.subplot(2,2,3)
plt.semilogy(df1['MSE'])
plt.grid()
plt.title(('Loss (MSE)'));

Test model¶

yEst1 = w * xTest + b

plt.plot(xTest, yTestClean1, xTest, yEst1);
plt.legend(('Test data (clean)', 'SGD'), loc='upper left')
mse1 = metrics.mean_squared_error(yTest1, yEst1)
print('Gradient Optimizer MSE = {:.3e}'.format(mse1));

Moment optimizer¶

# Model, instantiate the custom layer
model2 = tf.keras.Sequential([MyDenseLayer(input_shape=[numFeatures], dtype="float64")])

# Gradient Descent Optimizer
optim2 = tf.keras.optimizers.SGD(0.01, momentum=0.0001) # <---

# Perform a train step on a mini-batch
#  This function's code is rewritten by TensorFlow 2.0 and shall be compiled at every execution of the optimizer
@tf.function
def trainStep2(x, y):
    with tf.GradientTape() as tape:
        predictions = model2(x, training=True)
        loss = tf.keras.losses.mean_squared_error(y, predictions)
        
        gradients = tape.gradient(loss, model2.trainable_variables)
        optim2.apply_gradients(zip(gradients, model2.trainable_variables))
        return loss
    
# Initialize values and loop on epochs and mini batch
epoch = 0
mse_epoch = 1
wLearn = []
while mse_epoch > threshold and epoch < nEpochMax:
    mse_cumul = 0
    for b in range(0, nBatchPerEpoch):  
        (xTrain, yTrain, yTrainClean) = generateBatch(nBatch, True)
        mse_cumul += trainStep2(xTrain, yTrain) # <---
    
    W = model1.get_weights()
    mse_epoch = mse_cumul / nBatchPerEpoch
    hist.append((W[1][0], W[0][0], mse_epoch))
    epoch += 1
        
print("Predicted model: {a:.3f} x + {b:.3f}, num iterations={c}".format(a=w, b=b, c=len(wLearn)))
df2 = pandas.DataFrame(wLearn, columns = ('b', 'w', 'MSE'))

fig = plt.figure(figsize=(16,10))
plotToRef('b', 2, 2, 1, bRef, df2['b'])
plotToRef('w', 2, 2, 2, wRef, df2['w'])
plt.subplot(2,2,3)
plt.semilogy(df2['MSE'])
plt.grid()
plt.title(('Loss (MSE)'));

yEst2 = w * xTest + b

plt.plot(xTest, yTestClean1, xTest, yEst1, xTest, yEst2);
plt.legend(('Test data (clean)', 'SGD', 'Momentum'), loc='upper left')
mse2 = metrics.mean_squared_error(yTest1, yEst2)
print('Moment Optimizer MSE = {:.3e}'.format(mse2));

Adam optimizer¶

https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer

http://arxiv.org/pdf/1412.6980.pdf

# Model, instantiate the custom layer
model3 = tf.keras.Sequential([MyDenseLayer(input_shape=[numFeatures], dtype="float64")])

# Gradient Descent Optimizer
optim3 = tf.keras.optimizers.Adam(0.01) # <---

# Perform a train step on a mini-batch
#  This function's code is rewritten by TensorFlow 2.0 and shall be compiled at every execution of the optimizer
@tf.function
def trainStep3(x, y):
    with tf.GradientTape() as tape:
        predictions = model3(x, training=True)
        loss = tf.keras.losses.mean_squared_error(y, predictions)
        
        gradients = tape.gradient(loss, model3.trainable_variables)
        optim3.apply_gradients(zip(gradients, model3.trainable_variables))
        return loss
    
# Initialize values and loop on epochs and mini batch
epoch = 0
mse_epoch = 1
wLearn = []
while mse_epoch > threshold and epoch < nEpochMax:
    mse_cumul = 0
    for b in range(0, nBatchPerEpoch):  
        (xTrain, yTrain, yTrainClean) = generateBatch(nBatch, True)
        mse_cumul += trainStep3(xTrain, yTrain)
        
    W = model1.get_weights()
    mse_epoch = mse_cumul / nBatchPerEpoch
    hist.append((W[1][0], W[0][0], mse_epoch))
    epoch += 1
        
print("Predicted model: {a:.3f} x + {b:.3f}, num iterations={c}".format(a=w, b=b, c=len(wLearn)))
df3 = pandas.DataFrame(wLearn, columns = ('b', 'w', 'MSE'))

fig = plt.figure(figsize=(16,10))
plotToRef('$b$', 2, 2, 1, bRef, df3['b'])
plotToRef('$w$', 2, 2, 2, wRef, df3['w'])
plt.subplot(2,2,3)
plt.semilogy(df3['MSE'])
plt.grid()
plt.title(('Loss (MSE)'));

There is a clear gain compared to the standard and momentum gradient descent :

Less iterations
Less undershoot on $b$
Clear convergence of the MSE to the noise floor

yEst3 = w * xTest + b

plt.plot(xTest, yTestClean1, xTest, yEst1, xTest, yEst3);
plt.legend(('Test data (clean)', 'SGD', 'Adam'), loc='upper left')
mse3 = metrics.mean_squared_error(yTest1, yEst3)
print('Adam Optimizer MSE = {:.3e}'.format(mse3));

Where to go from here ?¶

Other single feature linear implementation using closed form, Scipy, SKLearn or "raw" Python (Notebook)

More complex bivariation models using "raw" Python (Notebook) up to the gradient descent with regularizer, or using Keras (Notebook)

Compare with the single feature binary classification using logistic regression using "raw" Python or libraries (Notebook)

data-science

Notebooks and Python about data science

If you like this project please add your Star