LR Finder
Using LR Finder to fix an optimum learning rate for training the model and using Grad-CAM to visualize heatmaps for mis-classified images
We will use CIFAR-10 dataset for this exercise and build the model using Keras ..
from keras import backend as K
import time
import matplotlib.pyplot as plt
import numpy as np
% matplotlib inline
np.random.seed(2017)
from keras import regularizers
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D,AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.datasets import cifar10
(train_features, train_labels), (test_features, test_labels) = cifar10.load_data()
num_train, img_rows, img_cols,img_channels = train_features.shape
num_test, _, _, _ = test_features.shape
num_classes = len(np.unique(train_labels))
print (num_classes)
print (num_train)
print (train_features.shape)
class_names = ['airplane','automobile','bird','cat','deer',
'dog','frog','horse','ship','truck']
fig = plt.figure(figsize=(8,3))
for i in range(num_classes):
ax = fig.add_subplot(2, 5, 1 + i, xticks=[], yticks=[])
idx = np.where(train_labels[:]==i)[0]
features_idx = train_features[idx,::]
img_num = np.random.randint(features_idx.shape[0])
im = features_idx[img_num]
ax.set_title(class_names[i])
plt.imshow(im)
plt.show()
def plot_model_history(model_history):
fig, axs = plt.subplots(1,2,figsize=(15,5))
# summarize history for accuracy
axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
axs[0].set_title('Model Accuracy')
axs[0].set_ylabel('Accuracy')
axs[0].set_xlabel('Epoch')
axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
axs[0].legend(['train', 'val'], loc='best')
# summarize history for loss
axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
axs[1].set_title('Model Loss')
axs[1].set_ylabel('Loss')
axs[1].set_xlabel('Epoch')
axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
axs[1].legend(['train', 'val'], loc='best')
plt.show()
def accuracy(test_x, test_y, model):
result = model.predict(test_x)
predicted_class = np.argmax(result, axis=1)
true_class = np.argmax(test_y, axis=1)
num_correct = np.sum(predicted_class == true_class)
accuracy = float(num_correct)/result.shape[0]
return (accuracy * 100)
def get_max_train_accuracy(model_info):
train_acc=model_info.history['acc']
max_train_acc=max(train_acc)
return (max_train_acc * 100)
def get_max_val_accuracy(model_info):
val_acc=model_info.history['val_acc']
max_val_acc=max(val_acc)
return (max_val_acc * 100)
train_features = train_features.astype('float32')/255
test_features = test_features.astype('float32')/255
# convert class labels to binary class labels
train_labels = np_utils.to_categorical(train_labels, num_classes)
test_labels = np_utils.to_categorical(test_labels, num_classes)
train_labels
(trainX, trainy), (testX, testy) = cifar10.load_data()
print('Statistics train=%.3f (%.3f), test=%.3f (%.3f)' % (trainX.mean(), trainX.std(), testX.mean(), testX.std()))
# create generator that centers pixel values
datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)
# calculate the mean on the training dataset
datagen.fit(trainX)
#print('Data Generator mean=%.3f, std=%.3f' % (datagen.mean, datagen.std))
# demonstrate effect on a single batch of samples
iterator = datagen.flow(trainX, trainy, batch_size=128)
# get a batch
batchX, batchy = iterator.next()
# pixel stats in the batch
print(batchX.shape, batchX.mean(), batchX.std())
# demonstrate effect on entire training dataset
iterator = datagen.flow(trainX, trainy, batch_size=len(trainX), shuffle=False)
# get a batch
batchX, batchy = iterator.next()
# pixel stats in the batch
print(batchX.shape, batchX.mean(), batchX.std())
iterator1 = datagen.flow(testX, testy, batch_size=len(testX), shuffle=False)
batch_testX, batch_testy = iterator1.next()
X_train = batchX
X_test = batch_testX
y_train=batchy
y_test=batch_testy
Y_train = np_utils.to_categorical(y_train, 10)
Y_test = np_utils.to_categorical(y_test, 10)
model1 = Sequential()
model1.add(Convolution2D(32, 3, 3, border_mode='same',kernel_regularizer=regularizers.l2(0.0001), input_shape=(32, 32, 3)))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(Convolution2D(64, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.2))
model1.add(Convolution2D(32, 1, 1))
model1.add(Convolution2D(64, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(Convolution2D(128, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.3))
model1.add(Convolution2D(32, 1, 1))
model1.add(Convolution2D(128, 3, 3,kernel_regularizer=regularizers.l2(0.0001), border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(Convolution2D(256, 3, 3,kernel_regularizer=regularizers.l2(0.0001), border_mode='same', name='LC1'))
model1.add(Activation('relu',name='R1'))
model1.add(BatchNormalization(name='BN1'))
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.5))
model1.add(Convolution2D(10, 1, 1, name="red1"))
model1.add(AveragePooling2D(pool_size = (4,4)))
model1.add(Flatten())
model1.add(Activation('softmax'))
model1.summary()
How to use LR Finder
This page http://puzzlemusa.com/2018/05/14/learning-rate-finder-using-keras/ describes how to use this technique of LR Finder
We will use LR Finder to fix our initial learning rate to train the model
In order to use the model with LR Finder compile the model with SGD optimizer
model1.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
How to use LR finder
The code for LR finder callback was done using a blog reference that Abu Saleh Md Musa maintained .
The blog link http://puzzlemusa.com/2018/05/14/learning-rate-finder-using-keras/ doesn't seem to be active at the time of writing this post. I've retained it in the post just in case the author activates it again.
Add functions to print LR at min loss and min smoothed loss
from keras.callbacks import Callback
class LR_Finder(Callback):
def __init__(self, start_lr=1e-5, end_lr=10, step_size=None, beta=.98):
super().__init__()
self.start_lr = start_lr
self.end_lr = end_lr
self.step_size = step_size
self.beta = beta
self.lr_mult = (end_lr / start_lr) ** (1 / step_size)
#print("lr mult : "+str(self.lr_mult))
def on_train_begin(self, logs=None):
self.best_loss = 1e9
self.avg_loss = 0
self.losses, self.smoothed_losses, self.lrs, self.iterations = [], [], [], []
self.iteration = 0
logs = logs or {}
K.set_value(self.model.optimizer.lr, self.start_lr)
def on_batch_end(self, epoch, logs=None):
logs = logs or {}
loss = logs.get('loss')
self.iteration += 1
self.avg_loss = self.beta * self.avg_loss + (1 - self.beta) * loss
smoothed_loss = self.avg_loss / (1 - self.beta ** self.iteration)
# Check if the loss is not exploding
if self.iteration > 1 and smoothed_loss > self.best_loss * 4:
self.model.stop_training = True
return
if smoothed_loss < self.best_loss or self.iteration == 1:
self.best_loss = smoothed_loss
lr = self.start_lr * (self.lr_mult ** self.iteration)
#print("lr = "+str(lr))
self.losses.append(loss)
self.smoothed_losses.append(smoothed_loss)
self.lrs.append(lr)
self.iterations.append(self.iteration)
K.set_value(self.model.optimizer.lr, lr)
def plot_lr(self):
plt.figure(figsize=(18,12))
plt.xlabel('Iterations')
plt.ylabel('Learning rate')
plt.plot(self.iterations, self.lrs)
def plot(self, n_skip=10):
plt.figure(figsize=(18,12))
plt.ylabel('Loss')
plt.xlabel('Learning rate (log scale)')
plt.plot(self.lrs[n_skip:-5], self.losses[n_skip:-5])
plt.xscale('log')
def plot_smoothed_loss(self, n_skip=10):
plt.figure(figsize=(18,12))
plt.ylabel('Smoothed Losses')
plt.xlabel('Learning rate (log scale)')
plt.plot(self.lrs[n_skip:-5], self.smoothed_losses[n_skip:-5])
plt.xscale('log')
def plot_loss(self):
plt.figure(figsize=(18,12))
plt.ylabel('Losses')
plt.xlabel('Iterations')
plt.plot(self.iterations[10:], self.losses[10:])
def get_best_loss(self):
return self.best_loss
def find_lr_at_best_loss(self):
print("====================================================================")
print("LR at min loss ")
print("LR at min loss : "+str(self.lrs[np.argmin(self.losses)]))
print("LR at min smoothed loss : "+str(self.lrs[np.argmin(self.smoothed_losses)]))
print("====================================================================")
batch_size=128
lr_finder = LR_Finder(start_lr=1e-5, end_lr=10, step_size=np.ceil(X_train.shape[0]/batch_size))
model1.fit(X_train, Y_train, epochs=1, batch_size=batch_size,callbacks=[lr_finder] )
lr_finder.plot_lr()
lr_finder.plot()
lr_finder.plot_smoothed_loss()
lr_finder.find_lr_at_best_loss()
K.eval(lr_finder.model.optimizer.lr)
From the loss vs lr plots ,The max rate of descent seems to be between 0.01 and 0.1 and it looks like min loss is between lr values of 0.1 and 0.01 .
printing the lr corresponding to the min loss and min smoothed loss confirms this observation .
Let us pick an initial learning rate of 0.045 since that is where the smoothened loss curve starts going up .
we will use SGD optimizer with lr=0.045 and momentum =0.9 to compile the model
from keras import optimizers
opt= optimizers.SGD(lr=0.05,momentum=0.9)
model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
Cutout Augmentation
Cutout was first presented as an effective augmentation technique in these two papers : Improved Regularization of Convolutional Neural Networks with Cutout and Random Erasing Data Augmentation The idea is to randomly cut away patches of information from images that a model is training on to force it to learn from more parts of the image. This would help the model learn more features about a class instead of depending on some simple assumptions using smaller areas within the image . This helps the model generalize better and make better predictions . We will use python code for random erasing found at https://github.com/yu4u/cutout-random-erasing
#get code for random erasing from https://github.com/yu4u/cutout-random-erasing
!wget https://raw.githubusercontent.com/yu4u/cutout-random-erasing/master/random_eraser.py
from keras.preprocessing.image import ImageDataGenerator
from random_eraser import get_random_eraser
batch_size=128
train_datagen=ImageDataGenerator(
featurewise_center=True, # set input mean to 0 over the dataset
#samplewise_center=False, # set each sample mean to 0
featurewise_std_normalization=True, # divide inputs by std of the dataset
#samplewise_std_normalization=False, # divide each input by its std
preprocessing_function=get_random_eraser(v_l=0, v_h=1),
horizontal_flip=True
)
val_datagen= ImageDataGenerator(
featurewise_center=True, # set input mean to 0 over the dataset
featurewise_std_normalization=True, # divide inputs by std of the dataset
)
train_datagen.fit(X_train)
val_datagen.fit(X_test)
training_generator=train_datagen.flow(X_train, Y_train, batch_size=batch_size,shuffle=True,seed=42)
validation_generator=val_datagen.flow(X_test, Y_test, batch_size=batch_size,shuffle=True,seed=42)
# train the model
start = time.time()
# Train the model
model_info = model1.fit_generator(training_generator, epochs=100,
steps_per_epoch=np.ceil(X_train.shape[0]/batch_size),
validation_steps=np.ceil(X_test.shape[0]/batch_size),
validation_data=validation_generator,
shuffle=True,
verbose=0)
end = time.time()
print ("Model took %0.2f seconds to train"%(end - start))
# plot model history
plot_model_history(model_info)
# compute test accuracy
print ("Accuracy on test data is: %0.2f"%accuracy(X_test, Y_test, model1))
print ("Max training accuracy is: %0.2f"%get_max_train_accuracy(model_info))
print ("Max validation accuracy is: %0.2f"%get_max_val_accuracy(model_info))
the model was trained for 100 epochs and reached a max val accuracy of 87.81 .We also notice that it almost the same as the max training accuracy . Training more epochs would yield better accuracy values . We will stop at 100 epochs and prepare to print Grad-CAM visualization on some misclassified images from this model's prediction on the test data
Grad-CAM
Now let us define the function for Grad-CAM visualization .
This function named gradcam takes as input the model , the set of images , the labels for each image and the layer to be used for calculating gradients . It returns a list of dictionaries containing original image , the heatmap, the titles to display during visualization
import cv2
from mpl_toolkits.axes_grid1 import ImageGrid
from google.colab.patches import cv2_imshow
from IPython.core.display import display, HTML
#select test images and corresponding labels to print heatmap
#x=np.array([test_features[41],test_features[410],test_features[222],test_features[950]])
#y=[test_labels[41],test_labels[410],test_labels[222],test_labels[950]]
def gradcam(model1,x,y,which_layer):
#
results=[]
#make prediction for these 4 images
preds = model1.predict(x)
for j in range(x.shape[0]):
#get class id from the prediction values
class_idx = np.argmax(preds[j])
class_output = model1.output[:, class_idx]
## choose the layer nearest to prediction that has a size of about 7x7 or 8x8
#in this case it is the layer being sent to the gradcam function
last_conv_layer = model1.get_layer(which_layer)
# compute gradients and from heatmap
grads = K.gradients(class_output, last_conv_layer.output)[0]
pooled_grads = K.mean(grads, axis=(0, 1, 2))
iterate = K.function([model1.input], [pooled_grads, last_conv_layer.output[0]])
pooled_grads_value, conv_layer_output_value = iterate([x])
#apply the pooled grad value to the conv layer channels
for i in range(256):
conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
#get the mean of the weighted values and assign to heatmap
heatmap = np.mean(conv_layer_output_value, axis=-1)
#retain only positive values (or 0) in heatmap
heatmap = np.maximum(heatmap, 0)
#convert values between 0 and 1 using divide by max value
heatmap /= np.max(heatmap)
#we now have a heatmap with size equal to the output size of the layer we chose
#img is the image we are running gradcam on
img = x[j]
#resize heatmap 8x8 to image size of 32x32
heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
#convert pixel values to be between 0 and 255
heatmap = np.uint8(255 * heatmap)
#apply suitable cv2 colormap . In this case colormap_JET
heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
# convert from BGR to RGB if we want to display using matplotlib
heatmap1 = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
# create superimposed image if we want to print using cv2 (cv2_imshow supported in colab)
superimposed_img = cv2.addWeighted(img, 0.5, heatmap1, 0.5, 0,dtype=5)
#create a dictionary object with details of image, heatmap, its title
title1=str(j+1)+": "+ class_names[np.argmax(y[j])]+" predicted as "+str(class_names[class_idx])
title2='superimposed heatmap'
image1=img
image2=heatmap1
image3=superimposed_img
imageObj={'image1':image1,'image2':image2,'image3':image3,'title1':title1,'title2':title2}
#append the image dict object to results list
results.append(imageObj)
#print(j)
#return grad-cam results as a list of dictionary objects , each containing an image and its heatmap
return results
def displayRow(images):
# we will plot 2 images in a row
# cv.imshow does not work in jupyter notebooks and colab
# cv2_imshow patch works on colab but matplotlib gives us a little more flexibility in formatting the display
# we will use matplotlib to print the image and its heatmap
fig = plt.figure(1, (13,13))
grid = ImageGrid(fig, 111,
nrows_ncols=(1,5),
axes_pad=1,label_mode="1"
)
#horizontal spacer
#grid[0].imshow(np.ones((32, 10)),alpha=0)
#grid[0].axis('off')
#first image
#print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
grid[0].imshow(images[0]['image1'])
grid[0].set_title(images[0]['title1'])
grid[0].axis('off')
#print the original image and on top of it place the heat map at 60% transparency
grid[1].imshow(images[0]['image1'],alpha=0.9)
grid[1].imshow(images[0]['image2'],alpha=0.6)
grid[1].set_title(images[0]['title2'])
grid[1].axis('off')
#vertical separator
grid[2].imshow(np.ones((32, 1)))
grid[2].axis('off')
#second image
#print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
grid[3].imshow(images[1]['image1'])
grid[3].set_title(images[1]['title1'])
grid[3].axis('off')
#print the original image and on top of it place the heat map at 60% transparency
grid[4].imshow(images[1]['image1'],alpha=0.9)
grid[4].imshow(images[1]['image2'],alpha=0.6)
grid[4].set_title(images[1]['title2'])
grid[4].axis('off')
plt.show()
display(HTML("<hr size='5' color='black' width='100%' align='center' />"))
pred=model1.predict(X_test)
pred2=np.argmax(pred,axis=1)
wrong_set=[]
correct_set=[]
wrong_labels=[]
true_labels=[]
wrong_indices=[]
for i in range(X_test.shape[0]):
if (pred2[i]==np.argmax(test_labels[i])):
correct_set.append(X_test[i])
else:
wrong_indices.append(i)
wrong_labels.append(class_names[pred2[i]])
true_labels.append(class_names[np.argmax(test_labels[i])])
wrong_set.append(X_test[i])
w_list=wrong_indices[:26]
x=[]
y=[]
for i in range(len(w_list)):
x.append(test_features[w_list[i]])
y.append(test_labels[w_list[i]])
#convert the image list to numpy array
x=np.array(x)
results=gradcam(model1,x,y,'R1') # we choose this layer as the layer nearest to prediction having a size of 8x8
display(HTML("<h2 align='center'>First 26 misclassified images with Grad-CAM heatmap </h2><hr size='5' color='black' width='100%' align='center' />"))
for i in range(0,len(results),2):
images=[]
images.append(results[i])
images.append(results[i+1])
displayRow(images)