LR Finder
Using LR Finder to fix an optimum learning rate for training the model and using Grad-CAM to visualize heatmaps for mis-classified images
We will use CIFAR-10 dataset for this exercise and build the model using Keras ..
from keras import backend as K
import time
import matplotlib.pyplot as plt
import numpy as np
% matplotlib inline
np.random.seed(2017) 
from keras import regularizers
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D,AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.datasets import cifar10
(train_features, train_labels), (test_features, test_labels) = cifar10.load_data()
num_train, img_rows, img_cols,img_channels =  train_features.shape
num_test, _, _, _ =  test_features.shape
num_classes = len(np.unique(train_labels))
print (num_classes)
print (num_train)
print (train_features.shape)
class_names = ['airplane','automobile','bird','cat','deer',
               'dog','frog','horse','ship','truck']
fig = plt.figure(figsize=(8,3))
for i in range(num_classes):
    ax = fig.add_subplot(2, 5, 1 + i, xticks=[], yticks=[])
    idx = np.where(train_labels[:]==i)[0]
    features_idx = train_features[idx,::]
    img_num = np.random.randint(features_idx.shape[0])
    im = features_idx[img_num]
    ax.set_title(class_names[i])
    plt.imshow(im)
plt.show()
def plot_model_history(model_history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.show()
def accuracy(test_x, test_y, model):
    result = model.predict(test_x)
    predicted_class = np.argmax(result, axis=1)
    true_class = np.argmax(test_y, axis=1)
    num_correct = np.sum(predicted_class == true_class) 
    accuracy = float(num_correct)/result.shape[0]
    return (accuracy * 100)
def get_max_train_accuracy(model_info):
  train_acc=model_info.history['acc']
  max_train_acc=max(train_acc)
  return (max_train_acc * 100)
def get_max_val_accuracy(model_info):
  val_acc=model_info.history['val_acc']
  max_val_acc=max(val_acc)
  return (max_val_acc * 100)
train_features = train_features.astype('float32')/255
test_features = test_features.astype('float32')/255
# convert class labels to binary class labels
train_labels = np_utils.to_categorical(train_labels, num_classes)
test_labels = np_utils.to_categorical(test_labels, num_classes)
train_labels
(trainX, trainy), (testX, testy) = cifar10.load_data()
print('Statistics train=%.3f (%.3f), test=%.3f (%.3f)' % (trainX.mean(), trainX.std(), testX.mean(), testX.std()))
# create generator that centers pixel values
datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)
# calculate the mean on the training dataset
datagen.fit(trainX)
#print('Data Generator mean=%.3f, std=%.3f' % (datagen.mean, datagen.std))
# demonstrate effect on a single batch of samples
iterator = datagen.flow(trainX, trainy, batch_size=128)
# get a batch
batchX, batchy = iterator.next()
# pixel stats in the batch
print(batchX.shape, batchX.mean(), batchX.std())
# demonstrate effect on entire training dataset
iterator = datagen.flow(trainX, trainy, batch_size=len(trainX), shuffle=False)
# get a batch
batchX, batchy = iterator.next()
# pixel stats in the batch
print(batchX.shape, batchX.mean(), batchX.std())
iterator1 = datagen.flow(testX, testy, batch_size=len(testX), shuffle=False)
batch_testX, batch_testy = iterator1.next()
X_train = batchX
X_test = batch_testX
y_train=batchy
y_test=batch_testy
                        
Y_train = np_utils.to_categorical(y_train, 10)
Y_test = np_utils.to_categorical(y_test, 10)
model1 = Sequential()
model1.add(Convolution2D(32, 3, 3, border_mode='same',kernel_regularizer=regularizers.l2(0.0001), input_shape=(32, 32, 3)))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(Convolution2D(64, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.2))
model1.add(Convolution2D(32, 1, 1))
model1.add(Convolution2D(64, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(Convolution2D(128, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.3))
model1.add(Convolution2D(32, 1, 1))
model1.add(Convolution2D(128, 3, 3,kernel_regularizer=regularizers.l2(0.0001), border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())
model1.add(Convolution2D(256, 3, 3,kernel_regularizer=regularizers.l2(0.0001), border_mode='same', name='LC1'))
model1.add(Activation('relu',name='R1'))
model1.add(BatchNormalization(name='BN1'))
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.5))
model1.add(Convolution2D(10, 1, 1, name="red1"))
model1.add(AveragePooling2D(pool_size = (4,4)))
model1.add(Flatten())
model1.add(Activation('softmax'))
model1.summary()
How to use LR Finder
This page http://puzzlemusa.com/2018/05/14/learning-rate-finder-using-keras/ describes how to use this technique of LR Finder
We will use LR Finder to fix our initial learning rate to train the model
In order to use the model with LR Finder compile the model with SGD optimizer
model1.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
How to use LR finder
The code for LR finder callback was done using a blog reference that Abu Saleh Md Musa maintained .
The blog link http://puzzlemusa.com/2018/05/14/learning-rate-finder-using-keras/ doesn't seem to be active at the time of writing this post. I've retained it in the post just in case the author activates it again.
Add functions to print LR at min loss and min smoothed loss
from keras.callbacks import Callback
class LR_Finder(Callback):
    def __init__(self, start_lr=1e-5, end_lr=10, step_size=None, beta=.98):
        super().__init__()
        self.start_lr = start_lr
        self.end_lr = end_lr
        self.step_size = step_size
        self.beta = beta
        self.lr_mult = (end_lr / start_lr) ** (1 / step_size)
        #print("lr mult : "+str(self.lr_mult))
    def on_train_begin(self, logs=None):
        self.best_loss = 1e9
        self.avg_loss = 0
        self.losses, self.smoothed_losses, self.lrs, self.iterations = [], [], [], []
        self.iteration = 0
        logs = logs or {}
        K.set_value(self.model.optimizer.lr, self.start_lr)
    def on_batch_end(self, epoch, logs=None):
        logs = logs or {}
        loss = logs.get('loss')
        self.iteration += 1
        self.avg_loss = self.beta * self.avg_loss + (1 - self.beta) * loss
        smoothed_loss = self.avg_loss / (1 - self.beta ** self.iteration)
        # Check if the loss is not exploding
        if self.iteration > 1 and smoothed_loss > self.best_loss * 4:
            self.model.stop_training = True
            return
        if smoothed_loss < self.best_loss or self.iteration == 1:
            self.best_loss = smoothed_loss
        lr = self.start_lr * (self.lr_mult ** self.iteration)
        #print("lr = "+str(lr))
        self.losses.append(loss)
        self.smoothed_losses.append(smoothed_loss)
        self.lrs.append(lr)
        self.iterations.append(self.iteration)
        K.set_value(self.model.optimizer.lr, lr)
    def plot_lr(self):
      plt.figure(figsize=(18,12))
      plt.xlabel('Iterations')
      plt.ylabel('Learning rate')
      plt.plot(self.iterations, self.lrs)
    def plot(self, n_skip=10):
      plt.figure(figsize=(18,12))
      plt.ylabel('Loss')
      plt.xlabel('Learning rate (log scale)')
      plt.plot(self.lrs[n_skip:-5], self.losses[n_skip:-5])
      plt.xscale('log')
    def plot_smoothed_loss(self, n_skip=10):
      plt.figure(figsize=(18,12))
      plt.ylabel('Smoothed Losses')
      plt.xlabel('Learning rate (log scale)')
      plt.plot(self.lrs[n_skip:-5], self.smoothed_losses[n_skip:-5])
      plt.xscale('log')
    def plot_loss(self):
      plt.figure(figsize=(18,12))
      plt.ylabel('Losses')
      plt.xlabel('Iterations')
      plt.plot(self.iterations[10:], self.losses[10:])
        
    def get_best_loss(self):
      return self.best_loss
        
    def find_lr_at_best_loss(self):
      print("====================================================================")
      print("LR at min loss ")
      print("LR at min loss : "+str(self.lrs[np.argmin(self.losses)]))
      print("LR at min smoothed loss : "+str(self.lrs[np.argmin(self.smoothed_losses)]))
      print("====================================================================")
     
      
batch_size=128
lr_finder = LR_Finder(start_lr=1e-5, end_lr=10, step_size=np.ceil(X_train.shape[0]/batch_size))
model1.fit(X_train, Y_train, epochs=1, batch_size=batch_size,callbacks=[lr_finder] )
lr_finder.plot_lr()
lr_finder.plot()
lr_finder.plot_smoothed_loss()
lr_finder.find_lr_at_best_loss()
K.eval(lr_finder.model.optimizer.lr)
From the loss vs lr plots ,The max rate of descent seems to be between 0.01 and 0.1 and it looks like min loss is between lr values of 0.1 and 0.01 .
printing the lr corresponding to the min loss and min smoothed loss confirms this observation .
Let us pick an initial learning rate of 0.045 since that is where the smoothened loss curve starts going up .
we will use SGD optimizer with lr=0.045 and momentum =0.9 to compile the model
from keras import optimizers
opt= optimizers.SGD(lr=0.05,momentum=0.9)
model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
Cutout Augmentation
Cutout was first presented as an effective augmentation technique in these two papers : Improved Regularization of Convolutional Neural Networks with Cutout and Random Erasing Data Augmentation The idea is to randomly cut away patches of information from images that a model is training on to force it to learn from more parts of the image. This would help the model learn more features about a class instead of depending on some simple assumptions using smaller areas within the image . This helps the model generalize better and make better predictions . We will use python code for random erasing found at https://github.com/yu4u/cutout-random-erasing
#get code for random erasing from https://github.com/yu4u/cutout-random-erasing
!wget https://raw.githubusercontent.com/yu4u/cutout-random-erasing/master/random_eraser.py
from keras.preprocessing.image import ImageDataGenerator
from random_eraser import get_random_eraser
batch_size=128
train_datagen=ImageDataGenerator(
        featurewise_center=True,  # set input mean to 0 over the dataset
        #samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=True,  # divide inputs by std of the dataset
        #samplewise_std_normalization=False,  # divide each input by its std
        preprocessing_function=get_random_eraser(v_l=0, v_h=1),
        horizontal_flip=True
    
)
val_datagen= ImageDataGenerator(
        featurewise_center=True,  # set input mean to 0 over the dataset
        
        featurewise_std_normalization=True,  # divide inputs by std of the dataset
        
)
train_datagen.fit(X_train)
val_datagen.fit(X_test)
training_generator=train_datagen.flow(X_train, Y_train, batch_size=batch_size,shuffle=True,seed=42)
validation_generator=val_datagen.flow(X_test, Y_test, batch_size=batch_size,shuffle=True,seed=42)
# train the model
start = time.time()
# Train the model
model_info = model1.fit_generator(training_generator, epochs=100, 
                        steps_per_epoch=np.ceil(X_train.shape[0]/batch_size), 
                    validation_steps=np.ceil(X_test.shape[0]/batch_size), 
                    validation_data=validation_generator,
                                 shuffle=True,
                                 verbose=0)
end = time.time()
print ("Model took %0.2f seconds to train"%(end - start))
# plot model history
plot_model_history(model_info)
# compute test accuracy
print ("Accuracy on test data is: %0.2f"%accuracy(X_test, Y_test, model1))
print ("Max training accuracy is: %0.2f"%get_max_train_accuracy(model_info))
print ("Max validation accuracy is: %0.2f"%get_max_val_accuracy(model_info))
the model was trained for 100 epochs and reached a max val accuracy of 87.81 .We also notice that it almost the same as the max training accuracy . Training more epochs would yield better accuracy values . We will stop at 100 epochs and prepare to print Grad-CAM visualization on some misclassified images from this model's prediction on the test data
Grad-CAM
Now let us define the function for Grad-CAM visualization .
This function named gradcam takes as input the model , the set of images , the labels for each image and the layer to be used for calculating gradients . It returns a list of dictionaries containing original image , the heatmap, the titles to display during visualization
import cv2
from mpl_toolkits.axes_grid1 import ImageGrid
from google.colab.patches import cv2_imshow
from IPython.core.display import display, HTML
#select test images and corresponding labels to print heatmap 
#x=np.array([test_features[41],test_features[410],test_features[222],test_features[950]])
#y=[test_labels[41],test_labels[410],test_labels[222],test_labels[950]]
def gradcam(model1,x,y,which_layer):
  #
  results=[]
  #make prediction for these 4 images 
  preds = model1.predict(x)
  for j in range(x.shape[0]):
    #get class id from the prediction values 
    class_idx = np.argmax(preds[j])
    class_output = model1.output[:, class_idx]
  
    ## choose the layer nearest to prediction that has a size of about 7x7 or 8x8 
    #in this case it is the layer being sent to the gradcam function 
    last_conv_layer = model1.get_layer(which_layer)
  
    # compute gradients and from heatmap 
    grads = K.gradients(class_output, last_conv_layer.output)[0]
    pooled_grads = K.mean(grads, axis=(0, 1, 2))
    iterate = K.function([model1.input], [pooled_grads, last_conv_layer.output[0]])
    pooled_grads_value, conv_layer_output_value = iterate([x])
    
    #apply the pooled grad value to the conv layer channels 
    for i in range(256):
      
      conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
      
    #get the mean of the weighted values and assign to heatmap   
    heatmap = np.mean(conv_layer_output_value, axis=-1)
    #retain only positive values (or 0) in heatmap 
    heatmap = np.maximum(heatmap, 0)
    #convert values between 0 and 1 using divide by max value 
    heatmap /= np.max(heatmap) 
    #we now have a heatmap with size equal to the output size of the layer we chose 
    
    #img is the image we are running gradcam on 
    img = x[j]
    
    #resize heatmap 8x8 to image size of 32x32 
    heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
    #convert pixel values to be between 0 and 255 
    heatmap = np.uint8(255 * heatmap)
    #apply suitable cv2 colormap . In this case colormap_JET 
    heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
  
    # convert from BGR to RGB if we want to display using matplotlib 
    heatmap1 = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
    
    # create superimposed image if we want to print using cv2 (cv2_imshow supported in colab)
    superimposed_img = cv2.addWeighted(img, 0.5, heatmap1, 0.5, 0,dtype=5)
    
    #create a dictionary object with details of image, heatmap, its title 
    title1=str(j+1)+": "+ class_names[np.argmax(y[j])]+" predicted as "+str(class_names[class_idx])
    title2='superimposed heatmap'
    image1=img
    image2=heatmap1
    image3=superimposed_img
    imageObj={'image1':image1,'image2':image2,'image3':image3,'title1':title1,'title2':title2}
    
    #append the image dict object to results list 
    results.append(imageObj)
    #print(j)
  #return grad-cam results as a list of dictionary objects , each containing an image and its heatmap  
  return results 
def displayRow(images):
  # we will plot 2 images in a row 
  # cv.imshow does not work in jupyter notebooks and colab 
  # cv2_imshow patch works on colab but matplotlib gives us a little more flexibility in formatting the display
  # we will use matplotlib to print the image and its heatmap 
  fig = plt.figure(1, (13,13))
  grid = ImageGrid(fig, 111,  
                 nrows_ncols=(1,5),  
                 axes_pad=1,label_mode="1"  
                 )
  
  #horizontal spacer
  
    
  #grid[0].imshow(np.ones((32, 10)),alpha=0)
  #grid[0].axis('off')
  
  
  
  #first image
  #print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[0].imshow(images[0]['image1'])
  grid[0].set_title(images[0]['title1'])
  grid[0].axis('off')
  
  #print the original image and on top of it place the heat map at 60% transparency 
  grid[1].imshow(images[0]['image1'],alpha=0.9)
  grid[1].imshow(images[0]['image2'],alpha=0.6)
  grid[1].set_title(images[0]['title2'])
  grid[1].axis('off')
  
  #vertical separator 
    
  grid[2].imshow(np.ones((32, 1)))
  grid[2].axis('off')
  
  #second image 
    
  #print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[3].imshow(images[1]['image1'])
  grid[3].set_title(images[1]['title1'])
  grid[3].axis('off')
  
  #print the original image and on top of it place the heat map at 60% transparency 
  grid[4].imshow(images[1]['image1'],alpha=0.9)
  grid[4].imshow(images[1]['image2'],alpha=0.6)
  grid[4].set_title(images[1]['title2'])
  grid[4].axis('off')
  
  
  
  plt.show()
  display(HTML("<hr size='5' color='black' width='100%' align='center' />"))
pred=model1.predict(X_test)
pred2=np.argmax(pred,axis=1)
wrong_set=[]
correct_set=[]
wrong_labels=[]
true_labels=[]
wrong_indices=[]
for i in range(X_test.shape[0]):
  
  if (pred2[i]==np.argmax(test_labels[i])):
    
    correct_set.append(X_test[i])
  else:
    wrong_indices.append(i)
    wrong_labels.append(class_names[pred2[i]])
    true_labels.append(class_names[np.argmax(test_labels[i])])
    wrong_set.append(X_test[i])
w_list=wrong_indices[:26]
x=[]
y=[]
for i in range(len(w_list)):
  x.append(test_features[w_list[i]])
  y.append(test_labels[w_list[i]])
  
#convert the image list to numpy array   
x=np.array(x)
results=gradcam(model1,x,y,'R1') # we choose this layer as the layer nearest to prediction having a size of 8x8 
display(HTML("<h2 align='center'>First 26 misclassified images with Grad-CAM heatmap </h2><hr size='5' color='black' width='100%' align='center' />"))
for i in range(0,len(results),2):
  images=[]
  images.append(results[i])
  images.append(results[i+1])
  displayRow(images)