We will use CIFAR-10 dataset for this exercise and build the model using Keras ..

import necessary modules

from keras import backend as K
import time
import matplotlib.pyplot as plt
import numpy as np
% matplotlib inline
np.random.seed(2017) 
from keras import regularizers
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D,AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.

get CIFAR10 dataset and set the train and test data

from keras.datasets import cifar10
(train_features, train_labels), (test_features, test_labels) = cifar10.load_data()
num_train, img_rows, img_cols,img_channels =  train_features.shape
num_test, _, _, _ =  test_features.shape
num_classes = len(np.unique(train_labels))

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
170500096/170498071 [==============================] - 9s 0us/step

print (num_classes)
print (num_train)
print (train_features.shape)

10
50000
(50000, 32, 32, 3)

inspect some of the images from the dataset by printing

class_names = ['airplane','automobile','bird','cat','deer',
               'dog','frog','horse','ship','truck']
fig = plt.figure(figsize=(8,3))
for i in range(num_classes):
    ax = fig.add_subplot(2, 5, 1 + i, xticks=[], yticks=[])
    idx = np.where(train_labels[:]==i)[0]
    features_idx = train_features[idx,::]
    img_num = np.random.randint(features_idx.shape[0])
    im = features_idx[img_num]
    ax.set_title(class_names[i])
    plt.imshow(im)
plt.show()

function for plotting accuracy vs number of epochs

def plot_model_history(model_history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.show()

function to calculate accuracy on test data

def accuracy(test_x, test_y, model):
    result = model.predict(test_x)
    predicted_class = np.argmax(result, axis=1)
    true_class = np.argmax(test_y, axis=1)
    num_correct = np.sum(predicted_class == true_class) 
    accuracy = float(num_correct)/result.shape[0]
    return (accuracy * 100)

function to get max training accuracy from model history

def get_max_train_accuracy(model_info):
  train_acc=model_info.history['acc']
  max_train_acc=max(train_acc)
  return (max_train_acc * 100)

function to get max validation accuracy from model history

def get_max_val_accuracy(model_info):
  val_acc=model_info.history['val_acc']
  max_val_acc=max(val_acc)
  return (max_val_acc * 100)

standardize pixel values of train and test images and convert train and test labels to categorical one hot encoded vectors

train_features = train_features.astype('float32')/255
test_features = test_features.astype('float32')/255
# convert class labels to binary class labels
train_labels = np_utils.to_categorical(train_labels, num_classes)
test_labels = np_utils.to_categorical(test_labels, num_classes)

train_labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

train dataset stats :mean , standard deviation for whole dataset , for a batch of 128 images , etc

(trainX, trainy), (testX, testy) = cifar10.load_data()


print('Statistics train=%.3f (%.3f), test=%.3f (%.3f)' % (trainX.mean(), trainX.std(), testX.mean(), testX.std()))

# create generator that centers pixel values
datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)

# calculate the mean on the training dataset
datagen.fit(trainX)
#print('Data Generator mean=%.3f, std=%.3f' % (datagen.mean, datagen.std))

# demonstrate effect on a single batch of samples
iterator = datagen.flow(trainX, trainy, batch_size=128)

# get a batch
batchX, batchy = iterator.next()

# pixel stats in the batch
print(batchX.shape, batchX.mean(), batchX.std())

# demonstrate effect on entire training dataset
iterator = datagen.flow(trainX, trainy, batch_size=len(trainX), shuffle=False)

# get a batch
batchX, batchy = iterator.next()

# pixel stats in the batch
print(batchX.shape, batchX.mean(), batchX.std())

Statistics train=120.708 (64.150), test=121.529 (64.061)
(128, 32, 32, 3) 0.01989002 1.0052702
(50000, 32, 32, 3) -1.6605131e-06 1.0000001

iterator1 = datagen.flow(testX, testy, batch_size=len(testX), shuffle=False)
batch_testX, batch_testy = iterator1.next()

X_train = batchX
X_test = batch_testX

y_train=batchy
y_test=batch_testy

Y_train = np_utils.to_categorical(y_train, 10)
Y_test = np_utils.to_categorical(y_test, 10)

Use the following standardization/regularization techniques for the model

Using Image Normalization
Making use of Batch Normalization
Making use of L2 Regularizer
Properly using Dropout

model1 = Sequential()
model1.add(Convolution2D(32, 3, 3, border_mode='same',kernel_regularizer=regularizers.l2(0.0001), input_shape=(32, 32, 3)))
model1.add(Activation('relu'))
model1.add(BatchNormalization())

model1.add(Convolution2D(64, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())

model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.2))

model1.add(Convolution2D(32, 1, 1))


model1.add(Convolution2D(64, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())

model1.add(Convolution2D(128, 3, 3,kernel_regularizer=regularizers.l2(0.0001),border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())

model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.3))

model1.add(Convolution2D(32, 1, 1))


model1.add(Convolution2D(128, 3, 3,kernel_regularizer=regularizers.l2(0.0001), border_mode='same'))
model1.add(Activation('relu'))
model1.add(BatchNormalization())

model1.add(Convolution2D(256, 3, 3,kernel_regularizer=regularizers.l2(0.0001), border_mode='same', name='LC1'))
model1.add(Activation('relu',name='R1'))
model1.add(BatchNormalization(name='BN1'))

model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.5))

model1.add(Convolution2D(10, 1, 1, name="red1"))

model1.add(AveragePooling2D(pool_size = (4,4)))
model1.add(Flatten())


model1.add(Activation('softmax'))

WARNING: Logging before flag parsing goes to stderr.
W0720 16:04:14.356874 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(32, (3, 3), kernel_regularizer=<keras.reg..., input_shape=(32, 32, 3..., padding="same")`
  
W0720 16:04:14.388915 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0720 16:04:14.394831 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0720 16:04:14.438685 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0720 16:04:14.439589 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0720 16:04:16.704932 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1834: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:6: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(64, (3, 3), kernel_regularizer=<keras.reg..., padding="same")`
  
W0720 16:04:16.960443 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0720 16:04:16.970278 140478930995072 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:13: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(32, (1, 1))`
  del sys.path[0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(64, (3, 3), kernel_regularizer=<keras.reg..., padding="same")`
  app.launch_new_instance()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:20: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(128, (3, 3), kernel_regularizer=<keras.reg..., padding="same")`
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:27: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(32, (1, 1))`
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:30: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(128, (3, 3), kernel_regularizer=<keras.reg..., padding="same")`
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:34: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(256, (3, 3), kernel_regularizer=<keras.reg..., name="LC1", padding="same")`
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:41: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(10, (1, 1), name="red1")`
W0720 16:04:17.392197 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3980: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.

print model summary

model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_1 (Conv2D)            (None, 32, 32, 32)        896       
_________________________________________________________________
activation_1 (Activation)    (None, 32, 32, 32)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 32, 32, 32)        128       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 32, 64)        18496     
_________________________________________________________________
activation_2 (Activation)    (None, 32, 32, 64)        0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 32, 32, 64)        256       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 16, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 16, 16, 32)        2080      
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 16, 16, 64)        18496     
_________________________________________________________________
activation_3 (Activation)    (None, 16, 16, 64)        0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 16, 16, 64)        256       
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 16, 16, 128)       73856     
_________________________________________________________________
activation_4 (Activation)    (None, 16, 16, 128)       0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 16, 16, 128)       512       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 8, 8, 128)         0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 8, 8, 128)         0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 8, 8, 32)          4128      
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 8, 8, 128)         36992     
_________________________________________________________________
activation_5 (Activation)    (None, 8, 8, 128)         0         
_________________________________________________________________
batch_normalization_5 (Batch (None, 8, 8, 128)         512       
_________________________________________________________________
LC1 (Conv2D)                 (None, 8, 8, 256)         295168    
_________________________________________________________________
R1 (Activation)              (None, 8, 8, 256)         0         
_________________________________________________________________
BN1 (BatchNormalization)     (None, 8, 8, 256)         1024      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 4, 4, 256)         0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 4, 4, 256)         0         
_________________________________________________________________
red1 (Conv2D)                (None, 4, 4, 10)          2570      
_________________________________________________________________
average_pooling2d_1 (Average (None, 1, 1, 10)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 10)                0         
_________________________________________________________________
activation_6 (Activation)    (None, 10)                0         
=================================================================
Total params: 455,370
Trainable params: 454,026
Non-trainable params: 1,344
_________________________________________________________________

Total params: 455,370

How to use LR Finder

This page http://puzzlemusa.com/2018/05/14/learning-rate-finder-using-keras/ describes how to use this technique of LR Finder

We will use LR Finder to fix our initial learning rate to train the model

In order to use the model with LR Finder compile the model with SGD optimizer

model1.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

W0720 16:04:17.436660 140478930995072 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

How to use LR finder

The code for LR finder callback was done using a blog reference that Abu Saleh Md Musa maintained .

The blog link http://puzzlemusa.com/2018/05/14/learning-rate-finder-using-keras/ doesn't seem to be active at the time of writing this post. I've retained it in the post just in case the author activates it again.

Add functions to print LR at min loss and min smoothed loss

from keras.callbacks import Callback

class LR_Finder(Callback):

    def __init__(self, start_lr=1e-5, end_lr=10, step_size=None, beta=.98):
        super().__init__()

        self.start_lr = start_lr
        self.end_lr = end_lr
        self.step_size = step_size
        self.beta = beta
        self.lr_mult = (end_lr / start_lr) ** (1 / step_size)
        #print("lr mult : "+str(self.lr_mult))

    def on_train_begin(self, logs=None):
        self.best_loss = 1e9
        self.avg_loss = 0
        self.losses, self.smoothed_losses, self.lrs, self.iterations = [], [], [], []
        self.iteration = 0
        logs = logs or {}
        K.set_value(self.model.optimizer.lr, self.start_lr)

    def on_batch_end(self, epoch, logs=None):
        logs = logs or {}
        loss = logs.get('loss')
        self.iteration += 1

        self.avg_loss = self.beta * self.avg_loss + (1 - self.beta) * loss
        smoothed_loss = self.avg_loss / (1 - self.beta ** self.iteration)

        # Check if the loss is not exploding
        if self.iteration > 1 and smoothed_loss > self.best_loss * 4:
            self.model.stop_training = True
            return

        if smoothed_loss < self.best_loss or self.iteration == 1:
            self.best_loss = smoothed_loss

        lr = self.start_lr * (self.lr_mult ** self.iteration)
        #print("lr = "+str(lr))

        self.losses.append(loss)
        self.smoothed_losses.append(smoothed_loss)
        self.lrs.append(lr)
        self.iterations.append(self.iteration)

        K.set_value(self.model.optimizer.lr, lr)

    def plot_lr(self):
      plt.figure(figsize=(18,12))
      plt.xlabel('Iterations')
      plt.ylabel('Learning rate')
      plt.plot(self.iterations, self.lrs)

    def plot(self, n_skip=10):
      plt.figure(figsize=(18,12))
      plt.ylabel('Loss')
      plt.xlabel('Learning rate (log scale)')
      plt.plot(self.lrs[n_skip:-5], self.losses[n_skip:-5])
      plt.xscale('log')

    def plot_smoothed_loss(self, n_skip=10):
      plt.figure(figsize=(18,12))
      plt.ylabel('Smoothed Losses')
      plt.xlabel('Learning rate (log scale)')
      plt.plot(self.lrs[n_skip:-5], self.smoothed_losses[n_skip:-5])
      plt.xscale('log')

    def plot_loss(self):
      plt.figure(figsize=(18,12))
      plt.ylabel('Losses')
      plt.xlabel('Iterations')
      plt.plot(self.iterations[10:], self.losses[10:])
        
    def get_best_loss(self):
      return self.best_loss
        
    def find_lr_at_best_loss(self):
      print("====================================================================")
      print("LR at min loss ")
      print("LR at min loss : "+str(self.lrs[np.argmin(self.losses)]))
      print("LR at min smoothed loss : "+str(self.lrs[np.argmin(self.smoothed_losses)]))
      print("====================================================================")

Run LR_finder for 1 epoch with start_lr=1e-5 , end_lr=10

batch_size=128
lr_finder = LR_Finder(start_lr=1e-5, end_lr=10, step_size=np.ceil(X_train.shape[0]/batch_size))
model1.fit(X_train, Y_train, epochs=1, batch_size=batch_size,callbacks=[lr_finder] )

plot the following

1. LR vs iterations

2. Loss vs LR(log scale)

3. Smoothed Loss vs LR(log scale)

From the smoothed loss vs LR plot we can see that the max descent for loss is between lr of 0.01 and 0.1

lr_finder.plot_lr()

lr_finder.plot()

lr_finder.plot_smoothed_loss()

lr_finder.find_lr_at_best_loss()

====================================================================
LR at min loss 
LR at min loss : 0.04489251258218551
LR at min smoothed loss : 0.04489251258218551
====================================================================

K.eval(lr_finder.model.optimizer.lr)

10.0

From the loss vs lr plots ,The max rate of descent seems to be between 0.01 and 0.1 and it looks like min loss is between lr values of 0.1 and 0.01 .

printing the lr corresponding to the min loss and min smoothed loss confirms this observation .

Let us pick an initial learning rate of 0.045 since that is where the smoothened loss curve starts going up .

we will use SGD optimizer with lr=0.045 and momentum =0.9 to compile the model

from keras import optimizers
opt= optimizers.SGD(lr=0.05,momentum=0.9)
model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

Cutout Augmentation

Cutout was first presented as an effective augmentation technique in these two papers : Improved Regularization of Convolutional Neural Networks with Cutout and Random Erasing Data Augmentation The idea is to randomly cut away patches of information from images that a model is training on to force it to learn from more parts of the image. This would help the model learn more features about a class instead of depending on some simple assumptions using smaller areas within the image . This helps the model generalize better and make better predictions . We will use python code for random erasing found at https://github.com/yu4u/cutout-random-erasing

#get code for random erasing from https://github.com/yu4u/cutout-random-erasing
!wget https://raw.githubusercontent.com/yu4u/cutout-random-erasing/master/random_eraser.py

train the model for 100 epochs . Use image augmentation of random cutout , horizontal flip

plot accuracy vs epochs , print accuracy and max val accuracy

from keras.preprocessing.image import ImageDataGenerator
from random_eraser import get_random_eraser

batch_size=128

train_datagen=ImageDataGenerator(
        featurewise_center=True,  # set input mean to 0 over the dataset
        #samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=True,  # divide inputs by std of the dataset
        #samplewise_std_normalization=False,  # divide each input by its std
        preprocessing_function=get_random_eraser(v_l=0, v_h=1),
        horizontal_flip=True
    
)

val_datagen= ImageDataGenerator(
        featurewise_center=True,  # set input mean to 0 over the dataset
        
        featurewise_std_normalization=True,  # divide inputs by std of the dataset
        
)


train_datagen.fit(X_train)

val_datagen.fit(X_test)

training_generator=train_datagen.flow(X_train, Y_train, batch_size=batch_size,shuffle=True,seed=42)

validation_generator=val_datagen.flow(X_test, Y_test, batch_size=batch_size,shuffle=True,seed=42)


# train the model
start = time.time()
# Train the model
model_info = model1.fit_generator(training_generator, epochs=100, 
                        steps_per_epoch=np.ceil(X_train.shape[0]/batch_size), 
                    validation_steps=np.ceil(X_test.shape[0]/batch_size), 
                    validation_data=validation_generator,
                                 shuffle=True,
                                 verbose=0)


end = time.time()
print ("Model took %0.2f seconds to train"%(end - start))
# plot model history
plot_model_history(model_info)
# compute test accuracy
print ("Accuracy on test data is: %0.2f"%accuracy(X_test, Y_test, model1))
print ("Max training accuracy is: %0.2f"%get_max_train_accuracy(model_info))
print ("Max validation accuracy is: %0.2f"%get_max_val_accuracy(model_info))

Model took 1151.59 seconds to train

Accuracy on test data is: 86.24
Max training accuracy is: 87.53
Max validation accuracy is: 87.81

the model was trained for 100 epochs and reached a max val accuracy of 87.81 .We also notice that it almost the same as the max training accuracy . Training more epochs would yield better accuracy values . We will stop at 100 epochs and prepare to print Grad-CAM visualization on some misclassified images from this model's prediction on the test data

Grad-CAM

Now let us define the function for Grad-CAM visualization .

This function named gradcam takes as input the model , the set of images , the labels for each image and the layer to be used for calculating gradients . It returns a list of dictionaries containing original image , the heatmap, the titles to display during visualization

import cv2
from mpl_toolkits.axes_grid1 import ImageGrid
from google.colab.patches import cv2_imshow
from IPython.core.display import display, HTML


#select test images and corresponding labels to print heatmap 
#x=np.array([test_features[41],test_features[410],test_features[222],test_features[950]])
#y=[test_labels[41],test_labels[410],test_labels[222],test_labels[950]]


def gradcam(model1,x,y,which_layer):
  #
  results=[]
  #make prediction for these 4 images 
  preds = model1.predict(x)
  for j in range(x.shape[0]):
    #get class id from the prediction values 
    class_idx = np.argmax(preds[j])

    class_output = model1.output[:, class_idx]
  
    ## choose the layer nearest to prediction that has a size of about 7x7 or 8x8 
    #in this case it is the layer being sent to the gradcam function 
    last_conv_layer = model1.get_layer(which_layer)
  
    # compute gradients and from heatmap 
    grads = K.gradients(class_output, last_conv_layer.output)[0]
    pooled_grads = K.mean(grads, axis=(0, 1, 2))
    iterate = K.function([model1.input], [pooled_grads, last_conv_layer.output[0]])
    pooled_grads_value, conv_layer_output_value = iterate([x])
    
    #apply the pooled grad value to the conv layer channels 
    for i in range(256):
      
      conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
      
    #get the mean of the weighted values and assign to heatmap   
    heatmap = np.mean(conv_layer_output_value, axis=-1)
    #retain only positive values (or 0) in heatmap 
    heatmap = np.maximum(heatmap, 0)
    #convert values between 0 and 1 using divide by max value 
    heatmap /= np.max(heatmap) 
    #we now have a heatmap with size equal to the output size of the layer we chose 
    
    #img is the image we are running gradcam on 
    img = x[j]
    
    #resize heatmap 8x8 to image size of 32x32 
    heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
    #convert pixel values to be between 0 and 255 
    heatmap = np.uint8(255 * heatmap)
    #apply suitable cv2 colormap . In this case colormap_JET 
    heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
  
    # convert from BGR to RGB if we want to display using matplotlib 
    heatmap1 = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
    
    # create superimposed image if we want to print using cv2 (cv2_imshow supported in colab)
    superimposed_img = cv2.addWeighted(img, 0.5, heatmap1, 0.5, 0,dtype=5)
    
    #create a dictionary object with details of image, heatmap, its title 
    title1=str(j+1)+": "+ class_names[np.argmax(y[j])]+" predicted as "+str(class_names[class_idx])
    title2='superimposed heatmap'
    image1=img
    image2=heatmap1
    image3=superimposed_img
    imageObj={'image1':image1,'image2':image2,'image3':image3,'title1':title1,'title2':title2}
    
    #append the image dict object to results list 
    results.append(imageObj)
    #print(j)
  #return grad-cam results as a list of dictionary objects , each containing an image and its heatmap  
  return results

Define the function to display the Grad-CAM visualizations .

This function displays a set of two images with heatmap visuals per row

def displayRow(images):
  # we will plot 2 images in a row 
  # cv.imshow does not work in jupyter notebooks and colab 
  # cv2_imshow patch works on colab but matplotlib gives us a little more flexibility in formatting the display
  # we will use matplotlib to print the image and its heatmap 
  fig = plt.figure(1, (13,13))

  grid = ImageGrid(fig, 111,  
                 nrows_ncols=(1,5),  
                 axes_pad=1,label_mode="1"  
                 )
  
  #horizontal spacer
  
    
  #grid[0].imshow(np.ones((32, 10)),alpha=0)
  #grid[0].axis('off')
  
  
  
  #first image
  #print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[0].imshow(images[0]['image1'])
  grid[0].set_title(images[0]['title1'])
  grid[0].axis('off')
  
  #print the original image and on top of it place the heat map at 60% transparency 
  grid[1].imshow(images[0]['image1'],alpha=0.9)
  grid[1].imshow(images[0]['image2'],alpha=0.6)
  grid[1].set_title(images[0]['title2'])
  grid[1].axis('off')
  
  #vertical separator 
    
  grid[2].imshow(np.ones((32, 1)))
  grid[2].axis('off')
  
  #second image 
    
  #print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[3].imshow(images[1]['image1'])
  grid[3].set_title(images[1]['title1'])
  grid[3].axis('off')
  
  #print the original image and on top of it place the heat map at 60% transparency 
  grid[4].imshow(images[1]['image1'],alpha=0.9)
  grid[4].imshow(images[1]['image2'],alpha=0.6)
  grid[4].set_title(images[1]['title2'])
  grid[4].axis('off')
  
  
  
  plt.show()
  display(HTML("<hr size='5' color='black' width='100%' align='center' />"))

Make predictions using the model and collect all the images that were classified wrongly

pred=model1.predict(X_test)
pred2=np.argmax(pred,axis=1)
wrong_set=[]
correct_set=[]
wrong_labels=[]
true_labels=[]
wrong_indices=[]
for i in range(X_test.shape[0]):
  
  if (pred2[i]==np.argmax(test_labels[i])):
    
    correct_set.append(X_test[i])
  else:
    wrong_indices.append(i)
    wrong_labels.append(class_names[pred2[i]])
    true_labels.append(class_names[np.argmax(test_labels[i])])
    wrong_set.append(X_test[i])

Now take the first 26 images and the corresponding labels to create the data for Grad-CAM visualization

w_list=wrong_indices[:26]
x=[]
y=[]
for i in range(len(w_list)):
  x.append(test_features[w_list[i]])
  y.append(test_labels[w_list[i]])
  
#convert the image list to numpy array   
x=np.array(x)

Obtain results from the gradcam function

results=gradcam(model1,x,y,'R1') # we choose this layer as the layer nearest to prediction having a size of 8x8

display the results from gradcam function with images and corresponding heatmap visuals

display(HTML("<h2 align='center'>First 26 misclassified images with Grad-CAM heatmap </h2><hr size='5' color='black' width='100%' align='center' />"))
for i in range(0,len(results),2):
  images=[]
  images.append(results[i])
  images.append(results[i+1])
  displayRow(images)

We used LR finder to fix an optimum learning rate of 0.045 for training the model on cifar 10 dataset . The model reached a max val accuracy of 87.81 and was almost the same as the max training accuracy indicating that this model would reach even higher accuracies with more epochs . We then used Grad-CAM to visualize the heatmaps for a set of 26 images that this model misclassifed.