Gradient-weighted Class Activation Mapping (Grad-CAM) :

Grad-CAM is a technique to visually represent where amodel is looking at and why it has made a certain prediction and was first presented in this paper Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization

We will classify images in CIFAR 10 dataset and integrate Grad-CAM visualization. We will also use Cutout Image Augmentation for training the model

Import necessary Modules

from keras import backend as K
import time
import matplotlib.pyplot as plt
import numpy as np
% matplotlib inline
np.random.seed(2017) 
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers import Activation, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

create train and test data using cifar10 dataset in Keras

from keras.datasets import cifar10
(train_features, train_labels), (test_features, test_labels) = cifar10.load_data()
num_train, img_channels, img_rows, img_cols =  train_features.shape
num_test, _, _, _ =  test_features.shape
num_classes = len(np.unique(train_labels))

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
170500096/170498071 [==============================] - 11s 0us/step

Plot some of the images in the dataset along with class label

class_names = ['airplane','automobile','bird','cat','deer',
               'dog','frog','horse','ship','truck']
fig = plt.figure(figsize=(8,3))
for i in range(num_classes):
    ax = fig.add_subplot(2, 5, 1 + i, xticks=[], yticks=[])
    idx = np.where(train_labels[:]==i)[0]
    features_idx = train_features[idx,::]
    img_num = np.random.randint(features_idx.shape[0])
    im = features_idx[img_num]
    ax.set_title(class_names[i])
    plt.imshow(im)
plt.show()

Scale the input features to be within 0 and 1

convert the train and test labels to 10 class category format

train_features = train_features.astype('float32')/255
test_features = test_features.astype('float32')/255
# convert class labels to binary class labels
train_labels = np_utils.to_categorical(train_labels, num_classes)
test_labels = np_utils.to_categorical(test_labels, num_classes)

Define Model for image classification

from keras.layers import Conv2D,BatchNormalization,MaxPooling2D,Activation,Flatten
# Define the model                                                              #RF
model = Sequential()

model.add(Conv2D(32, 3, border_mode='same', name='layer1', input_shape=(32, 32, 3)))           #3
model.add(BatchNormalization(name='BN1'))
model.add(Activation('relu',name='rl1'))

#Conv block 1 

model.add(Conv2D(64, 3,name='layer2',border_mode='same'))                                     #5
model.add(BatchNormalization(name='BN2'))
model.add(Activation('relu',name='rl2'))

model.add(Conv2D(128, 3,name='layer3'))                                                       #7
model.add(BatchNormalization(name='BN3'))
model.add(Activation('relu',name='rl3'))

#dropout after conv block1 
model.add(Dropout(0.1,name='drp1'))


#Transition Block 1
model.add(Conv2D(32,1,name='tb1'))
model.add(BatchNormalization(name='tb-BN1'))
model.add(Activation('relu',name='tb-rl1'))
model.add(MaxPooling2D(pool_size=(2, 2),name='mp1'))                                       #14

#Conv Block 2
model.add(Conv2D(64, 3, name='layer4',border_mode='same'))                                    #16  
model.add(BatchNormalization(name='BN4'))
model.add(Activation('relu',name='rl4'))
model.add(Conv2D(128, 3,name='layer5',border_mode='same'))                                    #18
model.add(BatchNormalization(name='BN5'))
model.add(Activation('relu',name='rl5'))


#dropout after conv block2 
model.add(Dropout(0.1,name='drp2'))

#Transition Block 2
model.add(Conv2D(32,1,name='tb2'))
model.add(BatchNormalization(name='tb-BN2'))
model.add(Activation('relu',name='tb-rl2'))
model.add(MaxPooling2D(pool_size=(2, 2),name='mp2'))                                       #36 - we have reached the image size here 

#final conv Block 

model.add(Conv2D(64, 3, name='layer6',border_mode='same'))                                    #38
model.add(BatchNormalization(name='BN6'))
model.add(Activation('relu',name='rl6'))
model.add(Conv2D(128, 3,name='layer7',border_mode='same'))                                    #40
model.add(BatchNormalization(name='BN7'))
model.add(Activation('relu',name='rl7'))

#dropout after final conv block
model.add(Dropout(0.1,name='d3'))

#Pointwise convolution to squash 128 channels to 10 output channels 
model.add(Conv2D(10,1,name='red1'))
model.add(BatchNormalization(name='red-BN1'))
model.add(Activation('relu',name='rrl1'))

#last conv layer - No ReLU activation, No Batch Normalization 
model.add(Conv2D(10,7,name='layer8'))                                                         #47

#Flatten the output 
model.add(Flatten())

#Softmax activation to output likelihood values for classes 

model.add(Activation('softmax'))

#Print model summary 

model.summary()

Learning Rate Scheduler : We will add a custom learning rate scheduler that reduces the rate every 3rd epoch sugject to a min of 0.0005. We will also start with a slightly larger lr of 0.003 compared to default of 0.001 for Adam optimizer

from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler
def scheduler(epoch, lr):
  if (epoch%3==0 and epoch):
    new_lr = max(0.9*lr,0.0005) 
  else:
    new_lr=lr
  
  return round(new_lr, 10)
  
lr_scheduler=LearningRateScheduler(scheduler,verbose=1)

#start with a higher lr of 0.003 
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.003), metrics=['accuracy'])

Mount google drive so that you can save the model with best validation accuracy and use it later for prediction tasks

from google.colab import drive

def mount_drive():
  drive.mount('/gdrive',force_remount=True)

mount_drive()

Create a modelcheckpoint callback to chack validation accuracy at the end of each epoch and save the model with best validation accuracy

from keras.callbacks import ModelCheckpoint
  
chkpoint_model=ModelCheckpoint("/gdrive/My Drive/EVA/Session9/model_customv1_cifar10_best.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='max')

Data Augmentation : Define datagenerator with horizontal flip set to True ,zoom range of 0.15 .

Train the model for 100 epochs

from keras.preprocessing.image import ImageDataGenerator


datagen = ImageDataGenerator(zoom_range=0.15, 
                             horizontal_flip=True)


# train the model
start = time.time()
# Train the model
model_info = model.fit_generator(datagen.flow(train_features, train_labels, batch_size = 128),
                                 samples_per_epoch = train_features.shape[0], nb_epoch = 100, 
                                 validation_data = (test_features, test_labels), 
                                 callbacks=[chkpoint_model,lr_scheduler],verbose=1)
end = time.time()
print ("Model took %0.2f seconds to train\n"%(end - start))

Model trained for 100 epochs and reached a max validation accuracy of 88.05. Model with best validation accuracy was saved in google drive

Load the model with best validation accuracy

from keras.models import load_model
model1=load_model('/gdrive/My Drive/EVA/Session9/model_customv1_cifar10_best.h5')

Integrate Grad-CAM to visualize gradient heatmaps

We will integrate Grad-CAM to visualize where the network is looking at or which pixels in the image contribute most to the prediction being made .

Choose 4 images from the test dataset , predict their classes and print GradCam heatmap visualization for these 4 images

import cv2
from mpl_toolkits.axes_grid1 import ImageGrid
from google.colab.patches import cv2_imshow

#select test images and corresponding labels to print heatmap 
x=np.array([test_features[41],test_features[410],test_features[222],test_features[950]])
y=[test_labels[41],test_labels[410],test_labels[222],test_labels[950]]

#make prediction for these 4 images 
preds = model1.predict(x)
for j in range(4):
  #get class id from the prediction values 
  class_idx = np.argmax(preds[j])

  class_output = model1.output[:, class_idx]
  
  ## choose the layer before last 7x7 layer 
  last_conv_layer = model1.get_layer("rrl1")
  
  # compute gradients and from it heatmap 
  grads = K.gradients(class_output, last_conv_layer.output)[0]
  pooled_grads = K.mean(grads, axis=(0, 1, 2))
  iterate = K.function([model1.input], [pooled_grads, last_conv_layer.output[0]])
  pooled_grads_value, conv_layer_output_value = iterate([x])
  for i in range(10):
      conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
  heatmap = np.mean(conv_layer_output_value, axis=-1)
  heatmap = np.maximum(heatmap, 0)
  heatmap /= np.max(heatmap)  
  
  img = x[j]
  #resize heatmap 7x7 to image size of 32x32 
  heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
  heatmap = np.uint8(255 * heatmap)
  heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
  
  # convert from BGR to RGB 
  heatmap1 = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
  # create superimposed image if we want to print using cv2 (cv2_imshow supported in colab)
  superimposed_img = cv2.addWeighted(img, 0.8, heatmap, 0.2, 0,dtype=5)
  
  
  # since cv.imshow does not work in jupyter notebooks and colab 
  # we will use matplotlib to print the image and its heatmap 
  fig = plt.figure(1, (5,5))

  grid = ImageGrid(fig, 111,  
                 nrows_ncols=(1,2),  
                 axes_pad=0.3,  
                 )
  print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[0].imshow(img)
  grid[0].set_title('Original')
  
  #print the original image and on top of it place the heat map at 70% transparency 
  grid[1].imshow(img,alpha=1)
  grid[1].imshow(heatmap1,alpha=0.7)
  grid[1].set_title('superimposed heatmap')
  
  plt.show()

 original class is :frog and predicted class is :frog

 original class is :horse and predicted class is :horse

 original class is :truck and predicted class is :truck

 original class is :cat and predicted class is :cat

How about Misclassified Images ?

Let us also choose 4 misclassified images to visualize their Grad-CAM heatmap

pred=model1.predict(test_features)
pred2=np.argmax(pred,axis=1)
wrong_set=[]
correct_set=[]
wrong_labels=[]
true_labels=[]
wrong_indices=[]
for i in range(10000):
  
  if (pred2[i]==np.argmax(test_labels[i])):
    
    correct_set.append(test_features[i])
  else:
    wrong_indices.append(i)
    wrong_labels.append(class_names[pred2[i]])
    true_labels.append(class_names[np.argmax(test_labels[i])])
    wrong_set.append(test_features[i])

A selection of 4 misclassiifed images

print('            Selection of 4 misclassified images \n           _________________________________\n')
from mpl_toolkits.axes_grid1 import ImageGrid

fig = plt.figure(1, (12, 12))

grid = ImageGrid(fig, 111,  
                 nrows_ncols=(1, 4),  
                 axes_pad=1,  
                 )
for i in range(5,9):
    
    grid[i-5].imshow(wrong_set[i].reshape(32,32,3))
    grid[i-5].set_title('{2}: {0}, predicted: {1}'.format(true_labels[i],wrong_labels[i],wrong_indices[i]))
    
plt.show()

            Selection of 4 misclassified images 
           _________________________________

print Grad-CAM heatmap for the misclassifed images

w_list=wrong_indices[5:9]
x=[]
y=[]
for i in range(len(w_list)):
  x.append(test_features[w_list[i]])
  y.append(test_labels[w_list[i]])
  
#convert the image list to numpy array   
x=np.array(x)

#make prediction for these 4 images
preds = model1.predict(x)
for j in range(len(x)):
  #get class id from the prediction values 
  class_idx = np.argmax(preds[j])

  class_output = model1.output[:, class_idx]
  
  ## choose the layer before last 7x7 layer 
  last_conv_layer = model1.get_layer("rrl1")
  
  # compute gradients and from it heatmap
  grads = K.gradients(class_output, last_conv_layer.output)[0]
  pooled_grads = K.mean(grads, axis=(0, 1, 2))
  iterate = K.function([model1.input], [pooled_grads, last_conv_layer.output[0]])
  pooled_grads_value, conv_layer_output_value = iterate([x])
  for i in range(10):
      conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
  heatmap = np.mean(conv_layer_output_value, axis=-1)
  heatmap = np.maximum(heatmap, 0)
  heatmap /= np.max(heatmap)  
  
  img = x[j]
  #resize heatmap 7x7 to image size of 32x32 
  heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
  heatmap = np.uint8(255 * heatmap)
  heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
  
  # convert from BGR to RGB 
  heatmap1 = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
  # create superimposed image if we want to print using cv2 (cv2_imshow supported in colab)
  superimposed_img = cv2.addWeighted(img, 0.8, heatmap, 0.2, 0,dtype=5)
  
  
  # since cv.imshow does not work in jupyter notebooks and colab 
  # we will use matplotlib to print the image and its heatmap 
  fig = plt.figure(1, (5,5))

  grid = ImageGrid(fig, 111,  
                 nrows_ncols=(1,2),  
                 axes_pad=0.3,  
                 )
  print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[0].imshow(img)
  grid[0].set_title('Original')
  
  
  #print the original image and on top of it place the heat map at 70% transparency   
  grid[1].imshow(img,alpha=1)
  grid[1].imshow(heatmap1,alpha=0.7)
  
  grid[1].set_title('superimposed heatmap')
  plt.show()

 original class is :cat and predicted class is :dog

 original class is :frog and predicted class is :dog

 original class is :frog and predicted class is :bird

 original class is :horse and predicted class is :dog

We trained the model with some basic data augmentation techniques available in Keras and visualized the Grad-CAM heatmaps for a selection 4 correctly classified images and 4 misclassifed images . Let us now use another augmentation technique called cutout to train the model and see if it improves the prediction of these misclassified images and also visualize where the model looks at when making the prediction

Model prediction improvement with Cutout augmentation

Now let us try and improve the model prediction accuracy by using an image augmentation technique called Cutout and see how the model performs for the misclassified images

Cutout Augmentation

Cutout was first presented as an effective augmentation technique in these two papers :

Improved Regularization of Convolutional Neural Networks with Cutout and Random Erasing Data Augmentation

The idea is to randomly cut away patches of information from images that a model is training on to force it to learn from more parts of the image. This would help the model learn more features about a class instead of depending on some simple assumptions using smaller areas within the image . This helps the model generalize better and make better predictions .

We will use python code for cutout /random erasing found at https://github.com/yu4u/cutout-random-erasing

!wget https://raw.githubusercontent.com/yu4u/cutout-random-erasing/master/random_eraser.py

--2019-06-27 20:06:44--  https://raw.githubusercontent.com/yu4u/cutout-random-erasing/master/random_eraser.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 888 [text/plain]
Saving to: ‘random_eraser.py.2’

random_eraser.py.2  100%[===================>]     888  --.-KB/s    in 0s      

2019-06-27 20:06:45 (170 MB/s) - ‘random_eraser.py.2’ saved [888/888]

Define model (It is the same as above)

from keras.layers import Conv2D,BatchNormalization,MaxPooling2D,Activation,Flatten
# Define the model                                                              #RF
model = Sequential()

model.add(Conv2D(32, 3, border_mode='same', name='layer1', input_shape=(32, 32, 3)))           #3
model.add(BatchNormalization(name='BN1'))
model.add(Activation('relu',name='rl1'))

#Conv block 1 

model.add(Conv2D(64, 3,name='layer2',border_mode='same'))                                     #5
model.add(BatchNormalization(name='BN2'))
model.add(Activation('relu',name='rl2'))

model.add(Conv2D(128, 3,name='layer3'))                                                       #7
model.add(BatchNormalization(name='BN3'))
model.add(Activation('relu',name='rl3'))

#dropout after conv block1 
model.add(Dropout(0.1,name='drp1'))


#Transition Block 1
model.add(Conv2D(32,1,name='tb1'))
model.add(BatchNormalization(name='tb-BN1'))
model.add(Activation('relu',name='tb-rl1'))
model.add(MaxPooling2D(pool_size=(2, 2),name='mp1'))                                       #14

#Conv Block 2
model.add(Conv2D(64, 3, name='layer4',border_mode='same'))                                    #16  
model.add(BatchNormalization(name='BN4'))
model.add(Activation('relu',name='rl4'))
model.add(Conv2D(128, 3,name='layer5',border_mode='same'))                                    #18
model.add(BatchNormalization(name='BN5'))
model.add(Activation('relu',name='rl5'))


#dropout after conv block2 
model.add(Dropout(0.1,name='drp2'))

#Transition Block 2
model.add(Conv2D(32,1,name='tb2'))
model.add(BatchNormalization(name='tb-BN2'))
model.add(Activation('relu',name='tb-rl2'))
model.add(MaxPooling2D(pool_size=(2, 2),name='mp2'))                                       #36 - we have reached the image size here 

#final conv Block 

model.add(Conv2D(64, 3, name='layer6',border_mode='same'))                                    #38
model.add(BatchNormalization(name='BN6'))
model.add(Activation('relu',name='rl6'))
model.add(Conv2D(128, 3,name='layer7',border_mode='same'))                                    #40
model.add(BatchNormalization(name='BN7'))
model.add(Activation('relu',name='rl7'))

#dropout after final conv block
model.add(Dropout(0.1,name='d3'))

#Pointwise convolution to squash 128 channels to 10 output channels 
model.add(Conv2D(10,1,name='red1'))
model.add(BatchNormalization(name='red-BN1'))
model.add(Activation('relu',name='rrl1'))

#last conv layer - No ReLU activation, No Batch Normalization 
model.add(Conv2D(10,7,name='layer8'))                                                         #47

#Flatten the output 
model.add(Flatten())

#Softmax activation to output likelihood values for classes 

model.add(Activation('softmax'))

#Print model summary 

model.summary()

Compile the model with Adam optimizer and initial learning rate of 0.003

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.003), metrics=['accuracy'])

Define a new modelcheckpoint to save this model trained with cutout augmentaion in a separate path on drive

  
chkpoint_model=ModelCheckpoint("/gdrive/My Drive/EVA/Session9/model3_with_cutout_cifar10_best.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='max')

Data Augmentation :

Define datagenerator with horizontal flip set to True ,zoom range of 0.15 .
Add random erasing or cutout as a preprocessing step . Use the default parameters from the random eraser code

Train the model for 100 epochs

from random_eraser import get_random_eraser
from keras.preprocessing.image import ImageDataGenerator


datagen = ImageDataGenerator(preprocessing_function=get_random_eraser(v_l=0, v_h=1),
                             zoom_range=0.15, 
                             horizontal_flip=True)


# train the model
start = time.time()
# Train the model
model_info = model.fit_generator(datagen.flow(train_features, train_labels, batch_size = 128),
                                 samples_per_epoch = train_features.shape[0], nb_epoch = 100, 
                                 validation_data = (test_features, test_labels), 
                                 callbacks=[chkpoint_model,lr_scheduler],verbose=1)
end = time.time()
print ("Model took %0.2f seconds to train\n"%(end - start))

Validation accuracy after 100 epochs is 88.28

Load the new model trained with cutout augmentation

model1=load_model('/gdrive/My Drive/EVA/Session9/model3_with_cutout_cifar10_best.h5')

visualize the same 4 images using Grad-CAM heatmap

x=np.array([test_features[41],test_features[410],test_features[222],test_features[950]])
y=[test_labels[41],test_labels[410],test_labels[222],test_labels[950]]

#make prediction for these 4 images
preds = model1.predict(x)
for j in range(4):
  #get class id from the prediction values 
  class_idx = np.argmax(preds[j])

  class_output = model1.output[:, class_idx]
  
  ## choose the layer before last 7x7 layer 
  last_conv_layer = model1.get_layer("rrl1")
  
  # compute gradients and from it heatmap 
  grads = K.gradients(class_output, last_conv_layer.output)[0]
  pooled_grads = K.mean(grads, axis=(0, 1, 2))
  iterate = K.function([model1.input], [pooled_grads, last_conv_layer.output[0]])
  pooled_grads_value, conv_layer_output_value = iterate([x])
  for i in range(10):
      conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
  heatmap = np.mean(conv_layer_output_value, axis=-1)
  heatmap = np.maximum(heatmap, 0)
  heatmap /= np.max(heatmap)  
  
  img = x[j]
  #resize heatmap 7x7 to image size of 32x32 
  heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
  heatmap = np.uint8(255 * heatmap)
  heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
  
  # convert from BGR to RGB 
  heatmap1 = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
  # create superimposed image if we want to print using cv2 (cv2_imshow supported in colab)
  superimposed_img = cv2.addWeighted(img, 0.8, heatmap, 0.2, 0,dtype=5)
  
  
  # since cv.imshow does not work in jupyter notebooks and colab 
  # we will use matplotlib to print the image and its heatmap 
  fig = plt.figure(1, (5,5))

  grid = ImageGrid(fig, 111,  
                 nrows_ncols=(1,2),  
                 axes_pad=0.3,  
                 )
  print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[0].imshow(img)
  grid[0].set_title('Original')
  
  #print the original image and on top of it place the heat map at 70% transparency 
  grid[1].imshow(img,alpha=1)
  grid[1].imshow(heatmap1,alpha=0.7)
  grid[1].set_title('superimposed heatmap')
  
  plt.show()

 original class is :frog and predicted class is :frog

 original class is :horse and predicted class is :horse

 original class is :truck and predicted class is :truck

 original class is :cat and predicted class is :cat

Let us see what happened to the 4 misclassified images after cutout augmenation - if the prediction changed and if the heatmap pattern changed too

w_list=wrong_indices[5:9]
x=[]
y=[]
for i in range(len(w_list)):
  x.append(test_features[w_list[i]])
  y.append(test_labels[w_list[i]])
  
#convert the image list to numpy array   
x=np.array(x)

#make prediction for these 4 images
preds = model1.predict(x)
for j in range(len(x)):
  #get class id from the prediction values 
  class_idx = np.argmax(preds[j])

  class_output = model1.output[:, class_idx]
  
  ## choose the layer before last 7x7 layer 
  last_conv_layer = model1.get_layer("rrl1")
  
  # compute gradients and from it heatmap
  grads = K.gradients(class_output, last_conv_layer.output)[0]
  pooled_grads = K.mean(grads, axis=(0, 1, 2))
  iterate = K.function([model1.input], [pooled_grads, last_conv_layer.output[0]])
  pooled_grads_value, conv_layer_output_value = iterate([x])
  for i in range(10):
      conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
  heatmap = np.mean(conv_layer_output_value, axis=-1)
  heatmap = np.maximum(heatmap, 0)
  heatmap /= np.max(heatmap)  
  
  img = x[j]
  #resize heatmap 7x7 to image size of 32x32 
  heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
  heatmap = np.uint8(255 * heatmap)
  heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
  
  # convert from BGR to RGB 
  heatmap1 = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
  # create superimposed image if we want to print using cv2 (cv2_imshow supported in colab)
  superimposed_img = cv2.addWeighted(img, 0.8, heatmap, 0.2, 0,dtype=5)
  
  
  # since cv.imshow does not work in jupyter notebooks and colab 
  # we will use matplotlib to print the image and its heatmap 
  fig = plt.figure(1, (5,5))

  grid = ImageGrid(fig, 111,  
                 nrows_ncols=(1,2),  
                 axes_pad=0.3,  
                 )
  print(" original class is :"+class_names[np.argmax(y[j])]+" and predicted class is :"+str(class_names[class_idx]))
  grid[0].imshow(img)
  grid[0].set_title('Original')
  
  
  #print the original image and on top of it place the heat map at 70% transparency   
  grid[1].imshow(img,alpha=1)
  grid[1].imshow(heatmap1,alpha=0.7)
  
  grid[1].set_title('superimposed heatmap')
  plt.show()

 original class is :cat and predicted class is :cat

 original class is :frog and predicted class is :frog

 original class is :frog and predicted class is :frog

 original class is :horse and predicted class is :horse

We can see that cutout augmenation forced the model to look at different parts of the image than it was looking at earlier and it helped in getting the classification of this set of 4 previously misclassified images right

It is also to be noted that the validation accuracy is still only at 88.28 even with cutout and we should train the network for more epochs and with different combinations of augmentations to get better results