Visualizing CNN decisions

はじめに

Grad-CAMについて。
github.com
上記サイトのコードに少し変更を加えた。
gluoncvを使っているので少し短くなっている。

環境

Windows10 Pro
Python 3.7.5
GPUなし
certifi==2019.9.11
chardet==3.0.4
cycler==0.10.0
gluoncv==0.5.0
graphviz==0.8.4
idna==2.6
kiwisolver==1.1.0
matplotlib==3.1.1
mxnet==1.6.0b20191004
numpy==1.16.5
opencv-python==4.1.1.26
Pillow==6.2.0
pyparsing==2.4.2
python-dateutil==2.8.0
requests==2.18.4
scipy==1.3.1
six==1.12.0
tqdm==4.36.1
urllib3==1.22

コード

  • gradcam.py
import mxnet as mx
import mxnet.ndarray as nd

from mxnet import gluon
from mxnet import autograd
from mxnet.gluon import nn

import numpy as np
import cv2

class ReluOp(mx.operator.CustomOp):
    """Modified ReLU as described in section 3.4 in https://arxiv.org/abs/1412.6806.
    This is used for guided backpropagation to get gradients of the image w.r.t activations.
    This Operator will do a regular backpropagation if `guided_backprop` is set to False
    and a guided packpropagation if `guided_backprop` is set to True. Check gradcam_demo.py
    for an example usage."""

    guided_backprop = False

    def forward(self, is_train, req, in_data, out_data, aux):
        x = in_data[0]
        y = nd.maximum(x, nd.zeros_like(x))
        self.assign(out_data[0], req[0], y)

    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        if ReluOp.guided_backprop:
            # Get output and gradients of output
            y = out_data[0]
            dy = out_grad[0]
            # Zero out the negatives in the gradients of the output
            dy_positives = nd.maximum(dy, nd.zeros_like(dy))
            # What output values were greater than 0?
            y_ones = y.__gt__(0)
            # Mask out the values for which at least one of dy or y is negative
            dx = dy_positives * y_ones
            self.assign(in_grad[0], req[0], dx)
        else:
            # Regular backward for ReLU
            x = in_data[0]
            x_gt_zero = x.__gt__(0)
            dx = out_grad[0] * x_gt_zero
            self.assign(in_grad[0], req[0], dx)

def set_guided_backprop(mode=True):
    ReluOp.guided_backprop = mode

@mx.operator.register("relu")
class ReluProp(mx.operator.CustomOpProp):
    def __init__(self):
        super(ReluProp, self).__init__(True)

    def infer_shape(self, in_shapes):
        data_shape = in_shapes[0]
        output_shape = data_shape
        return (data_shape,), (output_shape,), ()

    def create_operator(self, ctx, in_shapes, in_dtypes):
        return ReluOp()  

class Activation(mx.gluon.HybridBlock):
    @staticmethod
    def set_guided_backprop(mode=False):
        ReluOp.guided_backprop = mode

    def __init__(self, act_type, **kwargs):
        assert act_type == 'relu'
        super(Activation, self).__init__(**kwargs)

    def hybrid_forward(self, F, x):
        return F.Custom(x, op_type='relu')

class Conv2D(mx.gluon.HybridBlock):
    """Wrapper on top of gluon.nn.Conv2D to capture the output and gradients of output of a Conv2D
    layer in a network. Use `set_capture_layer_name` to select the layer
    whose outputs and gradients of outputs need to be captured. After the backward pass,
    `conv_output` will contain the output and `conv_output.grad` will contain the
    output's gradients. Check gradcam_demo.py for example usage."""

    conv_output = None
    capture_layer_name = None

    def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
                 dilation=(1, 1), groups=1, layout='NCHW',
                 activation=None, use_bias=True, weight_initializer=None,
                 bias_initializer='zeros', in_channels=0, **kwargs):
        super(Conv2D, self).__init__(**kwargs)
        self.conv = nn.Conv2D(channels, kernel_size, strides=strides, padding=padding,
                             dilation=dilation, groups=groups, layout=layout,
                             activation=activation, use_bias=use_bias, weight_initializer=weight_initializer,
                             bias_initializer=bias_initializer, in_channels=in_channels)

    def hybrid_forward(self, F, x):
        out = self.conv(x)
        name = self._prefix[:-1]
        if name == Conv2D.capture_layer_name:
            out.attach_grad()
            Conv2D.conv_output = out
        return out

def set_capture_layer_name(name):
    Conv2D.capture_layer_name = name

def _get_grad(net, image, class_id=None, conv_layer_name=None, image_grad=False):
    """This is an internal helper function that can be used for either of these
    but not both at the same time:
    1. Record the output and gradient of output of an intermediate convolutional layer.
    2. Record the gradients of the image.

    Parameters
    ----------
    image : NDArray
        Image to visuaize. This is an NDArray with the preprocessed image.
    class_id : int
        Category ID this image belongs to. If not provided,
        network's prediction will be used.
    conv_layer_name: str
        Name of the convolutional layer whose output and output's gradients need to be acptured.
    image_grad: bool
        Whether to capture gradients of the image."""

    if image_grad:
        image.attach_grad()
        Conv2D.capture_layer_name = None
        Activation.set_guided_backprop(True)
    else:
        # Tell convviz.Conv2D which layer's output and gradient needs to be recorded
        Conv2D.capture_layer_name = conv_layer_name
        Activation.set_guided_backprop(False)
    
    # Run the network
    with autograd.record(train_mode=False):
        out = net(image)
    
    # If user didn't provide a class id, we'll use the class that the network predicted
    if class_id == None:
        model_output = out.asnumpy()
        class_id = np.argmax(model_output)

    # Create a one-hot target with class_id and backprop with the created target
    one_hot_target = mx.nd.one_hot(mx.nd.array([class_id]), 1000)
    out.backward(one_hot_target, train_mode=False)

    if image_grad:
        return image.grad[0].asnumpy()
    else:
        # Return the recorded convolution output and gradient
        conv_out = Conv2D.conv_output
        return conv_out[0].asnumpy(), conv_out.grad[0].asnumpy()

def get_conv_out_grad(net, image, class_id=None, conv_layer_name=None):
    """Get the output and gradients of output of a convolutional layer.

    Parameters:
    ----------
    net: Block
        Network to use for visualization.
    image: NDArray
        Preprocessed image to use for visualization.
    class_id: int
        Category ID this image belongs to. If not provided,
        network's prediction will be used.
    conv_layer_name: str
        Name of the convolutional layer whose output and output's gradients need to be acptured."""
    return _get_grad(net, image, class_id, conv_layer_name, image_grad=False)

def get_image_grad(net, image, class_id=None):
    """Get the gradients of the image.

    Parameters:
    ----------
    net: Block
        Network to use for visualization.
    image: NDArray
        Preprocessed image to use for visualization.
    class_id: int
        Category ID this image belongs to. If not provided,
        network's prediction will be used."""
    return _get_grad(net, image, class_id, image_grad=True)

def grad_to_image(gradient):
    """Convert gradients of image obtained using `get_image_grad`
    into image. This shows parts of the image that is most strongly activating
    the output neurons."""
    gradient = gradient - gradient.min()
    gradient /= gradient.max()
    gradient = np.uint8(gradient * 255).transpose(1, 2, 0)
    gradient = gradient[..., ::-1]
    return gradient

def get_cam(imggrad, conv_out):
    """Compute CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details"""
    weights = np.mean(imggrad, axis=(1, 2))
    cam = np.ones(conv_out.shape[1:], dtype=np.float32)
    for i, w in enumerate(weights):
        cam += w * conv_out[i, :, :]
    cam = cv2.resize(cam, (imggrad.shape[1], imggrad.shape[2]))
    cam = np.maximum(cam, 0)
    cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam)) 
    cam = np.uint8(cam * 255)
    return cam

def get_guided_grad_cam(cam, imggrad):
    """Compute Guided Grad-CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details"""
    return np.multiply(cam, imggrad)

def get_img_heatmap(orig_img, activation_map):
    """Draw a heatmap on top of the original image using intensities from activation_map"""
    heatmap = cv2.applyColorMap(activation_map, cv2.COLORMAP_COOL)
    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
    img_heatmap = np.float32(heatmap) + np.float32(orig_img)
    img_heatmap = img_heatmap / np.max(img_heatmap)
    img_heatmap *= 255
    return img_heatmap.astype(int)

def to_grayscale(cv2im):
    """Convert gradients to grayscale. This gives a saliency map."""
    # How strongly does each position activate the output
    grayscale_im = np.sum(np.abs(cv2im), axis=0)

    # Normalize between min and 99th percentile
    im_max = np.percentile(grayscale_im, 99)
    im_min = np.min(grayscale_im)
    grayscale_im = np.clip((grayscale_im - im_min) / (im_max - im_min), 0, 1)

    grayscale_im = np.expand_dims(grayscale_im, axis=0)
    return grayscale_im

def visualize(net, preprocessed_img, orig_img, conv_layer_name):
    # Returns grad-cam heatmap, guided grad-cam, guided grad-cam saliency
    imggrad = get_image_grad(net, preprocessed_img)
    conv_out, conv_out_grad = get_conv_out_grad(net, preprocessed_img, conv_layer_name=conv_layer_name)

    cam = get_cam(conv_out_grad, conv_out)
    cam = cv2.resize(cam, (imggrad.shape[1], imggrad.shape[2]))
    ggcam = get_guided_grad_cam(cam, imggrad)
    img_ggcam = grad_to_image(ggcam)
    
    img_heatmap = get_img_heatmap(orig_img, cam)
    
    ggcam_gray = to_grayscale(ggcam)
    img_ggcam_gray = np.squeeze(grad_to_image(ggcam_gray))
    
    return img_heatmap, img_ggcam, img_ggcam_gray

  • gradcam_demo.py
import mxnet as mx
from mxnet import gluon, image
from mxnet.gluon.data.vision import transforms

import argparse
import os
import numpy as np
import cv2

import vgg
import gradcam

transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Receive image path from command line
parser = argparse.ArgumentParser(description='Grad-CAM demo')
parser.add_argument('img_path', metavar='image_path', type=str, help='path to the image file')

args = parser.parse_args()

# We'll use VGG-16 for visualization
network, synset = vgg.vgg16(pretrained=True, ctx=mx.cpu())

def read_image_cv(path):
    return cv2.resize(cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB), (224, 224))

def get_class_name(cls_id):
    return "%s (%d)" % (synset[cls_id], cls_id)

def run_inference(net, data):
    """Run the input image through the network and return the predicted category as integer"""
    out = net(data)
    return out.argmax(axis=1).asnumpy()[0].astype(int)

def visualize(net, img_path, conv_layer_name):
    """Create Grad-CAM visualizations using the network 'net' and the image at 'img_path'
    conv_layer_name is the name of the top most layer of the feature extractor"""
    _image = image.imread(img_path)
    _image = transform(_image)
    _image = _image.expand_dims(axis=0)
    
    pred_str = get_class_name(run_inference(net, _image))
    
    orig_img = read_image_cv(img_path)
    vizs = gradcam.visualize(net, _image, orig_img, conv_layer_name)
    return (pred_str, (orig_img, *vizs))

# Create Grad-CAM visualization for the user provided image
last_conv_layer_name = 'vgg0_conv2d12'
cat, vizs = visualize(network, args.img_path, last_conv_layer_name)

print("{0:20}: {1:80}".format("Predicted category", cat))

# Write the visualiations into file
img_name = os.path.split(args.img_path)[1].split('.')[0]
suffixes = ['orig', 'gradcam', 'guided_gradcam', 'saliency']
image_desc = ['Original Image', 'Grad-CAM', 'Guided Grad-CAM', 'Saliency Map']

for i, img in enumerate(vizs):
    img = img.astype(np.float32)
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    out_file_name = "%s_%s.jpg" % (img_name, suffixes[i])
    cv2.imwrite(out_file_name, img)
    print("{0:20}: {1:80}".format(image_desc[i], out_file_name))

  • vgg.py
import mxnet as mx
from mxnet import gluon

import os

from mxnet.initializer import Xavier
from mxnet.gluon.nn import MaxPool2D, Flatten, Dense, Dropout, BatchNorm
from gradcam import Activation, Conv2D

class VGG(mx.gluon.HybridBlock):
    def __init__(self, layers, filters, classes=1000, batch_norm=False, **kwargs):
        super(VGG, self).__init__(**kwargs)
        assert len(layers) == len(filters)
        with self.name_scope():
            self.features = self._make_features(layers, filters, batch_norm)
            self.features.add(Dense(4096, activation='relu',
                                       weight_initializer='normal',
                                       bias_initializer='zeros'))
            self.features.add(Dropout(rate=0.5))
            self.features.add(Dense(4096, activation='relu',
                                       weight_initializer='normal',
                                       bias_initializer='zeros'))
            self.features.add(Dropout(rate=0.5))
            self.output = Dense(classes,
                                   weight_initializer='normal',
                                   bias_initializer='zeros')

    def _make_features(self, layers, filters, batch_norm):
        featurizer = mx.gluon.nn.HybridSequential(prefix='')
        for i, num in enumerate(layers):
            for _ in range(num):
                featurizer.add(Conv2D(filters[i], kernel_size=3, padding=1,
                                         weight_initializer=Xavier(rnd_type='gaussian',
                                                                   factor_type='out',
                                                                   magnitude=2),
                                         bias_initializer='zeros'))
                if batch_norm:
                    featurizer.add(BatchNorm())
                featurizer.add(Activation('relu'))
            featurizer.add(MaxPool2D(strides=2))
        return featurizer

    def hybrid_forward(self, F, x):
        x = self.features(x)
        x = self.output(x)
        return x

vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
            13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
            16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
            19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}

def get_vgg(num_layers, pretrained=False, ctx=mx.cpu(),
            root=os.path.join('~', '.mxnet', 'models'), **kwargs):
    layers, filters = vgg_spec[num_layers]
    net = VGG(layers, filters, **kwargs)
    net.initialize(ctx=ctx)
    
    # Get the pretrained model
    #vgg = mx.gluon.model_zoo.vision.get_vgg(num_layers, pretrained=True, ctx=ctx) 
    from gluoncv.model_zoo import get_model
    vgg = get_model('vgg16', pretrained=True)

    # Set the parameters in the new network
    params = vgg.collect_params()
    for key in params:
        param = params[key]
        net.collect_params()[net.prefix+key.replace(vgg.prefix, '')].set_data(param.data())
   
    return net, vgg.classes

def vgg16(**kwargs):
    return get_vgg(16, **kwargs)