はじめに
Grad-CAMについて。
github.com
上記サイトのコードに少し変更を加えた。
gluoncvを使っているので少し短くなっている。
環境
Windows10 Pro Python 3.7.5 GPUなし
certifi==2019.9.11 chardet==3.0.4 cycler==0.10.0 gluoncv==0.5.0 graphviz==0.8.4 idna==2.6 kiwisolver==1.1.0 matplotlib==3.1.1 mxnet==1.6.0b20191004 numpy==1.16.5 opencv-python==4.1.1.26 Pillow==6.2.0 pyparsing==2.4.2 python-dateutil==2.8.0 requests==2.18.4 scipy==1.3.1 six==1.12.0 tqdm==4.36.1 urllib3==1.22
コード
- gradcam.py
import mxnet as mx import mxnet.ndarray as nd from mxnet import gluon from mxnet import autograd from mxnet.gluon import nn import numpy as np import cv2 class ReluOp(mx.operator.CustomOp): """Modified ReLU as described in section 3.4 in https://arxiv.org/abs/1412.6806. This is used for guided backpropagation to get gradients of the image w.r.t activations. This Operator will do a regular backpropagation if `guided_backprop` is set to False and a guided packpropagation if `guided_backprop` is set to True. Check gradcam_demo.py for an example usage.""" guided_backprop = False def forward(self, is_train, req, in_data, out_data, aux): x = in_data[0] y = nd.maximum(x, nd.zeros_like(x)) self.assign(out_data[0], req[0], y) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): if ReluOp.guided_backprop: # Get output and gradients of output y = out_data[0] dy = out_grad[0] # Zero out the negatives in the gradients of the output dy_positives = nd.maximum(dy, nd.zeros_like(dy)) # What output values were greater than 0? y_ones = y.__gt__(0) # Mask out the values for which at least one of dy or y is negative dx = dy_positives * y_ones self.assign(in_grad[0], req[0], dx) else: # Regular backward for ReLU x = in_data[0] x_gt_zero = x.__gt__(0) dx = out_grad[0] * x_gt_zero self.assign(in_grad[0], req[0], dx) def set_guided_backprop(mode=True): ReluOp.guided_backprop = mode @mx.operator.register("relu") class ReluProp(mx.operator.CustomOpProp): def __init__(self): super(ReluProp, self).__init__(True) def infer_shape(self, in_shapes): data_shape = in_shapes[0] output_shape = data_shape return (data_shape,), (output_shape,), () def create_operator(self, ctx, in_shapes, in_dtypes): return ReluOp() class Activation(mx.gluon.HybridBlock): @staticmethod def set_guided_backprop(mode=False): ReluOp.guided_backprop = mode def __init__(self, act_type, **kwargs): assert act_type == 'relu' super(Activation, self).__init__(**kwargs) def hybrid_forward(self, F, x): return F.Custom(x, op_type='relu') class Conv2D(mx.gluon.HybridBlock): """Wrapper on top of gluon.nn.Conv2D to capture the output and gradients of output of a Conv2D layer in a network. Use `set_capture_layer_name` to select the layer whose outputs and gradients of outputs need to be captured. After the backward pass, `conv_output` will contain the output and `conv_output.grad` will contain the output's gradients. Check gradcam_demo.py for example usage.""" conv_output = None capture_layer_name = None def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1, layout='NCHW', activation=None, use_bias=True, weight_initializer=None, bias_initializer='zeros', in_channels=0, **kwargs): super(Conv2D, self).__init__(**kwargs) self.conv = nn.Conv2D(channels, kernel_size, strides=strides, padding=padding, dilation=dilation, groups=groups, layout=layout, activation=activation, use_bias=use_bias, weight_initializer=weight_initializer, bias_initializer=bias_initializer, in_channels=in_channels) def hybrid_forward(self, F, x): out = self.conv(x) name = self._prefix[:-1] if name == Conv2D.capture_layer_name: out.attach_grad() Conv2D.conv_output = out return out def set_capture_layer_name(name): Conv2D.capture_layer_name = name def _get_grad(net, image, class_id=None, conv_layer_name=None, image_grad=False): """This is an internal helper function that can be used for either of these but not both at the same time: 1. Record the output and gradient of output of an intermediate convolutional layer. 2. Record the gradients of the image. Parameters ---------- image : NDArray Image to visuaize. This is an NDArray with the preprocessed image. class_id : int Category ID this image belongs to. If not provided, network's prediction will be used. conv_layer_name: str Name of the convolutional layer whose output and output's gradients need to be acptured. image_grad: bool Whether to capture gradients of the image.""" if image_grad: image.attach_grad() Conv2D.capture_layer_name = None Activation.set_guided_backprop(True) else: # Tell convviz.Conv2D which layer's output and gradient needs to be recorded Conv2D.capture_layer_name = conv_layer_name Activation.set_guided_backprop(False) # Run the network with autograd.record(train_mode=False): out = net(image) # If user didn't provide a class id, we'll use the class that the network predicted if class_id == None: model_output = out.asnumpy() class_id = np.argmax(model_output) # Create a one-hot target with class_id and backprop with the created target one_hot_target = mx.nd.one_hot(mx.nd.array([class_id]), 1000) out.backward(one_hot_target, train_mode=False) if image_grad: return image.grad[0].asnumpy() else: # Return the recorded convolution output and gradient conv_out = Conv2D.conv_output return conv_out[0].asnumpy(), conv_out.grad[0].asnumpy() def get_conv_out_grad(net, image, class_id=None, conv_layer_name=None): """Get the output and gradients of output of a convolutional layer. Parameters: ---------- net: Block Network to use for visualization. image: NDArray Preprocessed image to use for visualization. class_id: int Category ID this image belongs to. If not provided, network's prediction will be used. conv_layer_name: str Name of the convolutional layer whose output and output's gradients need to be acptured.""" return _get_grad(net, image, class_id, conv_layer_name, image_grad=False) def get_image_grad(net, image, class_id=None): """Get the gradients of the image. Parameters: ---------- net: Block Network to use for visualization. image: NDArray Preprocessed image to use for visualization. class_id: int Category ID this image belongs to. If not provided, network's prediction will be used.""" return _get_grad(net, image, class_id, image_grad=True) def grad_to_image(gradient): """Convert gradients of image obtained using `get_image_grad` into image. This shows parts of the image that is most strongly activating the output neurons.""" gradient = gradient - gradient.min() gradient /= gradient.max() gradient = np.uint8(gradient * 255).transpose(1, 2, 0) gradient = gradient[..., ::-1] return gradient def get_cam(imggrad, conv_out): """Compute CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details""" weights = np.mean(imggrad, axis=(1, 2)) cam = np.ones(conv_out.shape[1:], dtype=np.float32) for i, w in enumerate(weights): cam += w * conv_out[i, :, :] cam = cv2.resize(cam, (imggrad.shape[1], imggrad.shape[2])) cam = np.maximum(cam, 0) cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam)) cam = np.uint8(cam * 255) return cam def get_guided_grad_cam(cam, imggrad): """Compute Guided Grad-CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details""" return np.multiply(cam, imggrad) def get_img_heatmap(orig_img, activation_map): """Draw a heatmap on top of the original image using intensities from activation_map""" heatmap = cv2.applyColorMap(activation_map, cv2.COLORMAP_COOL) heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB) img_heatmap = np.float32(heatmap) + np.float32(orig_img) img_heatmap = img_heatmap / np.max(img_heatmap) img_heatmap *= 255 return img_heatmap.astype(int) def to_grayscale(cv2im): """Convert gradients to grayscale. This gives a saliency map.""" # How strongly does each position activate the output grayscale_im = np.sum(np.abs(cv2im), axis=0) # Normalize between min and 99th percentile im_max = np.percentile(grayscale_im, 99) im_min = np.min(grayscale_im) grayscale_im = np.clip((grayscale_im - im_min) / (im_max - im_min), 0, 1) grayscale_im = np.expand_dims(grayscale_im, axis=0) return grayscale_im def visualize(net, preprocessed_img, orig_img, conv_layer_name): # Returns grad-cam heatmap, guided grad-cam, guided grad-cam saliency imggrad = get_image_grad(net, preprocessed_img) conv_out, conv_out_grad = get_conv_out_grad(net, preprocessed_img, conv_layer_name=conv_layer_name) cam = get_cam(conv_out_grad, conv_out) cam = cv2.resize(cam, (imggrad.shape[1], imggrad.shape[2])) ggcam = get_guided_grad_cam(cam, imggrad) img_ggcam = grad_to_image(ggcam) img_heatmap = get_img_heatmap(orig_img, cam) ggcam_gray = to_grayscale(ggcam) img_ggcam_gray = np.squeeze(grad_to_image(ggcam_gray)) return img_heatmap, img_ggcam, img_ggcam_gray
- gradcam_demo.py
import mxnet as mx from mxnet import gluon, image from mxnet.gluon.data.vision import transforms import argparse import os import numpy as np import cv2 import vgg import gradcam transform = transforms.Compose([ transforms.Resize(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # Receive image path from command line parser = argparse.ArgumentParser(description='Grad-CAM demo') parser.add_argument('img_path', metavar='image_path', type=str, help='path to the image file') args = parser.parse_args() # We'll use VGG-16 for visualization network, synset = vgg.vgg16(pretrained=True, ctx=mx.cpu()) def read_image_cv(path): return cv2.resize(cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB), (224, 224)) def get_class_name(cls_id): return "%s (%d)" % (synset[cls_id], cls_id) def run_inference(net, data): """Run the input image through the network and return the predicted category as integer""" out = net(data) return out.argmax(axis=1).asnumpy()[0].astype(int) def visualize(net, img_path, conv_layer_name): """Create Grad-CAM visualizations using the network 'net' and the image at 'img_path' conv_layer_name is the name of the top most layer of the feature extractor""" _image = image.imread(img_path) _image = transform(_image) _image = _image.expand_dims(axis=0) pred_str = get_class_name(run_inference(net, _image)) orig_img = read_image_cv(img_path) vizs = gradcam.visualize(net, _image, orig_img, conv_layer_name) return (pred_str, (orig_img, *vizs)) # Create Grad-CAM visualization for the user provided image last_conv_layer_name = 'vgg0_conv2d12' cat, vizs = visualize(network, args.img_path, last_conv_layer_name) print("{0:20}: {1:80}".format("Predicted category", cat)) # Write the visualiations into file img_name = os.path.split(args.img_path)[1].split('.')[0] suffixes = ['orig', 'gradcam', 'guided_gradcam', 'saliency'] image_desc = ['Original Image', 'Grad-CAM', 'Guided Grad-CAM', 'Saliency Map'] for i, img in enumerate(vizs): img = img.astype(np.float32) if len(img.shape) == 3: img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) out_file_name = "%s_%s.jpg" % (img_name, suffixes[i]) cv2.imwrite(out_file_name, img) print("{0:20}: {1:80}".format(image_desc[i], out_file_name))
- vgg.py
import mxnet as mx from mxnet import gluon import os from mxnet.initializer import Xavier from mxnet.gluon.nn import MaxPool2D, Flatten, Dense, Dropout, BatchNorm from gradcam import Activation, Conv2D class VGG(mx.gluon.HybridBlock): def __init__(self, layers, filters, classes=1000, batch_norm=False, **kwargs): super(VGG, self).__init__(**kwargs) assert len(layers) == len(filters) with self.name_scope(): self.features = self._make_features(layers, filters, batch_norm) self.features.add(Dense(4096, activation='relu', weight_initializer='normal', bias_initializer='zeros')) self.features.add(Dropout(rate=0.5)) self.features.add(Dense(4096, activation='relu', weight_initializer='normal', bias_initializer='zeros')) self.features.add(Dropout(rate=0.5)) self.output = Dense(classes, weight_initializer='normal', bias_initializer='zeros') def _make_features(self, layers, filters, batch_norm): featurizer = mx.gluon.nn.HybridSequential(prefix='') for i, num in enumerate(layers): for _ in range(num): featurizer.add(Conv2D(filters[i], kernel_size=3, padding=1, weight_initializer=Xavier(rnd_type='gaussian', factor_type='out', magnitude=2), bias_initializer='zeros')) if batch_norm: featurizer.add(BatchNorm()) featurizer.add(Activation('relu')) featurizer.add(MaxPool2D(strides=2)) return featurizer def hybrid_forward(self, F, x): x = self.features(x) x = self.output(x) return x vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]), 13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]), 16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), 19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])} def get_vgg(num_layers, pretrained=False, ctx=mx.cpu(), root=os.path.join('~', '.mxnet', 'models'), **kwargs): layers, filters = vgg_spec[num_layers] net = VGG(layers, filters, **kwargs) net.initialize(ctx=ctx) # Get the pretrained model #vgg = mx.gluon.model_zoo.vision.get_vgg(num_layers, pretrained=True, ctx=ctx) from gluoncv.model_zoo import get_model vgg = get_model('vgg16', pretrained=True) # Set the parameters in the new network params = vgg.collect_params() for key in params: param = params[key] net.collect_params()[net.prefix+key.replace(vgg.prefix, '')].set_data(param.data()) return net, vgg.classes def vgg16(**kwargs): return get_vgg(16, **kwargs)