Grad-CAM 的 PyTorch 应用

整理了一下 GradCAM 的代码实现，因为还没有阅读原文，所以并没有解释细节，只是记录一下对于一份现成的代码，如何整理为我所用。

1. 概述

Grad-CAM 概述：给定图像和感兴趣的类别作为输入，我们通过模型的 CNN 部分前向传播图像，然后通过特定于任务的计算获得该类别的原始分数。除了期望的类别（虎），所有类别的梯度都设置为零，该类别设置为 1。然后将该信号反向传播到卷积特征图，我们将其结合起来计算粗略的 Grad-CAM 定位（蓝色热图）它表示模型在做出特定决策时必须查看的位置。最后，我们将热图与反向传播逐点相乘，以获得高分辨率和特定于概念的引导式 Grad-CAM 可视化 ¹。

¹. 参考 PyTorch 实现 GradCAM ↩

2. PyTorch 实现与应用

所参考和使用的是这个代码

这份代码针对常用的模型所写，如 VGG, ResNet 等等，并且输入是 $224 \times 224$ 的 3 通道彩色图片，对于自己的网络权重，以及对于输入大小并非 $224 \times 224$ 甚至不是彩色图像而是灰度图像的情况，需要自行改动。

该代码库中的核心文件是 interpretability/grad_cam.py 和 interpretability/guided_back_propagation.py，它们实现了 Grad-CAM 这件事。

interpretability/guided_back_propagation.py

这个文件的代码如下：

# -*- coding: utf-8 -*-
"""
Created on 2019/8/4 上午9:45

@author: mick.yi

"""
import torch
from torch import nn
import numpy as np


class GuidedBackPropagation(object):

    def __init__(self, net):
        self.net = net
        for (name, module) in self.net.named_modules():
            if isinstance(module, nn.ReLU):
                module.register_backward_hook(self.backward_hook)
        self.net.eval()

    @classmethod
    def backward_hook(cls, module, grad_in, grad_out):
        """

        :param module:
        :param grad_in: tuple,长度为1
        :param grad_out: tuple,长度为1
        :return: tuple(new_grad_in,)
        """
        return torch.clamp(grad_in[0], min=0.0),

    def __call__(self, inputs, index=None):
        """

        :param inputs: [1,3,H,W]
        :param index: class_id
        :return:
        """
        self.net.zero_grad()
        output = self.net(inputs)  # [1,num_classes]
        if index is None:
            index = np.argmax(output.cpu().data.numpy())
        target = output[0][index]

        target.backward()

        return inputs.grad[0]  # [3,H,W]

对于这份代码，我没有进行任何更改（我的输入图像是 $800 \times 750$ 的单通道灰度图）。从引用来看，这份代码是最『原始』的，只引用了最基本的库，而没有引用这份代码库里的其他代码。

interpretability/grad_cam.py

这个文件的代码如下：

# -*- coding: utf-8 -*-
"""
Created on 2019/8/4 上午9:37

@author: mick.yi

"""
import numpy as np
import cv2


class GradCAM(object):
    """
    1: 网络不更新梯度,输入需要梯度更新
    2: 使用目标类别的得分做反向传播
    """

    def __init__(self, net, layer_name):
        self.net = net
        self.layer_name = layer_name
        self.feature = None
        self.gradient = None
        self.net.eval()
        self.handlers = []
        self._register_hook()

    def _get_features_hook(self, module, input, output):
        self.feature = output
        print("feature shape:{}".format(output.size()))

    def _get_grads_hook(self, module, input_grad, output_grad):
        """

        :param input_grad: tuple, input_grad[0]: None
                                   input_grad[1]: weight
                                   input_grad[2]: bias
        :param output_grad:tuple,长度为1
        :return:
        """
        self.gradient = output_grad[0]

    def _register_hook(self):
        for (name, module) in self.net.named_modules():
            if name == self.layer_name:
                self.handlers.append(module.register_forward_hook(self._get_features_hook))
                self.handlers.append(module.register_backward_hook(self._get_grads_hook))

    def remove_handlers(self):
        for handle in self.handlers:
            handle.remove()

    def __call__(self, inputs, index):
        """

        :param inputs: [1,3,H,W]
        :param index: class id
        :return:
        """
        self.net.zero_grad()
        output = self.net(inputs)  # [1,num_classes]
        if index is None:
            index = np.argmax(output.cpu().data.numpy())
        target = output[0][index]

        inputs.retain_grad()
        target.backward()

        gradient = self.gradient[0].cpu().data.numpy()  # [C,H,W]
        weight = np.mean(gradient, axis=(1, 2))  # [C]

        feature = self.feature[0].cpu().data.numpy()  # [C,H,W]

        cam = feature * weight[:, np.newaxis, np.newaxis]  # [C,H,W]
        cam = np.sum(cam, axis=0)  # [H,W]
        cam = np.maximum(cam, 0)  # ReLU

        # 数值归一化
        cam -= np.min(cam)
        cam /= np.max(cam)
        # resize to 224*224
     	# 下面是有改动的，resize 到多大，跟输入图像的大小有关，原文件是 resize 到 (224, 224) 
     	# 实际可以根据自己的输入 resize，输入大小是 (w, h)，则 resize 到 (w, h)
     	# 此处不需要考虑通道
        cam = cv2.resize(cam, (800, 750))
        return cam


class GradCamPlusPlus(GradCAM):
    def __init__(self, net, layer_name):
        super(GradCamPlusPlus, self).__init__(net, layer_name)

    def __call__(self, inputs, index):
        """

        :param inputs: [1,3,H,W]
        :param index: class id
        :return:
        """
        self.net.zero_grad()
        output = self.net(inputs)  # [1,num_classes]
        if index is None:
            index = np.argmax(output.cpu().data.numpy())
        target = output[0][index]

        inputs.retain_grad()
        target.backward()

        gradient = self.gradient[0].cpu().data.numpy()  # [C,H,W]
        gradient = np.maximum(gradient, 0.)  # ReLU
        indicate = np.where(gradient > 0, 1., 0.)  # 示性函数
        norm_factor = np.sum(gradient, axis=(1, 2))  # [C]归一化
        for i in range(len(norm_factor)):
            norm_factor[i] = 1. / norm_factor[i] if norm_factor[i] > 0. else 0.  # 避免除零
        alpha = indicate * norm_factor[:, np.newaxis, np.newaxis]  # [C,H,W]

        weight = np.sum(gradient * alpha, axis=(1, 2))  # [C]  alpha*ReLU(gradient)

        feature = self.feature[0].cpu().data.numpy()  # [C,H,W]

        cam = feature * weight[:, np.newaxis, np.newaxis]  # [C,H,W]
        cam = np.sum(cam, axis=0)  # [H,W]
        # cam = np.maximum(cam, 0)  # ReLU

        # 数值归一化
        cam -= np.min(cam)
        cam /= np.max(cam)
        # resize to 224*224
        # 下面是有改动的，resize 到多大，跟输入图像的大小有关，原文件是 resize 到 (224, 224) 
     	# 实际可以根据自己的输入 resize，输入大小是 (w, h)，则 resize 到 (w, h)
     	# 此处不需要考虑通道
        cam = cv2.resize(cam, (800, 750))
        return cam

这个文件包含两个类：GradCAM 和 GeadCamPlusPlus，分别实现了 Grad-CAM 和 Grad-CAM++。这两个代码是需要改动的，主要是每个类的 __call__ 方法的倒数第二行的 resize 函数的参数，resize 到多少，原文件是写死了直接写成 (224, 224) ，自己用时可以改成自己的输入图像尺寸，OpenCV 的 resize 格式为 (w, h)。

最后实际的入口是 main.py，这部分也是做了改动的，改动的代码如下，具体细节写在了代码的注释里：

from torchvision import transforms
import numpy as np
from PIL import Image

import os

import cv2
import torch
from skimage import io
from torch import nn

from interpretability.grad_cam import GradCAM, GradCamPlusPlus
from interpretability.guided_back_propagation import GuidedBackPropagation


"""
原来的代码中最开始还有个 get_net 函数，用于根据函数名获取 torchvision.models 中的模型类，自己使用的时候
网络都是自己定义的，因此把这个函数删掉了，改为后面手动解决
"""

def get_last_conv_name(net):
    """
    获取网络的最后一个卷积层的名字
    :param net:
    :return:
    """
    layer_name = None
    for name, m in net.named_modules():
        if isinstance(m, nn.Conv2d):
            layer_name = name
    return layer_name


"""
prepare_input 在原文件的定义如下
它的输入 image 是一个图片的 numpy 数组除以 255，shape = (224, 224, 3)
def prepare_input(image):
    image = image.copy()

    # 归一化
    means = np.array([0.485, 0.456, 0.406])
    stds = np.array([0.229, 0.224, 0.225])
    image -= means
    image /= stds

    image = np.ascontiguousarray(np.transpose(image, (2, 0, 1)))  # channel first
    image = image[np.newaxis, ...]  # 增加batch维

    return torch.tensor(image, requires_grad=True)
"""

def prepare_input(image_path):
    """
    我的改动是直接输入图像路径，用 PIL 和 torchvision.transforms 处理，不预先读取为 numpy 数组了
    """
    image = Image.open(image_path)
    image = transforms.ToTensor()(image)

    return torch.tensor(image.unsqueeze(0), requires_grad=True).cuda()


def gen_cam(image, mask):
    """
    生成CAM图
    :param image: [H,W,C],原始图像
    :param mask: [H,W],范围0~1
    :return: tuple(cam, heatmap)
    """
    # mask转为heatmap
    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
    heatmap = np.float32(heatmap) / 255
    heatmap = heatmap[..., ::-1]  # gbr to rgb

    # 合并heatmap到原始图像
    cam = heatmap + np.float32(image)
    return norm_image(cam), (heatmap * 255).astype(np.uint8)


def norm_image(image):
    """
    标准化图像
    :param image: [H,W,C]
    :return:
    """
    image = image.copy()
    image -= np.max(np.min(image), 0)
    image /= np.max(image)
    image *= 255.
    return np.uint8(image)


def gen_gb(grad):
    """
    生guided back propagation 输入图像的梯度
    :param grad: tensor,[3,H,W]
    :return:
    """
    # 标准化
    grad = grad.data.cpu().numpy()
    gb = np.transpose(grad, (1, 2, 0))
    return gb


def save_image(image_dicts, input_image_name, network, output_dir):
    prefix = os.path.splitext(input_image_name)[0]
    for key, image in image_dicts.items():
        io.imsave(os.path.join(output_dir, key, '{}-{}-{}.jpg'.format(prefix, network, key)), image)


def main(image_path, output_dir):
    class_id = None
    layer_name = None
    network = "resnet18"

    # 输入
    # 这里的 img 读取成了 3 通道的图，即不管是彩图还是灰度图，都按 RGB 图读
    # 这主要是为了后面作图
    img = cv2.imread(image_path)
    img = np.float32(img) / 255

    # 这个是实际输入到网络的，shape = (1, c, h, w)
    inputs = prepare_input(image_path)  # tensor

    # 输出图像
    image_dict = {}

    # Grad-CAM
    # 下面的 net 其实是用了全局变量了，需要优化
    layer_name = get_last_conv_name(net) if layer_name is None else layer_name
    grad_cam = GradCAM(net, layer_name)
    mask = grad_cam(inputs, class_id)  # cam mask
    image_dict['cam'], image_dict['heatmap'] = gen_cam(img, mask)
    grad_cam.remove_handlers()

    # Grad-CAM++
    grad_cam_plus_plus = GradCamPlusPlus(net, layer_name)
    mask_plus_plus = grad_cam_plus_plus(inputs, class_id)  # cam mask
    image_dict['cam++'], image_dict['heatmap++'] = gen_cam(img, mask_plus_plus)
    grad_cam_plus_plus.remove_handlers()

    # GuidedBackPropagation
    gbp = GuidedBackPropagation(net)
    inputs.grad.zero_()  # 梯度置零
    grad = gbp(inputs)

    gb = gen_gb(grad)
    image_dict['gb'] = norm_image(gb)
    # 生成Guided Grad-CAM
    cam_gb = gb * mask[..., np.newaxis]
    image_dict['cam_gb'] = norm_image(cam_gb)

    for key in image_dict.keys():
        if not os.path.exists(os.path.join(output_dir, key)):
            os.mkdir(os.path.join(output_dir, key))
    save_image(image_dict, os.path.basename(image_path), network, output_dir)
    pass


if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    checkpoint = "best_model.pth"

    net = torch.load(checkpoint).to(device)
    # net.train()

    output_dir = "cam_images/"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    image_dir = "input_images/"

    for fname in os.listdir(image_dir):
        image_path = image_dir + fname
        main(image_path, output_dir)
        print(fname)

另外一个 star 更多的实现：jacobgil/pytorch-grad-cam