Source code for chainer.functions.pooling.spatial_pyramid_pooling_2d

import numpy
import six

from chainer import cuda
from chainer.functions.array import concat
from chainer.functions.pooling import max_pooling_2d
from chainer.functions.pooling import pooling_2d


class SpatialPyramidPooling2D(pooling_2d.Pooling2D):

    """Spatial pyramid pooling over a set of 2d planes."""

    def __init__(self, x_shape, pyramid_height, pooling_class, use_cudnn=True):
        bottom_c, bottom_h, bottom_w = x_shape
        self.pyramid_height = pyramid_height

        # create pooling functions for different pyramid levels
        out_dim = 0
        self.split_inds = []
        self.poolers = []
        for pyramid_level in six.moves.range(pyramid_height):
            num_bins = int(2 ** pyramid_level)

            ksize_h = int(numpy.ceil(bottom_h / (float(num_bins))))
            remainder_h = ksize_h * num_bins - bottom_h
            pad_h = remainder_h // 2

            ksize_w = int(numpy.ceil(bottom_w / (float(num_bins))))
            remainder_w = ksize_w * num_bins - bottom_w
            pad_w = remainder_w // 2

            ksize = (ksize_h, ksize_w)
            pad = (pad_h, pad_w)

            if pooling_class is max_pooling_2d.MaxPooling2D:
                pooler = pooling_class(ksize=ksize, stride=None, pad=pad,
                                       cover_all=True, use_cudnn=use_cudnn)
                self.poolers.append(pooler)
            else:
                raise NotImplementedError()

            out_dim += bottom_c * (num_bins ** 2)
            if pyramid_level < pyramid_height - 1:
                self.split_inds.append(out_dim)

    def forward(self, x):
        self.ys = []
        for pooler in self.poolers:
            y = pooler.forward(x)[0]
            n, c, h, w = pooler.out_shape = y.shape
            self.ys.append(y.reshape((n, c * h * w, 1, 1)))

        return concat.Concat(axis=1).forward(self.ys)

    def backward(self, x, gy):
        xp = cuda.get_array_module(*x)
        gx = xp.zeros_like(x[0])
        gys = xp.split(gy[0], self.split_inds, axis=1)
        for pooler, gy in zip(self.poolers, gys):
            gy = gy.reshape(pooler.out_shape)
            gx += pooler.backward(x, (gy,))[0]

        return gx,


[docs]def spatial_pyramid_pooling_2d(x, pyramid_height, pooling_class,
                               use_cudnn=True):
    """Spatial pyramid pooling function.

    It outputs a fixed-length vector regardless of input feature map size.

    It performs pooling operation to the input 4D-array ``x`` with different
    kernel sizes and padding sizes, and then flattens all dimensions except
    first dimension of all pooling results, and finally concatenates them along
    second dimension.

    At :math:`i`-th pyramid level, the kernel size
    :math:`(k_h^{(i)}, k_w^{(i)})` and padding size
    :math:`(p_h^{(i)}, p_w^{(i)})` of pooling operation are calculated as
    below:

    .. math::
        k_h^{(i)} &= \\lceil b_h / 2^i \\rceil, \\\\
        k_w^{(i)} &= \\lceil b_w / 2^i \\rceil, \\\\
        p_h^{(i)} &= (2^i k_h^{(i)} - b_h) / 2, \\\\
        p_w^{(i)} &= (2^i k_w^{(i)} - b_w) / 2,

    where :math:`\\lceil \\cdot \\rceil` denotes the ceiling function, and
    :math:`b_h, b_w` are height and width of input variable ``x``,
    respectively. Note that index of pyramid level :math:`i` is zero-based.

    See detail in paper: `Spatial Pyramid Pooling in Deep Convolutional \
    Networks for Visual Recognition \
    <https://arxiv.org/abs/1406.4729>`_.

    Args:
        x (~chainer.Variable): Input variable. The shape of ``x`` should be
            ``(batchsize, # of channels, height, width)``.
        pyramid_height (int): Number of pyramid levels
        pooling_class (MaxPooling2D or AveragePooling2D):
            Only MaxPooling2D class can be available for now.
        use_cudnn (bool): If ``True`` and cuDNN is enabled, then this function
            uses cuDNN as the core implementation.

    Returns:
        ~chainer.Variable: Output variable. The shape of the output variable
            will be :math:`(batchsize, c \\sum_{h=0}^{H-1} 2^{2h}, 1, 1)`,
            where :math:`c` is the number of channels of input variable ``x``
            and :math:`H` is the number of pyramid levels.

    .. note::

        This function uses some pooling classes as components to perform
        spatial pyramid pooling. Now it supports only
        :class:`~functions.MaxPooling2D` as elemental pooling operator so far.

    """

    return SpatialPyramidPooling2D(x.shape[1:], pyramid_height,
                                   pooling_class, use_cudnn=use_cudnn)(x)