Source code for chainer.optimizer

import collections

import numpy
import six

from chainer import cuda
import chainer.link as link_module


def _sum_sqnorm(arr):
    sq_sum = collections.defaultdict(float)
    for x in arr:
        with cuda.get_device_from_array(x) as dev:
            x = x.ravel()
            s = x.dot(x)
            sq_sum[int(dev)] += s
    return sum([float(i) for i in six.itervalues(sq_sum)])


def exponential_decay_noise(xp, shape, dtype, hook, opt):
    """Time-dependent annealed Gaussian noise function from the paper:

    `Adding Gradient Noise Improves Learning for Very Deep Networks
    <https://arxiv.org/pdf/1511.06807>`_.
    """
    std = numpy.sqrt(hook.eta / numpy.power(1 + opt.t, 0.55))
    return xp.random.normal(0, std, shape).astype(dtype)


[docs]class Optimizer(object):
    """Base class of all numerical optimizers.

    This class provides basic features for all optimization methods. It
    optimizes parameters of a *target link*. The target link is registered via
    the :meth:`setup` method, and then the :meth:`update` method updates its
    parameters based on a given loss function.

    Each optimizer implementation must be defined as a child class of
    Optimizer. It must override :meth:`update` method. An optimizer can use
    *internal states* each of which is tied to one of the parameters. State is
    a dictionary of serializable values (typically arrays of size same as
    the corresponding parameters). In order to use state dictionaries, the
    optimizer must override :meth:`init_state` method (or its CPU/GPU versions,
    :meth:`init_state_cpu` and :meth:`init_state_gpu`).

    If the optimizer is based on single gradient computation (like
    most first-order methods), then it should inherit :class:`GradientMethod`,
    which adds some features dedicated for the first order methods.

    Optimizer instance also supports *hook functions*. Hook function is
    registered by the :meth:`add_hook` method. Each hook function is called
    in registration order in advance of the actual parameter update.

    Attributes:
        target: Target link object. It is set by the :meth:`setup` method.
        t: Number of update steps. It must be incremented by the
            :meth:`update` method.
        epoch: Current epoch. It is incremented by the :meth:`new_epoch`
            method.

    """

[docs]    def setup(self, link):
        """Sets a target link and initializes the optimizer states.

        Given link is set to the :attr:`target` attribute. It also prepares the
        optimizer state dictionaries corresponding to all parameters in the
        link hierarchy. The existing states are discarded.

        Args:
            link (~chainer.Link): Target link object.

        """
        if not isinstance(link, link_module.Link):
            raise TypeError('optimization target must be a link')
        self.target = link
        self.t = 0
        self.epoch = 0
        self._states = {}
        self._hooks = collections.OrderedDict()

        self.prepare()

[docs]    def prepare(self):
        """Prepares for an update.

        This method initializes missing optimizer states (e.g. for newly added
        parameters after the set up), and copies arrays in each state
        dictionary to CPU or GPU according to the corresponding parameter
        array.

        """
        states = self._states
        for name, param in self.target.namedparams():
            if name not in states:
                state = {}
                self.init_state(param, state)
                states[name] = state
            else:
                state = states[name]
                with cuda.get_device_from_array(param.data) as dev:
                    if int(dev) == -1:  # cpu
                        for key, value in six.iteritems(state):
                            if isinstance(value, cuda.ndarray):
                                state[key] = value.get()
                    else:  # gpu
                        cupy = cuda.cupy
                        for key, value in six.iteritems(state):
                            if isinstance(value, numpy.ndarray):
                                state[key] = cuda.to_gpu(value)
                            elif (isinstance(value, cupy.ndarray) and
                                  value.device != dev):
                                state[key] = cupy.copy(value)

[docs]    def init_state(self, param, state):
        """Initializes the optimizer state corresponding to the parameter.

        This method should add needed items to the ``state`` dictionary. Each
        optimizer implementation that uses its own states should override this
        method or CPU/GPU dedicated versions (:meth:`init_state_cpu` and
        :meth:`init_state_gpu`).

        Args:
            param (~chainer.Variable): Parameter variable.
            state (dict): State dictionary.

        .. seealso:: :meth:`init_state_cpu`, :meth:`init_state_gpu`

        """
        with cuda.get_device_from_array(param.data) as dev:
            if int(dev) == -1:
                self.init_state_cpu(param, state)
            else:
                self.init_state_gpu(param, state)

[docs]    def init_state_cpu(self, param, state):
        """Initializes the optimizer state on CPU.

        This method is called from :meth:`init_state` by default.

        Args:
            param (~chainer.Variable): Parameter variable. Its data array is
                of type :class:`numpy.ndarray`.
            state (dict): State dictionary.

        .. seealso:: :meth:`init_state`

        """
        pass

[docs]    def init_state_gpu(self, param, state):
        """Initializes the optimizer state on GPU.

        This method is called from :meth:`init_state` by default.

        Args:
            param (~chainer.Variable): Parameter variable. Its data array is
                of type :class:`cupy.ndarray`.
            state (dict): State dictionary.

        .. seealso:: :meth:`init_state`

        """
        pass

[docs]    def update(self, lossfun=None, *args, **kwds):
        """Updates the parameters and optimizer states.

        This method updates the parameters of the target link and corresponding
        optimizer states. The behavior of this method is different for the
        cases either ``lossfun`` is given or not.

        If ``lossfun`` is given, then this method initializes the gradients by
        zeros, calls it with given extra arguments, and calls the
        :meth:`~chainer.Variable.backward` method of its output to compute the
        gradients. The implementation might call ``lossfun`` more than once.

        If ``lossfun`` is not given, then this method assumes that the
        gradients of all parameters are already computed. An implementation
        that requires multiple gradient computations might raise an error on
        this case.

        In both cases, this method invokes the update procedure for all
        parameters.

        Args:
            lossfun (function): Loss function. It accepts arbitrary arguments
                and returns one :class:`~chainer.Variable` object that
                represents the loss (or objective) value. This argument can be
                omitted for single gradient-based methods. In this case, this
                method assumes gradient arrays computed.
            args, kwds: Arguments for the loss function.

        """
        raise NotImplementedError

[docs]    def new_epoch(self):
        """Starts a new epoch.

        This method increments the :attr:`epoch` count. Note that if the
        optimizer depends on the epoch count, then user should call this method
        appropriately at the beginning of each epoch.

        """
        self.epoch += 1

[docs]    def add_hook(self, hook, name=None):
        """Registers a hook function.

        Hook function is typically called right after the gradient computation,
        though the timing depends on the optimization method.

        Args:
            hook (function): Hook function. It accepts the optimizer object.
            name (str): Name of the registration. If omitted, ``hook.name`` is
                used by default.

        """
        if not callable(hook):
            raise TypeError('hook function is not callable')
        if not hasattr(self, '_hooks'):
            raise RuntimeError('call `setup` method before `add_hook` method')

        if name is None:
            name = hook.name
        if name in self._hooks:
            raise KeyError('hook %s already exists' % name)
        self._hooks[name] = hook

[docs]    def remove_hook(self, name):
        """Removes a hook function.

        Args:
            name (str): Registered name of the hook function to remove.

        """
        del self._hooks[name]

[docs]    def call_hooks(self):
        """Invokes hook functions in registration order."""
        for hook in six.itervalues(self._hooks):
            hook(self)

[docs]    def serialize(self, serializer):
        """Serializes or deserializes the optimizer.

        It only saves or loads the following things:

        - Optimizer states
        - Global states (:attr:`t` and :attr:`epoch`)

        **It does not saves nor loads the parameters of the target link.** They
        should be separately saved or loaded.

        Args:
            serializer (~chainer.AbstractSerializer): Serializer or
                deserializer object.

        """
        self.t = serializer('t', self.t)
        self.epoch = serializer('epoch', self.epoch)
        for name, state in six.iteritems(self._states):
            s = serializer[name]
            for key, value in six.iteritems(state):
                state[key] = s(key, value)

[docs]    def zero_grads(self):
        """Fills all gradient arrays by zeros.

        .. deprecated:: v1.5
           Use the :meth:`chainer.Link.cleargrads` method for the target link
           instead.

        """
        self.target.zerograds()

[docs]    def compute_grads_norm(self):
        """Computes the norm of whole gradients.

        Returns:
            float: L2 norm of whole gradients, i.e. square root of sum of
            square of all gradient elements.

        .. warning::

            This method returns a CPU-computed value, which means that this
            method synchronizes between CPU and GPU if at least one of the
            gradients reside on the GPU.

        .. deprecated:: v1.5

        """
        return numpy.sqrt(_sum_sqnorm([p.grad for p in self.target.params()]))

[docs]    def clip_grads(self, maxnorm):
        """Clips the norm of whole gradients up to the threshold.

        Args:
            maxnorm (float): Threshold of gradient L2 norm.

        .. deprecated:: v1.5
           Use the :class:`~chainer.optimizer.GradientClipping` hook function
           instead.

        """
        GradientClipping(maxnorm)(self)

[docs]    def weight_decay(self, decay):
        """Applies weight decay to the parameter/gradient pairs.

        Args:
            decay (float): Coefficient of weight decay.

        .. deprecated:: v1.5
           Use the :class:`~chainer.optimizer.WeightDecay` hook function
           instead.

        """
        WeightDecay(decay)(self)

[docs]    def accumulate_grads(self, grads):
        """Accumulates gradients from other source.

        This method just adds given gradient arrays to gradients that this
        optimizer holds. It is typically used in data-parallel optimization,
        where gradients for different shards are computed in parallel and
        aggregated by this method. This method correctly treats multiple GPU
        devices.

        Args:
            grads (Iterable): Iterable of gradient arrays to be accumulated.

        .. deprecated:: v1.5
           Use the :meth:`chainer.Link.addgrads` method of the target link
           instead.

        """
        for param, g_src in zip(self.target.params(), grads):
            g_dst = param.grad
            if isinstance(g_dst, numpy.ndarray):
                g_dst += cuda.to_cpu(g_src)
                continue

            with cuda.get_device_from_array(g_dst):
                if (isinstance(g_src, cuda.ndarray) and
                        g_dst.device != g_src.device):
                    g_dst += cuda.copy(g_src, out_device=g_dst.device)
                else:
                    g_dst += cuda.to_gpu(g_src)


[docs]class GradientMethod(Optimizer):
    """Base class of all single gradient-based optimizers.

    This is an extension of the :class:`Optimizer` class. Typical gradient
    methods that just require the gradient at the current parameter vector on
    an update can be implemented as its child class.

    An implementation of a gradient method must override the following methods:

    - :meth:`init_state` or both :meth:`init_state_cpu` and
      :meth:`init_state_gpu`
    - :meth:`update_one` or both :meth:`update_one_cpu` and
      :meth:`update_one_gpu`

    .. note::
       It is recommended to call :meth:`use_cleargrads` after creating a
       :class:`GradientMethod` object for efficiency.

    """

[docs]    def reallocate_cleared_grads(self):
        """Reallocate gradients cleared by :meth:`~chainer.Variable.cleargrad`.

        This method allocates arrays for all gradients which have :obj:`None`.
        This method is called before and after every optimizer hook.
        If an inheriting optimizer does not require this allocation,
        the optimizer can override this method with a blank function.

        """
        for name, param in self.target.namedparams():
            if param.grad is None:
                with cuda.get_device_from_array(param.data):
                    xp = cuda.get_array_module(param.data)
                    param.grad = xp.zeros_like(param.data)

[docs]    def call_hooks(self):
        """Invokes hook functions in registration order."""
        for hook in six.itervalues(self._hooks):
            hook(self)
            self.reallocate_cleared_grads()

[docs]    def update(self, lossfun=None, *args, **kwds):
        """Updates parameters based on a loss function or computed gradients.

        This method runs in two ways.

        - If ``lossfun`` is given, then use it as a loss function to compute
          gradients.
        - Otherwise, this method assumes that the gradients are already
          computed.

        In both cases, the computed gradients are used to update parameters.
        The actual update routines are defined by the :meth:`update_one`
        method (or its CPU/GPU versions, :meth:`update_one_cpu` and
        :meth:`update_one_gpu`).

        """
        if lossfun is not None:
            use_cleargrads = getattr(self, '_use_cleargrads', False)
            loss = lossfun(*args, **kwds)
            if use_cleargrads:
                self.target.cleargrads()
            else:
                self.target.zerograds()
            loss.backward()
            del loss

        self.reallocate_cleared_grads()

        self.call_hooks()
        self.prepare()

        self.t += 1
        states = self._states
        for name, param in self.target.namedparams():
            with cuda.get_device_from_array(param.data):
                self.update_one(param, states[name])

[docs]    def update_one(self, param, state):
        """Updates a parameter based on the corresponding gradient and state.

        This method calls appropriate one from :meth:`update_param_cpu` or
        :meth:`update_param_gpu`.

        Args:
            param (~chainer.Variable): Parameter variable.
            state (dict): State dictionary.

        """
        if isinstance(param.data, numpy.ndarray):
            self.update_one_cpu(param, state)
        else:
            self.update_one_gpu(param, state)

[docs]    def update_one_cpu(self, param, state):
        """Updates a parameter on CPU.

        Args:
            param (~chainer.Variable): Parameter variable.
            state (dict): State dictionary.

        """
        raise NotImplementedError

[docs]    def update_one_gpu(self, param, state):
        """Updates a parameter on GPU.

        Args:
            param (~chainer.Variable): Parameter variable.
            state (dict): State dictionary.

        """
        raise NotImplementedError

[docs]    def use_cleargrads(self, use=True):
        """Enables or disables use of :func:`~chainer.Link.cleargrads` in `update`.

        Args:
            use (bool): If ``True``, this function enables use of
                `cleargrads`. If ``False``, disables use of `cleargrads`
                (`zerograds` is used).

        .. note::
           Note that :meth:`update` calls :meth:`~Link.zerograds` by default
           for backward compatibility. It is recommended to call this method
           before first call of `update` because `cleargrads` is more
           efficient than `zerograds`.

        """
        self._use_cleargrads = use


[docs]class WeightDecay(object):
    """Optimizer hook function for weight decay regularization.

    This hook function adds a scaled parameter to the corresponding gradient.
    It can be used as a regularization.

    Args:
        rate (float): Coefficient for the weight decay.

    Attributes:
        rate (float): Coefficient for the weight decay.

    """
    name = 'WeightDecay'

    def __init__(self, rate):
        self.rate = rate

    def kernel(self):
        return cuda.elementwise(
            'T p, T decay', 'T g', 'g += decay * p', 'weight_decay')

    def __call__(self, opt):
        rate = self.rate
        for param in opt.target.params():
            p, g = param.data, param.grad
            with cuda.get_device_from_array(p) as dev:
                if int(dev) == -1:
                    g += rate * p
                else:
                    self.kernel()(p, rate, g)


[docs]class Lasso(object):
    """Optimizer hook function for Lasso regularization.

    This hook function adds a scaled parameter to the sign of each weight.
    It can be used as a regularization.

    Args:
        rate (float): Coefficient for the weight decay.

    Attributes:
        rate (float): Coefficient for the weight decay.

    """
    name = 'Lasso'

    def __init__(self, rate):
        self.rate = rate

    def kernel(self):
        return cuda.elementwise(
            'T s, T decay', 'T g', 'g += decay * s', 'lasso')

    def __call__(self, opt):
        rate = self.rate
        for param in opt.target.params():
            p, g = param.data, param.grad
            xp = cuda.get_array_module(p)
            sign = xp.sign(p)
            with cuda.get_device_from_array(p) as dev:
                if int(dev) == -1:
                    g += rate * sign
                else:
                    self.kernel()(sign, rate, g)


[docs]class GradientClipping(object):
    """Optimizer hook function for gradient clipping.

    This hook function scales all gradient arrays to fit to the defined L2 norm
    threshold.

    Args:
        threshold (float): L2 norm threshold.

    Attributes:
        threshold (float): L2 norm threshold of gradient norm.

    """
    name = 'GradientClipping'

    def __init__(self, threshold):
        self.threshold = threshold

    def __call__(self, opt):
        norm = numpy.sqrt(_sum_sqnorm([p.grad for p in opt.target.params()]))
        rate = self.threshold / norm
        if rate < 1:
            for param in opt.target.params():
                grad = param.grad
                with cuda.get_device_from_array(grad):
                    grad *= rate


[docs]class GradientNoise(object):
    """Optimizer hook function for adding gradient noise.

    This hook function simply adds noise generated by the ``noise_func``
    to the gradient. By default it adds time-dependent annealed Gaussian
    noise to the gradient at every training step:

    .. math::

        g_t \\leftarrow g_t + N(0, \\sigma_t^2)

    where

    .. math::

        \\sigma_t^2 = \\frac{\\eta}{(1+t)^\\gamma}

    with :math:`\\eta` selected from {0.01, 0.3, 1.0} and
    :math:`\\gamma = 0.55`.

    Args:
        eta (float): Parameter that defines the scale of the noise, which for
            the default noise function is recommended to be either 0.01, 0.3
            or 1.0.
        noise_func (function): Noise generating function which by default
            is given by `Adding Gradient Noise Improves Learning for Very Deep\
            Networks <https://arxiv.org/pdf/1511.06807>`_.
    """
    name = 'GradientNoise'

    def __init__(self, eta, noise_func=exponential_decay_noise):
        self.eta = eta
        self.noise_func = noise_func

    def kernel(self):
        return cuda.elementwise(
            'T noise', 'T g', 'g += noise', 'gradient_noise')

    def __call__(self, opt):
        for param in opt.target.params():
            g = param.grad
            xp = cuda.get_array_module(g)
            with cuda.get_device_from_array(g) as dev:
                noise = self.noise_func(xp, g.shape, g.dtype, self, opt)
                if int(dev) == -1:
                    g += noise
                else:
                    self.kernel()(noise, g)


class GradientHardClipping(object):
    """Optimizer hook function for gradient clipping.

    This hook function clips all gradient arrays to be within a lower and upper
    bound.

    Args:
        lower_bound (float): The lower bound of the gradient value.
        upper_bound (float): The upper bound of the gradient value.

    Attributes:
        lower_bound (float): The lower bound of the gradient value.
        upper_bound (float): The upper bound of the gradient value.

    """
    name = 'GradientHardClipping'

    def __init__(self, lower_bound, upper_bound):
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound

    def __call__(self, opt):
        xp = opt.target.xp
        for param in opt.target.params():
            grad = param.grad
            with cuda.get_device_from_array(grad):
                xp.clip(grad, self.lower_bound, self.upper_bound, out=grad)