Source code for chainer.functions.loss.negative_sampling

import numpy
import six

from chainer import cuda
from chainer import function
from chainer.utils import type_check


class NegativeSamplingFunction(function.Function):

    ignore_label = -1

    def __init__(self, sampler, sample_size, reduce='sum'):
        if reduce not in ('sum', 'no'):
            raise ValueError(
                "only 'sum' and 'no' are valid for 'reduce', but '%s' is "
                'given' % reduce)

        self.sampler = sampler
        self.sample_size = sample_size
        self.reduce = reduce

    def _make_samples(self, t):
        if hasattr(self, 'samples'):
            return self.samples  # for testing

        size = int(t.shape[0])
        # first one is the positive, and others are sampled negatives
        samples = self.sampler((size, self.sample_size + 1))
        samples[:, 0] = t
        self.samples = samples

    def check_type_forward(self, in_types):
        type_check.expect(in_types.size() == 3)
        x_type, t_type, w_type = in_types

        type_check.expect(
            x_type.dtype == numpy.float32,
            x_type.ndim == 2,
            t_type.dtype == numpy.int32,
            t_type.ndim == 1,
            x_type.shape[0] == t_type.shape[0],
            w_type.dtype == numpy.float32,
            w_type.ndim == 2,
        )

    def forward_cpu(self, inputs):
        x, t, W = inputs
        self.ignore_mask = (t != self.ignore_label)
        self._make_samples(t)

        loss = numpy.empty(len(x), numpy.float32)
        for i, (it, ix, k) in enumerate(
                six.moves.zip(t, x, self.samples)):
            if it == self.ignore_label:
                loss[i] = 0
            else:
                w = W[k]
                f = w.dot(ix)
                f[0] *= -1  # positive sample
                loss[i] = numpy.sum(numpy.logaddexp(f, 0))

        if self.reduce == 'sum':
            loss = numpy.array(loss.sum(), 'f')
        return loss,

    def forward_gpu(self, inputs):
        x, t, W = inputs
        self.ignore_mask = (t != self.ignore_label)
        n_in = x.shape[1]
        self._make_samples(t)

        self.wx = cuda.elementwise(
            'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx',
            '''
            T f = 0;
            if (mask == 1) {
                for (int j = 0; j < c; ++j) {
                  int x_ind[] = {(i / m), j};
                  int w_ind[] = {k, j};
                  f += x[x_ind] * W[w_ind];
                }
            }
            wx = f;
            ''',
            'negative_sampling_wx'
        )(W, x, self.ignore_mask[:, None], self.samples, n_in,
          self.sample_size + 1)

        y = cuda.elementwise(
            'T wx, int32 c, int32 m', 'T y',
            '''
            T f = wx;
            if (i % m == 0) {
              f = -f;
            }
            T loss;
            if (f < 0) {
              loss = __logf(1 + __expf(f));
            } else {
              loss = f + __logf(1 + __expf(-f));
            }
            y = loss;
            ''',
            'negative_sampling_forward'
        )(self.wx, n_in, self.sample_size + 1)
        # TODO(okuta): merge elementwise
        loss = y * self.ignore_mask[:, None].astype('float32')
        if self.reduce == 'sum':
            loss = loss.sum()
        else:  # 'no':
            loss = loss.sum(axis=1)
        return loss,

    def backward_cpu(self, inputs, grads):
        x, t, W = inputs
        gloss, = grads

        gx = numpy.zeros_like(x)
        gW = numpy.zeros_like(W)
        for i in numpy.arange(len(self.ignore_mask))[self.ignore_mask]:
            ix = x[i]
            k = self.samples[i]
            if self.reduce == 'sum':
                igy = gloss
            else:
                igy = gloss[i]

            w = W[k]
            f = w.dot(ix)

            # g == -y * gloss / (1 + exp(yf))
            f[0] *= -1
            g = igy / (1 + numpy.exp(-f))
            g[0] *= -1

            gx[i] = g.dot(w)
            for ik, ig in six.moves.zip(k, g):
                gW[ik] += ig * ix
        return gx, None, gW

    def backward_gpu(self, inputs, grads):
        cupy = cuda.cupy
        x, t, W = inputs
        gy, = grads

        n_in = x.shape[1]
        if self.reduce == 'no':
            gy = gy[:, None]
        g = cuda.elementwise(
            'T wx, T gy, int32 m', 'T g',
            '''
            T y;
            if (i % m == 0) {
              y = 1;
            } else {
              y = -1;
            }

            g = -y * gy / (1.0f + __expf(wx * y));
            ''',
            'negative_sampling_calculate_g'
        )(self.wx, gy, self.sample_size + 1)
        gx = cupy.zeros_like(x)
        cuda.elementwise(
            'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx',
            '''
            int d = i / c;
            T w = 0;
            if (mask == 1){
                for (int j = 0; j < m; ++j) {
                  w += g[d * m + j] * W[k[d * m + j] * c + i % c];
                }
            }
            gx = w;
            ''',
            'negative_sampling_calculate_gx'
        )(g, W, self.ignore_mask[:, None], self.samples, n_in,
          self.sample_size + 1, gx)
        gW = cupy.zeros_like(W)
        cuda.elementwise(
            'T g, raw T x, S k, bool mask, int32 c, int32 m',
            'raw T gW',
            '''
            T gi = g;
            if (mask == 1) {
                for (int j = 0; j < c; ++j) {
                  atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]);
                }
            }
            ''',
            'negative_sampling_calculate_gw'
        )(g, x, self.samples, self.ignore_mask[:, None], n_in,
          self.sample_size + 1, gW)
        return gx, None, gW


[docs]def negative_sampling(x, t, W, sampler, sample_size, reduce='sum'): """Negative sampling loss function. In natural language processing, especially language modeling, the number of words in a vocabulary can be very large. Therefore, you need to spend a lot of time calculating the gradient of the embedding matrix. By using the negative sampling trick you only need to calculate the gradient for a few sampled negative examples. The objective function is below: .. math:: f(x, p) = \\log \\sigma(x^\\top w_p) + \\ k E_{i \\sim P(i)}[\\log \\sigma(- x^\\top w_i)], where :math:`\\sigma(\\cdot)` is a sigmoid function, :math:`w_i` is the weight vector for the word :math:`i`, and :math:`p` is a positive example. It is approximated with :math:`k` examples :math:`N` sampled from probability :math:`P(i)`, like this: .. math:: f(x, p) \\approx \\log \\sigma(x^\\top w_p) + \\ \\sum_{n \\in N} \\log \\sigma(-x^\\top w_n). Each sample of :math:`N` is drawn from the word distribution :math:`P(w)`. This is calculated as :math:`P(w) = \\frac{1}{Z} c(w)^\\alpha`, where :math:`c(w)` is the unigram count of the word :math:`w`, :math:`\\alpha` is a hyper-parameter, and :math:`Z` is the normalization constant. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. sampler (~types.FunctionType): Sampling function. It takes a shape and returns an integer array of the shape. Each element of this array is a sample from the word distribution. A :class:`~chainer.utils.WalkerAlias` object built with the power distribution of word frequency is recommended. sample_size (int): Number of samples. reduce (str): Reduction option. Its value must be either ``'sum'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding the loss value(s) calculated by the above equation. If ``reduce`` is ``'no'``, the output variable holds array whose shape is same as one of (hence both of) input variables. If it is ``'sum'``, the output variable holds a scalar value. See: `Distributed Representations of Words and Phrases and their\ Compositionality <https://arxiv.org/abs/1310.4546>`_ .. seealso:: :class:`~chainer.links.NegativeSampling`. """ return NegativeSamplingFunction(sampler, sample_size, reduce)(x, t, W)