Source code for chainer.optimizers.adam

import math

import numpy

from chainer import cuda
from chainer import optimizer


[docs]class Adam(optimizer.GradientMethod): """Adam optimization algorithm. See: https://arxiv.org/abs/1412.6980v8 """ def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8): self.alpha = alpha self.beta1 = beta1 self.beta2 = beta2 self.eps = eps def init_state(self, param, state): xp = cuda.get_array_module(param.data) with cuda.get_device_from_array(param.data): state['m'] = xp.zeros_like(param.data) state['v'] = xp.zeros_like(param.data) def update_one_cpu(self, param, state): m, v = state['m'], state['v'] grad = param.grad m += (1 - self.beta1) * (grad - m) v += (1 - self.beta2) * (grad * grad - v) param.data -= self.lr * m / (numpy.sqrt(v) + self.eps) def update_one_gpu(self, param, state): cuda.elementwise( 'T grad, T lr, T one_minus_beta1, T one_minus_beta2, T eps', 'T param, T m, T v', '''m += one_minus_beta1 * (grad - m); v += one_minus_beta2 * (grad * grad - v); param -= lr * m / (sqrt(v) + eps);''', 'adam')(param.grad, self.lr, 1 - self.beta1, 1 - self.beta2, self.eps, param.data, state['m'], state['v']) @property def lr(self): fix1 = 1. - self.beta1 ** self.t fix2 = 1. - self.beta2 ** self.t return self.alpha * math.sqrt(fix2) / fix1