def bs_adam()

in optimizer.py [0:0]


def bs_adam(grads, variables, learning_rate, beta2=0.999,
            grad_scale=1.0, max_grad_norm=1.0,
            fp16_mean_var=True, static_loss_scaling=False, **kwargs):

    # set to large value to disable clipping, but still collect global norm
    # we also use this for dynamic loss scaling
    if not max_grad_norm:
        max_grad_norm = 9e9

    if static_loss_scaling:
        global_norm, norm_scale = bs.clip_by_global_norm(grads,
                                                         grad_scale=grad_scale,
                                                         clip_norm=max_grad_norm,
                                                         saturate=65504.0,
                                                         zero_nans=True)
    else:
        # We first calculate whether its nan, then also clip.
        global_norm, _ = bs.clip_by_global_norm(grads,
                                                grad_scale=grad_scale,
                                                clip_norm=max_grad_norm)
        # Try zeroing infs.
        grads = [bs.filter_tensor(g, zero_infs=True, zero_nans=True) for g in grads]
        _, norm_scale = bs.clip_by_global_norm(grads,
                                               grad_scale=grad_scale,
                                               clip_norm=max_grad_norm)

    adam = bs.AdamOptimizer(
        learning_rate=learning_rate,
        beta2=beta2,
        norm_scale=norm_scale,
        grad_scale=grad_scale,
        fp16=fp16_mean_var,
        zero_init_variables=mpi_rank() != 0,
        saturate=65504.0, zero_nans=True)

    return adam.apply_gradients(zip(grads, variables)), global_norm