def adagrad_optimizer()

in benchmarks/dlrm/ootb/dlrm_s_caffe2.py [0:0]


    def adagrad_optimizer(self, learning_rate,
                        T=None, _gradientMap=None, sync_dense_params=True,
                        epsilon=1e-10, decay_=0.0, weight_decay_=0.0):
        # create one, it and lr tags (or use them if already present)
        if T is not None:
            (tag_one, tag_it, tag_lr) = T
        else:
            (tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")

            # approach 1: feed values directly
            # self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
            # self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
            # it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
            # lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
            #                           base_lr=-1 * learning_rate, policy="fixed")
            # approach 2: use brew
            self.AddLayerWrapper(self.model.param_init_net.ConstantFill,
                                 [], tag_one, shape=[1], value=1.0)
            self.AddLayerWrapper(brew.iter, self.model, tag_it)
            self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
                                 base_lr=-1 * learning_rate, policy="fixed")
            # save the blob shapes for latter (only needed if onnx is requested)
            if self.save_onnx:
                self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
                self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))

        # create gradient maps (or use them if already present)
        if _gradientMap is not None:
            self.gradientMap = _gradientMap
        else:
            if self.loss.__class__ == list:
                self.gradientMap = self.model.AddGradientOperators(self.loss)
            else:
                self.gradientMap = self.model.AddGradientOperators([self.loss])

        # update weights
        # approach 1: builtin function
        # optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
        # approach 2: custom code
        # top MLP weight and bias
        for i, w in enumerate(self.top_w):
            # allreduce across devices if needed
            if sync_dense_params and self.ndevices > 1:
                grad_blobs = [
                    self.gradientMap["gpu_{}/".format(d) + w]
                    for d in range(self.ndevices)
                ]
                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
            # update weights
            self.model.Adagrad(
                [
                    w,
                    "momentum_mlp_top_{}".format(i + 1),
                    self.gradientMap[w],
                    tag_lr
                ],
                [w, "momentum_mlp_top_{}".format(i + 1)],
                epsilon=epsilon,
                decay_=decay_,
                weight_decay_=weight_decay_
            )

        # bottom MLP weight and bias
        for i, w in enumerate(self.bot_w):
            # allreduce across devices if needed
            if sync_dense_params and self.ndevices > 1:
                grad_blobs = [
                    self.gradientMap["gpu_{}/".format(d) + w]
                    for d in range(self.ndevices)
                ]
                self.model.NCCLAllreduce(grad_blobs, grad_blobs)
            # update weights
            self.model.Adagrad(
                [
                    w,
                    "momentum_mlp_bot_{}".format(i + 1),
                    self.gradientMap[w],
                    tag_lr
                ],
                [w, "momentum_mlp_bot_{}".format(i + 1)],
                epsilon=epsilon,
                decay_=decay_,
                weight_decay_=weight_decay_
            )

        # update embeddings
        for i, w in enumerate(self.emb_w):
            # select device
            if self.ndevices > 1:
                d = i % self.ndevices
            # create tags
            on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
            _tag_one = on_device + tag_one
            _tag_lr = on_device + tag_lr
            # pickup gradient
            w_grad = self.gradientMap[w]
            # update weights
            def add_optimizer():
                self.model.Unique(
                    w_grad.indices,
                    ["unique_w_grad_indices", "remapping_w_grad_indices"]
                )
                self.model.UnsortedSegmentSum(
                    [w_grad.values, "remapping_w_grad_indices"],
                    "unique_w_grad_values"
                )

                if self.emb_optimizer == "adagrad":
                    self.model.SparseAdagrad(
                        [
                            w,
                            "momentum_emb_{}".format(i),
                            "unique_w_grad_indices",
                            "unique_w_grad_values",
                            _tag_lr
                        ],
                        [w, "momentum_emb_{}".format(i)],
                        epsilon=epsilon,
                        decay_=decay_,
                        weight_decay_=weight_decay_
                    )

                elif self.emb_optimizer == "rwsadagrad":
                    self.model.RowWiseSparseAdagrad(
                        [
                            w,
                            "momentum_emb_{}".format(i),
                            "unique_w_grad_indices",
                            "unique_w_grad_values",
                            _tag_lr
                        ],
                        [w, "momentum_emb_{}".format(i)],
                        epsilon=epsilon,
                        decay_=decay_,
                        weight_decay_=weight_decay_
                    )

            if self.ndevices > 1:
                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
                    add_optimizer()
            else:
                add_optimizer()

        # update per sample weights
        if self.weighted_pooling == "learned":
            for i, w in enumerate(self.emb_vw):
                # select device
                if self.ndevices > 1:
                    d = i % self.ndevices
                # create tags
                on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
                _tag_one = on_device + tag_one
                _tag_lr = on_device + tag_lr
                # pickup gradient
                w_grad = self.gradientMap[w]
                # update weights
                if self.ndevices > 1:
                    with core.DeviceScope(
                        core.DeviceOption(workspace.GpuDeviceType, d)
                    ):
                        self.model.ScatterWeightedSum(
                            [w, _tag_one, w_grad.indices,
                            w_grad.values, _tag_lr], w
                        )
                else:
                    self.model.ScatterWeightedSum(
                        [w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
                    )