in dlrm_s_caffe2.py [0:0]
def sgd_optimizer(self, learning_rate,
T=None, _gradientMap=None, sync_dense_params=True):
# create one, it and lr tags (or use them if already present)
if T is not None:
(tag_one, tag_it, tag_lr) = T
else:
(tag_one, tag_it, tag_lr) = ("const_one", "optim_it", "optim_lr")
# approach 1: feed values directly
# self.FeedBlobWrapper(tag_one, np.ones(1).astype(np.float32))
# self.FeedBlobWrapper(tag_it, np.zeros(1).astype(np.int64))
# it = self.AddLayerWrapper(self.model.Iter, tag_it, tag_it)
# lr = self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
# base_lr=-1 * learning_rate, policy="fixed")
# approach 2: use brew
self.AddLayerWrapper(self.model.param_init_net.ConstantFill,
[], tag_one, shape=[1], value=1.0)
self.AddLayerWrapper(brew.iter, self.model, tag_it)
self.AddLayerWrapper(self.model.LearningRate, tag_it, tag_lr,
base_lr=-1 * learning_rate, policy="fixed")
# save the blob shapes for latter (only needed if onnx is requested)
if self.save_onnx:
self.onnx_tsd[tag_one] = (onnx.TensorProto.FLOAT, (1,))
self.onnx_tsd[tag_it] = (onnx.TensorProto.INT64, (1,))
# create gradient maps (or use them if already present)
if _gradientMap is not None:
self.gradientMap = _gradientMap
else:
if self.loss.__class__ == list:
self.gradientMap = self.model.AddGradientOperators(self.loss)
else:
self.gradientMap = self.model.AddGradientOperators([self.loss])
# update weights
# approach 1: builtin function
# optimizer.build_sgd(self.model, base_learning_rate=learning_rate)
# approach 2: custom code
# top MLP weight and bias
for w in self.top_w:
# allreduce across devices if needed
if sync_dense_params and self.ndevices > 1:
grad_blobs = [
self.gradientMap["gpu_{}/".format(d) + w]
for d in range(self.ndevices)
]
self.model.NCCLAllreduce(grad_blobs, grad_blobs)
# update weights
self.AddLayerWrapper(self.model.WeightedSum,
[w, tag_one, "", tag_lr], w, reset_grad=True)
# bottom MLP weight and bias
for w in self.bot_w:
# allreduce across devices if needed
if sync_dense_params and self.ndevices > 1:
grad_blobs = [
self.gradientMap["gpu_{}/".format(d) + w]
for d in range(self.ndevices)
]
self.model.NCCLAllreduce(grad_blobs, grad_blobs)
# update weights
self.AddLayerWrapper(self.model.WeightedSum,
[w, tag_one, "", tag_lr], w, reset_grad=True)
# update embeddings
for i, w in enumerate(self.emb_w):
# select device
if self.ndevices > 1:
d = i % self.ndevices
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
_tag_one = on_device + tag_one
_tag_lr = on_device + tag_lr
# pickup gradient
w_grad = self.gradientMap[w]
# update weights
if self.ndevices > 1:
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, d)):
self.model.ScatterWeightedSum([w, _tag_one, w_grad.indices,
w_grad.values, _tag_lr], w)
else:
self.model.ScatterWeightedSum([w, _tag_one, w_grad.indices,
w_grad.values, _tag_lr], w)
# update per sample weights
if self.weighted_pooling == "learned":
for i, w in enumerate(self.emb_vw):
# select device
if self.ndevices > 1:
d = i % self.ndevices
# create tags
on_device = "" if self.ndevices <= 1 else "gpu_" + str(d) + "/"
_tag_one = on_device + tag_one
_tag_lr = on_device + tag_lr
# pickup gradient
w_grad = self.gradientMap[w]
# update weights
if self.ndevices > 1:
with core.DeviceScope(
core.DeviceOption(workspace.GpuDeviceType, d)
):
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices,
w_grad.values, _tag_lr], w
)
else:
self.model.ScatterWeightedSum(
[w, _tag_one, w_grad.indices, w_grad.values, _tag_lr], w
)