in python/singa/autograd.py [0:0]
def backward(y, dy=None):
"""
Run the backward propagation starting at y.
Args:
y: a Tensor instance, usually the loss
dy: a number or a Tensor instance, for the gradient of the
objective/loss w.r.t y, usually None, i.e., 1.0
Return:
yeild the parameter (tensor with stores_grad true) and the
gradient tensors.
"""
assert isinstance(y, Tensor), "wrong input type."
op_dep, tensor_dep = infer_dependency(y.creator)
assert y.size() == 1, ("y must be a Tensor with a single value;"
"size of y is % d" % y.size())
# by default the dy is a tensor with 1.0 for each sample;
if dy is None:
dy = float(1.0)
elif isinstance(dy, Tensor):
dy = dy.data
else:
dy = float(dy)
# ready is a queue of (operation, dy list)
ready = deque([(y.creator, (dy,))])
not_ready = {} # mapping: op->[dy]
if y.stores_grad:
# gradients[y] = dy
if isinstance(dy, float):
g = np.array(dy)
else:
g = dy
tg = Tensor(device=g.device(), data=g)
yield (y, tg)
while len(ready) > 0:
op, dys = ready.pop()
if not op.requires_grad or isinstance(op, Dummy):
continue
# if not isinstance(op, tensor.Dummy):
dxs = op._do_backward(*dys)
# TODO src and dx must match
assert len(op.src) == len(dxs), (
"the number of src ops (=%d) and dx (=%d) not match" %
(len(op.src), len(dxs)))
for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
# prefix x is w.r.t op; prefix y is w.r.t src_op.
# x_id is the python id of one input arg of src_op, denoted as x.
# y_idx (below) is the index of x among the outputs of src_op.
# not_ready[src_op][y_idx] records the intermediate gradient
# of the y_idx'th output of src_op. 'intermediate gradient'
# indicates that if this output is used in multiple children
# operations, then we have to add the graident (dx) from all these
# children operations. When src_op is ready, it means that
# the gradient of all its outputs are available, i.e. all children
# operations have been backwarded.
# y is None if y.stores_grad is false; otherwise it is a Tensor
if isinstance(src_op, Dummy) and (not src_op.stores_grad):
continue
y_idx = src_op.y_id2idx[x_id]
if src_op not in not_ready:
# src_op may have mulitple outputs
not_ready[src_op] = [None for _ in src_op.y_id2idx]
not_ready[src_op][y_idx] = dx
else:
dxs_ = not_ready[src_op]
if dxs_[y_idx] is None:
dxs_[y_idx] = dx
else:
# add the gradient from another children operation that
# uses y_idx'th output of src_op as input arg
dxs_[y_idx] += dx
op_dep[src_op] -= 1
tensor_dep[x_id] -= 1
if y_stores_grad and tensor_dep[x_id] == 0:
# store the gradient for final return, e.g. for parameters.
# it may cause a delay to yield. Only after src_op's all
# output tensors have recieved the gradients, then output
g = not_ready[src_op][y_idx]
tg = Tensor(device=g.device(),
data=g,
name=src_op.grad_name(y_idx))
yield (y, tg)
if op_dep[src_op] == 0:
if src_op.requires_grad is True:
assert not isinstance(
src_op, Dummy), "Dummy op does not do backward()"
ready.append((src_op, not_ready[src_op]))
del not_ready[src_op]
del op # delete the operation to free all tensors from this op