def backward()

in python/singa/autograd.py [0:0]


def backward(y, dy=None):
    """
    Run the backward propagation starting at y.
    Args:
        y: a Tensor instance, usually the loss
        dy: a number or a Tensor instance, for the gradient of the
            objective/loss w.r.t y, usually None, i.e., 1.0
    Return:
        yeild the parameter (tensor with stores_grad true) and the
            gradient tensors.
    """
    assert isinstance(y, Tensor), "wrong input type."
    op_dep, tensor_dep = infer_dependency(y.creator)
    assert y.size() == 1, ("y must be a Tensor with a single value;"
                           "size of y is % d" % y.size())

    # by default the dy is a tensor with 1.0 for each sample;
    if dy is None:
        dy = float(1.0)
    elif isinstance(dy, Tensor):
        dy = dy.data
    else:
        dy = float(dy)

    # ready is a queue of (operation, dy list)
    ready = deque([(y.creator, (dy,))])
    not_ready = {}  # mapping: op->[dy]

    if y.stores_grad:
        # gradients[y] = dy
        if isinstance(dy, float):
            g = np.array(dy)
        else:
            g = dy
        tg = Tensor(device=g.device(), data=g)
        yield (y, tg)

    while len(ready) > 0:
        op, dys = ready.pop()
        if not op.requires_grad or isinstance(op, Dummy):
            continue
        # if not isinstance(op, tensor.Dummy):
        dxs = op._do_backward(*dys)
        # TODO src and dx must match

        assert len(op.src) == len(dxs), (
            "the number of src ops (=%d) and dx (=%d) not match" %
            (len(op.src), len(dxs)))
        for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
            # prefix x is w.r.t op; prefix y is w.r.t src_op.
            # x_id is the python id of one input arg of src_op, denoted as x.
            # y_idx (below) is the index of x among the outputs of src_op.
            # not_ready[src_op][y_idx] records the intermediate gradient
            # of the y_idx'th output of src_op. 'intermediate gradient'
            # indicates that if this output is used in multiple children
            # operations, then we have to add the graident (dx) from all these
            # children operations. When src_op is ready, it means that
            # the gradient of all its outputs are available, i.e. all children
            # operations have been backwarded.
            # y is None if y.stores_grad is false; otherwise it is a Tensor

            if isinstance(src_op, Dummy) and (not src_op.stores_grad):
                continue

            y_idx = src_op.y_id2idx[x_id]
            if src_op not in not_ready:
                # src_op may have mulitple outputs
                not_ready[src_op] = [None for _ in src_op.y_id2idx]
                not_ready[src_op][y_idx] = dx
            else:
                dxs_ = not_ready[src_op]
                if dxs_[y_idx] is None:
                    dxs_[y_idx] = dx
                else:
                    # add the gradient from another children operation that
                    # uses y_idx'th output of src_op as input arg
                    dxs_[y_idx] += dx

            op_dep[src_op] -= 1
            tensor_dep[x_id] -= 1
            if y_stores_grad and tensor_dep[x_id] == 0:
                # store the gradient for final return, e.g. for parameters.
                # it may cause a delay to yield. Only after src_op's all
                # output tensors have recieved the gradients, then output
                g = not_ready[src_op][y_idx]
                tg = Tensor(device=g.device(),
                            data=g,
                            name=src_op.grad_name(y_idx))
                yield (y, tg)

            if op_dep[src_op] == 0:
                if src_op.requires_grad is True:
                    assert not isinstance(
                        src_op, Dummy), "Dummy op does not do backward()"
                    ready.append((src_op, not_ready[src_op]))
                del not_ready[src_op]
        del op  # delete the operation to free all tensors from this op