in utils/model.py [0:0]
def forward(self, features, poses, homography, poses2, lengths):
""" decode image feature vectors and generate pose sequences """
poses = self.embed(poses)
poses[:, 0, :] = torch.zeros([poses.shape[0], self.embed_size]).cuda().float()
# concat the embedding with (im features, homographies)
embeddings = torch.cat((poses, features), 2)
embeddings = torch.cat((features, homography.cuda()), 2)
embeddings = torch.cat((embeddings, poses2.cuda()), 2)
packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
hiddens, _ = self.lstm(packed)
# transform result to size of vocab (each word has score)
outputs = self.linear(hiddens[0])
return outputs