in iep/models/baselines.py [0:0]
def __init__(self, vocab,
rnn_wordvec_dim=300, rnn_dim=256, rnn_num_layers=2, rnn_dropout=0,
cnn_feat_dim=(1024,14,14),
stacked_attn_dim=512, num_stacked_attn=2,
fc_use_batchnorm=False, fc_dropout=0, fc_dims=(1024,)):
super(CnnLstmSaModel, self).__init__()
rnn_kwargs = {
'token_to_idx': vocab['question_token_to_idx'],
'wordvec_dim': rnn_wordvec_dim,
'rnn_dim': rnn_dim,
'rnn_num_layers': rnn_num_layers,
'rnn_dropout': rnn_dropout,
}
self.rnn = LstmEncoder(**rnn_kwargs)
C, H, W = cnn_feat_dim
self.image_proj = nn.Conv2d(C, rnn_dim, kernel_size=1, padding=0)
self.stacked_attns = []
for i in range(num_stacked_attn):
sa = StackedAttention(rnn_dim, stacked_attn_dim)
self.stacked_attns.append(sa)
self.add_module('stacked-attn-%d' % i, sa)
classifier_args = {
'input_dim': rnn_dim,
'hidden_dims': fc_dims,
'output_dim': len(vocab['answer_token_to_idx']),
'use_batchnorm': fc_use_batchnorm,
'dropout': fc_dropout,
}
self.classifier = build_mlp(**classifier_args)