def __init__()

in models_vd/decoder.py [0:0]


  def __init__(self, inputs, output_pool, params):
    """Initialize answer decoder.

    Args:
      inputs:
      output_pool:
      params:
    """

    self.params = params

    # keep track of inputs and outputs
    used_inputs = []
    outputs = {}

    # alias for criterion
    criterion = tf.nn.sparse_softmax_cross_entropy_with_logits

    # begin decoding
    with tf.variable_scope(self.params['embed_scope'], reuse=True):
      size = [params['text_vocab_size'], params['text_embed_size']]
      embed_mat = tf.get_variable('embed_mat')

    output = tf.nn.embedding_lookup(embed_mat, inputs['ans_in'])
    used_inputs.extend(['ans_in', 'ans_out', 'ans_len'])

    # recurrent neural network cell
    cell = tf.contrib.rnn.BasicLSTMCell(params['lstm_size'])

    # decide the source based on train / evaluation
    source = output_pool if params['train_mode'] else inputs

    # concatenate question to both
    concat_list = []
    # add program context vector
    concat_list.append(source['context'])
    # adding last hidden size
    concat_list.append(source['enc_dec_h'][-1])
    used_inputs.extend(['enc_dec_h', 'enc_dec_c'])

    if not params['train_mode']:
      used_inputs.append('context')
    #--------------------------------------------------------------------------

    # stack all the vectors
    stack_vec = tf.concat(concat_list, axis=1)
    stack_vec = FC(stack_vec, params['lstm_size'])

    # construct encoder decoder H
    enc_dec_h = [source['enc_dec_h'][ii]
                 for ii in range(params['num_layers'] - 1)]
    enc_dec_h.append(stack_vec)
    # construct encoder decoder C
    enc_dec_c = [source['enc_dec_c'][ii] for ii in range(params['num_layers'])]
    init_state = [tf.contrib.rnn.LSTMStateTuple(cc, hh)
                  for cc, hh in zip(enc_dec_c, enc_dec_h)]

    if params['decoder'] == 'gen':
      for ii in range(params['num_layers']):
        # dynamic rnn
        output,  _ = tf.nn.dynamic_rnn(cell, output,
                                       sequence_length=inputs['ans_len'],
                                       initial_state=init_state[ii],
                                       dtype=tf.float32, scope='layer_%d' % ii)

      # predict the output words
      output = FC(output, params['text_vocab_size'], activation_fn=None)
      # create a mask
      mask = tf.not_equal(inputs['ans_out'], params['pad_id'])
      mask = tf.cast(mask, tf.float32)

      # multiply by mask for variable length sequences
      answer_loss = criterion(logits=output, labels=inputs['ans_out'])
      masked_answer_loss = tf.multiply(answer_loss, mask)
      token_likelihood = tf.reduce_sum(masked_answer_loss)
      num_tokens = tf.maximum(tf.reduce_sum(mask), 1)

      outputs['ans_token_loss'] = token_likelihood/num_tokens
      outputs['per_sample_loss'] = tf.reduce_sum(masked_answer_loss, 1)

      # extract the probabilities
      out_softmax = tf.nn.log_softmax(output)
      out_softmax_flat = tf.reshape(out_softmax, [-1, params['text_vocab_size']])
      orig_shape = tf.shape(inputs['ans_out'])
      ans_out_flat = tf.reshape(inputs['ans_out'], [-1])
      inds = [tf.range(0, tf.shape(ans_out_flat)[0]), ans_out_flat]
      inds = tf.stack(inds, axis=1)

      prob_tokens = tf.gather_nd(out_softmax_flat, inds)
      prob_tokens = tf.reshape(prob_tokens, orig_shape)
      prob_tokens = tf.multiply(prob_tokens, mask)
      # compute the loglikelihood
      outputs['llh'] = tf.reduce_sum(prob_tokens, 1)
      # compute mean instead of sum
      num_tokens = tf.maximum(tf.reduce_sum(mask, 1), 1)
      outputs['llh_mean'] = outputs['llh'] / num_tokens

    elif params['decoder'] == 'disc':
      # embed options and encode via lstm
      with tf.variable_scope(self.params['embed_scope'], reuse=True):
        size = [params['text_vocab_size'], params['text_embed_size']]
        embed_mat = tf.get_variable('embed_mat')
      opt_embed = tf.nn.embedding_lookup(embed_mat, inputs['opt'])

      # transpose and merging batch and option dimension
      opt_embed = tf.transpose(opt_embed, [0, 2, 1, 3])
      shape = opt_embed.shape.as_list()
      opt_embed = tf.reshape(opt_embed, [-1, shape[2], shape[3]])

      opt_len = tf.reshape(inputs['opt_len'], [-1])

      output, _ = tf.nn.dynamic_rnn(cell, opt_embed,
                                    sequence_length=opt_len,
                                    dtype=tf.float32, scope='opt_layer_0')
      for ii in range(1, params['num_layers']):
        # dynamic rnn
        output, _ = tf.nn.dynamic_rnn(cell, output, \
                                      sequence_length=opt_len,
                                      dtype=tf.float32,
                                      scope='opt_layer_%d' % ii)

      opt_encode = support.last_relevant(output, opt_len)
      # reshape back
      opt_encode = tf.reshape(opt_encode, [-1, shape[1], params['lstm_size']])

      # score the options with context vector
      score_vec = tf.matmul(opt_encode, tf.expand_dims(stack_vec, -1))
      score_vec = tf.squeeze(score_vec, -1)
      scores = criterion(logits=score_vec, labels=inputs['gt_ind'])
      outputs['ans_token_loss'] = tf.reduce_mean(scores)
      outputs['scores'] = score_vec

      used_inputs.extend(['opt', 'opt_len', 'gt_ind'])

    # setup the inputs and outputs
    self.outputs = outputs
    self.inputs = {ii: inputs[ii] for ii in used_inputs}