models_mnist/modules.py [487:660]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        norms = tf.einsum('ijkl->i', att_grid)
        norms = tf.reshape(norms, [-1, 1, 1, 1])
        #norms = tf.tile(tf.reshape(norms, [-1, 1, 1, 1]), [1, H, W, 1])
        # NOTE: if norm is too low, then clip it
        norms = tf.clip_by_value(norms, 1e-6, 1e6)
        att_grid = att_grid / norms

    return [att_grid]
#------------------------------------------------------------------------------

class InvalidLoomOp(loom.LoomOp):
  """
    Mapping: returns a context of zeros
    Output:
     context: [N, encode_size] of zeros

    Implementation:
     Take the elementwise-min

    Parameters typically contain:
      map_dim = 1024
      module_scope = find_module
      reuse = True
      scope
  """
  def __init__(self, in_types, out_types, params):
    self._params = params
    self._scope = params.get('scope', 'invalid_module')
    self._module_scope = params['module_scope']
    self._reuse = params.get('reuse', None)
    super(InvalidLoomOp, self).__init__(in_types, out_types)

  def instantiate_batch(self, inputs):
    """
      Inputs:
        visual attention outputs
        time id for current module
    """
    img_feat = inputs
    encode_size = self._params['encode_size']

    with tf.variable_scope(self._module_scope):
      with tf.variable_scope(self._scope, reuse=self._reuse):
        N = tf.shape(img_feat)[0]
        context = tf.zeros([N, encode_size], tf.float32)

    return [context]
#------------------------------------------------------------------------------

class DescribeLoomOp(loom.LoomOp):
  """
  Mapping: att_grid -> context vector
  Input:
    input_0: [N, H, W, 1]
  Output:
    answer_scores: [N, outputSize]

  Implementation:
  1. Extract visual features using the input attention map, and
  linear transform to map_dim
  2. linear transform language features to map_dim
  3. Element-wise multiplication of the two, l2_normalize, linear transform.
  """
  def __init__(self, in_types, out_types, params):
    self._params = params
    self._scope = params.get('scope', 'describe_module')
    self._module_scope = params['module_scope']
    self._reuse = params.get('reuse', None)
    super(DescribeLoomOp, self).__init__(in_types, out_types)

  def instantiate_batch(self, inputs):
    """
    Inputs:
      output from the previous modules
      image feature for the example
      text attention for all modules for the example
      time id for current module
    """
    vis_att, img_feat, text_att = inputs

    # text feature dimension, intermediate mapping dimension
    # batch size, image feature height and width
    text_dim = text_att.shape.as_list()[-1]
    map_dim = self._params['map_dim']
    encode_size = self._params['encode_size']
    N = tf.shape(img_feat)[0]
    H, W = img_feat.shape.as_list()[1:3]

    with tf.variable_scope(self._module_scope):
      with tf.variable_scope(self._scope, reuse=self._reuse):
        text_map = fc('fc_text', text_att, output_dim=map_dim)
        # nonlinearity
        text_map = tf.nn.relu(text_map)

        # att_feat, att_feat_1 has shape [N, D_vis]
        att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2])
        img_map = tf.reshape(fc('fc_att', att_feats, output_dim=map_dim),
                  [N, map_dim])
        # nonlinearity
        img_map = tf.nn.relu(img_map)

        eltwise_mult = tf.nn.l2_normalize(img_map * text_map, 1)
        context = fc('fc_eltwise', eltwise_mult, output_dim=encode_size)


    return [context]
#------------------------------------------------------------------------------

class TransformLoomOp(loom.LoomOp):
  """
  Mapping: att_grid x text_param -> att_grid
  Input:
   input_0: [N, H, W, 1]
   text_param: [N, D_txt]
  Output:
   att_grid: [N, H, W, 1]

  Implementation:
   1. Extract visual features using the input attention map, and
    linear transform to map_dim
   2. linear transform language features to map_dim
   3. Convolve image features to map_dim
   4. Element-wise multiplication of the three, l2_normalize, linear transform.
  """
  def __init__(self, in_types, out_types, params):
    self._params = params
    self._scope = params.get('scope', 'transform_module')
    self._module_scope = params['module_scope']
    self._reuse = params.get('reuse', None)
    super(TransformLoomOp, self).__init__(in_types, out_types)

  def instantiate_batch(self, inputs):
    """
    Inputs:
      output from the previous modules
      image feature for the example
      text attention for all modules for the example
      time id for current module
    """
    vis_att, img_feat, text_att = inputs

    # text feature dimension, intermediate mapping dimension
    # batch size, image feature height and width
    text_dim = text_att.shape.as_list()[-1]
    map_dim = self._params['map_dim']
    encode_size = self._params['encode_size']
    N = tf.shape(img_feat)[0]
    H, W = img_feat.shape.as_list()[1:3]

    with tf.variable_scope(self._module_scope):
      with tf.variable_scope(self._scope, reuse=self._reuse):
        # image_feat_mapped has shape [N, H, W, map_dim]
        img_map = _1x1_conv('conv_image', img_feat, output_dim=map_dim)
        # nonlinearity
        img_map = tf.nn.relu(img_map)

        text_map = fc('fc_text', text_att, output_dim=map_dim)
        text_map = tf.reshape(text_map, [-1, 1, 1, map_dim])
        # nonlinearity
        text_map = tf.nn.relu(text_map)

        att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2])
        att_map = tf.reshape(fc('fc_att', att_feats, output_dim=map_dim),
                  [N, 1, 1, map_dim])

        # interact via element wise map
        eltwise_mult = tf.nn.l2_normalize(img_map * text_map * att_map, 3)
        att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

        # softmax
        att_grid_soft = tf.nn.softmax(tf.reshape(att_grid, [-1, H*W]))
        att_grid = tf.reshape(att_grid_soft, [-1, H, W, 1])

    return [att_grid]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


models_vd/modules.py [338:511]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        norms = tf.einsum('ijkl->i', att_grid)
        norms = tf.reshape(norms, [-1, 1, 1, 1])
        #norms = tf.tile(tf.reshape(norms, [-1, 1, 1, 1]), [1, H, W, 1])
        # NOTE: if norm is too low, then clip it
        norms = tf.clip_by_value(norms, 1e-6, 1e6)
        att_grid = att_grid / norms

    return [att_grid]
#------------------------------------------------------------------------------

class InvalidLoomOp(loom.LoomOp):
  """
  Mapping: returns a context of zeros
  Output:
   context: [N, encodeSize] of zeros

  Implementation:
   Take the elementwise-min

  Parameters typically contain:
    map_dim = 1024
    module_scope = findModule
    reuse = True
    scope
  """
  def __init__(self, in_types, out_types, params):
    self._params = params
    self._scope = params.get('scope', 'invalid_module')
    self._module_scope = params['module_scope']
    self._reuse = params.get('reuse', None)
    super(InvalidLoomOp, self).__init__(in_types, out_types)

  def instantiate_batch(self, inputs):
    """
      Inputs:
        visual attention outputs
        time id for current module
    """
    img_feat = inputs
    encode_size = self._params['encode_size']

    with tf.variable_scope(self._module_scope):
      with tf.variable_scope(self._scope, reuse=self._reuse):
        N = tf.shape(img_feat)[0]
        context = tf.zeros([N, encode_size], tf.float32)

    return [context]
#------------------------------------------------------------------------------

class DescribeLoomOp(loom.LoomOp):
  """
  Mapping: att_grid -> context vector
  Input:
    input_0: [N, H, W, 1]
  Output:
    answer_scores: [N, outputSize]

  Implementation:
  1. Extract visual features using the input attention map, and
  linear transform to map_dim
  2. linear transform language features to map_dim
  3. Element-wise multiplication of the two, l2_normalize, linear transform.
  """
  def __init__(self, in_types, out_types, params):
    self._params = params
    self._scope = params.get('scope', 'describe_module')
    self._module_scope = params['module_scope']
    self._reuse = params.get('reuse', None)
    super(DescribeLoomOp, self).__init__(in_types, out_types)

  def instantiate_batch(self, inputs):
    """
    Inputs:
      output from the previous modules
      image feature for the example
      text attention for all modules for the example
      time id for current module
    """
    vis_att, img_feat, text_att = inputs

    # text feature dimension, intermediate mapping dimension
    # batch size, image feature height and width
    text_dim = text_att.shape.as_list()[-1]
    map_dim = self._params['map_dim']
    encode_size = self._params['encode_size']
    N = tf.shape(img_feat)[0]
    H, W = img_feat.shape.as_list()[1:3]

    with tf.variable_scope(self._module_scope):
      with tf.variable_scope(self._scope, reuse=self._reuse):
        text_map = fc('fc_text', text_att, output_dim=map_dim)
        # nonlinearity
        text_map = tf.nn.relu(text_map)

        # att_feat, att_feat_1 has shape [N, D_vis]
        att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2])
        img_map = tf.reshape(fc('fc_att', att_feats, output_dim=map_dim),
                  [N, map_dim])
        # nonlinearity
        img_map = tf.nn.relu(img_map)

        eltwise_mult = tf.nn.l2_normalize(img_map * text_map, 1)
        context = fc('fc_eltwise', eltwise_mult, output_dim=encode_size)


    return [context]
#------------------------------------------------------------------------------

class TransformLoomOp(loom.LoomOp):
  """
  Mapping: att_grid x text_param -> att_grid
  Input:
   input_0: [N, H, W, 1]
   text_param: [N, D_txt]
  Output:
   att_grid: [N, H, W, 1]

  Implementation:
   1. Extract visual features using the input attention map, and
    linear transform to map_dim
   2. linear transform language features to map_dim
   3. Convolve image features to map_dim
   4. Element-wise multiplication of the three, l2_normalize, linear transform.
  """
  def __init__(self, in_types, out_types, params):
    self._params = params
    self._scope = params.get('scope', 'transform_module')
    self._module_scope = params['module_scope']
    self._reuse = params.get('reuse', None)
    super(TransformLoomOp, self).__init__(in_types, out_types)

  def instantiate_batch(self, inputs):
    """
    Inputs:
      output from the previous modules
      image feature for the example
      text attention for all modules for the example
      time id for current module
    """
    vis_att, img_feat, text_att = inputs

    # text feature dimension, intermediate mapping dimension
    # batch size, image feature height and width
    text_dim = text_att.shape.as_list()[-1]
    map_dim = self._params['map_dim']
    encode_size = self._params['encode_size']
    N = tf.shape(img_feat)[0]
    H, W = img_feat.shape.as_list()[1:3]

    with tf.variable_scope(self._module_scope):
      with tf.variable_scope(self._scope, reuse=self._reuse):
        # image_feat_mapped has shape [N, H, W, map_dim]
        img_map = _1x1_conv('conv_image', img_feat, output_dim=map_dim)
        # nonlinearity
        img_map = tf.nn.relu(img_map)

        text_map = fc('fc_text', text_att, output_dim=map_dim)
        text_map = tf.reshape(text_map, [-1, 1, 1, map_dim])
        # nonlinearity
        text_map = tf.nn.relu(text_map)

        att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2])
        att_map = tf.reshape(fc('fc_att', att_feats, output_dim=map_dim),
                  [N, 1, 1, map_dim])

        # interact via element wise map
        eltwise_mult = tf.nn.l2_normalize(img_map * text_map * att_map, 3)
        att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

        # softmax
        att_grid_soft = tf.nn.softmax(tf.reshape(att_grid, [-1, H*W]))
        att_grid = tf.reshape(att_grid_soft, [-1, H, W, 1])

    return [att_grid]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -