import json
import pickle
import math
import sys
import argparse
import warnings

from os import makedirs
from os.path import basename, join, exists, dirname, splitext, realpath

from wikidata_linker_utils.progressbar import get_progress_bar
from dataset import TSVDataset, CombinedDataset, H5Dataset, ClassificationHandler
from batchifier import (iter_batches_single_threaded,
                        requires_vocab,
                        requires_character_convolution,
                        get_feature_vocabs)
import tensorflow as tf
import numpy as np

try:
    RNNCell = tf.nn.rnn_cell.RNNCell
    TFLSTMCell = tf.nn.rnn_cell.LSTMCell
    MultiRNNCell = tf.nn.rnn_cell.MultiRNNCell
    LSTMStateTuple = tf.nn.rnn_cell.LSTMStateTuple
    from tensorflow.contrib.cudnn_rnn import CudnnLSTM
except AttributeError:
    RNNCell = tf.contrib.rnn.RNNCell
    TFLSTMCell = tf.contrib.rnn.LSTMCell
    MultiRNNCell = tf.contrib.rnn.MultiRNNCell
    LSTMStateTuple = tf.contrib.rnn.LSTMStateTuple
    from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTM

from tensorflow.python.client import device_lib


class LazyAdamOptimizer(tf.train.AdamOptimizer):
    """Variant of the Adam optimizer that handles sparse updates more efficiently.

    The original Adam algorithm maintains two moving-average accumulators for
    each trainable variable; the accumulators are updated at every step.
    This class provides lazier handling of gradient updates for sparse variables.
    It only updates moving-average accumulators for sparse variable indices that
    appear in the current batch, rather than updating the accumulators for all
    indices. Compared with the original Adam optimizer, it can provide large
    improvements in model training throughput for some applications. However, it
    provides slightly different semantics than the original Adam algorithm, and
    may lead to different empirical results.
    """

    def _apply_sparse(self, grad, var):
        beta1_power = tf.cast(self._beta1_power, var.dtype.base_dtype)
        beta2_power = tf.cast(self._beta2_power, var.dtype.base_dtype)
        lr_t = tf.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = tf.cast(self._epsilon_t, var.dtype.base_dtype)
        lr = (lr_t * tf.sqrt(1 - beta2_power) / (1 - beta1_power))

        # m := beta1 * m + (1 - beta1) * g_t
        # We use a slightly different version of the moving-average update formula
        # that does a better job of handling concurrent lockless updates:
        # m -= (1 - beta1) * (m - g_t)
        m = self.get_slot(var, "m")
        m_t_delta = tf.gather(m, grad.indices) - grad.values
        m_t = tf.scatter_sub(m, grad.indices,
                                    (1 - beta1_t) * m_t_delta,
                                    use_locking=self._use_locking)

        # v := beta2 * v + (1 - beta2) * (g_t * g_t)
        # We reformulate the update as:
        # v -= (1 - beta2) * (v - g_t * g_t)
        v = self.get_slot(var, "v")
        v_t_delta = tf.gather(v, grad.indices) - tf.square(grad.values)
        v_t = tf.scatter_sub(v, grad.indices,
                                    (1 - beta2_t) * v_t_delta,
                                    use_locking=self._use_locking)

        # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
        m_t_slice = tf.gather(m_t, grad.indices)
        v_t_slice = tf.gather(v_t, grad.indices)
        denominator_slice = tf.sqrt(v_t_slice) + epsilon_t
        var_update = tf.scatter_sub(var, grad.indices,
                                    lr * m_t_slice / denominator_slice,
                                    use_locking=self._use_locking)
        return tf.group(var_update, m_t, v_t)


def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

def split(values, axis, num_splits, name=None):
    return tf.split(values, num_splits, axis=axis, name=name)

def reverse(values, axis):
    return tf.reverse(values, [axis])


def sparse_softmax_cross_entropy_with_logits(logits, labels):
    return tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=labels)


def concat(values, axis, name=None):
    if len(values) == 1:
        return values[0]
    return tf.concat(values, axis, name=name)


def concat_tensor_array(values, name=None):
    return values.stack(name=name)


def batch_gather_3d(values, indices):
    return tf.gather(tf.reshape(values, [-1, tf.shape(values)[2]]),
                     tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
                     indices)


def batch_gather_2d(values, indices):
    return tf.gather(tf.reshape(values, [-1]),
                     tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
                     indices)


def viterbi_decode(score, transition_params, sequence_lengths, back_prop=False,
                   parallel_iterations=1):
    """Decode the highest scoring sequence of tags inside of TensorFlow!!!
    This can be used anytime.
    Args:
        score: A [batch, seq_len, num_tags] matrix of unary potentials.
        transition_params: A [num_tags, num_tags] matrix of binary potentials.
        sequence_lengths: A [batch] int32 vector of the length of each score
            sequence.
    Returns:
        viterbi: A [batch, seq_len] list of integers containing the highest
            scoring tag indices.
        viterbi_score: A vector of float containing the score for the Viterbi
            sequence.
    """
    sequence_lengths = tf.convert_to_tensor(
        sequence_lengths, name="sequence_lengths")
    score = tf.convert_to_tensor(score, name="score")
    transition_params = tf.convert_to_tensor(
        transition_params, name="transition_params")

    if sequence_lengths.dtype != tf.int32:
        sequence_lengths = tf.cast(sequence_lengths, tf.int32)

    def condition(t, *args):
        """Stop when full score sequence has been read in."""
        return tf.less(t, tf.shape(score)[1])

    def body(t, trellis, backpointers, trellis_val):
        """Perform forward viterbi pass."""
        v = tf.expand_dims(trellis_val, 2) + tf.expand_dims(transition_params, 0)
        new_trellis_val = score[:, t, :] + tf.reduce_max(v, axis=1)
        new_trellis = trellis.write(t, new_trellis_val)
        new_backpointers = backpointers.write(
            t, tf.cast(tf.argmax(v, axis=1), tf.int32))
        return t + 1, new_trellis, new_backpointers, new_trellis_val

    trellis_arr = tf.TensorArray(score.dtype, size=0,
        dynamic_size=True, clear_after_read=False, infer_shape=False)
    first_trellis_val = score[:, 0, :]
    trellis_arr = trellis_arr.write(0, first_trellis_val)

    backpointers_arr = tf.TensorArray(tf.int32, size=0,
        dynamic_size=True, clear_after_read=False, infer_shape=False)
    backpointers_arr = backpointers_arr.write(0,
        tf.zeros_like(score[:, 0, :], dtype=tf.int32))

    _, trellis_out, backpointers_out, _ = tf.while_loop(
        condition, body,
        (tf.constant(1, name="t", dtype=tf.int32), trellis_arr, backpointers_arr, first_trellis_val),
        parallel_iterations=parallel_iterations,
        back_prop=back_prop)

    trellis_out = concat_tensor_array(trellis_out)
    backpointers_out = concat_tensor_array(backpointers_out)
    # make batch-major:
    trellis_out = tf.transpose(trellis_out, [1, 0, 2])
    backpointers_out = tf.transpose(backpointers_out, [1, 0, 2])

    def condition(t, *args):
        return tf.less(t, tf.shape(score)[1])

    def body(t, viterbi, last_decision):
        backpointers_timestep = batch_gather_3d(
            backpointers_out, tf.maximum(sequence_lengths - t, 0))
        new_last_decision = batch_gather_2d(
            backpointers_timestep, last_decision)
        new_viterbi = viterbi.write(t, new_last_decision)
        return t + 1, new_viterbi, new_last_decision

    last_timestep = batch_gather_3d(trellis_out, sequence_lengths - 1)
    # get scores for last timestep of each batch element inside
    # trellis:
    scores = tf.reduce_max(last_timestep, axis=1)
    # get choice index for last timestep:
    last_decision = tf.cast(tf.argmax(last_timestep, axis=1), tf.int32)

    # decode backwards using backpointers:
    viterbi = tf.TensorArray(tf.int32, size=0,
        dynamic_size=True, clear_after_read=False, infer_shape=False)
    viterbi = viterbi.write(0, last_decision)
    _, viterbi_out, _ = tf.while_loop(
        condition, body,
        (tf.constant(1, name="t", dtype=tf.int32), viterbi, last_decision),
        parallel_iterations=parallel_iterations,
        back_prop=back_prop)
    viterbi_out = concat_tensor_array(viterbi_out)
    # make batch-major:
    viterbi_out = tf.transpose(viterbi_out, [1, 0])
    viterbi_out_fwd = tf.reverse_sequence(
        viterbi_out, sequence_lengths, seq_dim=1)
    return viterbi_out_fwd, scores


def sum_list(elements):
    total = elements[0]
    for el in elements[1:]:
        total += el
    return total


def explicitly_set_fields():
    received = set()
    for argument in sys.argv:
        if argument.startswith("--"):
            received.add(argument[2:])
            if argument[2:].startswith("no"):
                received.add(argument[4:])
    return received


def save_session(session, saver, path, verbose=False):
    """
    Call save on tf.train.Saver on a specific path to store all the variables
    of the current tensorflow session to a file for later restoring.

    Arguments:
        session : tf.Session
        path : str, place to save session
    """
    makedirs(path, exist_ok=True)
    if not path.endswith("/"):
        path = path + "/"

    path = join(path, "model.ckpt")
    if verbose:
        print("Saving session under %r" % (path,), flush=True)
    saver.save(session, path)
    print("Saved", flush=True)

### constants for saving & loading

# model config:
OBJECTIVE_NAMES = "OBJECTIVE_NAMES"
OBJECTIVE_TYPES = "OBJECTIVE_TYPES"

# inputs:
INPUT_PLACEHOLDERS = "INPUT_PLACEHOLDERS"
LABEL_PLACEHOLDERS = "LABEL_PLACEHOLDERS"
LABEL_MASK_PLACEHOLDERS = "LABEL_MASK_PLACEHOLDERS"
TRAIN_OP = "TRAIN_OP"
SEQUENCE_LENGTHS = "SEQUENCE_LENGTHS"
IS_TRAINING = "IS_TRAINING"

# outputs:
DECODED = "DECODED"
DECODED_SCORES = "DECODED_SCORES"
UNARY_SCORES = "UNARY_SCORES"

# per objective metrics:
TOKEN_CORRECT = "TOKEN_CORRECT"
TOKEN_CORRECT_TOTAL = "TOKEN_CORRECT_TOTAL"
SENTENCE_CORRECT = "SENTENCE_CORRECT"
SENTENCE_CORRECT_TOTAL = "SENTENCE_CORRECT_TOTAL"

# aggregate metrics over all objectives
NLL = "NLL"
NLL_TOTAL = "NLL_TOTAL"
TOKEN_CORRECT_ALL = "TOKEN_CORRECT_ALL"
TOKEN_CORRECT_ALL_TOTAL = "TOKEN_CORRECT_ALL_TOTAL"
SENTENCE_CORRECT_ALL = "SENTENCE_CORRECT_ALL"
SENTENCE_CORRECT_ALL_TOTAL = "SENTENCE_CORRECT_ALL_TOTAL"
CONFUSION_MATRIX = "CONFUSION_MATRIX"
GLOBAL_STEP = "global_step"
SUMMARIES_ASSIGNS = "SUMMARIES_ASSIGNS"
SUMMARIES_PLACEHOLDERS = "SUMMARIES_PLACEHOLDERS"
SUMMARIES_NAMES = "SUMMARIES_NAMES"
TRAIN_SUMMARIES = "TRAIN_SUMMARIES"

TRUE_POSITIVES = "TRUE_POSITIVES"
FALSE_POSITIVES = "FALSE_POSITIVES"
FALSE_NEGATIVES = "FALSE_NEGATIVES"

def maybe_dropout(inputs, keep_prob, is_training):
    return tf.cond(is_training,
        lambda : tf.nn.dropout(inputs, keep_prob),
        lambda : inputs
    ) if keep_prob < 1 else inputs


def compute_sentence_correct(correct, sequence_mask):
    any_label = tf.reduce_max(tf.cast(sequence_mask, tf.int32), 1)
    sentence_correct_total = tf.reduce_sum(any_label)
    # is 1 when all is correct, 0 otherwise
    sentence_correct = tf.reduce_sum(tf.reduce_prod(
        tf.cast(
            tf.logical_or(correct, tf.logical_not(sequence_mask)),
            tf.int32
        ),
        1
    ) * any_label)
    return sentence_correct, sentence_correct_total


def lstm_activation(inputs, input_h, input_c, W, b, activation):
    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    cell_inputs = concat([inputs, input_h], axis=1)

    lstm_matrix = tf.nn.xw_plus_b(cell_inputs, W, b)
    preactiv = split(lstm_matrix, axis=1, num_splits=4)
    # from CUDNN docs:
    # Values 0 and 4 reference the input gate.
    # Values 1 and 5 reference the forget gate.
    # Values 2 and 6 reference the new memory gate.
    # Values 3 and 7 reference the output gate
    i, f, j, o = (
        preactiv[CUDNN_MAPPING["i"]],
        preactiv[CUDNN_MAPPING["f"]],
        preactiv[CUDNN_MAPPING["j"]],
        preactiv[CUDNN_MAPPING["o"]]
    )

    c = (tf.nn.sigmoid(f) * input_c +
         tf.nn.sigmoid(i) * activation(j))

    m = tf.nn.sigmoid(o) * activation(c)
    return (c, m)


class Logger(object):
    def __init__(self, session, writer):
        self.session = session
        self.writer = writer
        self._placeholders = {}
        summaries = tf.get_collection(SUMMARIES_ASSIGNS)
        summaries_pholders = tf.get_collection(SUMMARIES_PLACEHOLDERS)
        summaries_names = [name.decode("utf-8")
                           for name in tf.get_collection(SUMMARIES_NAMES)]

        for summary, pholder, name in zip(summaries, summaries_pholders, summaries_names):
            self._placeholders[name] = (pholder, summary)


    def log(self, name, value, step):
        if name not in self._placeholders:
            pholder = tf.placeholder(tf.float32, [], name=name)
            summary = tf.summary.scalar(name, pholder)
            tf.add_to_collection(SUMMARIES_ASSIGNS, summary)
            tf.add_to_collection(SUMMARIES_NAMES, name)
            tf.add_to_collection(SUMMARIES_PLACEHOLDERS, pholder)
            self._placeholders[name] = (pholder, summary)
        pholder, summary = self._placeholders[name]
        res = self.session.run(summary, {pholder:value})
        self.writer.add_summary(res, step)


class ParametrizedLSTMCell(RNNCell):
    def __init__(self, weights, biases, hidden_size):
        self._weights = weights
        self._biases = biases
        self.hidden_size = hidden_size

    @property
    def state_size(self):
        return (self.hidden_size, self.hidden_size)

    @property
    def output_size(self):
        return self.hidden_size

    def __call__(self, inputs, state, scope=None):
        input_h, input_c = state
        c, m = lstm_activation(inputs,
                               input_h=input_h,
                               input_c=input_c,
                               b=self._biases,
                               W=self._weights,
                               activation=tf.nn.tanh)
        return m, (m, c)


class LSTMCell(TFLSTMCell):
    def __init__(self,
                 num_units,
                 keep_prob=1.0,
                 is_training=False):
        self._is_training = is_training
        self._keep_prob = keep_prob
        TFLSTMCell.__init__(
            self,
            num_units=num_units,
            state_is_tuple=True
        )

    def __call__(self, inputs, state, scope=None):
        (c_prev, m_prev) = state

        dtype = inputs.dtype
        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
        with tf.variable_scope(scope or type(self).__name__,
                               initializer=self._initializer):  # "LSTMCell"
            concat_w = _get_concat_variable(
                    "W", [input_size.value + self._num_units, 4 * self._num_units],
                    dtype, 1)

            b = tf.get_variable(
                    "B", shape=[4 * self._num_units],
                    initializer=tf.zeros_initializer(), dtype=dtype)

        c, m = lstm_activation(inputs,
                               input_c=c_prev,
                               input_h=m_prev,
                               W=concat_w,
                               b=b,
                               activation=self._activation,
                               keep_prob=self._keep_prob,
                               is_training=self._is_training,
                               forget_bias=self._forget_bias)
        return m, LSTMStateTuple(c, m)


def cudnn_lstm_parameter_size(input_size, hidden_size):
    """Number of parameters in a single CuDNN LSTM cell."""
    biases = 8 * hidden_size
    weights = 4 * (hidden_size * input_size) + 4 * (hidden_size * hidden_size)
    return biases + weights


def direction_to_num_directions(direction):
    if direction == "unidirectional":
        return 1
    elif direction == "bidirectional":
        return 2
    else:
        raise ValueError("Unknown direction: %r." % (direction,))


def estimate_cudnn_parameter_size(num_layers,
                                  input_size,
                                  hidden_size,
                                  input_mode,
                                  direction):
    """
    Compute the number of parameters needed to
    construct a stack of LSTMs. Assumes the hidden states
    of bidirectional LSTMs are concatenated before being
    sent to the next layer up.
    """
    num_directions = direction_to_num_directions(direction)
    params = 0
    isize = input_size
    for layer in range(num_layers):
        for direction in range(num_directions):
            params += cudnn_lstm_parameter_size(
                isize, hidden_size
            )
        isize = hidden_size * num_directions
    return params

# cudnn conversion to dynamic RNN:
CUDNN_LAYER_WEIGHT_ORDER = [
    "x", "x", "x", "x", "h", "h", "h", "h"
]
CUDNN_LAYER_BIAS_ORDER = [
    "bx", "bx", "bx", "bx", "bh", "bh", "bh", "bh"
]
CUDNN_TRANSPOSED = True
CUDNN_MAPPING = {"i": 0, "f": 1, "j": 2, "o": 3}


def consume_biases_direction(params, old_offset, hidden_size, isize):
    offset = old_offset
    layer_biases_x = []
    layer_biases_h = []

    for piece in CUDNN_LAYER_BIAS_ORDER:
        if piece == "bx":
            layer_biases_x.append(
                params[offset:offset + hidden_size]
            )
            offset += hidden_size
        elif piece == "bh":
            layer_biases_h.append(
                params[offset:offset + hidden_size]
            )
            offset += hidden_size
        else:
            raise ValueError("Unknown cudnn piece %r." % (piece,))
    b = concat(layer_biases_x, axis=0) + concat(layer_biases_h, axis=0)
    return b, offset


def consume_weights_direction(params, old_offset, hidden_size, isize):
    offset = old_offset
    layer_weights_x = []
    layer_weights_h = []
    for piece in CUDNN_LAYER_WEIGHT_ORDER:
        if piece == "x":
            layer_weights_x.append(
                tf.reshape(
                    params[offset:offset + hidden_size * isize],
                    [hidden_size, isize] if CUDNN_TRANSPOSED else [isize, hidden_size]
                )
            )
            offset += hidden_size * isize
        elif piece == "h":
            layer_weights_h.append(
                tf.reshape(
                    params[offset:offset + hidden_size * hidden_size],
                    [hidden_size, hidden_size]
                )
            )
            offset += hidden_size * hidden_size
        else:
            raise ValueError("Unknown cudnn piece %r." % (piece,))
    if CUDNN_TRANSPOSED:
        W_T = concat([concat(layer_weights_x, axis=0), concat(layer_weights_h, axis=0)], axis=1)
        W = tf.transpose(W_T)
    else:
        W = concat([concat(layer_weights_x, axis=1), concat(layer_weights_h, axis=1)], axis=0)
    return W, offset


def decompose_layer_params(params, num_layers,
                           hidden_size, cell_input_size,
                           input_mode, direction, create_fn):
    """
    This operation converts the opaque cudnn params into a set of
    usable weight matrices.
    Args:
        params : Tensor, opaque cudnn params tensor
        num_layers : int, number of stacked LSTMs.
        hidden_size : int, number of neurons in each LSTM.
        cell_input_size : int, input size for the LSTMs.
        input_mode: whether a pre-projection was used or not. Currently only
            'linear_input' is supported (e.g. CuDNN does its own projection
            internally)
        direction : str, 'unidirectional' or 'bidirectional'.
        create_fn: callback for weight creation. Receives parameter slice (op),
                   layer (int), direction (0 = fwd, 1 = bwd),
                   parameter_index (0 = W, 1 = b).
    Returns:
        weights : list of lists of Tensors in the format:
            first list is indexed layers,
            inner list is indexed by direction (fwd, bwd),
            tensors in the inner list are (Weights, biases)
    """
    if input_mode != "linear_input":
        raise ValueError("Only input_mode == linear_input supported for now.")
    num_directions = direction_to_num_directions(direction)
    offset = 0
    all_weights = [[[] for j in range(num_directions)]
                   for i in range(num_layers)]
    isize = cell_input_size
    with tf.variable_scope("DecomposeCudnnParams"):
        for layer in range(num_layers):
            with tf.variable_scope("Layer{}".format(layer)):
                for direction in range(num_directions):
                    with tf.variable_scope("fwd" if direction == 0 else "bwd"):
                        with tf.variable_scope("weights"):
                            W, offset = consume_weights_direction(
                                params,
                                old_offset=offset,
                                hidden_size=hidden_size,
                                isize=isize)
                            all_weights[layer][direction].append(
                                create_fn(W, layer, direction, 0))
            isize = hidden_size * num_directions
        isize = cell_input_size
        for layer in range(num_layers):
            with tf.variable_scope("Layer{}".format(layer)):
                for direction in range(num_directions):
                    with tf.variable_scope("fwd" if direction == 0 else "bwd"):
                        with tf.variable_scope("biases"):
                            b, offset = consume_biases_direction(
                                params,
                                old_offset=offset,
                                hidden_size=hidden_size,
                                isize=isize)
                            all_weights[layer][direction].append(
                                create_fn(b, layer, direction, 1))
            isize = hidden_size * num_directions
    return all_weights


def create_decomposed_variable(param, lidx, didx, pidx):
    with tf.device("cpu"):
        return tf.get_variable("w" if pidx == 0 else "b",
                               shape=param.get_shape().as_list(),
                               dtype=param.dtype,
                               trainable=False,
                               collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                                            "excluded_variables"])


def cpu_cudnn_params(params, num_layers, hidden_size, cell_input_size, input_mode,
                     direction):
    """
    This operation converts the opaque cudnn params into a set of
    usable weight matrices, and caches the conversion.
    Args:
        params : Tensor, opaque cudnn params tensor
        num_layers : int, number of stacked LSTMs.
        hidden_size : int, number of neurons in each LSTM.
        cell_input_size : int, input size for the LSTMs.
        input_mode: whether a pre-projection was used or not. Currently only
            'linear_input' is supported (e.g. CuDNN does its own projection
            internally)
        direction : str, 'unidirectional' or 'bidirectional'.
        skip_creation : bool, whether to build variables.
    Returns:
        weights : list of lists of Tensors in the format:
            first list is indexed layers,
            inner list is indexed by direction (fwd, bwd),
            tensors in the inner list are (Weights, biases)
    """
    # create a boolean status variable that checks whether the
    # weights have been converted to cpu format:
    with tf.device("cpu"):
        cpu_conversion_status = tf.get_variable(
            name="CudnnConversionStatus", dtype=tf.float32,
            initializer=tf.zeros_initializer(), shape=[],
            trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES])
    # create a fresh copy of the weights (not trainable)
    reshaped = decompose_layer_params(
        params,
        num_layers=num_layers,
        hidden_size=hidden_size,
        cell_input_size=cell_input_size,
        input_mode=input_mode,
        direction=direction,
        create_fn=create_decomposed_variable)

    def cpu_convert():
        all_assigns = decompose_layer_params(
            params,
            num_layers=num_layers,
            hidden_size=hidden_size,
            cell_input_size=cell_input_size,
            input_mode=input_mode,
            direction=direction,
            create_fn=lambda p, lidx, didx, pidx: tf.assign(reshaped[lidx][didx][pidx], p))
        all_assigns = [assign for layer_assign in all_assigns
                       for dir_assign in layer_assign
                       for assign in dir_assign]
        all_assigns.append(tf.assign(cpu_conversion_status, tf.constant(1.0, dtype=tf.float32)))
        all_assigns.append(tf.Print(cpu_conversion_status, [0],
            message="Converted cudnn weights to CPU format. "))
        with tf.control_dependencies(all_assigns):
            ret = tf.identity(cpu_conversion_status)
            return ret
    # cache the reshaping/concatenating
    ensure_conversion = tf.cond(tf.greater(cpu_conversion_status, 0),
                                lambda: cpu_conversion_status,
                                cpu_convert)
    # if weights are already reshaped, go ahead:
    with tf.control_dependencies([ensure_conversion]):
        # wrap with identity to ensure there is a dependency between assignment
        # and using the weights:
        all_params = [[[tf.identity(p) for p in dir_param]
                       for dir_param in layer_param]
                      for layer_param in reshaped]
        return all_params


class CpuCudnnLSTM(object):
    def __init__(self, num_layers, hidden_size,
                 cell_input_size, input_mode, direction):
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.cell_input_size = cell_input_size
        self.input_mode = input_mode
        self.direction = direction

    def __call__(self,
                 inputs,
                 input_h,
                 input_c,
                 params,
                 is_training=True):
        layer_params = cpu_cudnn_params(params,
            num_layers=self.num_layers,
            hidden_size=self.hidden_size,
            cell_input_size=self.cell_input_size,
            input_mode=self.input_mode,
            direction=self.direction)
        REVERSED = 1
        layer_inputs = inputs
        cell_idx = 0
        for layer_param in layer_params:
            hidden_fwd_bwd = []
            final_output_c = []
            final_output_h = []
            for direction, (W, b) in enumerate(layer_param):
                if direction == REVERSED:
                    layer_inputs = reverse(layer_inputs, axis=0)
                hiddens, (output_h, output_c) = tf.nn.dynamic_rnn(
                    cell=ParametrizedLSTMCell(W, b, self.hidden_size),
                    inputs=layer_inputs,
                    dtype=inputs.dtype,
                    time_major=True,
                    initial_state=(input_h[cell_idx], input_c[cell_idx]))
                if direction == REVERSED:
                    hiddens = reverse(hiddens, axis=0)
                hidden_fwd_bwd.append(hiddens)
                final_output_c.append(tf.expand_dims(output_c, 0))
                final_output_h.append(tf.expand_dims(output_h, 0))
                cell_idx += 1
            if len(hidden_fwd_bwd) > 1:
                layer_inputs = concat(hidden_fwd_bwd, axis=2)
                final_output_c = concat(final_output_c, axis=0)
                final_output_h = concat(final_output_h, axis=0)
            else:
                layer_inputs = hidden_fwd_bwd[0]
                final_output_c = final_output_c[0]
                final_output_h = final_output_h[0]
        return layer_inputs, final_output_h, final_output_c


def highway(x, activation_fn=tf.nn.relu, scope=None):
    size = x.get_shape()[-1].value
    with tf.variable_scope(scope or "HighwayLayer"):
        activ = tf.contrib.layers.fully_connected(
            x, size * 2, activation_fn=None, scope="FC"
        )
        transform = tf.sigmoid(activ[..., :size], name="transform_gate")
        hidden = activation_fn(activ[..., size:])
        carry = 1.0 - transform
        return tf.add(hidden * transform, x * carry, "y")


def conv2d(inputs, output_dim, k_h, k_w,
           stddev=0.02, scope=None,
           weight_noise=0.0, is_training=True):
    with tf.variable_scope(scope or "Conv2D"):
         w = tf.get_variable('w', [k_h, k_w, inputs.get_shape()[-1], output_dim],
                   initializer=tf.truncated_normal_initializer(stddev=stddev))
         if weight_noise > 0 and not isinstance(is_training, bool):
            w = add_weight_noise(w, is_training=is_training, stddev=weight_noise)
         return tf.nn.conv2d(inputs, w, strides=[1, 1, 1, 1], padding="VALID")


def character_convolution(inputs, feature):
    inputs_2d = tf.reshape(inputs,
        [tf.shape(inputs)[0] * tf.shape(inputs)[1], tf.shape(inputs)[2]]
    )
    inputs_3d = embedding_lookup(
        inputs_2d,
        dim=feature["dimension"],
        # 255 different bytes (uint8)
        # & start and end symbol:
        size=257,
        dtype=tf.float32,
        mask_negative=True)
    inputs_4d = tf.expand_dims(inputs_3d, 1)
    feature_pools = []
    for idx, conv_filter in enumerate(feature["filters"]):
        width, channels = conv_filter["width"], conv_filter["channels"]
        # [batch * time x 1 x word_length x embed_dim x feature_map_dim]
        conv = tf.squeeze(conv2d(inputs_4d, channels, 1, width, scope="CharacterConvolution%d" % (idx,)), [1])
        # remove word dimension
        pool = tf.reduce_max(conv, 1)
        feature_pools.append(pool)
    activations = concat(feature_pools, axis=1)
    channels_out = sum(conv_filter["channels"] for conv_filter in feature["filters"])
    activations = tf.reshape(
        tf.tanh(activations),
        [tf.shape(inputs)[0], tf.shape(inputs)[1], channels_out],
        name="CharacterConvolutionPooled")
    for idx in range(feature["highway_layers"]):
        activations = highway(activations, scope="HighwayLayer%d" % (idx,),
            activation_fn=tf.tanh)
    return activations


def feature_dtype(feat):
    if requires_vocab(feat):
        return tf.int32
    elif feat["type"] in {"digit", "punctuation_count", "uppercase"}:
        return tf.float32
    elif requires_character_convolution(feat):
        return tf.int32
    else:
        raise ValueError("unknown feature %r." % (feat,))


def feature_shape(feature):
    if requires_vocab(feature) or feature["type"] in {'digit', 'punctuation_count', 'uppercase'}:
        return [None, None]
    elif requires_character_convolution(feature):
        return [None, None, None]
    else:
        raise ValueError("unknown feature %r." % (feature,))


def build_inputs(features, objectives, fused, class_weights,
                 class_weights_clipval):
    input_placeholders = []
    labels = []
    labels_mask = []
    labels_class_weights = []
    max_output_vocab = max(len(obj["vocab"]) for obj in objectives)

    with tf.variable_scope("Inputs"):
        is_training = tf.placeholder(tf.bool, [], name="is_training")
        tf.add_to_collection(IS_TRAINING, is_training)
        for idx, feat in enumerate(features):
            input_placeholder = tf.placeholder(
                feature_dtype(feat), feature_shape(feat),
                name="input_placeholders_%d" % (idx,)
            )
            input_placeholders.append(input_placeholder)
            tf.add_to_collection(INPUT_PLACEHOLDERS, input_placeholder)

        if fused:
            label_placeholder = tf.placeholder(
                tf.int32, [None, None, len(objectives)]
            )
            labels_mask_placeholder = tf.placeholder(
                tf.bool, [None, None,  len(objectives)], name="labels_mask"
            )

            labels.append(label_placeholder)
            labels_mask.append(labels_mask_placeholder)
            tf.add_to_collection(LABEL_PLACEHOLDERS, label_placeholder)
            tf.add_to_collection(LABEL_MASK_PLACEHOLDERS, labels_mask_placeholder)

            if class_weights:
                with tf.variable_scope("FusedClassWeights"):
                    init_class_weights = tf.get_variable(
                        name="class_weights",
                        shape=[len(objectives) * max_output_vocab],
                        initializer=tf.constant_initializer(1),
                        dtype=tf.int64,
                        trainable=False)
                    init_class_count = tf.get_variable(
                        name="class_weights_denominator",
                        shape=[len(objectives)],
                        initializer=tf.constant_initializer(1),
                        dtype=tf.int64,
                        trainable=False)

                    def update_class_weights():
                        mask_as_ints = tf.cast(tf.reshape(labels_mask_placeholder, [-1, len(objectives)]), tf.int64)
                        updated_cls_weights = tf.scatter_add(
                            init_class_weights,
                            tf.reshape(label_placeholder + tf.reshape(tf.range(len(objectives)) * max_output_vocab, [1, 1, len(objectives)]), [-1]),
                            tf.reshape(mask_as_ints, [-1])
                        )
                        updated_class_count = tf.assign_add(init_class_count, tf.reduce_sum(mask_as_ints, 0))

                        # class weight: weight_i = total / class_i
                        weights = tf.clip_by_value(tf.expand_dims(updated_class_count, 1) /
                                                   tf.reshape(updated_cls_weights, [len(objectives), max_output_vocab]),
                                                   1e-6, class_weights_clipval)
                        return tf.cast(weights, tf.float32)

                    def return_class_weights():
                        # class weight: weight_i = total / class_i
                        return tf.cast(
                            tf.clip_by_value(tf.expand_dims(init_class_count, 1) /
                                             tf.reshape(init_class_weights, [len(objectives), max_output_vocab]),
                                             1e-6, class_weights_clipval), tf.float32)

                    labels_class_weights.append(
                        tf.cond(is_training,
                                update_class_weights,
                                return_class_weights))
            else:
                labels_class_weights.append(None)
        else:
            for objective in objectives:
                with tf.variable_scope(objective["name"]):
                    label_placeholder = tf.placeholder(
                        tf.int32, [None, None], name="labels"
                    )
                    labels.append(label_placeholder)
                    if objective["type"] == "crf":
                        labels_mask_placeholder = tf.placeholder(
                            tf.bool, [None], name="labels_mask"
                        )
                        labels_class_weights.append(None)
                    elif objective["type"] == "softmax":
                        labels_mask_placeholder = tf.placeholder(
                            tf.bool, [None, None], name="labels_mask"
                        )
                        if class_weights:
                            init_class_weights = tf.get_variable(
                                name="class_weights",
                                shape=len(objective["vocab"]),
                                initializer=tf.constant_initializer(1),
                                dtype=tf.int64,
                                trainable=False)
                            init_class_count = tf.get_variable(
                                name="class_weights_denominator",
                                shape=[],
                                initializer=tf.constant_initializer(1),
                                dtype=tf.int64,
                                trainable=False)

                            def update_class_weights():
                                mask_as_ints = tf.cast(tf.reshape(labels_mask_placeholder, [-1]), tf.int64)
                                updated_cls_weights = tf.scatter_add(
                                    init_class_weights,
                                    tf.reshape(label_placeholder, [-1]),
                                    mask_as_ints
                                )
                                updated_class_count = tf.assign_add(init_class_count, tf.reduce_sum(mask_as_ints))

                                # class weight: weight_i = total / class_i
                                weights = tf.clip_by_value(updated_class_count / updated_cls_weights,
                                                           1e-6, class_weights_clipval)
                                return tf.cast(weights, tf.float32)

                            def return_class_weights():
                                # class weight: weight_i = total / class_i
                                return tf.cast(
                                    tf.clip_by_value(init_class_count / init_class_weights,
                                                     1e-6, class_weights_clipval), tf.float32)

                            labels_class_weights.append(
                                tf.cond(is_training, update_class_weights, return_class_weights)
                            )
                        else:
                            labels_class_weights.append(None)
                    else:
                        raise ValueError(
                            "unknown objective type %r." % (
                                objective["type"]
                            )
                        )
                    labels_mask.append(labels_mask_placeholder)
                    tf.add_to_collection(LABEL_PLACEHOLDERS, label_placeholder)
                    tf.add_to_collection(LABEL_MASK_PLACEHOLDERS, labels_mask_placeholder)
        sequence_lengths = tf.placeholder(tf.int32, [None],
                                          name="sequence_lengths")
        tf.add_to_collection(SEQUENCE_LENGTHS, sequence_lengths)
    return (input_placeholders,
            labels,
            labels_mask,
            labels_class_weights,
            sequence_lengths,
            is_training)


def add_weight_noise(x, is_training, stddev):
    return tf.cond(is_training,
                   lambda: x + tf.random_normal(
                       shape=tf.shape(x), stddev=stddev),
                   lambda: x)


def build_recurrent(inputs, cudnn, faux_cudnn, hidden_sizes, is_training,
                    keep_prob, weight_noise):
    dtype = tf.float32
    if cudnn:
        if len(hidden_sizes) == 0:
            raise ValueError("hidden_sizes must be a list of length > 1.")
        hidden_size = hidden_sizes[0]
        if any(hidden_size != hsize for hsize in hidden_sizes):
            raise ValueError("cudnn RNN requires all hidden units "
                             "to be the same size (got %r)" % (
                hidden_sizes,
            ))
        num_layers = len(hidden_sizes)
        cell_input_size = inputs.get_shape()[-1].value

        est_size = estimate_cudnn_parameter_size(
            num_layers=num_layers,
            hidden_size=hidden_size,
            input_size=cell_input_size,
            input_mode="linear_input",
            direction="bidirectional"
        )
        # autoswitch to GPUs based on availability of alternatives:
        cudnn_params = tf.get_variable("RNNParams",
                                       shape=[est_size],
                                       dtype=tf.float32,
                                       initializer=tf.contrib.layers.variance_scaling_initializer())
        if weight_noise > 0:
            cudnn_params = add_weight_noise(cudnn_params,
                stddev=weight_noise, is_training=is_training)
        if faux_cudnn:
            cudnn_cell = CpuCudnnLSTM(num_layers,
                                      hidden_size,
                                      cell_input_size,
                                      input_mode="linear_input",
                                      direction="bidirectional")
        else:
            cpu_cudnn_params(cudnn_params,
                num_layers=num_layers,
                hidden_size=hidden_size,
                cell_input_size=cell_input_size,
                input_mode="linear_input",
                direction="bidirectional")
            cudnn_cell = CudnnLSTM(num_layers,
                                   hidden_size,
                                   cell_input_size,
                                   input_mode="linear_input",
                                   direction="bidirectional")
        init_state = tf.fill(
            (2 * num_layers, tf.shape(inputs)[1], hidden_size),
            tf.constant(np.float32(0.0)))
        hiddens, output_h, output_c = cudnn_cell(
            inputs,
            input_h=init_state,
            input_c=init_state,
            params=cudnn_params,
            is_training=True)
        hiddens = maybe_dropout(
            hiddens,
            keep_prob,
            is_training)
    else:
        cell = MultiRNNCell(
            [LSTMCell(hsize, is_training=is_training, keep_prob=keep_prob)
             for hsize in hidden_sizes]
        )
        hiddens, _ = bidirectional_dynamic_rnn(
            cell,
            inputs,
            time_major=True,
            dtype=dtype,
            swap_memory=True
        )
    return hiddens


def build_embed(inputs, features, index2words, keep_prob, is_training):
    embeddings = []
    for idx, (values, feature, index2word) in enumerate(zip(inputs, features, index2words)):
        if requires_vocab(feature):
            with tf.variable_scope("embedding_%d" % (idx,)):
                embedding = embedding_lookup(
                    values,
                    dim=feature["dimension"],
                    size=len(index2word),
                    dtype=tf.float32,
                    mask_negative=True
                )
                embeddings.append(embedding)
        elif requires_character_convolution(feature):
            embeddings.append(
                character_convolution(values, feature)
            )
        else:
            embeddings.append(tf.expand_dims(values, 2))
    return maybe_dropout(concat(embeddings, axis=2), keep_prob, is_training)


def crf_metrics(unary_scores, labels, transition_params, sequence_lengths,
                mask):
    """
    Computes CRF output metrics.
    Receives:
        unary_scores : batch-major order
        labels : batch-major order
        transition_params : nclasses x nclasses matrix.
        sequence_lengths : length of each time-sequence
        mask : batch-major example mask

    Returns:
        token_correct,
        token_correct_total,
        sentence_correct,
        sentence_correct_total
    """
    classes = unary_scores.get_shape()[-1].value
    decoded, scores = viterbi_decode(unary_scores,
                                     transition_params,
                                     sequence_lengths)

    tf.add_to_collection(UNARY_SCORES, unary_scores)
    tf.add_to_collection(DECODED, decoded)
    tf.add_to_collection(DECODED_SCORES, scores)

    equals_label = tf.equal(labels, decoded)
    token_correct = tf.reduce_sum(
        tf.cast(
            tf.logical_and(equals_label, mask),
            tf.int32
        )
    )
    token_correct_total = tf.reduce_sum(tf.cast(mask, tf.int32))
    tf.add_to_collection(TOKEN_CORRECT, token_correct)
    tf.add_to_collection(TOKEN_CORRECT_TOTAL, token_correct_total)
    sentence_correct, _ = compute_sentence_correct(equals_label, mask)
    sentence_correct_total = tf.reduce_sum(tf.cast(mask[:, 0], tf.int32))

    tf.add_to_collection(SENTENCE_CORRECT, sentence_correct)
    tf.add_to_collection(SENTENCE_CORRECT_TOTAL, sentence_correct_total)

    build_true_false_positives(decoded, mask, labels,
        classes, equals_label)

    return (token_correct, token_correct_total,
            sentence_correct, sentence_correct_total)


def build_true_false_positives(decoded, mask_batch_major, labels_batch_major,
                               classes, equals_label):
    masked_equals_label = tf.logical_and(equals_label, mask_batch_major)

    # now for each class compute tp, fp, fn
    # [nclasses x batch x time]
    masked_per_class = tf.logical_and(
        tf.equal(labels_batch_major[None, :, :], tf.range(classes)[:, None, None]),
        mask_batch_major)

    # correct, and on label
    correct = tf.reduce_sum(tf.cast(tf.logical_and(masked_per_class, equals_label[None, :, :]), tf.int32),
        axis=[1, 2])
    # predicted a particular class
    guessed = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(decoded[None, :, :], tf.range(classes)[:, None, None]), mask_batch_major), tf.int32),
        axis=[1, 2])
    total = tf.reduce_sum(tf.cast(masked_per_class, tf.int32), axis=[1, 2])
    tp, fp, fn = correct, guessed - correct, total - correct

    tf.add_to_collection(TRUE_POSITIVES, tp)
    tf.add_to_collection(FALSE_POSITIVES, fp)
    tf.add_to_collection(FALSE_NEGATIVES, fn)


def softmax_metrics(unary_scores, labels, mask):
    """
    Compute softmax output stats for correct/accuracy per-token/per-sentence.
    Receive
        unary_scores : time-major
        labels : time-major
        mask : time-major
    Returns:
        token_correct,
        token_correct_total,
        sentence_correct,
        sentence_correct_total
    """
    classes = unary_scores.get_shape()[-1].value
    unary_scores_batch_major = tf.transpose(unary_scores, [1, 0, 2])
    labels_batch_major = tf.transpose(labels, [1, 0])
    mask_batch_major = tf.transpose(mask, [1, 0])
    decoded = tf.cast(tf.argmax(unary_scores_batch_major, 2), labels.dtype)
    unary_probs_batch_major = tf.nn.softmax(unary_scores_batch_major)
    scores = tf.reduce_max(unary_probs_batch_major, 2)

    tf.add_to_collection(UNARY_SCORES, unary_probs_batch_major)
    tf.add_to_collection(DECODED, decoded)
    tf.add_to_collection(DECODED_SCORES, scores)

    equals_label = tf.equal(decoded, labels_batch_major)

    token_correct = tf.reduce_sum(
        tf.cast(
            tf.logical_and(
                equals_label,
                mask_batch_major
            ),
            tf.int32
        )
    )
    token_correct_total = tf.reduce_sum(tf.cast(mask, tf.int32))
    tf.add_to_collection(TOKEN_CORRECT, token_correct)
    tf.add_to_collection(TOKEN_CORRECT_TOTAL, token_correct_total)

    sentence_correct, sentence_correct_total = compute_sentence_correct(
        equals_label, mask_batch_major
    )
    tf.add_to_collection(SENTENCE_CORRECT, sentence_correct)
    tf.add_to_collection(SENTENCE_CORRECT_TOTAL, sentence_correct_total)

    build_true_false_positives(decoded, mask_batch_major, labels_batch_major,
        classes, equals_label)
    return (token_correct, token_correct_total,
            sentence_correct, sentence_correct_total)


def add_objective_names_types(objectives):
    for objective in objectives:
        with tf.variable_scope(objective["name"]):
            # store objective names in graph:
            tf.add_to_collection(OBJECTIVE_NAMES,
                tf.constant(objective["name"], name="objective_name")
            )
            tf.add_to_collection(OBJECTIVE_TYPES,
                tf.constant(objective["type"], name="objective_type")
            )


def build_loss(inputs, objectives, labels, labels_mask,
               labels_class_weights, fused, sequence_lengths,
               class_weights_normalize):
    """
    Compute loss function given the objectives.
    Assumes inputs are of the form [time, batch, features].

    Arguments:
    ----------
        inputs : tf.Tensor
        objectives : list<dict>, objective specs
        labels : list<tf.Tensor>
        labels_mask : list<tf.Tensor>
        labels_class_weights : list<tf.Tensor>
        sequence_lengths : tf.Tensor

    Returns:
        loss : tf.Tensor (scalar)
    """
    losses = []
    negative_log_likelihoods  = []
    sentence_corrects = []
    sentence_corrects_total = []
    token_corrects = []
    token_corrects_total = []
    max_output_vocab = max(len(obj["vocab"]) for obj in objectives)
    total_output_size = len(objectives) * max_output_vocab

    add_objective_names_types(objectives)

    if fused:
        with tf.variable_scope("FusedOutputs"):
            objective_labels = labels[0]
            mask = labels_mask[0]
            objective_class_weights = labels_class_weights[0]
            # perform all classifications at once:
            unary_scores = tf.contrib.layers.fully_connected(
                inputs, total_output_size,
                activation_fn=None
            )

            unary_scores = tf.reshape(unary_scores,
                                      [tf.shape(unary_scores)[0],
                                       tf.shape(unary_scores)[1],
                                       len(objectives),
                                       max_output_vocab])
            negative_log_likelihood = sparse_softmax_cross_entropy_with_logits(
                logits=unary_scores,
                labels=objective_labels
            )
            labels_mask_casted = tf.cast(mask, negative_log_likelihood.dtype)
            masked_negative_log_likelihood = negative_log_likelihood * labels_mask_casted
            if objective_class_weights is not None:
                class_weights_mask = tf.gather(
                        tf.reshape(objective_class_weights, [-1]),
                        objective_labels +
                        tf.reshape(tf.range(len(objectives)) * max_output_vocab, [1, 1, len(objectives)]))
                if class_weights_normalize:
                    masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood * class_weights_mask
                    num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted * class_weights_mask), 1e-6)
                    normed_loss = masked_weighed_negative_log_likelihood_sum / (num_predictions / len(objectives))
                else:
                    masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood * class_weights_mask
                    num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
                    normed_loss = masked_weighed_negative_log_likelihood_sum / (num_predictions / len(objectives))
            else:
                masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood
                num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
                normed_loss = masked_weighed_negative_log_likelihood_sum / (num_predictions / len(objectives))

            masked_negative_log_likelihood_sum = tf.reduce_sum(masked_negative_log_likelihood)
            losses.append(normed_loss)
            negative_log_likelihoods.append(masked_negative_log_likelihood_sum)

            for idx, objective in enumerate(objectives):
                with tf.variable_scope(objective["name"]):
                    (token_correct,
                     token_correct_total,
                     sentence_correct,
                     sentence_correct_total) = softmax_metrics(unary_scores[:, :, idx, :len(objective["vocab"])],
                                                               labels=objective_labels[:, :, idx],
                                                               mask=mask[:, :, idx])
                    token_corrects.append(token_correct)
                    token_corrects_total.append(token_correct_total)
                    sentence_corrects.append(sentence_correct)
                    sentence_corrects_total.append(sentence_correct_total)

    else:
        for objective, objective_labels, mask, objective_class_weights in zip(objectives, labels, labels_mask, labels_class_weights):
            with tf.variable_scope(objective["name"]):
                if objective["type"] == "crf":
                    unary_scores = tf.contrib.layers.fully_connected(
                        inputs,
                        len(objective["vocab"]),
                        activation_fn=None
                    )
                    unary_scores_batch_major = tf.transpose(unary_scores, [1, 0, 2])
                    labels_batch_major = tf.transpose(objective_labels, [1, 0])


                    padded_unary_scores_batch_major = tf.cond(tf.greater(tf.shape(unary_scores_batch_major)[1], 1),
                        lambda: unary_scores_batch_major,
                        lambda: tf.pad(unary_scores_batch_major, [[0, 0], [0, 1], [0, 0]]))
                    padded_labels_batch_major = tf.cond(tf.greater(tf.shape(labels_batch_major)[1], 1),
                        lambda: labels_batch_major,
                        lambda: tf.pad(labels_batch_major, [[0, 0], [0, 1]]))

                    log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
                        padded_unary_scores_batch_major, padded_labels_batch_major, sequence_lengths
                    )
                    labels_mask_casted = tf.cast(mask, log_likelihood.dtype)
                    masked_log_likelihood = (
                        log_likelihood * labels_mask_casted
                    )
                    masked_negative_log_likelihood_sum = -tf.reduce_sum(masked_log_likelihood)
                    num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
                    losses.append(masked_negative_log_likelihood_sum / num_predictions)
                    negative_log_likelihoods.append(masked_negative_log_likelihood_sum)
                    sequence_mask = tf.logical_and(
                        tf.sequence_mask(sequence_lengths),
                        # pad the time dimension:
                        tf.expand_dims(mask, 1)
                    )

                    (token_correct,
                     token_correct_total,
                     sentence_correct,
                     sentence_correct_total) = crf_metrics(unary_scores_batch_major,
                                                           labels=labels_batch_major,
                                                           mask=sequence_mask,
                                                           transition_params=transition_params,
                                                           sequence_lengths=sequence_lengths)
                elif objective["type"] == 'softmax':
                    unary_scores = tf.contrib.layers.fully_connected(
                        inputs,
                        len(objective["vocab"]),
                        activation_fn=None
                    )
                    negative_log_likelihood = sparse_softmax_cross_entropy_with_logits(
                        logits=unary_scores,
                        labels=objective_labels
                    )
                    labels_mask_casted = tf.cast(mask, negative_log_likelihood.dtype)
                    masked_negative_log_likelihood = (
                        negative_log_likelihood * labels_mask_casted
                    )
                    if objective_class_weights is not None:
                        class_weights_mask = tf.gather(objective_class_weights, objective_labels)
                        masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood * class_weights_mask
                        masked_negative_log_likelihood_sum = tf.reduce_sum(masked_negative_log_likelihood)

                        if class_weights_normalize:
                            num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted * class_weights_mask), 1e-6)
                            normed_loss = masked_weighed_negative_log_likelihood_sum / num_predictions
                        else:
                            num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
                            normed_loss = masked_weighed_negative_log_likelihood_sum / num_predictions
                    else:
                        masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood
                        masked_negative_log_likelihood_sum = tf.reduce_sum(masked_negative_log_likelihood)
                        num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
                        normed_loss = masked_weighed_negative_log_likelihood_sum / num_predictions

                    losses.append(normed_loss)
                    negative_log_likelihoods.append(masked_negative_log_likelihood_sum)

                    (token_correct,
                     token_correct_total,
                     sentence_correct,
                     sentence_correct_total) = softmax_metrics(unary_scores,
                                                               labels=objective_labels,
                                                               mask=mask)
                else:
                    raise ValueError(
                        "unknown objective type %r" % (objective["type"],)
                    )
                token_corrects.append(token_correct)
                token_corrects_total.append(token_correct_total)
                sentence_corrects.append(sentence_correct)
                sentence_corrects_total.append(sentence_correct_total)
    # aggregate metrics for all objectives:
    total_loss = tf.reduce_sum(sum_list(losses))
    tf.summary.scalar("BatchLoss", total_loss)
    neg_log_likelihood_total = sum_list(negative_log_likelihoods)
    tf.summary.scalar("BatchNLL", neg_log_likelihood_total)
    tf.add_to_collection(NLL, neg_log_likelihood_total)
    tf.add_to_collection(NLL_TOTAL, tf.shape(inputs)[1])

    sentence_corrects_total = sum_list(sentence_corrects_total)
    sentence_corrects = sum_list(sentence_corrects)
    tf.add_to_collection(SENTENCE_CORRECT_ALL, sentence_corrects)
    tf.add_to_collection(SENTENCE_CORRECT_ALL_TOTAL, sentence_corrects_total)

    token_corrects_total = sum_list(token_corrects_total)
    token_corrects = sum_list(token_corrects)
    tf.add_to_collection(TOKEN_CORRECT_ALL, token_corrects)
    tf.add_to_collection(TOKEN_CORRECT_ALL_TOTAL, token_corrects_total)
    return total_loss


def build_model(name,
                trainable,
                features,
                feature_index2words,
                objectives,
                keep_prob,
                input_keep_prob,
                hidden_sizes,
                freeze_rate,
                freeze_rate_anneal,
                solver,
                cudnn,
                fused,
                faux_cudnn,
                class_weights,
                class_weights_normalize,
                class_weights_clipval,
                lr,
                weight_noise,
                anneal_rate,
                clip_norm):
    # mixed output fusing is currently unsupported
    if fused and any(obj["type"] != "softmax" for obj in objectives):
        raise ValueError("cannot fuse outputs and use non-softmax output.")
    # clear all existing collections to ensure every new collection is
    # is created fresh
    graph = tf.get_default_graph()
    for collection_name in graph.get_all_collection_keys():
        graph.clear_collection(collection_name)

    # build a model under the model's name to prevent collisions
    # when multiple models are restored simultaneously
    with tf.variable_scope(name):
        global_step = tf.Variable(0, trainable=False, name="global_step")
        tf.add_to_collection(GLOBAL_STEP, global_step)
        # model placeholders:
        (input_placeholders,
         labels,
         labels_mask,
         labels_class_weights,
         sequence_lengths,
         is_training) = build_inputs(features,
                                     objectives=objectives,
                                     fused=fused,
                                     class_weights=class_weights,
                                     class_weights_clipval=class_weights_clipval)
        embed = build_embed(input_placeholders,
                            features=features,
                            index2words=feature_index2words,
                            is_training=is_training,
                            keep_prob=input_keep_prob)
        hiddens = embed
        if len(hidden_sizes) > 0:
            hiddens = build_recurrent(hiddens,
                                      cudnn=cudnn,
                                      faux_cudnn=faux_cudnn,
                                      hidden_sizes=hidden_sizes,
                                      keep_prob=keep_prob,
                                      weight_noise=weight_noise,
                                      is_training=is_training)

        loss = build_loss(hiddens,
                          objectives=objectives,
                          fused=fused,
                          labels=labels,
                          labels_mask=labels_mask,
                          labels_class_weights=labels_class_weights,
                          class_weights_normalize=class_weights_normalize,
                          sequence_lengths=sequence_lengths)
        if trainable:
            learning_rate = tf.train.exponential_decay(lr, global_step,
                                                       33000, anneal_rate, staircase=True)

            if solver == "adam":
                optimizer = LazyAdamOptimizer(learning_rate)
            elif solver == "sgd":
                optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            else:
                raise ValueError("Unknown solver %r." % (solver))

            grad_vars = optimizer.compute_gradients(loss)
            if clip_norm > 0:
                grad_vars = [(grad if isinstance(grad, tf.IndexedSlices) else tf.clip_by_norm(grad, clip_norm), var) for grad, var in grad_vars]
            train_op = optimizer.apply_gradients(grad_vars, global_step=global_step)
        else:
            train_op = tf.no_op()
        tf.add_to_collection(TRAIN_OP, train_op)
        tf.add_to_collection(TRAIN_SUMMARIES, tf.summary.merge_all())


def restore_session(session,
                    path,
                    replace_to=None,
                    replace_from=None,
                    verbose=False,
                    use_metagraph=True,
                    only_features=False):
    """
    Call restore on tf.train.Saver on a specific path to store all the
    variables of the current tensorflow session to a file for later restoring.

    Arguments:
        session : tf.Session
        path : str, place containing the session data to restore
        verbose : bool, print status messages.
        use_metagraph : bool, restore by re-creating saved metagraph.

    Returns:
        bool : success or failure of the restoration
    """
    makedirs(path, exist_ok=True)
    if not path.endswith("/"):
        path = path + "/"
    checkpoint = tf.train.get_checkpoint_state(path)
    if verbose:
        print("Looking for saved session under %r" % (path,), flush=True)
    if checkpoint is None or checkpoint.model_checkpoint_path is None:
        if verbose:
            print("No saved session found", flush=True)
        return False
    fname = basename(checkpoint.model_checkpoint_path)
    if verbose:
        print("Restoring saved session from %r" % (join(path, fname),), flush=True)

    if use_metagraph:
        param_saver = tf.train.import_meta_graph(join(path, fname + ".meta"),
            clear_devices=True)
        missing_vars = []
    else:
        if only_features:
            to_restore = {}
            whitelist = ["embedding", "/RNN/", "/RNNParams", "CharacterConvolution", "HighwayLayer"]
            for var in tf.global_variables():
                if any(keyword in var.name for keyword in whitelist):
                    to_restore[var.name[:-2]] = var
            param_saver = tf.train.Saver(to_restore)
        else:
            if replace_to is not None and replace_from is not None:
                to_restore = {}
                for var in tf.global_variables():
                    var_name = var.name[:var.name.rfind(":")]
                    old_name = var_name.replace(replace_to, replace_from)
                    to_restore[old_name] = var
                param_saver = tf.train.Saver(to_restore)
                missing_vars = []
            else:
                reader = tf.train.NewCheckpointReader(join(path, fname))
                saved_shapes = reader.get_variable_to_shape_map()
                found_vars = [var for var in tf.global_variables()
                              if var.name.split(':')[0] in saved_shapes]
                missing_vars = [var for var in tf.global_variables()
                                if var.name.split(':')[0] not in saved_shapes]
                param_saver = tf.train.Saver(found_vars)
    param_saver.restore(session, join(path, fname))
    session.run([var.initializer for var in missing_vars])
    return True


def bidirectional_dynamic_rnn(cell, inputs, dtype, time_major=True, swap_memory=False):
    with tf.variable_scope("forward"):
        out_fwd, final_fwd = tf.nn.dynamic_rnn(
            cell,
            inputs,
            time_major=time_major,
            dtype=dtype,
            swap_memory=swap_memory
        )

    if time_major:
        reverse_axis = 0
    else:
        reverse_axis = 1

    with tf.variable_scope("backward"):
        out_bwd, final_bwd = tf.nn.dynamic_rnn(
            cell,
            reverse(inputs, axis=reverse_axis),
            time_major=time_major,
            dtype=dtype,
            swap_memory=swap_memory
        )

    out_bwd = reverse(out_bwd, axis=reverse_axis)
    return concat([out_fwd, out_bwd], axis=2), (final_fwd, final_bwd)


def get_embedding_lookup(size, dim, dtype, reuse=None, trainable=True):
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
        W = tf.get_variable(
            name="embedding",
            shape=[size, dim],
            dtype=dtype,
            initializer=tf.random_uniform_initializer(
                -1.0 / math.sqrt(dim),
                1.0 / math.sqrt(dim)
            ),
            trainable=trainable
        )
        return W


def embedding_lookup(inputs,
                     size,
                     dim,
                     dtype,
                     reuse=None,
                     mask_negative=False,
                     trainable=True,
                     place_on_cpu_if_big=True):
    """
    Construct an Embedding layer that gathers
    elements from a matrix with `size` rows,
    and `dim` features using the indices stored in `x`.

    Arguments:
    ----------
        inputs : tf.Tensor, of integer type
        size : int, how many symbols in the lookup table
        dim : int, how many columns per symbol.
        dtype : data type for the lookup table (e.g. tf.float32)
        reuse : bool, (default None) whether the lookup table
            was already used before (thus this is weight sharing).
        mask_negative : bool, (default False) should -1s in the
            lookup input indicate padding (e.g. no lookup),
            and thus should those values be masked out post-lookup.
        trainable : bool (default True), whether the parameters of
            this lookup table can be backpropagated into (e.g.
            for Glove word vectors that are fixed pre-trained, this
            can be set to False).
        place_on_cpu_if_big : bool, if matrix is big, store it on cpu.
    Returns:
    --------
        tf.Tensor, result of tf.nn.embedding_lookup(LookupTable, inputs)
    """
    W = get_embedding_lookup(size, dim, dtype, reuse, trainable=trainable)
    if mask_negative:
        embedded = tf.nn.embedding_lookup(W, tf.maximum(inputs, 0))
        null_mask = tf.expand_dims(
            tf.cast(
                tf.not_equal(inputs, -1),
                dtype
            ),
            -1
        )
        return embedded * null_mask
    else:
        return tf.nn.embedding_lookup(W, inputs)


def _get_sharded_variable(name, shape, dtype, num_shards):
    """Get a list of sharded variables with the given dtype."""
    if num_shards > shape[0]:
        raise ValueError("Too many shards: shape=%s, num_shards=%d" %
                         (shape, num_shards))
    unit_shard_size = int(math.floor(shape[0] / num_shards))
    remaining_rows = shape[0] - unit_shard_size * num_shards

    shards = []
    for i in range(num_shards):
        current_size = unit_shard_size
        if i < remaining_rows:
            current_size += 1
        shards.append(
            tf.get_variable(
                name + "_%d" % i,
                [current_size] + shape[1:],
                dtype=dtype
            )
        )
    return shards


def _get_concat_variable(name, shape, dtype, num_shards):
    """Get a sharded variable concatenated into one tensor."""
    sharded_variable = _get_sharded_variable(name, shape, dtype, num_shards)
    if len(sharded_variable) == 1:
        return sharded_variable[0]

    concat_name = name + "/concat"
    concat_full_name = tf.get_variable_scope().name + "/" + concat_name + ":0"
    for value in tf.get_collection(tf.GraphKeys.CONCATENATED_VARIABLES):
        if value.name == concat_full_name:
            return value

    concat_variable = tf.concat_v2(sharded_variable, 0, name=concat_name)
    tf.add_to_collection(tf.GraphKeys.CONCATENATED_VARIABLES, concat_variable)
    return concat_variable


class SequenceModel(object):
    def __init__(self,
                 objectives,
                 features,
                 feature_index2words,
                 hidden_sizes,
                 keep_prob,
                 lr,
                 solver,
                 seed=1234,
                 input_keep_prob=0.7,
                 clip_norm=-1,
                 name="SequenceTagger",
                 cudnn=False,
                 anneal_rate=0.99,
                 trainable=True,
                 weight_noise=0.0,
                 class_weights_normalize=False,
                 faux_cudnn=False,
                 class_weights=False,
                 class_weights_clipval=1000.0,
                 freeze_rate=1.0,
                 fused=False,
                 freeze_rate_anneal=0.8,
                 create_variables=True):
        if fused and objectives[0]["type"] == "crf":
            fused = False

        self.keep_prob = keep_prob
        self.input_keep_prob = input_keep_prob
        self.hidden_sizes = hidden_sizes
        self.name = name
        self.objectives = objectives
        self.features = features
        self.feature_index2words = feature_index2words
        self.seed = seed
        self.lr = lr
        self.fused = fused
        self.weight_noise = weight_noise
        self.anneal_rate = anneal_rate
        self.clip_norm = clip_norm
        self.solver = solver
        self.class_weights_normalize = class_weights_normalize
        self.class_weights = class_weights
        self.class_weights_clipval = class_weights_clipval
        self.rng = np.random.RandomState(seed)
        self.cudnn = cudnn
        self.feature_word2index = [
            {w: k for k, w in enumerate(index2word)} if index2word is not None else None
            for index2word in self.feature_index2words
        ]
        self.label2index = [
            {w: k for k, w in enumerate(objective["vocab"])}
            for objective in self.objectives
        ]

        if create_variables:
            # 1) build graph here (TF functional code pattern)
            build_model(name=self.name,
                        trainable=trainable,
                        objectives=self.objectives,
                        features=self.features,
                        feature_index2words=self.feature_index2words,
                        hidden_sizes=self.hidden_sizes,
                        keep_prob=self.keep_prob,
                        solver=self.solver,
                        freeze_rate=freeze_rate,
                        class_weights_normalize=self.class_weights_normalize,
                        class_weights=self.class_weights,
                        class_weights_clipval=self.class_weights_clipval,
                        freeze_rate_anneal=freeze_rate_anneal,
                        cudnn=self.cudnn,
                        lr=self.lr,
                        fused=self.fused,
                        weight_noise=self.weight_noise,
                        anneal_rate=self.anneal_rate,
                        input_keep_prob=self.input_keep_prob,
                        faux_cudnn=faux_cudnn,
                        clip_norm=self.clip_norm)

        # 2) and use meta graph to recover these fields:
        self.recover_graph_variables()


    def recover_graph_variables(self):
        """Use TF meta graph to obtain key metrics
        and outputs from model."""
        self.labels = tf.get_collection(LABEL_PLACEHOLDERS)
        self.labels_mask = tf.get_collection(LABEL_MASK_PLACEHOLDERS)
        self.input_placeholders = tf.get_collection(INPUT_PLACEHOLDERS)
        self.sequence_lengths = tf.get_collection(SEQUENCE_LENGTHS)[0]
        self.decoded = tf.get_collection(DECODED)
        self.decoded_scores = tf.get_collection(DECODED_SCORES)
        self.unary_scores = tf.get_collection(UNARY_SCORES)

        self.token_correct = tf.get_collection(TOKEN_CORRECT)
        self.token_correct_total = tf.get_collection(TOKEN_CORRECT_TOTAL)

        self.sentence_correct = tf.get_collection(SENTENCE_CORRECT)
        self.sentence_correct_total = tf.get_collection(SENTENCE_CORRECT_TOTAL)

        self.token_correct_all = tf.get_collection(TOKEN_CORRECT_ALL)[0]
        self.token_correct_all_total = tf.get_collection(TOKEN_CORRECT_ALL_TOTAL)[0]
        self.sentence_correct_all = tf.get_collection(SENTENCE_CORRECT_ALL)[0]
        self.sentence_correct_all_total = tf.get_collection(SENTENCE_CORRECT_ALL_TOTAL)[0]

        self.true_positives = tf.get_collection(TRUE_POSITIVES)
        self.false_positives = tf.get_collection(FALSE_POSITIVES)
        self.false_negatives = tf.get_collection(FALSE_NEGATIVES)

        if len(self.true_positives) == 0 and len(self.token_correct) != 0:
            self.true_positives = [None for _ in self.token_correct]
            self.false_positives = [None for _ in self.token_correct]
            self.false_negatives = [None for _ in self.token_correct]

        if len(tf.get_collection(GLOBAL_STEP)) > 0:
            self.global_step = tf.get_collection(GLOBAL_STEP)[0]
        else:
            try:
                self.global_step = tf.get_default_graph().get_tensor_by_name(
                    self.name + "/" + "global_step:0")
            except KeyError:
                self.global_step = tf.Variable(0, trainable=False, name="global_step")
            tf.add_to_collection(GLOBAL_STEP, self.global_step)

        self.is_training = tf.get_collection(IS_TRAINING)[0]
        self.noop = tf.no_op()
        self.train_op = tf.get_collection(TRAIN_OP)[0]
        train_summaries = tf.get_collection(TRAIN_SUMMARIES)
        self.train_summaries = train_summaries[0] if len(train_summaries) > 0 else None

        self.nll = tf.get_collection(NLL)[0]
        self.nll_total = tf.get_collection(NLL_TOTAL)[0]
        self.saver = tf.train.Saver()


    @classmethod
    def overrideable_fields(cls):
        return [
            "keep_prob",
            "name",
            "lr",
            "clip_norm",
            "class_weights_normalize",
            "class_weights_clipval",
            "cudnn",
            "anneal_rate",
            "weight_noise",
            "input_keep_prob"
        ]

    @classmethod
    def fields_to_save(cls):
        return [
            "hidden_sizes",
            "objectives",
            "name",
            "cudnn",
            "class_weights",
            "features",
            "fused",
            "class_weights_normalize",
            "weight_noise",
            "anneal_rate",
            "feature_index2words",
            "solver",
            "lr",
            "clip_norm",
            "keep_prob",
            "input_keep_prob",
            "class_weights_clipval"
        ]

    def predict(self, session, feed_dict):
        feed_dict[self.is_training] = False
        outputs, outputs_probs = session.run(
            (self.decoded, self.decoded_scores), feed_dict
        )
        predictions_out = {}
        for value, val_prob, objective in zip(outputs, outputs_probs, self.objectives):
            predictions_out[objective["name"]] = (value, val_prob)
        return predictions_out

    def predict_proba(self, session, feed_dict):
        feed_dict[self.is_training] = False
        outputs = session.run(
            self.unary_scores, feed_dict
        )
        predictions_out = {}
        for value, objective in zip(outputs, self.objectives):
            predictions_out[objective["name"]] = value
        return predictions_out

    def save(self, session, path):
        makedirs(path, exist_ok=True)
        with open(join(path, "model.json"), "wt") as fout:
            save_dict = {}
            for field in type(self).fields_to_save():
                save_dict[field] = getattr(self, field)
            json.dump(save_dict, fout)

        with open(join(path, "rng.pkl"), "wb") as fout:
            pickle.dump(self.rng, fout)

        save_session(session, self.saver, path, verbose=True)

    @classmethod
    def load(cls, session, path, args=None, verbose=True, trainable=True,
             rebuild_graph=False, faux_cudnn=False, replace_to=None, replace_from=None):
        """Convenience method for using a tensorflow session to reload
        a previously saved + serialized model from disk."""
        with open(join(path, "model.json"), "rt") as fin:
            model_props = json.load(fin)

        # update fields based on CLI:
        if args is not None:
            ex_fields = explicitly_set_fields()
            for field in cls.overrideable_fields():
                if field in ex_fields:
                    model_props[field] = getattr(args, field)

        # prune old fields based on changes to saveable fields:
        relevant_props = {}
        for field in cls.fields_to_save():
            if field in model_props:
                relevant_props[field] = model_props[field]

        relevant_props["trainable"] = trainable
        relevant_props["faux_cudnn"] = faux_cudnn

        if rebuild_graph:
            print("Using rebuild_graph mode: creating a new graph.", flush=True)
            relevant_props["create_variables"] = True
            model = cls(**relevant_props)
            restore_session(
                session, path,
                replace_to=replace_to,
                replace_from=replace_from,
                verbose=verbose,
                use_metagraph=False
            )
        else:
            if model_props.get("cudnn", False):
                import tensorflow.contrib.cudnn_rnn
            relevant_props["create_variables"] = False
            restore_session(
                session, path,
                verbose=verbose,
                use_metagraph=True
            )
            model = cls(**relevant_props)

        rng_path = join(path, "rng.pkl")
        if exists(rng_path):
            # apply the saved random number generator to this
            # model:
            with open(rng_path, "rb") as fin:
                model.rng = pickle.load(fin)
        return model


def make_path_absolute(obj, basepath):
    copied = obj.copy()
    for key in ["path", "vocab"]:
        if key in copied:
            copied[key] = join(basepath, copied[key])
    return copied


class Config(object):
    def __init__(self, datasets, features, objectives,
                 wikidata_path, classification_path):
        assert(len(features) > 0)
        self.datasets = datasets
        self.features = features
        self.objectives = objectives
        self.classifications = None
        self.wikidata_path = wikidata_path
        self.classification_path = classification_path

        # build the objective names:
        self._named_objectives = [obj["name"] for obj in self.objectives]

    @classmethod
    def load(cls, path):
        with open(path, "rt") as fin:
            config = json.load(fin)
        config_dirname = dirname(path)
        return cls(
            datasets=[make_path_absolute(dataset, config_dirname) for dataset in config['datasets']],
            features=[make_path_absolute(feat, config_dirname) for feat in config['features']],
            objectives=[make_path_absolute(objective, config_dirname) for objective in config['objectives']],
            wikidata_path=config.get("wikidata_path", None),
            classification_path=(
                join(config_dirname, config.get("classification_path", None))
                if "classification_path" in config else None)
        )

    def load_dataset_separate(self, dataset_type):
        paths = [dataset for dataset in self.datasets if dataset["type"] == dataset_type]
        all_examples = {}
        for dataset in paths:
            _, extension = splitext(dataset["path"])
            if extension == ".h5" or extension == ".hdf5":
                if self.classifications is None:
                    if self.wikidata_path is None or self.classification_path is None:
                        raise ValueError("missing wikidata_path and "
                                         "classification_path, cannot "
                                         "construct H5Dataset.")
                    self.classifications = ClassificationHandler(
                        self.wikidata_path,
                        self.classification_path
                    )
                examples = H5Dataset(
                    dataset["path"],
                    dataset["x"],
                    dataset["y"],
                    self._named_objectives,
                    ignore_value=dataset.get('ignore', None),
                    classifications=self.classifications)
            else:
                examples = TSVDataset(
                    dataset["path"],
                    dataset["x"],
                    dataset["y"],
                    self._named_objectives,
                    comment=dataset.get('comment', '#'),
                    ignore_value=dataset.get('ignore', None),
                    retokenize=dataset.get('retokenize', False))
            title = dataset["path"].split('/')[-1].split(".")[0]
            name = title
            iteration = 1
            while name in all_examples:
                name = title + "-%d" % (iteration,)
                iteration += 1
            all_examples[name] = examples
        return all_examples

    def load_dataset(self, dataset_type, merge=True):
        datasets = self.load_dataset_separate(dataset_type)
        if merge:
            return CombinedDataset(list(datasets.values()))
        return datasets


def boolean_argument(parser, name, default):
    parser.add_argument("--" + name, action="store_true", default=default)
    parser.add_argument("--no" + name, action="store_false", dest=name)


def parse_args(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('config', type=str)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--anneal_rate', type=float, default=0.99)
    parser.add_argument('--clip_norm', type=float, default=-1)
    parser.add_argument('--weight_noise', type=float, default=0.0)
    parser.add_argument('--hidden_sizes', type=int, nargs="*", default=[200, 200])
    parser.add_argument('--load_dir', type=str, default=None)
    parser.add_argument('--restore_input_features', type=str, default=None)
    parser.add_argument('--improvement_key', type=str, default="token_correct")
    parser.add_argument('--freeze_rate', type=float, default=1.0)
    parser.add_argument('--freeze_rate_anneal', type=float, default=0.8)
    parser.add_argument('--save_dir', type=str, default=None)
    parser.add_argument('--max_epochs', type=int, default=1000)
    parser.add_argument('--test_every', type=int, default=10000,
        help="Number of training iterations after which testing should occur.")
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--max_patience', type=int, default=10)
    parser.add_argument('--class_weights_clipval', type=float, default=1000.0)
    parser.add_argument('--device', type=str, default="gpu:0")
    parser.add_argument('--keep_prob', type=float, default=0.5)
    parser.add_argument('--input_keep_prob', type=float, default=0.7)
    parser.add_argument('--solver', type=str, default="adam",
                        choices=["adam", "sgd"])
    parser.add_argument("--name", type=str, default="SequenceTagger")
    parser.add_argument("--old_name", type=str, default=None)
    boolean_argument(parser, "cudnn", True)
    boolean_argument(parser, "faux_cudnn", False)
    boolean_argument(parser, "class_weights", False)
    boolean_argument(parser, "rebuild_graph", False)
    boolean_argument(parser, "class_weights_normalize", False)
    boolean_argument(parser, "fused", True)
    boolean_argument(parser, "report_metrics_per_axis", True)
    boolean_argument(parser, "report_class_f1", False)
    return parser.parse_args(args=args)


def get_vocab(dataset, max_vocab=-1, extra_words=None):
    index2word = []
    occurrence = {}
    for el in dataset:
        if el not in occurrence:
            index2word.append(el)
            occurrence[el] = 1
        else:
            occurrence[el] += 1
    index2word = sorted(index2word, key=lambda x: occurrence[x], reverse=True)
    if max_vocab > 0:
        index2word = index2word[:max_vocab]
    if extra_words is not None:
        index2word = extra_words + index2word
    return index2word


def get_objectives(objectives, dataset):
    out = []
    for obj_idx, objective in enumerate(objectives):
        if "vocab" in objective:
            with open(objective["vocab"], "rt") as fin:
                vocab = fin.read().splitlines()
        else:
            vocab = get_vocab((w[obj_idx] for _, y in dataset for w in y if w[obj_idx] is not None), -1)

        out.append(
            {
                "vocab": vocab,
                "type": objective["type"],
                "name": objective["name"]
            }
        )
    return out


def merge_all_metrics(metrics):
    out = {}
    for key, metric in metrics.items():
        for subkey, submetric in metric.items():
            if len(key) > 0:
                out[key + "_" + subkey] = submetric
                if subkey not in out:
                    out[subkey] = submetric
                else:
                    out[subkey] += submetric
            else:
                out[subkey] = submetric
    return out


def log_outcome(logger, outcome, step, name):
    for k, v in sorted(outcome.items()):
        if "total" in k:
            continue
        else:
            total = outcome[k + "_total"]
            if total == 0:
                continue
            logger.log(k, v / total, step=step)
    logger.writer.flush()


def compute_f1(metrics, objectives, report_class_f1):
    total_f1 = 0.0
    total_precision = 0.0
    total_recall = 0.0
    total = 0
    for objective in objectives:
        name = objective["name"]
        key = "%s_true_positives" % (name,)
        if key not in metrics:
            continue
        tp = metrics[key]
        fp = metrics["%s_false_positives" % (name,)]
        fn = metrics["%s_false_negatives" % (name,)]
        del metrics[key]
        del metrics["%s_false_positives" % (name,)]
        del metrics["%s_false_negatives" % (name,)]


        precision = 1.* tp / np.maximum((tp + fp), 1e-6)
        recall = 1. * tp / np.maximum((tp + fn), 1e-6)
        f1 = 2.0 * precision * recall / np.maximum((precision + recall), 1e-6)

        support = tp + fn

        full_f1 = np.average(f1, weights=support) * 100.0
        full_recall = np.average(recall, weights=support) * 100.0
        full_precision = np.average(precision, weights=support) * 100.0

        total_f1 += full_f1
        total_recall += full_recall
        total_precision += full_precision
        total += 1
        if report_class_f1:
            print("F1 %s: %r" % (name, full_f1))
            print("Name\tF1\tTP\tFP\tFN")
            rows = zip([label for label, has_support in zip(objective["vocab"],
                                                            support > 0)
                        if has_support],
                       f1, tp, fp, fn)
            for val, f1_val, val_tp, val_fp, val_fn in rows:
                print("%s\t%r\t%d\t%d\t%d" % (
                    val, f1_val, val_tp, val_fp, val_fn))
            print("")
    if total > 0:
        metrics["F1"] = total_f1
        metrics["recall"] = total_recall
        metrics["precision"] = total_precision
        metrics["F1_total"] = total
        metrics["recall_total"] = total
        metrics["precision_total"] = total


def accuracy(model, session, datasets, batch_size, train,
             report_metrics_per_axis, report_class_f1,
             callback=None,
             callback_period=None, writer=None):
    pbar = get_progress_bar("train" if train else "validation", item="batches")
    if not isinstance(datasets, dict):
        datasets = {'':datasets}
    all_metrics_agg = {}

    if callback is not None:
        if callback_period is None:
            raise ValueError("callback_period cannot be None if "
                             "callback is used.")
    else:
        callback_period = None

    if train:
        train_op = model.train_op
    else:
        train_op = model.noop
    is_training = model.is_training
    metrics = {"nll": model.nll, "nll_total": model.nll_total}
    summaries = []

    if not train:
        metric_iter = zip(
            model.objectives,
            model.token_correct,
            model.token_correct_total,
            model.sentence_correct,
            model.sentence_correct_total,
            model.true_positives,
            model.false_positives,
            model.false_negatives
        )
        for metric_vars in metric_iter:
            (
                objective,
                token_correct,
                token_correct_total,
                sentence_correct,
                sentence_correct_total,
                true_positives,
                false_positives,
                false_negatives
            ) = metric_vars
            name = objective["name"]
            if report_metrics_per_axis:
                metrics["%s_token_correct" % (name,)] = token_correct
                metrics["%s_token_correct_total" % (name,)] = token_correct_total
                metrics["%s_sentence_correct" % (name,)] = sentence_correct
                metrics["%s_sentence_correct_total" % (name,)] = sentence_correct_total
            if true_positives is not None:
                metrics["%s_true_positives" % (name,)] = true_positives
                metrics["%s_false_positives" % (name,)] = false_positives
                metrics["%s_false_negatives" % (name,)] = false_negatives
        metrics["token_correct"] = model.token_correct_all
        metrics["token_correct_total"] = model.token_correct_all_total
        metrics["sentence_correct"] = model.sentence_correct_all
        metrics["sentence_correct_total"] = model.sentence_correct_all_total
        summaries = []
    else:
        if writer is not None and model.train_summaries is not None:
            summaries = model.train_summaries

    metrics_values = [v for _, v in sorted(metrics.items())]
    metrics_names = [name for name, _ in sorted(metrics.items())]
    outputs_val = [train_op, model.global_step, summaries, metrics_values]
    for title, dataset in datasets.items():
        batches = iter_batches_single_threaded(
            model=model,
            dataset=dataset,
            batch_size=batch_size,
            train=train,
            pbar=pbar
        )
        metrics_agg = {}
        iteration = 0
        for feed_dict in batches:
            feed_dict[is_training] = train
            _, step, summary_out, outputs = session.run(outputs_val, feed_dict)
            if writer is not None:
                writer.add_summary(summary_out, step)
            for key, value in zip(metrics_names, outputs[:len(metrics_names)]):
                if key not in metrics_agg:
                    metrics_agg[key] = value
                else:
                    metrics_agg[key] += value
            iteration += 1
            if callback_period is not None and iteration % callback_period == 0:
                callback(iteration)

            if np.isnan(metrics_agg['nll']):
                print("loss is NaN.", flush=True, file=sys.stderr)
                sys.exit(1)

        compute_f1(metrics_agg, model.objectives, report_class_f1)
        all_metrics_agg[title] = metrics_agg
        del batches
    return merge_all_metrics(all_metrics_agg)


def present_outcome(outcome, epoch, name):
    string_rows = []
    for k, v in sorted(outcome.items()):
        if "total" in k:
            continue
        else:
            total = outcome[k + "_total"]
            if total == 0:
                continue
            if "correct" in k:
                string_rows.append(
                    [
                        k,
                        "%.2f%%" % (100.0 * v / total),
                        "(%d correct / %d)" % (v, total)
                    ]
                )
            else:
                string_rows.append(
                    [
                        k,
                        "%.3f" % (v / total),
                        ""
                    ]
                )
    max_len_cols = [
        max(len(row[colidx]) for row in string_rows)
        for colidx in range(len(string_rows[0]))
    ] if len(string_rows) > 0 else []
    rows = []
    for row in string_rows:
        rows.append(
            " ".join(
                [col + " " * (max_len_cols[colidx] - len(col))
                 for colidx, col in enumerate(row)]
            )
        )
    return "\n".join(["Epoch {epoch}: {name}".format(epoch=epoch, name=name)] + rows)


def print_outcome(outcome, objectives, epoch, step, name, logger=None):
    outcome_report = present_outcome(outcome, epoch, name)
    if logger is not None:
        log_outcome(logger, outcome, step, name)
    print(outcome_report)


class SequenceTagger(object):
    def __init__(self, path, device="gpu", faux_cudnn=False, rebuild_graph=False):
        tf.reset_default_graph()
        session_conf = tf.ConfigProto(
            allow_soft_placement=True
        )
        self.session = tf.InteractiveSession(config=session_conf)
        with tf.device(device):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", UserWarning)
                self._model = SequenceModel.load(
                    self.session,
                    path,
                    args=None,
                    verbose=False,
                    trainable=False,
                    rebuild_graph=rebuild_graph,
                    faux_cudnn=faux_cudnn
                )

    @property
    def objectives(self):
        return self._model.objectives

    def predict_proba(self, tokens):
        blank_labels = tuple(None for _ in self._model.objectives)
        batches = list(iter_batches_single_threaded(
            model=self._model,
            dataset=[
                (tokens, [blank_labels for t in tokens])
            ],
            batch_size=1,
            train=False,
            autoresize=False
        ))
        outputs = []
        batches[0][self._model.is_training] = False
        probs_out = self._model.predict_proba(
            self.session, batches[0]
        )
        return probs_out


    def predict_proba_sentences(self, sentences):
        blank_labels = tuple(None for _ in self._model.objectives)
        batches = iter_batches_single_threaded(
            model=self._model,
            dataset=[
                (sentence, [blank_labels for t in sentence])
                for sentence in sentences
            ],
            batch_size=min(256, len(sentences)),
            train=False,
            autoresize=False
        )
        for batch in batches:
            batch[self._model.is_training] = False
            yield self._model.predict_proba(
                self.session, batch
            )

    def predict_topk_sentences(self, sentences, k=5):
        blank_labels = tuple(None for _ in self._model.objectives)
        batches = iter_batches_single_threaded(
            model=self._model,
            dataset=[
                (sentence, [blank_labels for t in sentence])
                for sentence in sentences
            ],
            batch_size=min(256, len(sentences)),
            train=False,
            autoresize=False
        )
        for batch in batches:
            outputs = self._model.predict_proba(
                self.session, batch
            )
            named_outputs = {}
            for objective in self._model.objectives:
                obj_name = objective["name"]
                tags, scores = outputs[obj_name]
                if objective["type"] == "crf":
                    named_outputs[obj_name] = [
                        [(token, [objective["vocab"][tag]], [score]) for token, tag in zip(tokens, tags)]
                        for tokens, tags, score in zip(sentences, tags, scores)
                    ]
                elif objective["type"] == 'softmax':
                    all_sent_scores = []

                    for tokens, scores in zip(sentences, scores):
                        sent_scores = []
                        for token, token_scores in zip(tokens, scores):
                            topk = np.argsort(token_scores)[::-1][:k]
                            sent_scores.append(
                                (
                                    token,
                                    [objective["vocab"][idx] for idx in topk],
                                    [token_scores[idx] for idx in topk]
                                )
                            )
                        all_sent_scores.append(sent_scores)
                    named_outputs[obj_name] = all_sent_scores
                else:
                    raise ValueError("unknown objective type %r." % (objective["type"],))
            yield named_outputs

    def tag_sentences(self, sentences):
        if len(sentences) == 0:
            return {
                objective["name"]: []
                for objective in self._model.objectives
            }
        blank_labels = tuple(None for _ in self._model.objectives)
        batches = list(iter_batches_single_threaded(
            self._model,
            [
                (sentence, [blank_labels for t in sentence])
                for sentence in sentences
            ],
            batch_size=min(256, len(sentences)),
            train=False,
            autoresize=False
        ))

        named_outputs = {}
        sentence_idx = 0

        for batch in batches:
            outputs = self._model.predict(self.session, batch)
            for objective in self._model.objectives:
                obj_name = objective["name"]
                if obj_name not in named_outputs:
                    named_outputs[obj_name] = []
                tags, scores = outputs[obj_name]
                nsentences = len(tags)
                if objective["type"] == "crf":
                    named_outputs[obj_name].extend([
                        [(token, objective["vocab"][tag], score) for token, tag in zip(tokens, tags)]
                        for tokens, tags, score in zip(sentences[sentence_idx:sentence_idx+nsentences], tags, scores)
                    ])
                elif objective["type"] == 'softmax':
                    named_outputs[obj_name].extend([
                        [(token, objective["vocab"][tag], score)
                         for token, tag, score in zip(tokens, tags, scores)]
                        for tokens, tags, scores in zip(sentences[sentence_idx:sentence_idx+nsentences], tags, scores)
                    ])
                else:
                    raise ValueError("unknown objective type %r." % (objective["type"],))
            sentence_idx += nsentences

        return named_outputs


def count_number_of_parameters():
    return int(sum([np.prod(var.get_shape().as_list())
                    for var in tf.trainable_variables()]))


class TestCallback(object):
    def __init__(self, model, session, dataset, epoch, args, logger):
        self.model = model
        self.session = session
        self.dataset = dataset
        self.epoch = epoch
        self.args = args
        self.logger = logger
        self.report_metrics_per_axis = args.report_metrics_per_axis
        self.report_class_f1 = args.report_class_f1

    def test(self, iteration):
        dev_outcome = accuracy(self.model, self.session, self.dataset, self.args.batch_size,
            train=False, report_metrics_per_axis=self.report_metrics_per_axis,
            report_class_f1=self.report_class_f1)
        print_outcome(dev_outcome, self.model.objectives,
            epoch="{}-{}".format(self.epoch, iteration),
            step=self.session.run(self.model.global_step),
            name="validation",
            logger=self.logger
        )
        if self.args.save_dir is not None:
            self.model.save(self.session, self.args.save_dir)


def compute_epoch(session, model, train_set,
                  validation_set, test_callback, epoch,
                  train_writer, test_writer,
                  args):
    test_callback.epoch = epoch
    train_outcome = accuracy(model,
                             session,
                             train_set,
                             args.batch_size,
                             train=True,
                             callback_period=args.test_every,
                             writer=train_writer.writer if train_writer is not None else None,
                             report_metrics_per_axis=args.report_metrics_per_axis,
                             report_class_f1=args.report_class_f1,
                             callback=test_callback.test)
    global_step = session.run(model.global_step)
    print_outcome(train_outcome,
                  model.objectives,
                  epoch=epoch,
                  name="train",
                  step=global_step,
                  logger=train_writer)
    dev_outcome = accuracy(
        model, session, validation_set, args.batch_size,
        train=False,
        report_metrics_per_axis=args.report_metrics_per_axis,
        report_class_f1=args.report_class_f1)
    print_outcome(dev_outcome,
                  model.objectives,
                  epoch=epoch,
                  step=global_step,
                  name="validation",
                  logger=test_writer)
    if args.save_dir is not None:
        model.save(session, args.save_dir)
    return dev_outcome


def main():
    args = parse_args()
    config = Config.load(args.config)
    validation_set = config.load_dataset("dev", merge=False)
    session_conf = tf.ConfigProto(allow_soft_placement=True)

    with tf.Session(config=session_conf) as session, tf.device(args.device):
        if args.load_dir is not None:
            model = SequenceModel.load(session, args.load_dir,
                args=args, rebuild_graph=args.rebuild_graph, faux_cudnn=args.faux_cudnn,
                replace_to=args.name,
                replace_from=args.old_name)
            dev_outcome = accuracy(
                model, session, validation_set, args.batch_size, train=False,
                report_metrics_per_axis=args.report_metrics_per_axis,
                report_class_f1=args.report_class_f1)
            print_outcome(dev_outcome,
                          model.objectives, 0,
                          name="loaded validation",
                          step=session.run(model.global_step),
                          logger=None)
            # dev_outcome = None
            if args.rebuild_graph and args.save_dir is not None:
                model.save(session, args.save_dir)
            train_set = config.load_dataset("train")
        else:
            # load classes and index2word from a file.
            dev_outcome = None
            train_set = config.load_dataset("train")
            model = SequenceModel(
                objectives=get_objectives(config.objectives, train_set),
                features=config.features,
                feature_index2words=get_feature_vocabs(config.features, train_set, ["<UNK>"]),
                lr=args.lr,
                anneal_rate=args.anneal_rate,
                weight_noise=args.weight_noise,
                freeze_rate=args.freeze_rate,
                freeze_rate_anneal=args.freeze_rate_anneal,
                clip_norm=args.clip_norm,
                hidden_sizes=args.hidden_sizes,
                solver=args.solver,
                fused=args.fused,
                class_weights_normalize=args.class_weights_normalize,
                class_weights=args.class_weights,
                class_weights_clipval=args.class_weights_clipval,
                keep_prob=args.keep_prob,
                input_keep_prob=args.input_keep_prob,
                name=args.name,
                cudnn=args.cudnn,
                faux_cudnn=args.faux_cudnn,
                create_variables=True)
            session.run(tf.global_variables_initializer())
            if args.restore_input_features is not None:
                restore_session(
                    session, args.restore_input_features,
                    verbose=True,
                    use_metagraph=False,
                    only_features=True)

        print("Model has {} trainable parameters.".format(count_number_of_parameters()), flush=True)
        best_dev_score = 0.0
        patience = 0
        best_epoch = 0
        best_outcome = None
        improvement_key = args.improvement_key
        if dev_outcome is not None:
            best_dev_score = dev_outcome[improvement_key]
            best_epoch = -1
            best_outcome = dev_outcome

        if args.save_dir is not None:
            train_writer = Logger(session, tf.summary.FileWriter(join(args.save_dir, "train")))
            test_writer = Logger(session, tf.summary.FileWriter(join(args.save_dir, "test")))
        else:
            train_writer, test_writer = None, None

        test_callback = TestCallback(model,
                                     session,
                                     validation_set,
                                     -1,
                                     args,
                                     logger=test_writer)
        if len(train_set) > 0:
            train_set.set_randomize(True)
            train_set.set_rng(model.rng)
            for epoch in range(args.max_epochs):
                dev_outcome = compute_epoch(
                    session, model,
                    train_set=train_set, validation_set=validation_set,
                    epoch=epoch, test_callback=test_callback,
                    train_writer=train_writer,
                    test_writer=test_writer,
                    args=args)

                if dev_outcome[improvement_key] > best_dev_score:
                    best_dev_score = dev_outcome[improvement_key]
                    best_epoch = epoch
                    best_outcome = dev_outcome
                    patience = 0
                    if args.save_dir is not None:
                        model.save(session, join(args.save_dir, "best"))
                else:
                    patience += 1
                    if patience >= args.max_patience:
                        print("No improvements for {} epochs. Stopping.".format(args.max_patience))
                        break
                del dev_outcome
        print_outcome(
            best_outcome,
            model.objectives,
            epoch=best_epoch,
            name="validation-best",
            step=session.run(model.global_step),
            logger=None)


if __name__ == "__main__":
    main()