learning/train_type.py (2,161 lines of code) (raw):
import json
import pickle
import math
import sys
import argparse
import warnings
from os import makedirs
from os.path import basename, join, exists, dirname, splitext, realpath
from wikidata_linker_utils.progressbar import get_progress_bar
from dataset import TSVDataset, CombinedDataset, H5Dataset, ClassificationHandler
from batchifier import (iter_batches_single_threaded,
requires_vocab,
requires_character_convolution,
get_feature_vocabs)
import tensorflow as tf
import numpy as np
try:
RNNCell = tf.nn.rnn_cell.RNNCell
TFLSTMCell = tf.nn.rnn_cell.LSTMCell
MultiRNNCell = tf.nn.rnn_cell.MultiRNNCell
LSTMStateTuple = tf.nn.rnn_cell.LSTMStateTuple
from tensorflow.contrib.cudnn_rnn import CudnnLSTM
except AttributeError:
RNNCell = tf.contrib.rnn.RNNCell
TFLSTMCell = tf.contrib.rnn.LSTMCell
MultiRNNCell = tf.contrib.rnn.MultiRNNCell
LSTMStateTuple = tf.contrib.rnn.LSTMStateTuple
from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTM
from tensorflow.python.client import device_lib
class LazyAdamOptimizer(tf.train.AdamOptimizer):
"""Variant of the Adam optimizer that handles sparse updates more efficiently.
The original Adam algorithm maintains two moving-average accumulators for
each trainable variable; the accumulators are updated at every step.
This class provides lazier handling of gradient updates for sparse variables.
It only updates moving-average accumulators for sparse variable indices that
appear in the current batch, rather than updating the accumulators for all
indices. Compared with the original Adam optimizer, it can provide large
improvements in model training throughput for some applications. However, it
provides slightly different semantics than the original Adam algorithm, and
may lead to different empirical results.
"""
def _apply_sparse(self, grad, var):
beta1_power = tf.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = tf.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = tf.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = tf.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * tf.sqrt(1 - beta2_power) / (1 - beta1_power))
# m := beta1 * m + (1 - beta1) * g_t
# We use a slightly different version of the moving-average update formula
# that does a better job of handling concurrent lockless updates:
# m -= (1 - beta1) * (m - g_t)
m = self.get_slot(var, "m")
m_t_delta = tf.gather(m, grad.indices) - grad.values
m_t = tf.scatter_sub(m, grad.indices,
(1 - beta1_t) * m_t_delta,
use_locking=self._use_locking)
# v := beta2 * v + (1 - beta2) * (g_t * g_t)
# We reformulate the update as:
# v -= (1 - beta2) * (v - g_t * g_t)
v = self.get_slot(var, "v")
v_t_delta = tf.gather(v, grad.indices) - tf.square(grad.values)
v_t = tf.scatter_sub(v, grad.indices,
(1 - beta2_t) * v_t_delta,
use_locking=self._use_locking)
# variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
m_t_slice = tf.gather(m_t, grad.indices)
v_t_slice = tf.gather(v_t, grad.indices)
denominator_slice = tf.sqrt(v_t_slice) + epsilon_t
var_update = tf.scatter_sub(var, grad.indices,
lr * m_t_slice / denominator_slice,
use_locking=self._use_locking)
return tf.group(var_update, m_t, v_t)
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
def split(values, axis, num_splits, name=None):
return tf.split(values, num_splits, axis=axis, name=name)
def reverse(values, axis):
return tf.reverse(values, [axis])
def sparse_softmax_cross_entropy_with_logits(logits, labels):
return tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=labels)
def concat(values, axis, name=None):
if len(values) == 1:
return values[0]
return tf.concat(values, axis, name=name)
def concat_tensor_array(values, name=None):
return values.stack(name=name)
def batch_gather_3d(values, indices):
return tf.gather(tf.reshape(values, [-1, tf.shape(values)[2]]),
tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
indices)
def batch_gather_2d(values, indices):
return tf.gather(tf.reshape(values, [-1]),
tf.range(0, tf.shape(values)[0]) * tf.shape(values)[1] +
indices)
def viterbi_decode(score, transition_params, sequence_lengths, back_prop=False,
parallel_iterations=1):
"""Decode the highest scoring sequence of tags inside of TensorFlow!!!
This can be used anytime.
Args:
score: A [batch, seq_len, num_tags] matrix of unary potentials.
transition_params: A [num_tags, num_tags] matrix of binary potentials.
sequence_lengths: A [batch] int32 vector of the length of each score
sequence.
Returns:
viterbi: A [batch, seq_len] list of integers containing the highest
scoring tag indices.
viterbi_score: A vector of float containing the score for the Viterbi
sequence.
"""
sequence_lengths = tf.convert_to_tensor(
sequence_lengths, name="sequence_lengths")
score = tf.convert_to_tensor(score, name="score")
transition_params = tf.convert_to_tensor(
transition_params, name="transition_params")
if sequence_lengths.dtype != tf.int32:
sequence_lengths = tf.cast(sequence_lengths, tf.int32)
def condition(t, *args):
"""Stop when full score sequence has been read in."""
return tf.less(t, tf.shape(score)[1])
def body(t, trellis, backpointers, trellis_val):
"""Perform forward viterbi pass."""
v = tf.expand_dims(trellis_val, 2) + tf.expand_dims(transition_params, 0)
new_trellis_val = score[:, t, :] + tf.reduce_max(v, axis=1)
new_trellis = trellis.write(t, new_trellis_val)
new_backpointers = backpointers.write(
t, tf.cast(tf.argmax(v, axis=1), tf.int32))
return t + 1, new_trellis, new_backpointers, new_trellis_val
trellis_arr = tf.TensorArray(score.dtype, size=0,
dynamic_size=True, clear_after_read=False, infer_shape=False)
first_trellis_val = score[:, 0, :]
trellis_arr = trellis_arr.write(0, first_trellis_val)
backpointers_arr = tf.TensorArray(tf.int32, size=0,
dynamic_size=True, clear_after_read=False, infer_shape=False)
backpointers_arr = backpointers_arr.write(0,
tf.zeros_like(score[:, 0, :], dtype=tf.int32))
_, trellis_out, backpointers_out, _ = tf.while_loop(
condition, body,
(tf.constant(1, name="t", dtype=tf.int32), trellis_arr, backpointers_arr, first_trellis_val),
parallel_iterations=parallel_iterations,
back_prop=back_prop)
trellis_out = concat_tensor_array(trellis_out)
backpointers_out = concat_tensor_array(backpointers_out)
# make batch-major:
trellis_out = tf.transpose(trellis_out, [1, 0, 2])
backpointers_out = tf.transpose(backpointers_out, [1, 0, 2])
def condition(t, *args):
return tf.less(t, tf.shape(score)[1])
def body(t, viterbi, last_decision):
backpointers_timestep = batch_gather_3d(
backpointers_out, tf.maximum(sequence_lengths - t, 0))
new_last_decision = batch_gather_2d(
backpointers_timestep, last_decision)
new_viterbi = viterbi.write(t, new_last_decision)
return t + 1, new_viterbi, new_last_decision
last_timestep = batch_gather_3d(trellis_out, sequence_lengths - 1)
# get scores for last timestep of each batch element inside
# trellis:
scores = tf.reduce_max(last_timestep, axis=1)
# get choice index for last timestep:
last_decision = tf.cast(tf.argmax(last_timestep, axis=1), tf.int32)
# decode backwards using backpointers:
viterbi = tf.TensorArray(tf.int32, size=0,
dynamic_size=True, clear_after_read=False, infer_shape=False)
viterbi = viterbi.write(0, last_decision)
_, viterbi_out, _ = tf.while_loop(
condition, body,
(tf.constant(1, name="t", dtype=tf.int32), viterbi, last_decision),
parallel_iterations=parallel_iterations,
back_prop=back_prop)
viterbi_out = concat_tensor_array(viterbi_out)
# make batch-major:
viterbi_out = tf.transpose(viterbi_out, [1, 0])
viterbi_out_fwd = tf.reverse_sequence(
viterbi_out, sequence_lengths, seq_dim=1)
return viterbi_out_fwd, scores
def sum_list(elements):
total = elements[0]
for el in elements[1:]:
total += el
return total
def explicitly_set_fields():
received = set()
for argument in sys.argv:
if argument.startswith("--"):
received.add(argument[2:])
if argument[2:].startswith("no"):
received.add(argument[4:])
return received
def save_session(session, saver, path, verbose=False):
"""
Call save on tf.train.Saver on a specific path to store all the variables
of the current tensorflow session to a file for later restoring.
Arguments:
session : tf.Session
path : str, place to save session
"""
makedirs(path, exist_ok=True)
if not path.endswith("/"):
path = path + "/"
path = join(path, "model.ckpt")
if verbose:
print("Saving session under %r" % (path,), flush=True)
saver.save(session, path)
print("Saved", flush=True)
### constants for saving & loading
# model config:
OBJECTIVE_NAMES = "OBJECTIVE_NAMES"
OBJECTIVE_TYPES = "OBJECTIVE_TYPES"
# inputs:
INPUT_PLACEHOLDERS = "INPUT_PLACEHOLDERS"
LABEL_PLACEHOLDERS = "LABEL_PLACEHOLDERS"
LABEL_MASK_PLACEHOLDERS = "LABEL_MASK_PLACEHOLDERS"
TRAIN_OP = "TRAIN_OP"
SEQUENCE_LENGTHS = "SEQUENCE_LENGTHS"
IS_TRAINING = "IS_TRAINING"
# outputs:
DECODED = "DECODED"
DECODED_SCORES = "DECODED_SCORES"
UNARY_SCORES = "UNARY_SCORES"
# per objective metrics:
TOKEN_CORRECT = "TOKEN_CORRECT"
TOKEN_CORRECT_TOTAL = "TOKEN_CORRECT_TOTAL"
SENTENCE_CORRECT = "SENTENCE_CORRECT"
SENTENCE_CORRECT_TOTAL = "SENTENCE_CORRECT_TOTAL"
# aggregate metrics over all objectives
NLL = "NLL"
NLL_TOTAL = "NLL_TOTAL"
TOKEN_CORRECT_ALL = "TOKEN_CORRECT_ALL"
TOKEN_CORRECT_ALL_TOTAL = "TOKEN_CORRECT_ALL_TOTAL"
SENTENCE_CORRECT_ALL = "SENTENCE_CORRECT_ALL"
SENTENCE_CORRECT_ALL_TOTAL = "SENTENCE_CORRECT_ALL_TOTAL"
CONFUSION_MATRIX = "CONFUSION_MATRIX"
GLOBAL_STEP = "global_step"
SUMMARIES_ASSIGNS = "SUMMARIES_ASSIGNS"
SUMMARIES_PLACEHOLDERS = "SUMMARIES_PLACEHOLDERS"
SUMMARIES_NAMES = "SUMMARIES_NAMES"
TRAIN_SUMMARIES = "TRAIN_SUMMARIES"
TRUE_POSITIVES = "TRUE_POSITIVES"
FALSE_POSITIVES = "FALSE_POSITIVES"
FALSE_NEGATIVES = "FALSE_NEGATIVES"
def maybe_dropout(inputs, keep_prob, is_training):
return tf.cond(is_training,
lambda : tf.nn.dropout(inputs, keep_prob),
lambda : inputs
) if keep_prob < 1 else inputs
def compute_sentence_correct(correct, sequence_mask):
any_label = tf.reduce_max(tf.cast(sequence_mask, tf.int32), 1)
sentence_correct_total = tf.reduce_sum(any_label)
# is 1 when all is correct, 0 otherwise
sentence_correct = tf.reduce_sum(tf.reduce_prod(
tf.cast(
tf.logical_or(correct, tf.logical_not(sequence_mask)),
tf.int32
),
1
) * any_label)
return sentence_correct, sentence_correct_total
def lstm_activation(inputs, input_h, input_c, W, b, activation):
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
cell_inputs = concat([inputs, input_h], axis=1)
lstm_matrix = tf.nn.xw_plus_b(cell_inputs, W, b)
preactiv = split(lstm_matrix, axis=1, num_splits=4)
# from CUDNN docs:
# Values 0 and 4 reference the input gate.
# Values 1 and 5 reference the forget gate.
# Values 2 and 6 reference the new memory gate.
# Values 3 and 7 reference the output gate
i, f, j, o = (
preactiv[CUDNN_MAPPING["i"]],
preactiv[CUDNN_MAPPING["f"]],
preactiv[CUDNN_MAPPING["j"]],
preactiv[CUDNN_MAPPING["o"]]
)
c = (tf.nn.sigmoid(f) * input_c +
tf.nn.sigmoid(i) * activation(j))
m = tf.nn.sigmoid(o) * activation(c)
return (c, m)
class Logger(object):
def __init__(self, session, writer):
self.session = session
self.writer = writer
self._placeholders = {}
summaries = tf.get_collection(SUMMARIES_ASSIGNS)
summaries_pholders = tf.get_collection(SUMMARIES_PLACEHOLDERS)
summaries_names = [name.decode("utf-8")
for name in tf.get_collection(SUMMARIES_NAMES)]
for summary, pholder, name in zip(summaries, summaries_pholders, summaries_names):
self._placeholders[name] = (pholder, summary)
def log(self, name, value, step):
if name not in self._placeholders:
pholder = tf.placeholder(tf.float32, [], name=name)
summary = tf.summary.scalar(name, pholder)
tf.add_to_collection(SUMMARIES_ASSIGNS, summary)
tf.add_to_collection(SUMMARIES_NAMES, name)
tf.add_to_collection(SUMMARIES_PLACEHOLDERS, pholder)
self._placeholders[name] = (pholder, summary)
pholder, summary = self._placeholders[name]
res = self.session.run(summary, {pholder:value})
self.writer.add_summary(res, step)
class ParametrizedLSTMCell(RNNCell):
def __init__(self, weights, biases, hidden_size):
self._weights = weights
self._biases = biases
self.hidden_size = hidden_size
@property
def state_size(self):
return (self.hidden_size, self.hidden_size)
@property
def output_size(self):
return self.hidden_size
def __call__(self, inputs, state, scope=None):
input_h, input_c = state
c, m = lstm_activation(inputs,
input_h=input_h,
input_c=input_c,
b=self._biases,
W=self._weights,
activation=tf.nn.tanh)
return m, (m, c)
class LSTMCell(TFLSTMCell):
def __init__(self,
num_units,
keep_prob=1.0,
is_training=False):
self._is_training = is_training
self._keep_prob = keep_prob
TFLSTMCell.__init__(
self,
num_units=num_units,
state_is_tuple=True
)
def __call__(self, inputs, state, scope=None):
(c_prev, m_prev) = state
dtype = inputs.dtype
input_size = inputs.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
with tf.variable_scope(scope or type(self).__name__,
initializer=self._initializer): # "LSTMCell"
concat_w = _get_concat_variable(
"W", [input_size.value + self._num_units, 4 * self._num_units],
dtype, 1)
b = tf.get_variable(
"B", shape=[4 * self._num_units],
initializer=tf.zeros_initializer(), dtype=dtype)
c, m = lstm_activation(inputs,
input_c=c_prev,
input_h=m_prev,
W=concat_w,
b=b,
activation=self._activation,
keep_prob=self._keep_prob,
is_training=self._is_training,
forget_bias=self._forget_bias)
return m, LSTMStateTuple(c, m)
def cudnn_lstm_parameter_size(input_size, hidden_size):
"""Number of parameters in a single CuDNN LSTM cell."""
biases = 8 * hidden_size
weights = 4 * (hidden_size * input_size) + 4 * (hidden_size * hidden_size)
return biases + weights
def direction_to_num_directions(direction):
if direction == "unidirectional":
return 1
elif direction == "bidirectional":
return 2
else:
raise ValueError("Unknown direction: %r." % (direction,))
def estimate_cudnn_parameter_size(num_layers,
input_size,
hidden_size,
input_mode,
direction):
"""
Compute the number of parameters needed to
construct a stack of LSTMs. Assumes the hidden states
of bidirectional LSTMs are concatenated before being
sent to the next layer up.
"""
num_directions = direction_to_num_directions(direction)
params = 0
isize = input_size
for layer in range(num_layers):
for direction in range(num_directions):
params += cudnn_lstm_parameter_size(
isize, hidden_size
)
isize = hidden_size * num_directions
return params
# cudnn conversion to dynamic RNN:
CUDNN_LAYER_WEIGHT_ORDER = [
"x", "x", "x", "x", "h", "h", "h", "h"
]
CUDNN_LAYER_BIAS_ORDER = [
"bx", "bx", "bx", "bx", "bh", "bh", "bh", "bh"
]
CUDNN_TRANSPOSED = True
CUDNN_MAPPING = {"i": 0, "f": 1, "j": 2, "o": 3}
def consume_biases_direction(params, old_offset, hidden_size, isize):
offset = old_offset
layer_biases_x = []
layer_biases_h = []
for piece in CUDNN_LAYER_BIAS_ORDER:
if piece == "bx":
layer_biases_x.append(
params[offset:offset + hidden_size]
)
offset += hidden_size
elif piece == "bh":
layer_biases_h.append(
params[offset:offset + hidden_size]
)
offset += hidden_size
else:
raise ValueError("Unknown cudnn piece %r." % (piece,))
b = concat(layer_biases_x, axis=0) + concat(layer_biases_h, axis=0)
return b, offset
def consume_weights_direction(params, old_offset, hidden_size, isize):
offset = old_offset
layer_weights_x = []
layer_weights_h = []
for piece in CUDNN_LAYER_WEIGHT_ORDER:
if piece == "x":
layer_weights_x.append(
tf.reshape(
params[offset:offset + hidden_size * isize],
[hidden_size, isize] if CUDNN_TRANSPOSED else [isize, hidden_size]
)
)
offset += hidden_size * isize
elif piece == "h":
layer_weights_h.append(
tf.reshape(
params[offset:offset + hidden_size * hidden_size],
[hidden_size, hidden_size]
)
)
offset += hidden_size * hidden_size
else:
raise ValueError("Unknown cudnn piece %r." % (piece,))
if CUDNN_TRANSPOSED:
W_T = concat([concat(layer_weights_x, axis=0), concat(layer_weights_h, axis=0)], axis=1)
W = tf.transpose(W_T)
else:
W = concat([concat(layer_weights_x, axis=1), concat(layer_weights_h, axis=1)], axis=0)
return W, offset
def decompose_layer_params(params, num_layers,
hidden_size, cell_input_size,
input_mode, direction, create_fn):
"""
This operation converts the opaque cudnn params into a set of
usable weight matrices.
Args:
params : Tensor, opaque cudnn params tensor
num_layers : int, number of stacked LSTMs.
hidden_size : int, number of neurons in each LSTM.
cell_input_size : int, input size for the LSTMs.
input_mode: whether a pre-projection was used or not. Currently only
'linear_input' is supported (e.g. CuDNN does its own projection
internally)
direction : str, 'unidirectional' or 'bidirectional'.
create_fn: callback for weight creation. Receives parameter slice (op),
layer (int), direction (0 = fwd, 1 = bwd),
parameter_index (0 = W, 1 = b).
Returns:
weights : list of lists of Tensors in the format:
first list is indexed layers,
inner list is indexed by direction (fwd, bwd),
tensors in the inner list are (Weights, biases)
"""
if input_mode != "linear_input":
raise ValueError("Only input_mode == linear_input supported for now.")
num_directions = direction_to_num_directions(direction)
offset = 0
all_weights = [[[] for j in range(num_directions)]
for i in range(num_layers)]
isize = cell_input_size
with tf.variable_scope("DecomposeCudnnParams"):
for layer in range(num_layers):
with tf.variable_scope("Layer{}".format(layer)):
for direction in range(num_directions):
with tf.variable_scope("fwd" if direction == 0 else "bwd"):
with tf.variable_scope("weights"):
W, offset = consume_weights_direction(
params,
old_offset=offset,
hidden_size=hidden_size,
isize=isize)
all_weights[layer][direction].append(
create_fn(W, layer, direction, 0))
isize = hidden_size * num_directions
isize = cell_input_size
for layer in range(num_layers):
with tf.variable_scope("Layer{}".format(layer)):
for direction in range(num_directions):
with tf.variable_scope("fwd" if direction == 0 else "bwd"):
with tf.variable_scope("biases"):
b, offset = consume_biases_direction(
params,
old_offset=offset,
hidden_size=hidden_size,
isize=isize)
all_weights[layer][direction].append(
create_fn(b, layer, direction, 1))
isize = hidden_size * num_directions
return all_weights
def create_decomposed_variable(param, lidx, didx, pidx):
with tf.device("cpu"):
return tf.get_variable("w" if pidx == 0 else "b",
shape=param.get_shape().as_list(),
dtype=param.dtype,
trainable=False,
collections=[tf.GraphKeys.GLOBAL_VARIABLES,
"excluded_variables"])
def cpu_cudnn_params(params, num_layers, hidden_size, cell_input_size, input_mode,
direction):
"""
This operation converts the opaque cudnn params into a set of
usable weight matrices, and caches the conversion.
Args:
params : Tensor, opaque cudnn params tensor
num_layers : int, number of stacked LSTMs.
hidden_size : int, number of neurons in each LSTM.
cell_input_size : int, input size for the LSTMs.
input_mode: whether a pre-projection was used or not. Currently only
'linear_input' is supported (e.g. CuDNN does its own projection
internally)
direction : str, 'unidirectional' or 'bidirectional'.
skip_creation : bool, whether to build variables.
Returns:
weights : list of lists of Tensors in the format:
first list is indexed layers,
inner list is indexed by direction (fwd, bwd),
tensors in the inner list are (Weights, biases)
"""
# create a boolean status variable that checks whether the
# weights have been converted to cpu format:
with tf.device("cpu"):
cpu_conversion_status = tf.get_variable(
name="CudnnConversionStatus", dtype=tf.float32,
initializer=tf.zeros_initializer(), shape=[],
trainable=False,
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
# create a fresh copy of the weights (not trainable)
reshaped = decompose_layer_params(
params,
num_layers=num_layers,
hidden_size=hidden_size,
cell_input_size=cell_input_size,
input_mode=input_mode,
direction=direction,
create_fn=create_decomposed_variable)
def cpu_convert():
all_assigns = decompose_layer_params(
params,
num_layers=num_layers,
hidden_size=hidden_size,
cell_input_size=cell_input_size,
input_mode=input_mode,
direction=direction,
create_fn=lambda p, lidx, didx, pidx: tf.assign(reshaped[lidx][didx][pidx], p))
all_assigns = [assign for layer_assign in all_assigns
for dir_assign in layer_assign
for assign in dir_assign]
all_assigns.append(tf.assign(cpu_conversion_status, tf.constant(1.0, dtype=tf.float32)))
all_assigns.append(tf.Print(cpu_conversion_status, [0],
message="Converted cudnn weights to CPU format. "))
with tf.control_dependencies(all_assigns):
ret = tf.identity(cpu_conversion_status)
return ret
# cache the reshaping/concatenating
ensure_conversion = tf.cond(tf.greater(cpu_conversion_status, 0),
lambda: cpu_conversion_status,
cpu_convert)
# if weights are already reshaped, go ahead:
with tf.control_dependencies([ensure_conversion]):
# wrap with identity to ensure there is a dependency between assignment
# and using the weights:
all_params = [[[tf.identity(p) for p in dir_param]
for dir_param in layer_param]
for layer_param in reshaped]
return all_params
class CpuCudnnLSTM(object):
def __init__(self, num_layers, hidden_size,
cell_input_size, input_mode, direction):
self.num_layers = num_layers
self.hidden_size = hidden_size
self.cell_input_size = cell_input_size
self.input_mode = input_mode
self.direction = direction
def __call__(self,
inputs,
input_h,
input_c,
params,
is_training=True):
layer_params = cpu_cudnn_params(params,
num_layers=self.num_layers,
hidden_size=self.hidden_size,
cell_input_size=self.cell_input_size,
input_mode=self.input_mode,
direction=self.direction)
REVERSED = 1
layer_inputs = inputs
cell_idx = 0
for layer_param in layer_params:
hidden_fwd_bwd = []
final_output_c = []
final_output_h = []
for direction, (W, b) in enumerate(layer_param):
if direction == REVERSED:
layer_inputs = reverse(layer_inputs, axis=0)
hiddens, (output_h, output_c) = tf.nn.dynamic_rnn(
cell=ParametrizedLSTMCell(W, b, self.hidden_size),
inputs=layer_inputs,
dtype=inputs.dtype,
time_major=True,
initial_state=(input_h[cell_idx], input_c[cell_idx]))
if direction == REVERSED:
hiddens = reverse(hiddens, axis=0)
hidden_fwd_bwd.append(hiddens)
final_output_c.append(tf.expand_dims(output_c, 0))
final_output_h.append(tf.expand_dims(output_h, 0))
cell_idx += 1
if len(hidden_fwd_bwd) > 1:
layer_inputs = concat(hidden_fwd_bwd, axis=2)
final_output_c = concat(final_output_c, axis=0)
final_output_h = concat(final_output_h, axis=0)
else:
layer_inputs = hidden_fwd_bwd[0]
final_output_c = final_output_c[0]
final_output_h = final_output_h[0]
return layer_inputs, final_output_h, final_output_c
def highway(x, activation_fn=tf.nn.relu, scope=None):
size = x.get_shape()[-1].value
with tf.variable_scope(scope or "HighwayLayer"):
activ = tf.contrib.layers.fully_connected(
x, size * 2, activation_fn=None, scope="FC"
)
transform = tf.sigmoid(activ[..., :size], name="transform_gate")
hidden = activation_fn(activ[..., size:])
carry = 1.0 - transform
return tf.add(hidden * transform, x * carry, "y")
def conv2d(inputs, output_dim, k_h, k_w,
stddev=0.02, scope=None,
weight_noise=0.0, is_training=True):
with tf.variable_scope(scope or "Conv2D"):
w = tf.get_variable('w', [k_h, k_w, inputs.get_shape()[-1], output_dim],
initializer=tf.truncated_normal_initializer(stddev=stddev))
if weight_noise > 0 and not isinstance(is_training, bool):
w = add_weight_noise(w, is_training=is_training, stddev=weight_noise)
return tf.nn.conv2d(inputs, w, strides=[1, 1, 1, 1], padding="VALID")
def character_convolution(inputs, feature):
inputs_2d = tf.reshape(inputs,
[tf.shape(inputs)[0] * tf.shape(inputs)[1], tf.shape(inputs)[2]]
)
inputs_3d = embedding_lookup(
inputs_2d,
dim=feature["dimension"],
# 255 different bytes (uint8)
# & start and end symbol:
size=257,
dtype=tf.float32,
mask_negative=True)
inputs_4d = tf.expand_dims(inputs_3d, 1)
feature_pools = []
for idx, conv_filter in enumerate(feature["filters"]):
width, channels = conv_filter["width"], conv_filter["channels"]
# [batch * time x 1 x word_length x embed_dim x feature_map_dim]
conv = tf.squeeze(conv2d(inputs_4d, channels, 1, width, scope="CharacterConvolution%d" % (idx,)), [1])
# remove word dimension
pool = tf.reduce_max(conv, 1)
feature_pools.append(pool)
activations = concat(feature_pools, axis=1)
channels_out = sum(conv_filter["channels"] for conv_filter in feature["filters"])
activations = tf.reshape(
tf.tanh(activations),
[tf.shape(inputs)[0], tf.shape(inputs)[1], channels_out],
name="CharacterConvolutionPooled")
for idx in range(feature["highway_layers"]):
activations = highway(activations, scope="HighwayLayer%d" % (idx,),
activation_fn=tf.tanh)
return activations
def feature_dtype(feat):
if requires_vocab(feat):
return tf.int32
elif feat["type"] in {"digit", "punctuation_count", "uppercase"}:
return tf.float32
elif requires_character_convolution(feat):
return tf.int32
else:
raise ValueError("unknown feature %r." % (feat,))
def feature_shape(feature):
if requires_vocab(feature) or feature["type"] in {'digit', 'punctuation_count', 'uppercase'}:
return [None, None]
elif requires_character_convolution(feature):
return [None, None, None]
else:
raise ValueError("unknown feature %r." % (feature,))
def build_inputs(features, objectives, fused, class_weights,
class_weights_clipval):
input_placeholders = []
labels = []
labels_mask = []
labels_class_weights = []
max_output_vocab = max(len(obj["vocab"]) for obj in objectives)
with tf.variable_scope("Inputs"):
is_training = tf.placeholder(tf.bool, [], name="is_training")
tf.add_to_collection(IS_TRAINING, is_training)
for idx, feat in enumerate(features):
input_placeholder = tf.placeholder(
feature_dtype(feat), feature_shape(feat),
name="input_placeholders_%d" % (idx,)
)
input_placeholders.append(input_placeholder)
tf.add_to_collection(INPUT_PLACEHOLDERS, input_placeholder)
if fused:
label_placeholder = tf.placeholder(
tf.int32, [None, None, len(objectives)]
)
labels_mask_placeholder = tf.placeholder(
tf.bool, [None, None, len(objectives)], name="labels_mask"
)
labels.append(label_placeholder)
labels_mask.append(labels_mask_placeholder)
tf.add_to_collection(LABEL_PLACEHOLDERS, label_placeholder)
tf.add_to_collection(LABEL_MASK_PLACEHOLDERS, labels_mask_placeholder)
if class_weights:
with tf.variable_scope("FusedClassWeights"):
init_class_weights = tf.get_variable(
name="class_weights",
shape=[len(objectives) * max_output_vocab],
initializer=tf.constant_initializer(1),
dtype=tf.int64,
trainable=False)
init_class_count = tf.get_variable(
name="class_weights_denominator",
shape=[len(objectives)],
initializer=tf.constant_initializer(1),
dtype=tf.int64,
trainable=False)
def update_class_weights():
mask_as_ints = tf.cast(tf.reshape(labels_mask_placeholder, [-1, len(objectives)]), tf.int64)
updated_cls_weights = tf.scatter_add(
init_class_weights,
tf.reshape(label_placeholder + tf.reshape(tf.range(len(objectives)) * max_output_vocab, [1, 1, len(objectives)]), [-1]),
tf.reshape(mask_as_ints, [-1])
)
updated_class_count = tf.assign_add(init_class_count, tf.reduce_sum(mask_as_ints, 0))
# class weight: weight_i = total / class_i
weights = tf.clip_by_value(tf.expand_dims(updated_class_count, 1) /
tf.reshape(updated_cls_weights, [len(objectives), max_output_vocab]),
1e-6, class_weights_clipval)
return tf.cast(weights, tf.float32)
def return_class_weights():
# class weight: weight_i = total / class_i
return tf.cast(
tf.clip_by_value(tf.expand_dims(init_class_count, 1) /
tf.reshape(init_class_weights, [len(objectives), max_output_vocab]),
1e-6, class_weights_clipval), tf.float32)
labels_class_weights.append(
tf.cond(is_training,
update_class_weights,
return_class_weights))
else:
labels_class_weights.append(None)
else:
for objective in objectives:
with tf.variable_scope(objective["name"]):
label_placeholder = tf.placeholder(
tf.int32, [None, None], name="labels"
)
labels.append(label_placeholder)
if objective["type"] == "crf":
labels_mask_placeholder = tf.placeholder(
tf.bool, [None], name="labels_mask"
)
labels_class_weights.append(None)
elif objective["type"] == "softmax":
labels_mask_placeholder = tf.placeholder(
tf.bool, [None, None], name="labels_mask"
)
if class_weights:
init_class_weights = tf.get_variable(
name="class_weights",
shape=len(objective["vocab"]),
initializer=tf.constant_initializer(1),
dtype=tf.int64,
trainable=False)
init_class_count = tf.get_variable(
name="class_weights_denominator",
shape=[],
initializer=tf.constant_initializer(1),
dtype=tf.int64,
trainable=False)
def update_class_weights():
mask_as_ints = tf.cast(tf.reshape(labels_mask_placeholder, [-1]), tf.int64)
updated_cls_weights = tf.scatter_add(
init_class_weights,
tf.reshape(label_placeholder, [-1]),
mask_as_ints
)
updated_class_count = tf.assign_add(init_class_count, tf.reduce_sum(mask_as_ints))
# class weight: weight_i = total / class_i
weights = tf.clip_by_value(updated_class_count / updated_cls_weights,
1e-6, class_weights_clipval)
return tf.cast(weights, tf.float32)
def return_class_weights():
# class weight: weight_i = total / class_i
return tf.cast(
tf.clip_by_value(init_class_count / init_class_weights,
1e-6, class_weights_clipval), tf.float32)
labels_class_weights.append(
tf.cond(is_training, update_class_weights, return_class_weights)
)
else:
labels_class_weights.append(None)
else:
raise ValueError(
"unknown objective type %r." % (
objective["type"]
)
)
labels_mask.append(labels_mask_placeholder)
tf.add_to_collection(LABEL_PLACEHOLDERS, label_placeholder)
tf.add_to_collection(LABEL_MASK_PLACEHOLDERS, labels_mask_placeholder)
sequence_lengths = tf.placeholder(tf.int32, [None],
name="sequence_lengths")
tf.add_to_collection(SEQUENCE_LENGTHS, sequence_lengths)
return (input_placeholders,
labels,
labels_mask,
labels_class_weights,
sequence_lengths,
is_training)
def add_weight_noise(x, is_training, stddev):
return tf.cond(is_training,
lambda: x + tf.random_normal(
shape=tf.shape(x), stddev=stddev),
lambda: x)
def build_recurrent(inputs, cudnn, faux_cudnn, hidden_sizes, is_training,
keep_prob, weight_noise):
dtype = tf.float32
if cudnn:
if len(hidden_sizes) == 0:
raise ValueError("hidden_sizes must be a list of length > 1.")
hidden_size = hidden_sizes[0]
if any(hidden_size != hsize for hsize in hidden_sizes):
raise ValueError("cudnn RNN requires all hidden units "
"to be the same size (got %r)" % (
hidden_sizes,
))
num_layers = len(hidden_sizes)
cell_input_size = inputs.get_shape()[-1].value
est_size = estimate_cudnn_parameter_size(
num_layers=num_layers,
hidden_size=hidden_size,
input_size=cell_input_size,
input_mode="linear_input",
direction="bidirectional"
)
# autoswitch to GPUs based on availability of alternatives:
cudnn_params = tf.get_variable("RNNParams",
shape=[est_size],
dtype=tf.float32,
initializer=tf.contrib.layers.variance_scaling_initializer())
if weight_noise > 0:
cudnn_params = add_weight_noise(cudnn_params,
stddev=weight_noise, is_training=is_training)
if faux_cudnn:
cudnn_cell = CpuCudnnLSTM(num_layers,
hidden_size,
cell_input_size,
input_mode="linear_input",
direction="bidirectional")
else:
cpu_cudnn_params(cudnn_params,
num_layers=num_layers,
hidden_size=hidden_size,
cell_input_size=cell_input_size,
input_mode="linear_input",
direction="bidirectional")
cudnn_cell = CudnnLSTM(num_layers,
hidden_size,
cell_input_size,
input_mode="linear_input",
direction="bidirectional")
init_state = tf.fill(
(2 * num_layers, tf.shape(inputs)[1], hidden_size),
tf.constant(np.float32(0.0)))
hiddens, output_h, output_c = cudnn_cell(
inputs,
input_h=init_state,
input_c=init_state,
params=cudnn_params,
is_training=True)
hiddens = maybe_dropout(
hiddens,
keep_prob,
is_training)
else:
cell = MultiRNNCell(
[LSTMCell(hsize, is_training=is_training, keep_prob=keep_prob)
for hsize in hidden_sizes]
)
hiddens, _ = bidirectional_dynamic_rnn(
cell,
inputs,
time_major=True,
dtype=dtype,
swap_memory=True
)
return hiddens
def build_embed(inputs, features, index2words, keep_prob, is_training):
embeddings = []
for idx, (values, feature, index2word) in enumerate(zip(inputs, features, index2words)):
if requires_vocab(feature):
with tf.variable_scope("embedding_%d" % (idx,)):
embedding = embedding_lookup(
values,
dim=feature["dimension"],
size=len(index2word),
dtype=tf.float32,
mask_negative=True
)
embeddings.append(embedding)
elif requires_character_convolution(feature):
embeddings.append(
character_convolution(values, feature)
)
else:
embeddings.append(tf.expand_dims(values, 2))
return maybe_dropout(concat(embeddings, axis=2), keep_prob, is_training)
def crf_metrics(unary_scores, labels, transition_params, sequence_lengths,
mask):
"""
Computes CRF output metrics.
Receives:
unary_scores : batch-major order
labels : batch-major order
transition_params : nclasses x nclasses matrix.
sequence_lengths : length of each time-sequence
mask : batch-major example mask
Returns:
token_correct,
token_correct_total,
sentence_correct,
sentence_correct_total
"""
classes = unary_scores.get_shape()[-1].value
decoded, scores = viterbi_decode(unary_scores,
transition_params,
sequence_lengths)
tf.add_to_collection(UNARY_SCORES, unary_scores)
tf.add_to_collection(DECODED, decoded)
tf.add_to_collection(DECODED_SCORES, scores)
equals_label = tf.equal(labels, decoded)
token_correct = tf.reduce_sum(
tf.cast(
tf.logical_and(equals_label, mask),
tf.int32
)
)
token_correct_total = tf.reduce_sum(tf.cast(mask, tf.int32))
tf.add_to_collection(TOKEN_CORRECT, token_correct)
tf.add_to_collection(TOKEN_CORRECT_TOTAL, token_correct_total)
sentence_correct, _ = compute_sentence_correct(equals_label, mask)
sentence_correct_total = tf.reduce_sum(tf.cast(mask[:, 0], tf.int32))
tf.add_to_collection(SENTENCE_CORRECT, sentence_correct)
tf.add_to_collection(SENTENCE_CORRECT_TOTAL, sentence_correct_total)
build_true_false_positives(decoded, mask, labels,
classes, equals_label)
return (token_correct, token_correct_total,
sentence_correct, sentence_correct_total)
def build_true_false_positives(decoded, mask_batch_major, labels_batch_major,
classes, equals_label):
masked_equals_label = tf.logical_and(equals_label, mask_batch_major)
# now for each class compute tp, fp, fn
# [nclasses x batch x time]
masked_per_class = tf.logical_and(
tf.equal(labels_batch_major[None, :, :], tf.range(classes)[:, None, None]),
mask_batch_major)
# correct, and on label
correct = tf.reduce_sum(tf.cast(tf.logical_and(masked_per_class, equals_label[None, :, :]), tf.int32),
axis=[1, 2])
# predicted a particular class
guessed = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(decoded[None, :, :], tf.range(classes)[:, None, None]), mask_batch_major), tf.int32),
axis=[1, 2])
total = tf.reduce_sum(tf.cast(masked_per_class, tf.int32), axis=[1, 2])
tp, fp, fn = correct, guessed - correct, total - correct
tf.add_to_collection(TRUE_POSITIVES, tp)
tf.add_to_collection(FALSE_POSITIVES, fp)
tf.add_to_collection(FALSE_NEGATIVES, fn)
def softmax_metrics(unary_scores, labels, mask):
"""
Compute softmax output stats for correct/accuracy per-token/per-sentence.
Receive
unary_scores : time-major
labels : time-major
mask : time-major
Returns:
token_correct,
token_correct_total,
sentence_correct,
sentence_correct_total
"""
classes = unary_scores.get_shape()[-1].value
unary_scores_batch_major = tf.transpose(unary_scores, [1, 0, 2])
labels_batch_major = tf.transpose(labels, [1, 0])
mask_batch_major = tf.transpose(mask, [1, 0])
decoded = tf.cast(tf.argmax(unary_scores_batch_major, 2), labels.dtype)
unary_probs_batch_major = tf.nn.softmax(unary_scores_batch_major)
scores = tf.reduce_max(unary_probs_batch_major, 2)
tf.add_to_collection(UNARY_SCORES, unary_probs_batch_major)
tf.add_to_collection(DECODED, decoded)
tf.add_to_collection(DECODED_SCORES, scores)
equals_label = tf.equal(decoded, labels_batch_major)
token_correct = tf.reduce_sum(
tf.cast(
tf.logical_and(
equals_label,
mask_batch_major
),
tf.int32
)
)
token_correct_total = tf.reduce_sum(tf.cast(mask, tf.int32))
tf.add_to_collection(TOKEN_CORRECT, token_correct)
tf.add_to_collection(TOKEN_CORRECT_TOTAL, token_correct_total)
sentence_correct, sentence_correct_total = compute_sentence_correct(
equals_label, mask_batch_major
)
tf.add_to_collection(SENTENCE_CORRECT, sentence_correct)
tf.add_to_collection(SENTENCE_CORRECT_TOTAL, sentence_correct_total)
build_true_false_positives(decoded, mask_batch_major, labels_batch_major,
classes, equals_label)
return (token_correct, token_correct_total,
sentence_correct, sentence_correct_total)
def add_objective_names_types(objectives):
for objective in objectives:
with tf.variable_scope(objective["name"]):
# store objective names in graph:
tf.add_to_collection(OBJECTIVE_NAMES,
tf.constant(objective["name"], name="objective_name")
)
tf.add_to_collection(OBJECTIVE_TYPES,
tf.constant(objective["type"], name="objective_type")
)
def build_loss(inputs, objectives, labels, labels_mask,
labels_class_weights, fused, sequence_lengths,
class_weights_normalize):
"""
Compute loss function given the objectives.
Assumes inputs are of the form [time, batch, features].
Arguments:
----------
inputs : tf.Tensor
objectives : list<dict>, objective specs
labels : list<tf.Tensor>
labels_mask : list<tf.Tensor>
labels_class_weights : list<tf.Tensor>
sequence_lengths : tf.Tensor
Returns:
loss : tf.Tensor (scalar)
"""
losses = []
negative_log_likelihoods = []
sentence_corrects = []
sentence_corrects_total = []
token_corrects = []
token_corrects_total = []
max_output_vocab = max(len(obj["vocab"]) for obj in objectives)
total_output_size = len(objectives) * max_output_vocab
add_objective_names_types(objectives)
if fused:
with tf.variable_scope("FusedOutputs"):
objective_labels = labels[0]
mask = labels_mask[0]
objective_class_weights = labels_class_weights[0]
# perform all classifications at once:
unary_scores = tf.contrib.layers.fully_connected(
inputs, total_output_size,
activation_fn=None
)
unary_scores = tf.reshape(unary_scores,
[tf.shape(unary_scores)[0],
tf.shape(unary_scores)[1],
len(objectives),
max_output_vocab])
negative_log_likelihood = sparse_softmax_cross_entropy_with_logits(
logits=unary_scores,
labels=objective_labels
)
labels_mask_casted = tf.cast(mask, negative_log_likelihood.dtype)
masked_negative_log_likelihood = negative_log_likelihood * labels_mask_casted
if objective_class_weights is not None:
class_weights_mask = tf.gather(
tf.reshape(objective_class_weights, [-1]),
objective_labels +
tf.reshape(tf.range(len(objectives)) * max_output_vocab, [1, 1, len(objectives)]))
if class_weights_normalize:
masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood * class_weights_mask
num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted * class_weights_mask), 1e-6)
normed_loss = masked_weighed_negative_log_likelihood_sum / (num_predictions / len(objectives))
else:
masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood * class_weights_mask
num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
normed_loss = masked_weighed_negative_log_likelihood_sum / (num_predictions / len(objectives))
else:
masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood
num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
normed_loss = masked_weighed_negative_log_likelihood_sum / (num_predictions / len(objectives))
masked_negative_log_likelihood_sum = tf.reduce_sum(masked_negative_log_likelihood)
losses.append(normed_loss)
negative_log_likelihoods.append(masked_negative_log_likelihood_sum)
for idx, objective in enumerate(objectives):
with tf.variable_scope(objective["name"]):
(token_correct,
token_correct_total,
sentence_correct,
sentence_correct_total) = softmax_metrics(unary_scores[:, :, idx, :len(objective["vocab"])],
labels=objective_labels[:, :, idx],
mask=mask[:, :, idx])
token_corrects.append(token_correct)
token_corrects_total.append(token_correct_total)
sentence_corrects.append(sentence_correct)
sentence_corrects_total.append(sentence_correct_total)
else:
for objective, objective_labels, mask, objective_class_weights in zip(objectives, labels, labels_mask, labels_class_weights):
with tf.variable_scope(objective["name"]):
if objective["type"] == "crf":
unary_scores = tf.contrib.layers.fully_connected(
inputs,
len(objective["vocab"]),
activation_fn=None
)
unary_scores_batch_major = tf.transpose(unary_scores, [1, 0, 2])
labels_batch_major = tf.transpose(objective_labels, [1, 0])
padded_unary_scores_batch_major = tf.cond(tf.greater(tf.shape(unary_scores_batch_major)[1], 1),
lambda: unary_scores_batch_major,
lambda: tf.pad(unary_scores_batch_major, [[0, 0], [0, 1], [0, 0]]))
padded_labels_batch_major = tf.cond(tf.greater(tf.shape(labels_batch_major)[1], 1),
lambda: labels_batch_major,
lambda: tf.pad(labels_batch_major, [[0, 0], [0, 1]]))
log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
padded_unary_scores_batch_major, padded_labels_batch_major, sequence_lengths
)
labels_mask_casted = tf.cast(mask, log_likelihood.dtype)
masked_log_likelihood = (
log_likelihood * labels_mask_casted
)
masked_negative_log_likelihood_sum = -tf.reduce_sum(masked_log_likelihood)
num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
losses.append(masked_negative_log_likelihood_sum / num_predictions)
negative_log_likelihoods.append(masked_negative_log_likelihood_sum)
sequence_mask = tf.logical_and(
tf.sequence_mask(sequence_lengths),
# pad the time dimension:
tf.expand_dims(mask, 1)
)
(token_correct,
token_correct_total,
sentence_correct,
sentence_correct_total) = crf_metrics(unary_scores_batch_major,
labels=labels_batch_major,
mask=sequence_mask,
transition_params=transition_params,
sequence_lengths=sequence_lengths)
elif objective["type"] == 'softmax':
unary_scores = tf.contrib.layers.fully_connected(
inputs,
len(objective["vocab"]),
activation_fn=None
)
negative_log_likelihood = sparse_softmax_cross_entropy_with_logits(
logits=unary_scores,
labels=objective_labels
)
labels_mask_casted = tf.cast(mask, negative_log_likelihood.dtype)
masked_negative_log_likelihood = (
negative_log_likelihood * labels_mask_casted
)
if objective_class_weights is not None:
class_weights_mask = tf.gather(objective_class_weights, objective_labels)
masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood * class_weights_mask
masked_negative_log_likelihood_sum = tf.reduce_sum(masked_negative_log_likelihood)
if class_weights_normalize:
num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted * class_weights_mask), 1e-6)
normed_loss = masked_weighed_negative_log_likelihood_sum / num_predictions
else:
num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
normed_loss = masked_weighed_negative_log_likelihood_sum / num_predictions
else:
masked_weighed_negative_log_likelihood_sum = masked_negative_log_likelihood
masked_negative_log_likelihood_sum = tf.reduce_sum(masked_negative_log_likelihood)
num_predictions = tf.maximum(tf.reduce_sum(labels_mask_casted), 1e-6)
normed_loss = masked_weighed_negative_log_likelihood_sum / num_predictions
losses.append(normed_loss)
negative_log_likelihoods.append(masked_negative_log_likelihood_sum)
(token_correct,
token_correct_total,
sentence_correct,
sentence_correct_total) = softmax_metrics(unary_scores,
labels=objective_labels,
mask=mask)
else:
raise ValueError(
"unknown objective type %r" % (objective["type"],)
)
token_corrects.append(token_correct)
token_corrects_total.append(token_correct_total)
sentence_corrects.append(sentence_correct)
sentence_corrects_total.append(sentence_correct_total)
# aggregate metrics for all objectives:
total_loss = tf.reduce_sum(sum_list(losses))
tf.summary.scalar("BatchLoss", total_loss)
neg_log_likelihood_total = sum_list(negative_log_likelihoods)
tf.summary.scalar("BatchNLL", neg_log_likelihood_total)
tf.add_to_collection(NLL, neg_log_likelihood_total)
tf.add_to_collection(NLL_TOTAL, tf.shape(inputs)[1])
sentence_corrects_total = sum_list(sentence_corrects_total)
sentence_corrects = sum_list(sentence_corrects)
tf.add_to_collection(SENTENCE_CORRECT_ALL, sentence_corrects)
tf.add_to_collection(SENTENCE_CORRECT_ALL_TOTAL, sentence_corrects_total)
token_corrects_total = sum_list(token_corrects_total)
token_corrects = sum_list(token_corrects)
tf.add_to_collection(TOKEN_CORRECT_ALL, token_corrects)
tf.add_to_collection(TOKEN_CORRECT_ALL_TOTAL, token_corrects_total)
return total_loss
def build_model(name,
trainable,
features,
feature_index2words,
objectives,
keep_prob,
input_keep_prob,
hidden_sizes,
freeze_rate,
freeze_rate_anneal,
solver,
cudnn,
fused,
faux_cudnn,
class_weights,
class_weights_normalize,
class_weights_clipval,
lr,
weight_noise,
anneal_rate,
clip_norm):
# mixed output fusing is currently unsupported
if fused and any(obj["type"] != "softmax" for obj in objectives):
raise ValueError("cannot fuse outputs and use non-softmax output.")
# clear all existing collections to ensure every new collection is
# is created fresh
graph = tf.get_default_graph()
for collection_name in graph.get_all_collection_keys():
graph.clear_collection(collection_name)
# build a model under the model's name to prevent collisions
# when multiple models are restored simultaneously
with tf.variable_scope(name):
global_step = tf.Variable(0, trainable=False, name="global_step")
tf.add_to_collection(GLOBAL_STEP, global_step)
# model placeholders:
(input_placeholders,
labels,
labels_mask,
labels_class_weights,
sequence_lengths,
is_training) = build_inputs(features,
objectives=objectives,
fused=fused,
class_weights=class_weights,
class_weights_clipval=class_weights_clipval)
embed = build_embed(input_placeholders,
features=features,
index2words=feature_index2words,
is_training=is_training,
keep_prob=input_keep_prob)
hiddens = embed
if len(hidden_sizes) > 0:
hiddens = build_recurrent(hiddens,
cudnn=cudnn,
faux_cudnn=faux_cudnn,
hidden_sizes=hidden_sizes,
keep_prob=keep_prob,
weight_noise=weight_noise,
is_training=is_training)
loss = build_loss(hiddens,
objectives=objectives,
fused=fused,
labels=labels,
labels_mask=labels_mask,
labels_class_weights=labels_class_weights,
class_weights_normalize=class_weights_normalize,
sequence_lengths=sequence_lengths)
if trainable:
learning_rate = tf.train.exponential_decay(lr, global_step,
33000, anneal_rate, staircase=True)
if solver == "adam":
optimizer = LazyAdamOptimizer(learning_rate)
elif solver == "sgd":
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
else:
raise ValueError("Unknown solver %r." % (solver))
grad_vars = optimizer.compute_gradients(loss)
if clip_norm > 0:
grad_vars = [(grad if isinstance(grad, tf.IndexedSlices) else tf.clip_by_norm(grad, clip_norm), var) for grad, var in grad_vars]
train_op = optimizer.apply_gradients(grad_vars, global_step=global_step)
else:
train_op = tf.no_op()
tf.add_to_collection(TRAIN_OP, train_op)
tf.add_to_collection(TRAIN_SUMMARIES, tf.summary.merge_all())
def restore_session(session,
path,
replace_to=None,
replace_from=None,
verbose=False,
use_metagraph=True,
only_features=False):
"""
Call restore on tf.train.Saver on a specific path to store all the
variables of the current tensorflow session to a file for later restoring.
Arguments:
session : tf.Session
path : str, place containing the session data to restore
verbose : bool, print status messages.
use_metagraph : bool, restore by re-creating saved metagraph.
Returns:
bool : success or failure of the restoration
"""
makedirs(path, exist_ok=True)
if not path.endswith("/"):
path = path + "/"
checkpoint = tf.train.get_checkpoint_state(path)
if verbose:
print("Looking for saved session under %r" % (path,), flush=True)
if checkpoint is None or checkpoint.model_checkpoint_path is None:
if verbose:
print("No saved session found", flush=True)
return False
fname = basename(checkpoint.model_checkpoint_path)
if verbose:
print("Restoring saved session from %r" % (join(path, fname),), flush=True)
if use_metagraph:
param_saver = tf.train.import_meta_graph(join(path, fname + ".meta"),
clear_devices=True)
missing_vars = []
else:
if only_features:
to_restore = {}
whitelist = ["embedding", "/RNN/", "/RNNParams", "CharacterConvolution", "HighwayLayer"]
for var in tf.global_variables():
if any(keyword in var.name for keyword in whitelist):
to_restore[var.name[:-2]] = var
param_saver = tf.train.Saver(to_restore)
else:
if replace_to is not None and replace_from is not None:
to_restore = {}
for var in tf.global_variables():
var_name = var.name[:var.name.rfind(":")]
old_name = var_name.replace(replace_to, replace_from)
to_restore[old_name] = var
param_saver = tf.train.Saver(to_restore)
missing_vars = []
else:
reader = tf.train.NewCheckpointReader(join(path, fname))
saved_shapes = reader.get_variable_to_shape_map()
found_vars = [var for var in tf.global_variables()
if var.name.split(':')[0] in saved_shapes]
missing_vars = [var for var in tf.global_variables()
if var.name.split(':')[0] not in saved_shapes]
param_saver = tf.train.Saver(found_vars)
param_saver.restore(session, join(path, fname))
session.run([var.initializer for var in missing_vars])
return True
def bidirectional_dynamic_rnn(cell, inputs, dtype, time_major=True, swap_memory=False):
with tf.variable_scope("forward"):
out_fwd, final_fwd = tf.nn.dynamic_rnn(
cell,
inputs,
time_major=time_major,
dtype=dtype,
swap_memory=swap_memory
)
if time_major:
reverse_axis = 0
else:
reverse_axis = 1
with tf.variable_scope("backward"):
out_bwd, final_bwd = tf.nn.dynamic_rnn(
cell,
reverse(inputs, axis=reverse_axis),
time_major=time_major,
dtype=dtype,
swap_memory=swap_memory
)
out_bwd = reverse(out_bwd, axis=reverse_axis)
return concat([out_fwd, out_bwd], axis=2), (final_fwd, final_bwd)
def get_embedding_lookup(size, dim, dtype, reuse=None, trainable=True):
with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
W = tf.get_variable(
name="embedding",
shape=[size, dim],
dtype=dtype,
initializer=tf.random_uniform_initializer(
-1.0 / math.sqrt(dim),
1.0 / math.sqrt(dim)
),
trainable=trainable
)
return W
def embedding_lookup(inputs,
size,
dim,
dtype,
reuse=None,
mask_negative=False,
trainable=True,
place_on_cpu_if_big=True):
"""
Construct an Embedding layer that gathers
elements from a matrix with `size` rows,
and `dim` features using the indices stored in `x`.
Arguments:
----------
inputs : tf.Tensor, of integer type
size : int, how many symbols in the lookup table
dim : int, how many columns per symbol.
dtype : data type for the lookup table (e.g. tf.float32)
reuse : bool, (default None) whether the lookup table
was already used before (thus this is weight sharing).
mask_negative : bool, (default False) should -1s in the
lookup input indicate padding (e.g. no lookup),
and thus should those values be masked out post-lookup.
trainable : bool (default True), whether the parameters of
this lookup table can be backpropagated into (e.g.
for Glove word vectors that are fixed pre-trained, this
can be set to False).
place_on_cpu_if_big : bool, if matrix is big, store it on cpu.
Returns:
--------
tf.Tensor, result of tf.nn.embedding_lookup(LookupTable, inputs)
"""
W = get_embedding_lookup(size, dim, dtype, reuse, trainable=trainable)
if mask_negative:
embedded = tf.nn.embedding_lookup(W, tf.maximum(inputs, 0))
null_mask = tf.expand_dims(
tf.cast(
tf.not_equal(inputs, -1),
dtype
),
-1
)
return embedded * null_mask
else:
return tf.nn.embedding_lookup(W, inputs)
def _get_sharded_variable(name, shape, dtype, num_shards):
"""Get a list of sharded variables with the given dtype."""
if num_shards > shape[0]:
raise ValueError("Too many shards: shape=%s, num_shards=%d" %
(shape, num_shards))
unit_shard_size = int(math.floor(shape[0] / num_shards))
remaining_rows = shape[0] - unit_shard_size * num_shards
shards = []
for i in range(num_shards):
current_size = unit_shard_size
if i < remaining_rows:
current_size += 1
shards.append(
tf.get_variable(
name + "_%d" % i,
[current_size] + shape[1:],
dtype=dtype
)
)
return shards
def _get_concat_variable(name, shape, dtype, num_shards):
"""Get a sharded variable concatenated into one tensor."""
sharded_variable = _get_sharded_variable(name, shape, dtype, num_shards)
if len(sharded_variable) == 1:
return sharded_variable[0]
concat_name = name + "/concat"
concat_full_name = tf.get_variable_scope().name + "/" + concat_name + ":0"
for value in tf.get_collection(tf.GraphKeys.CONCATENATED_VARIABLES):
if value.name == concat_full_name:
return value
concat_variable = tf.concat_v2(sharded_variable, 0, name=concat_name)
tf.add_to_collection(tf.GraphKeys.CONCATENATED_VARIABLES, concat_variable)
return concat_variable
class SequenceModel(object):
def __init__(self,
objectives,
features,
feature_index2words,
hidden_sizes,
keep_prob,
lr,
solver,
seed=1234,
input_keep_prob=0.7,
clip_norm=-1,
name="SequenceTagger",
cudnn=False,
anneal_rate=0.99,
trainable=True,
weight_noise=0.0,
class_weights_normalize=False,
faux_cudnn=False,
class_weights=False,
class_weights_clipval=1000.0,
freeze_rate=1.0,
fused=False,
freeze_rate_anneal=0.8,
create_variables=True):
if fused and objectives[0]["type"] == "crf":
fused = False
self.keep_prob = keep_prob
self.input_keep_prob = input_keep_prob
self.hidden_sizes = hidden_sizes
self.name = name
self.objectives = objectives
self.features = features
self.feature_index2words = feature_index2words
self.seed = seed
self.lr = lr
self.fused = fused
self.weight_noise = weight_noise
self.anneal_rate = anneal_rate
self.clip_norm = clip_norm
self.solver = solver
self.class_weights_normalize = class_weights_normalize
self.class_weights = class_weights
self.class_weights_clipval = class_weights_clipval
self.rng = np.random.RandomState(seed)
self.cudnn = cudnn
self.feature_word2index = [
{w: k for k, w in enumerate(index2word)} if index2word is not None else None
for index2word in self.feature_index2words
]
self.label2index = [
{w: k for k, w in enumerate(objective["vocab"])}
for objective in self.objectives
]
if create_variables:
# 1) build graph here (TF functional code pattern)
build_model(name=self.name,
trainable=trainable,
objectives=self.objectives,
features=self.features,
feature_index2words=self.feature_index2words,
hidden_sizes=self.hidden_sizes,
keep_prob=self.keep_prob,
solver=self.solver,
freeze_rate=freeze_rate,
class_weights_normalize=self.class_weights_normalize,
class_weights=self.class_weights,
class_weights_clipval=self.class_weights_clipval,
freeze_rate_anneal=freeze_rate_anneal,
cudnn=self.cudnn,
lr=self.lr,
fused=self.fused,
weight_noise=self.weight_noise,
anneal_rate=self.anneal_rate,
input_keep_prob=self.input_keep_prob,
faux_cudnn=faux_cudnn,
clip_norm=self.clip_norm)
# 2) and use meta graph to recover these fields:
self.recover_graph_variables()
def recover_graph_variables(self):
"""Use TF meta graph to obtain key metrics
and outputs from model."""
self.labels = tf.get_collection(LABEL_PLACEHOLDERS)
self.labels_mask = tf.get_collection(LABEL_MASK_PLACEHOLDERS)
self.input_placeholders = tf.get_collection(INPUT_PLACEHOLDERS)
self.sequence_lengths = tf.get_collection(SEQUENCE_LENGTHS)[0]
self.decoded = tf.get_collection(DECODED)
self.decoded_scores = tf.get_collection(DECODED_SCORES)
self.unary_scores = tf.get_collection(UNARY_SCORES)
self.token_correct = tf.get_collection(TOKEN_CORRECT)
self.token_correct_total = tf.get_collection(TOKEN_CORRECT_TOTAL)
self.sentence_correct = tf.get_collection(SENTENCE_CORRECT)
self.sentence_correct_total = tf.get_collection(SENTENCE_CORRECT_TOTAL)
self.token_correct_all = tf.get_collection(TOKEN_CORRECT_ALL)[0]
self.token_correct_all_total = tf.get_collection(TOKEN_CORRECT_ALL_TOTAL)[0]
self.sentence_correct_all = tf.get_collection(SENTENCE_CORRECT_ALL)[0]
self.sentence_correct_all_total = tf.get_collection(SENTENCE_CORRECT_ALL_TOTAL)[0]
self.true_positives = tf.get_collection(TRUE_POSITIVES)
self.false_positives = tf.get_collection(FALSE_POSITIVES)
self.false_negatives = tf.get_collection(FALSE_NEGATIVES)
if len(self.true_positives) == 0 and len(self.token_correct) != 0:
self.true_positives = [None for _ in self.token_correct]
self.false_positives = [None for _ in self.token_correct]
self.false_negatives = [None for _ in self.token_correct]
if len(tf.get_collection(GLOBAL_STEP)) > 0:
self.global_step = tf.get_collection(GLOBAL_STEP)[0]
else:
try:
self.global_step = tf.get_default_graph().get_tensor_by_name(
self.name + "/" + "global_step:0")
except KeyError:
self.global_step = tf.Variable(0, trainable=False, name="global_step")
tf.add_to_collection(GLOBAL_STEP, self.global_step)
self.is_training = tf.get_collection(IS_TRAINING)[0]
self.noop = tf.no_op()
self.train_op = tf.get_collection(TRAIN_OP)[0]
train_summaries = tf.get_collection(TRAIN_SUMMARIES)
self.train_summaries = train_summaries[0] if len(train_summaries) > 0 else None
self.nll = tf.get_collection(NLL)[0]
self.nll_total = tf.get_collection(NLL_TOTAL)[0]
self.saver = tf.train.Saver()
@classmethod
def overrideable_fields(cls):
return [
"keep_prob",
"name",
"lr",
"clip_norm",
"class_weights_normalize",
"class_weights_clipval",
"cudnn",
"anneal_rate",
"weight_noise",
"input_keep_prob"
]
@classmethod
def fields_to_save(cls):
return [
"hidden_sizes",
"objectives",
"name",
"cudnn",
"class_weights",
"features",
"fused",
"class_weights_normalize",
"weight_noise",
"anneal_rate",
"feature_index2words",
"solver",
"lr",
"clip_norm",
"keep_prob",
"input_keep_prob",
"class_weights_clipval"
]
def predict(self, session, feed_dict):
feed_dict[self.is_training] = False
outputs, outputs_probs = session.run(
(self.decoded, self.decoded_scores), feed_dict
)
predictions_out = {}
for value, val_prob, objective in zip(outputs, outputs_probs, self.objectives):
predictions_out[objective["name"]] = (value, val_prob)
return predictions_out
def predict_proba(self, session, feed_dict):
feed_dict[self.is_training] = False
outputs = session.run(
self.unary_scores, feed_dict
)
predictions_out = {}
for value, objective in zip(outputs, self.objectives):
predictions_out[objective["name"]] = value
return predictions_out
def save(self, session, path):
makedirs(path, exist_ok=True)
with open(join(path, "model.json"), "wt") as fout:
save_dict = {}
for field in type(self).fields_to_save():
save_dict[field] = getattr(self, field)
json.dump(save_dict, fout)
with open(join(path, "rng.pkl"), "wb") as fout:
pickle.dump(self.rng, fout)
save_session(session, self.saver, path, verbose=True)
@classmethod
def load(cls, session, path, args=None, verbose=True, trainable=True,
rebuild_graph=False, faux_cudnn=False, replace_to=None, replace_from=None):
"""Convenience method for using a tensorflow session to reload
a previously saved + serialized model from disk."""
with open(join(path, "model.json"), "rt") as fin:
model_props = json.load(fin)
# update fields based on CLI:
if args is not None:
ex_fields = explicitly_set_fields()
for field in cls.overrideable_fields():
if field in ex_fields:
model_props[field] = getattr(args, field)
# prune old fields based on changes to saveable fields:
relevant_props = {}
for field in cls.fields_to_save():
if field in model_props:
relevant_props[field] = model_props[field]
relevant_props["trainable"] = trainable
relevant_props["faux_cudnn"] = faux_cudnn
if rebuild_graph:
print("Using rebuild_graph mode: creating a new graph.", flush=True)
relevant_props["create_variables"] = True
model = cls(**relevant_props)
restore_session(
session, path,
replace_to=replace_to,
replace_from=replace_from,
verbose=verbose,
use_metagraph=False
)
else:
if model_props.get("cudnn", False):
import tensorflow.contrib.cudnn_rnn
relevant_props["create_variables"] = False
restore_session(
session, path,
verbose=verbose,
use_metagraph=True
)
model = cls(**relevant_props)
rng_path = join(path, "rng.pkl")
if exists(rng_path):
# apply the saved random number generator to this
# model:
with open(rng_path, "rb") as fin:
model.rng = pickle.load(fin)
return model
def make_path_absolute(obj, basepath):
copied = obj.copy()
for key in ["path", "vocab"]:
if key in copied:
copied[key] = join(basepath, copied[key])
return copied
class Config(object):
def __init__(self, datasets, features, objectives,
wikidata_path, classification_path):
assert(len(features) > 0)
self.datasets = datasets
self.features = features
self.objectives = objectives
self.classifications = None
self.wikidata_path = wikidata_path
self.classification_path = classification_path
# build the objective names:
self._named_objectives = [obj["name"] for obj in self.objectives]
@classmethod
def load(cls, path):
with open(path, "rt") as fin:
config = json.load(fin)
config_dirname = dirname(path)
return cls(
datasets=[make_path_absolute(dataset, config_dirname) for dataset in config['datasets']],
features=[make_path_absolute(feat, config_dirname) for feat in config['features']],
objectives=[make_path_absolute(objective, config_dirname) for objective in config['objectives']],
wikidata_path=config.get("wikidata_path", None),
classification_path=(
join(config_dirname, config.get("classification_path", None))
if "classification_path" in config else None)
)
def load_dataset_separate(self, dataset_type):
paths = [dataset for dataset in self.datasets if dataset["type"] == dataset_type]
all_examples = {}
for dataset in paths:
_, extension = splitext(dataset["path"])
if extension == ".h5" or extension == ".hdf5":
if self.classifications is None:
if self.wikidata_path is None or self.classification_path is None:
raise ValueError("missing wikidata_path and "
"classification_path, cannot "
"construct H5Dataset.")
self.classifications = ClassificationHandler(
self.wikidata_path,
self.classification_path
)
examples = H5Dataset(
dataset["path"],
dataset["x"],
dataset["y"],
self._named_objectives,
ignore_value=dataset.get('ignore', None),
classifications=self.classifications)
else:
examples = TSVDataset(
dataset["path"],
dataset["x"],
dataset["y"],
self._named_objectives,
comment=dataset.get('comment', '#'),
ignore_value=dataset.get('ignore', None),
retokenize=dataset.get('retokenize', False))
title = dataset["path"].split('/')[-1].split(".")[0]
name = title
iteration = 1
while name in all_examples:
name = title + "-%d" % (iteration,)
iteration += 1
all_examples[name] = examples
return all_examples
def load_dataset(self, dataset_type, merge=True):
datasets = self.load_dataset_separate(dataset_type)
if merge:
return CombinedDataset(list(datasets.values()))
return datasets
def boolean_argument(parser, name, default):
parser.add_argument("--" + name, action="store_true", default=default)
parser.add_argument("--no" + name, action="store_false", dest=name)
def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('config', type=str)
parser.add_argument('--lr', type=float, default=0.001)
parser.add_argument('--anneal_rate', type=float, default=0.99)
parser.add_argument('--clip_norm', type=float, default=-1)
parser.add_argument('--weight_noise', type=float, default=0.0)
parser.add_argument('--hidden_sizes', type=int, nargs="*", default=[200, 200])
parser.add_argument('--load_dir', type=str, default=None)
parser.add_argument('--restore_input_features', type=str, default=None)
parser.add_argument('--improvement_key', type=str, default="token_correct")
parser.add_argument('--freeze_rate', type=float, default=1.0)
parser.add_argument('--freeze_rate_anneal', type=float, default=0.8)
parser.add_argument('--save_dir', type=str, default=None)
parser.add_argument('--max_epochs', type=int, default=1000)
parser.add_argument('--test_every', type=int, default=10000,
help="Number of training iterations after which testing should occur.")
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--max_patience', type=int, default=10)
parser.add_argument('--class_weights_clipval', type=float, default=1000.0)
parser.add_argument('--device', type=str, default="gpu:0")
parser.add_argument('--keep_prob', type=float, default=0.5)
parser.add_argument('--input_keep_prob', type=float, default=0.7)
parser.add_argument('--solver', type=str, default="adam",
choices=["adam", "sgd"])
parser.add_argument("--name", type=str, default="SequenceTagger")
parser.add_argument("--old_name", type=str, default=None)
boolean_argument(parser, "cudnn", True)
boolean_argument(parser, "faux_cudnn", False)
boolean_argument(parser, "class_weights", False)
boolean_argument(parser, "rebuild_graph", False)
boolean_argument(parser, "class_weights_normalize", False)
boolean_argument(parser, "fused", True)
boolean_argument(parser, "report_metrics_per_axis", True)
boolean_argument(parser, "report_class_f1", False)
return parser.parse_args(args=args)
def get_vocab(dataset, max_vocab=-1, extra_words=None):
index2word = []
occurrence = {}
for el in dataset:
if el not in occurrence:
index2word.append(el)
occurrence[el] = 1
else:
occurrence[el] += 1
index2word = sorted(index2word, key=lambda x: occurrence[x], reverse=True)
if max_vocab > 0:
index2word = index2word[:max_vocab]
if extra_words is not None:
index2word = extra_words + index2word
return index2word
def get_objectives(objectives, dataset):
out = []
for obj_idx, objective in enumerate(objectives):
if "vocab" in objective:
with open(objective["vocab"], "rt") as fin:
vocab = fin.read().splitlines()
else:
vocab = get_vocab((w[obj_idx] for _, y in dataset for w in y if w[obj_idx] is not None), -1)
out.append(
{
"vocab": vocab,
"type": objective["type"],
"name": objective["name"]
}
)
return out
def merge_all_metrics(metrics):
out = {}
for key, metric in metrics.items():
for subkey, submetric in metric.items():
if len(key) > 0:
out[key + "_" + subkey] = submetric
if subkey not in out:
out[subkey] = submetric
else:
out[subkey] += submetric
else:
out[subkey] = submetric
return out
def log_outcome(logger, outcome, step, name):
for k, v in sorted(outcome.items()):
if "total" in k:
continue
else:
total = outcome[k + "_total"]
if total == 0:
continue
logger.log(k, v / total, step=step)
logger.writer.flush()
def compute_f1(metrics, objectives, report_class_f1):
total_f1 = 0.0
total_precision = 0.0
total_recall = 0.0
total = 0
for objective in objectives:
name = objective["name"]
key = "%s_true_positives" % (name,)
if key not in metrics:
continue
tp = metrics[key]
fp = metrics["%s_false_positives" % (name,)]
fn = metrics["%s_false_negatives" % (name,)]
del metrics[key]
del metrics["%s_false_positives" % (name,)]
del metrics["%s_false_negatives" % (name,)]
precision = 1.* tp / np.maximum((tp + fp), 1e-6)
recall = 1. * tp / np.maximum((tp + fn), 1e-6)
f1 = 2.0 * precision * recall / np.maximum((precision + recall), 1e-6)
support = tp + fn
full_f1 = np.average(f1, weights=support) * 100.0
full_recall = np.average(recall, weights=support) * 100.0
full_precision = np.average(precision, weights=support) * 100.0
total_f1 += full_f1
total_recall += full_recall
total_precision += full_precision
total += 1
if report_class_f1:
print("F1 %s: %r" % (name, full_f1))
print("Name\tF1\tTP\tFP\tFN")
rows = zip([label for label, has_support in zip(objective["vocab"],
support > 0)
if has_support],
f1, tp, fp, fn)
for val, f1_val, val_tp, val_fp, val_fn in rows:
print("%s\t%r\t%d\t%d\t%d" % (
val, f1_val, val_tp, val_fp, val_fn))
print("")
if total > 0:
metrics["F1"] = total_f1
metrics["recall"] = total_recall
metrics["precision"] = total_precision
metrics["F1_total"] = total
metrics["recall_total"] = total
metrics["precision_total"] = total
def accuracy(model, session, datasets, batch_size, train,
report_metrics_per_axis, report_class_f1,
callback=None,
callback_period=None, writer=None):
pbar = get_progress_bar("train" if train else "validation", item="batches")
if not isinstance(datasets, dict):
datasets = {'':datasets}
all_metrics_agg = {}
if callback is not None:
if callback_period is None:
raise ValueError("callback_period cannot be None if "
"callback is used.")
else:
callback_period = None
if train:
train_op = model.train_op
else:
train_op = model.noop
is_training = model.is_training
metrics = {"nll": model.nll, "nll_total": model.nll_total}
summaries = []
if not train:
metric_iter = zip(
model.objectives,
model.token_correct,
model.token_correct_total,
model.sentence_correct,
model.sentence_correct_total,
model.true_positives,
model.false_positives,
model.false_negatives
)
for metric_vars in metric_iter:
(
objective,
token_correct,
token_correct_total,
sentence_correct,
sentence_correct_total,
true_positives,
false_positives,
false_negatives
) = metric_vars
name = objective["name"]
if report_metrics_per_axis:
metrics["%s_token_correct" % (name,)] = token_correct
metrics["%s_token_correct_total" % (name,)] = token_correct_total
metrics["%s_sentence_correct" % (name,)] = sentence_correct
metrics["%s_sentence_correct_total" % (name,)] = sentence_correct_total
if true_positives is not None:
metrics["%s_true_positives" % (name,)] = true_positives
metrics["%s_false_positives" % (name,)] = false_positives
metrics["%s_false_negatives" % (name,)] = false_negatives
metrics["token_correct"] = model.token_correct_all
metrics["token_correct_total"] = model.token_correct_all_total
metrics["sentence_correct"] = model.sentence_correct_all
metrics["sentence_correct_total"] = model.sentence_correct_all_total
summaries = []
else:
if writer is not None and model.train_summaries is not None:
summaries = model.train_summaries
metrics_values = [v for _, v in sorted(metrics.items())]
metrics_names = [name for name, _ in sorted(metrics.items())]
outputs_val = [train_op, model.global_step, summaries, metrics_values]
for title, dataset in datasets.items():
batches = iter_batches_single_threaded(
model=model,
dataset=dataset,
batch_size=batch_size,
train=train,
pbar=pbar
)
metrics_agg = {}
iteration = 0
for feed_dict in batches:
feed_dict[is_training] = train
_, step, summary_out, outputs = session.run(outputs_val, feed_dict)
if writer is not None:
writer.add_summary(summary_out, step)
for key, value in zip(metrics_names, outputs[:len(metrics_names)]):
if key not in metrics_agg:
metrics_agg[key] = value
else:
metrics_agg[key] += value
iteration += 1
if callback_period is not None and iteration % callback_period == 0:
callback(iteration)
if np.isnan(metrics_agg['nll']):
print("loss is NaN.", flush=True, file=sys.stderr)
sys.exit(1)
compute_f1(metrics_agg, model.objectives, report_class_f1)
all_metrics_agg[title] = metrics_agg
del batches
return merge_all_metrics(all_metrics_agg)
def present_outcome(outcome, epoch, name):
string_rows = []
for k, v in sorted(outcome.items()):
if "total" in k:
continue
else:
total = outcome[k + "_total"]
if total == 0:
continue
if "correct" in k:
string_rows.append(
[
k,
"%.2f%%" % (100.0 * v / total),
"(%d correct / %d)" % (v, total)
]
)
else:
string_rows.append(
[
k,
"%.3f" % (v / total),
""
]
)
max_len_cols = [
max(len(row[colidx]) for row in string_rows)
for colidx in range(len(string_rows[0]))
] if len(string_rows) > 0 else []
rows = []
for row in string_rows:
rows.append(
" ".join(
[col + " " * (max_len_cols[colidx] - len(col))
for colidx, col in enumerate(row)]
)
)
return "\n".join(["Epoch {epoch}: {name}".format(epoch=epoch, name=name)] + rows)
def print_outcome(outcome, objectives, epoch, step, name, logger=None):
outcome_report = present_outcome(outcome, epoch, name)
if logger is not None:
log_outcome(logger, outcome, step, name)
print(outcome_report)
class SequenceTagger(object):
def __init__(self, path, device="gpu", faux_cudnn=False, rebuild_graph=False):
tf.reset_default_graph()
session_conf = tf.ConfigProto(
allow_soft_placement=True
)
self.session = tf.InteractiveSession(config=session_conf)
with tf.device(device):
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
self._model = SequenceModel.load(
self.session,
path,
args=None,
verbose=False,
trainable=False,
rebuild_graph=rebuild_graph,
faux_cudnn=faux_cudnn
)
@property
def objectives(self):
return self._model.objectives
def predict_proba(self, tokens):
blank_labels = tuple(None for _ in self._model.objectives)
batches = list(iter_batches_single_threaded(
model=self._model,
dataset=[
(tokens, [blank_labels for t in tokens])
],
batch_size=1,
train=False,
autoresize=False
))
outputs = []
batches[0][self._model.is_training] = False
probs_out = self._model.predict_proba(
self.session, batches[0]
)
return probs_out
def predict_proba_sentences(self, sentences):
blank_labels = tuple(None for _ in self._model.objectives)
batches = iter_batches_single_threaded(
model=self._model,
dataset=[
(sentence, [blank_labels for t in sentence])
for sentence in sentences
],
batch_size=min(256, len(sentences)),
train=False,
autoresize=False
)
for batch in batches:
batch[self._model.is_training] = False
yield self._model.predict_proba(
self.session, batch
)
def predict_topk_sentences(self, sentences, k=5):
blank_labels = tuple(None for _ in self._model.objectives)
batches = iter_batches_single_threaded(
model=self._model,
dataset=[
(sentence, [blank_labels for t in sentence])
for sentence in sentences
],
batch_size=min(256, len(sentences)),
train=False,
autoresize=False
)
for batch in batches:
outputs = self._model.predict_proba(
self.session, batch
)
named_outputs = {}
for objective in self._model.objectives:
obj_name = objective["name"]
tags, scores = outputs[obj_name]
if objective["type"] == "crf":
named_outputs[obj_name] = [
[(token, [objective["vocab"][tag]], [score]) for token, tag in zip(tokens, tags)]
for tokens, tags, score in zip(sentences, tags, scores)
]
elif objective["type"] == 'softmax':
all_sent_scores = []
for tokens, scores in zip(sentences, scores):
sent_scores = []
for token, token_scores in zip(tokens, scores):
topk = np.argsort(token_scores)[::-1][:k]
sent_scores.append(
(
token,
[objective["vocab"][idx] for idx in topk],
[token_scores[idx] for idx in topk]
)
)
all_sent_scores.append(sent_scores)
named_outputs[obj_name] = all_sent_scores
else:
raise ValueError("unknown objective type %r." % (objective["type"],))
yield named_outputs
def tag_sentences(self, sentences):
if len(sentences) == 0:
return {
objective["name"]: []
for objective in self._model.objectives
}
blank_labels = tuple(None for _ in self._model.objectives)
batches = list(iter_batches_single_threaded(
self._model,
[
(sentence, [blank_labels for t in sentence])
for sentence in sentences
],
batch_size=min(256, len(sentences)),
train=False,
autoresize=False
))
named_outputs = {}
sentence_idx = 0
for batch in batches:
outputs = self._model.predict(self.session, batch)
for objective in self._model.objectives:
obj_name = objective["name"]
if obj_name not in named_outputs:
named_outputs[obj_name] = []
tags, scores = outputs[obj_name]
nsentences = len(tags)
if objective["type"] == "crf":
named_outputs[obj_name].extend([
[(token, objective["vocab"][tag], score) for token, tag in zip(tokens, tags)]
for tokens, tags, score in zip(sentences[sentence_idx:sentence_idx+nsentences], tags, scores)
])
elif objective["type"] == 'softmax':
named_outputs[obj_name].extend([
[(token, objective["vocab"][tag], score)
for token, tag, score in zip(tokens, tags, scores)]
for tokens, tags, scores in zip(sentences[sentence_idx:sentence_idx+nsentences], tags, scores)
])
else:
raise ValueError("unknown objective type %r." % (objective["type"],))
sentence_idx += nsentences
return named_outputs
def count_number_of_parameters():
return int(sum([np.prod(var.get_shape().as_list())
for var in tf.trainable_variables()]))
class TestCallback(object):
def __init__(self, model, session, dataset, epoch, args, logger):
self.model = model
self.session = session
self.dataset = dataset
self.epoch = epoch
self.args = args
self.logger = logger
self.report_metrics_per_axis = args.report_metrics_per_axis
self.report_class_f1 = args.report_class_f1
def test(self, iteration):
dev_outcome = accuracy(self.model, self.session, self.dataset, self.args.batch_size,
train=False, report_metrics_per_axis=self.report_metrics_per_axis,
report_class_f1=self.report_class_f1)
print_outcome(dev_outcome, self.model.objectives,
epoch="{}-{}".format(self.epoch, iteration),
step=self.session.run(self.model.global_step),
name="validation",
logger=self.logger
)
if self.args.save_dir is not None:
self.model.save(self.session, self.args.save_dir)
def compute_epoch(session, model, train_set,
validation_set, test_callback, epoch,
train_writer, test_writer,
args):
test_callback.epoch = epoch
train_outcome = accuracy(model,
session,
train_set,
args.batch_size,
train=True,
callback_period=args.test_every,
writer=train_writer.writer if train_writer is not None else None,
report_metrics_per_axis=args.report_metrics_per_axis,
report_class_f1=args.report_class_f1,
callback=test_callback.test)
global_step = session.run(model.global_step)
print_outcome(train_outcome,
model.objectives,
epoch=epoch,
name="train",
step=global_step,
logger=train_writer)
dev_outcome = accuracy(
model, session, validation_set, args.batch_size,
train=False,
report_metrics_per_axis=args.report_metrics_per_axis,
report_class_f1=args.report_class_f1)
print_outcome(dev_outcome,
model.objectives,
epoch=epoch,
step=global_step,
name="validation",
logger=test_writer)
if args.save_dir is not None:
model.save(session, args.save_dir)
return dev_outcome
def main():
args = parse_args()
config = Config.load(args.config)
validation_set = config.load_dataset("dev", merge=False)
session_conf = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=session_conf) as session, tf.device(args.device):
if args.load_dir is not None:
model = SequenceModel.load(session, args.load_dir,
args=args, rebuild_graph=args.rebuild_graph, faux_cudnn=args.faux_cudnn,
replace_to=args.name,
replace_from=args.old_name)
dev_outcome = accuracy(
model, session, validation_set, args.batch_size, train=False,
report_metrics_per_axis=args.report_metrics_per_axis,
report_class_f1=args.report_class_f1)
print_outcome(dev_outcome,
model.objectives, 0,
name="loaded validation",
step=session.run(model.global_step),
logger=None)
# dev_outcome = None
if args.rebuild_graph and args.save_dir is not None:
model.save(session, args.save_dir)
train_set = config.load_dataset("train")
else:
# load classes and index2word from a file.
dev_outcome = None
train_set = config.load_dataset("train")
model = SequenceModel(
objectives=get_objectives(config.objectives, train_set),
features=config.features,
feature_index2words=get_feature_vocabs(config.features, train_set, ["<UNK>"]),
lr=args.lr,
anneal_rate=args.anneal_rate,
weight_noise=args.weight_noise,
freeze_rate=args.freeze_rate,
freeze_rate_anneal=args.freeze_rate_anneal,
clip_norm=args.clip_norm,
hidden_sizes=args.hidden_sizes,
solver=args.solver,
fused=args.fused,
class_weights_normalize=args.class_weights_normalize,
class_weights=args.class_weights,
class_weights_clipval=args.class_weights_clipval,
keep_prob=args.keep_prob,
input_keep_prob=args.input_keep_prob,
name=args.name,
cudnn=args.cudnn,
faux_cudnn=args.faux_cudnn,
create_variables=True)
session.run(tf.global_variables_initializer())
if args.restore_input_features is not None:
restore_session(
session, args.restore_input_features,
verbose=True,
use_metagraph=False,
only_features=True)
print("Model has {} trainable parameters.".format(count_number_of_parameters()), flush=True)
best_dev_score = 0.0
patience = 0
best_epoch = 0
best_outcome = None
improvement_key = args.improvement_key
if dev_outcome is not None:
best_dev_score = dev_outcome[improvement_key]
best_epoch = -1
best_outcome = dev_outcome
if args.save_dir is not None:
train_writer = Logger(session, tf.summary.FileWriter(join(args.save_dir, "train")))
test_writer = Logger(session, tf.summary.FileWriter(join(args.save_dir, "test")))
else:
train_writer, test_writer = None, None
test_callback = TestCallback(model,
session,
validation_set,
-1,
args,
logger=test_writer)
if len(train_set) > 0:
train_set.set_randomize(True)
train_set.set_rng(model.rng)
for epoch in range(args.max_epochs):
dev_outcome = compute_epoch(
session, model,
train_set=train_set, validation_set=validation_set,
epoch=epoch, test_callback=test_callback,
train_writer=train_writer,
test_writer=test_writer,
args=args)
if dev_outcome[improvement_key] > best_dev_score:
best_dev_score = dev_outcome[improvement_key]
best_epoch = epoch
best_outcome = dev_outcome
patience = 0
if args.save_dir is not None:
model.save(session, join(args.save_dir, "best"))
else:
patience += 1
if patience >= args.max_patience:
print("No improvements for {} epochs. Stopping.".format(args.max_patience))
break
del dev_outcome
print_outcome(
best_outcome,
model.objectives,
epoch=best_epoch,
name="validation-best",
step=session.run(model.global_step),
logger=None)
if __name__ == "__main__":
main()