in src/cpp/modules/manual/lstm_d.cpp [139:271]
void lstm_predict_d(int l, int b,
const MainParams<T>& main_params, const ExtraParams<T>& extra_params,
State<T>& state,
const T* input,
LayerStateJacobianPredict<T>& zero_layer_jacobian,
ModelJacobian<T>& layer_state_d,
T* output,
StateJacobianPredict<T>& state_jacobian,
PredictionJacobian<T>& output_jacobian)
{
// Intial setup (from predict())
for (int i = 0; i < b; ++i) {
output[i] = input[i] * extra_params.in_weight[i];
// note that the rest of zero_layer_jacobian.d_hidden and zero_layer_jacobian.d_cell are unused
*zero_layer_jacobian.d_hidden[i].d_extra_in_weight = input[i];
}
// Pointer to current output/next layer's input
T* layer_output = output;
// Pointer to the jacobian of the previous layer
LayerStateJacobianPredict<T>* prev_layer_jacobian = &zero_layer_jacobian;
// Main LSTM loop (from predict())
for (int i = 0; i < l; ++i)
{
lstm_model_d(b, main_params.layer_params[i], state.layer_state[i], layer_output, layer_state_d);
layer_output = state.layer_state[i].hidden;
// set state_jacobian.layer[i]
for (int j = 0; j < b; ++j)
{
T hidden_j_d_input = layer_state_d.hidden[j].d_input;
T cell_j_d_input = layer_state_d.cell[j].d_input;
// derivatives by variables on which layer_output depends
*state_jacobian.layer[i].d_hidden[j].d_extra_in_weight = hidden_j_d_input * (*prev_layer_jacobian->d_hidden[j].d_extra_in_weight);
*state_jacobian.layer[i].d_cell[j].d_extra_in_weight = cell_j_d_input * (*prev_layer_jacobian->d_hidden[j].d_extra_in_weight);
for (int k = 0; k < i ; ++k)
{
state_jacobian.layer[i].d_hidden[j].d_weight_forget[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_forget[k];
state_jacobian.layer[i].d_hidden[j].d_weight_ingate[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_ingate[k];
state_jacobian.layer[i].d_hidden[j].d_weight_outgate[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_outgate[k];
state_jacobian.layer[i].d_hidden[j].d_weight_change[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_change[k];
state_jacobian.layer[i].d_hidden[j].d_bias_forget[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_forget[k];
state_jacobian.layer[i].d_hidden[j].d_bias_ingate[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_ingate[k];
state_jacobian.layer[i].d_hidden[j].d_bias_outgate[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_outgate[k];
state_jacobian.layer[i].d_hidden[j].d_bias_change[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_change[k];
state_jacobian.layer[i].d_hidden[j].d_hidden[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_hidden[k];
state_jacobian.layer[i].d_hidden[j].d_cell[k] = hidden_j_d_input * prev_layer_jacobian->d_hidden[j].d_cell[k];
state_jacobian.layer[i].d_cell[j].d_weight_forget[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_forget[k];
state_jacobian.layer[i].d_cell[j].d_weight_ingate[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_ingate[k];
state_jacobian.layer[i].d_cell[j].d_weight_outgate[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_outgate[k];
state_jacobian.layer[i].d_cell[j].d_weight_change[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_weight_change[k];
state_jacobian.layer[i].d_cell[j].d_bias_forget[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_forget[k];
state_jacobian.layer[i].d_cell[j].d_bias_ingate[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_ingate[k];
state_jacobian.layer[i].d_cell[j].d_bias_outgate[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_outgate[k];
state_jacobian.layer[i].d_cell[j].d_bias_change[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_bias_change[k];
state_jacobian.layer[i].d_cell[j].d_hidden[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_hidden[k];
state_jacobian.layer[i].d_cell[j].d_cell[k] = cell_j_d_input * prev_layer_jacobian->d_hidden[j].d_cell[k];
}
// derivatives by variables on which lstm_model_d depends directly
state_jacobian.layer[i].d_hidden[j].d_weight_forget[i] = layer_state_d.hidden[j].d_weight.forget;
state_jacobian.layer[i].d_hidden[j].d_weight_ingate[i] = layer_state_d.hidden[j].d_weight.ingate;
state_jacobian.layer[i].d_hidden[j].d_weight_outgate[i] = layer_state_d.hidden[j].d_weight.outgate;
state_jacobian.layer[i].d_hidden[j].d_weight_change[i] = layer_state_d.hidden[j].d_weight.change;
state_jacobian.layer[i].d_hidden[j].d_bias_forget[i] = layer_state_d.hidden[j].d_bias.forget;
state_jacobian.layer[i].d_hidden[j].d_bias_ingate[i] = layer_state_d.hidden[j].d_bias.ingate;
state_jacobian.layer[i].d_hidden[j].d_bias_outgate[i] = layer_state_d.hidden[j].d_bias.outgate;
state_jacobian.layer[i].d_hidden[j].d_bias_change[i] = layer_state_d.hidden[j].d_bias.change;
state_jacobian.layer[i].d_hidden[j].d_hidden[i] = layer_state_d.hidden[j].d_hidden;
state_jacobian.layer[i].d_hidden[j].d_cell[i] = layer_state_d.hidden[j].d_cell;
state_jacobian.layer[i].d_cell[j].d_weight_forget[i] = layer_state_d.cell[j].d_weight.forget;
state_jacobian.layer[i].d_cell[j].d_weight_ingate[i] = layer_state_d.cell[j].d_weight.ingate;
state_jacobian.layer[i].d_cell[j].d_weight_outgate[i] = layer_state_d.cell[j].d_weight.outgate;
state_jacobian.layer[i].d_cell[j].d_weight_change[i] = layer_state_d.cell[j].d_weight.change;
state_jacobian.layer[i].d_cell[j].d_bias_forget[i] = layer_state_d.cell[j].d_bias.forget;
state_jacobian.layer[i].d_cell[j].d_bias_ingate[i] = layer_state_d.cell[j].d_bias.ingate;
state_jacobian.layer[i].d_cell[j].d_bias_outgate[i] = layer_state_d.cell[j].d_bias.outgate;
state_jacobian.layer[i].d_cell[j].d_bias_change[i] = layer_state_d.cell[j].d_bias.change;
state_jacobian.layer[i].d_cell[j].d_hidden[i] = layer_state_d.cell[j].d_hidden;
state_jacobian.layer[i].d_cell[j].d_cell[i] = layer_state_d.cell[j].d_cell;
// derivatives by variable on which lstm_model_d does not depend (zero)
for (int k = i + 1; k < l; ++k)
{
state_jacobian.layer[i].d_hidden[j].d_weight_forget[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_weight_ingate[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_weight_outgate[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_weight_change[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_bias_forget[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_bias_ingate[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_bias_outgate[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_bias_change[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_hidden[k] = 0.;
state_jacobian.layer[i].d_hidden[j].d_cell[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_weight_forget[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_weight_ingate[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_weight_outgate[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_weight_change[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_bias_forget[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_bias_ingate[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_bias_outgate[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_bias_change[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_hidden[k] = 0.;
state_jacobian.layer[i].d_cell[j].d_cell[k] = 0.;
}
}
prev_layer_jacobian = &state_jacobian.layer[i];
}
// Final changes (from predict())
for (int i = 0; i < b; ++i)
{
T cur_out_weight = extra_params.out_weight[i];
// compute output
output[i] = layer_output[i] * cur_out_weight + extra_params.out_bias[i];
// compute the derivatives of output
*output_jacobian.d_prediction[i].d_extra_in_weight = cur_out_weight * (*prev_layer_jacobian->d_hidden[i].d_extra_in_weight);
*output_jacobian.d_prediction[i].d_extra_out_weight = layer_output[i];
*output_jacobian.d_prediction[i].d_extra_out_bias = 1.;
for (int j = 0; j < l; ++j)
{
output_jacobian.d_prediction[i].d_weight_forget[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_weight_forget[j];
output_jacobian.d_prediction[i].d_weight_ingate[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_weight_ingate[j];
output_jacobian.d_prediction[i].d_weight_outgate[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_weight_outgate[j];
output_jacobian.d_prediction[i].d_weight_change[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_weight_change[j];
output_jacobian.d_prediction[i].d_bias_forget[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_bias_forget[j];
output_jacobian.d_prediction[i].d_bias_ingate[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_bias_ingate[j];
output_jacobian.d_prediction[i].d_bias_outgate[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_bias_outgate[j];
output_jacobian.d_prediction[i].d_bias_change[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_bias_change[j];
output_jacobian.d_prediction[i].d_hidden[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_hidden[j];
output_jacobian.d_prediction[i].d_cell[j] = cur_out_weight * prev_layer_jacobian->d_hidden[i].d_cell[j];
}
}
}