in src/model.cpp [353:485]
float EmbedModel::trainOneBatch(shared_ptr<InternDataHandler> data,
const vector<ParseResults>& batch_exs,
size_t negSearchLimit,
Real rate0,
bool trainWord) {
using namespace boost::numeric::ublas;
// Keep all the activations on the stack so we can asynchronously
// update.
int batch_sz = batch_exs.size();
std::vector<Matrix<Real>> lhs(batch_sz), rhsP(batch_sz);
std::vector<Real> posSim(batch_sz);
std::vector<Real> labelRate(batch_sz, -rate0);
auto cols = args_->dim;
for (auto i = 0; i < batch_sz; i++) {
const auto& items = batch_exs[i].LHSTokens;
const auto& labels = batch_exs[i].RHSTokens;
projectLHS(items, lhs[i]);
check(lhs[i]);
projectRHS(labels, rhsP[i]);
check(rhsP[i]);
posSim[i] = similarity(lhs[i], rhsP[i]);
}
// Some simple helpers to characterize the current triple we're
// considering.
auto tripleLoss = [&] (Real posSim, Real negSim) {
auto val = args_->margin - posSim + negSim;
assert(!isnan(posSim));
assert(!isnan(negSim));
assert(!isinf(posSim));
assert(!isinf(negSim));
// We want the max representable loss to have some wiggle room to
// compute with.
const auto kMaxLoss = 10e7;
auto retval = (std::max)((std::min)(val, kMaxLoss), 0.0);
return retval;
};
// Get a random batch of negatives
std::vector<Matrix<Real>> rhsN(negSearchLimit);
std::vector<std::vector<Base>> batch_negLabels;
for (unsigned int i = 0; i < negSearchLimit; i++) {
std::vector<Base> negLabels;
if (trainWord) {
data->getRandomWord(negLabels);
} else {
data->getRandomRHS(negLabels);
}
projectRHS(negLabels, rhsN[i]);;
check(rhsN[i]);
batch_negLabels.push_back(negLabels);
}
// Select negative examples
Real total_loss = 0.0;
std::vector<Real> loss(batch_sz);
std::vector<Matrix<Real>> negMean(batch_sz);
std::vector<int> num_negs(batch_sz);
std::vector<std::vector<Real>> nRate(batch_sz);
std::vector<std::vector<bool>> update_flag;
update_flag.resize(batch_sz);
for (int i = 0; i < batch_sz; i++) {
num_negs[i] = 0;
loss[i] = 0.0;
negMean[i].matrix = zero_matrix<Real>(1, cols);
update_flag[i].resize(negSearchLimit, false);
nRate[i].resize(negSearchLimit, 0);
for (unsigned int j = 0; j < negSearchLimit; j++) {
nRate[i][j] = 0.0;
if (batch_exs[i].RHSTokens == batch_negLabels[j]) {
continue;
}
auto thisLoss = tripleLoss(posSim[i], similarity(lhs[i], rhsN[j]));
if (thisLoss > 0.0) {
num_negs[i]++;
loss[i] += thisLoss;
negMean[i].add(rhsN[j]);
assert(loss[i] >= 0.0);
update_flag[i][j] = true;
if (num_negs[i] == args_->maxNegSamples) {
break;
}
}
}
if (num_negs[i] == 0) {
continue;
}
loss[i] /= negSearchLimit;
negMean[i].matrix /= num_negs[i];
total_loss += loss[i];
// gradW for i
negMean[i].add(rhsP[i], -1);
for (unsigned int j = 0; j < negSearchLimit; j++) {
if (update_flag[i][j]) {
nRate[i][j] = rate0 / num_negs[i];
}
}
}
// Couldn't find a negative example given reasonable effort, so
// give up.
if (total_loss == 0.0) return 0.0;
assert(!std::isinf(total_loss));
if (rate0 == 0.0) return total_loss;
// Let w be the average of the input features, t+ be the positive
// example and t- be the average of the negative examples.
// Our error E is:
//
// E = k - dot(w, t+) + dot(w, t-)
//
// Differentiating term-by-term we get:
//
// dE / dw = t- - t+
// dE / dt- = w
// dE / dt+ = -w
//
// gradW = \sum_i t_i- - t+. We're done with negMean, so reuse it.
backward(batch_exs, batch_negLabels,
negMean, lhs, num_negs,
rate0, labelRate, nRate);
return total_loss;
}