in inference/src/translator/html.cpp [752:809]
void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
std::vector<SpanIterator> const &sourceTokenSpans) {
size_t offset = 0; // sentence offset in sourceTokenSpans
// For each sentence...
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
alignments.emplace_back();
// Hard-align: find for each target token the most prevalent source token
// Note: only search from 0 to N-1 because token N is end-of-sentence token
// that can only align with the end-of-sentence token of the target
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
alignments.back().push_back(
std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
response.alignments[sentenceIdx][t].begin());
}
// Next, we try to smooth out these selected alignments with a few heuristics
for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
// If this token is a continuation of a previous token, pick the tags from the most
// prevalent token for the whole word.
if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
// Note: only looking at the previous token since that will already
// have this treatment applied to it.
size_t currSentenceIdx = alignments.back()[t];
size_t prevSentenceIdx = alignments.back()[t - 1];
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
// If this token has more markup, or a better score than the previous
// token (and they together are part of a word-ish thing) then mark
// this word as aligning. Otherwise just copy the alignment source of
// the previous token.
if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
// Apply this to all previous tokens in the word
for (size_t i = t;; --i) {
alignments.back()[i] = currSentenceIdx;
// Stop if this was the first token or the beginning of the word
if (i == 0 ||
!isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
break;
}
} else {
alignments.back()[t] = prevSentenceIdx;
}
}
}
// Always align target end with source end
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
}