void HTML::hardAlignments()

in inference/src/translator/html.cpp [752:809]


void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                          std::vector<SpanIterator> const &sourceTokenSpans) {
  size_t offset = 0;  // sentence offset in sourceTokenSpans

  // For each sentence...
  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
    alignments.emplace_back();

    // Hard-align: find for each target token the most prevalent source token
    // Note: only search from 0 to N-1 because token N is end-of-sentence token
    // that can only align with the end-of-sentence token of the target
    for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
      alignments.back().push_back(
          std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
          response.alignments[sentenceIdx][t].begin());
    }

    // Next, we try to smooth out these selected alignments with a few heuristics
    for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
      // If this token is a continuation of a previous token, pick the tags from the most
      // prevalent token for the whole word.
      if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
        // Note: only looking at the previous token since that will already
        // have this treatment applied to it.
        size_t currSentenceIdx = alignments.back()[t];
        size_t prevSentenceIdx = alignments.back()[t - 1];
        float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
        float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];

        TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
        TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;

        // If this token has more markup, or a better score than the previous
        // token (and they together are part of a word-ish thing) then mark
        // this word as aligning. Otherwise just copy the alignment source of
        // the previous token.
        if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
          // Apply this to all previous tokens in the word
          for (size_t i = t;; --i) {
            alignments.back()[i] = currSentenceIdx;

            // Stop if this was the first token or the beginning of the word
            if (i == 0 ||
                !isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
              break;
          }
        } else {
          alignments.back()[t] = prevSentenceIdx;
        }
      }
    }

    // Always align target end with source end
    alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);

    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
  }
}