in inference/src/translator/translation_model.cpp [139:202]
Ptr<marian::data::CorpusBatch> TranslationModel::convertToMarianBatch(Batch &batch) {
std::vector<data::SentenceTuple> batchVector;
auto &sentences = batch.sentences();
size_t batchSequenceNumber{0};
for (auto &sentence : sentences) {
data::SentenceTuple sentence_tuple(batchSequenceNumber);
Segment segment = sentence.getUnderlyingSegment();
sentence_tuple.push_back(segment);
batchVector.push_back(sentence_tuple);
++batchSequenceNumber;
}
// Usually one would expect inputs to be [B x T], where B = batch-size and T = max seq-len among the sentences in the
// batch. However, marian's library supports multi-source and ensembling through different source-vocabulary but same
// target vocabulary. This means the inputs are 3 dimensional when converted into marian's library formatted batches.
//
// Consequently B x T projects to N x B x T, where N = ensemble size. This adaptation does not fully force the idea of
// N = 1 (the code remains general, but N iterates only from 0-1 in the nested loop).
size_t batchSize = batchVector.size();
std::vector<size_t> sentenceIds;
std::vector<int> maxDims;
for (auto &example : batchVector) {
if (maxDims.size() < example.size()) {
maxDims.resize(example.size(), 0);
}
for (size_t i = 0; i < example.size(); ++i) {
if (example[i].size() > static_cast<size_t>(maxDims[i])) {
maxDims[i] = static_cast<int>(example[i].size());
}
}
sentenceIds.push_back(example.getId());
}
using SubBatch = marian::data::SubBatch;
std::vector<Ptr<SubBatch>> subBatches;
for (size_t j = 0; j < maxDims.size(); ++j) {
subBatches.emplace_back(New<SubBatch>(batchSize, maxDims[j], vocabs_.sources().at(j)));
}
std::vector<size_t> words(maxDims.size(), 0);
for (size_t i = 0; i < batchSize; ++i) {
for (size_t j = 0; j < maxDims.size(); ++j) {
for (size_t k = 0; k < batchVector[i][j].size(); ++k) {
subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
subBatches[j]->mask()[k * batchSize + i] = 1.f;
words[j]++;
}
}
}
for (size_t j = 0; j < maxDims.size(); ++j) {
subBatches[j]->setWords(words[j]);
}
using CorpusBatch = marian::data::CorpusBatch;
Ptr<CorpusBatch> corpusBatch = New<CorpusBatch>(subBatches);
corpusBatch->setSentenceIds(sentenceIds);
return corpusBatch;
}