in inference/src/translator/quality_estimator.cpp [187:239]
LogisticRegressorQualityEstimator::Matrix LogisticRegressorQualityEstimator::extractFeatures(
const std::vector<SubwordRange>& wordIndices, const std::vector<float>& logProbs) const {
if (wordIndices.empty()) {
return std::move(Matrix(0, 0));
}
// The number of features (numFeatures), which is currently must be 4
Matrix features(wordIndices.size(), /*numFeatures =*/4);
size_t featureRow = 0;
// I_MEAN = index position in the feature vector hat represents the mean of log probability of a given word
// I_MIN = index position in the feature vector that represents the minimum of log probability of a given word
// I_NUM_SUBWORDS = index position in the feature vector that represents the number of subwords that compose a given
// I_OVERALL_MEAN = index position in the feature vector that represents the overall log probability score in the
// entire sequence
const size_t I_MEAN{0}, I_MIN{1}, I_NUM_SUBWORDS{2}, I_OVERALL_MEAN{3};
float overallMean = 0.0;
size_t numlogProbs = 0;
for (const SubwordRange& wordIndice : wordIndices) {
if (wordIndice.begin == wordIndice.end) {
++featureRow;
continue;
}
float minScore = std::numeric_limits<float>::max();
for (size_t i = wordIndice.begin; i < wordIndice.end; ++i) {
++numlogProbs;
overallMean += logProbs[i];
features.at(featureRow, I_MEAN) += logProbs[i];
minScore = std::min<float>(logProbs[i], minScore);
}
features.at(featureRow, I_MEAN) /= static_cast<float>(wordIndice.size());
features.at(featureRow, I_MIN) = minScore;
features.at(featureRow, I_NUM_SUBWORDS) = wordIndice.size();
++featureRow;
}
if (numlogProbs == 0) {
return std::move(Matrix(0, 0));
}
overallMean /= wordIndices.rbegin()->end;
for (int i = 0; i < features.rows; ++i) {
features.at(i, I_OVERALL_MEAN) = overallMean;
}
return std::move(features);
}