in recipes/self_training/pseudo_labeling/AnalyzeDataset.cpp [21:61]
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, false);
auto predictionDict =
filter::dataset::createTranscriptDictFromFile(FLAGS_infile);
auto groundtruthDict =
filter::dataset::createTranscriptDictFromFile(FLAGS_groundtruthfile);
fl::EditDistanceMeter wer;
size_t predictionDuration{0};
for (auto& sample : predictionDict) {
auto prediction = sample.second;
auto groundtruth = groundtruthDict[sample.first];
predictionDuration += prediction->getDuration();
wer.add(prediction->transcriptWords, groundtruth->transcriptWords);
}
size_t groundtruthDuration{0};
for (auto& sample : groundtruthDict) {
groundtruthDuration += sample.second->getDuration();
}
// Num samples
std::cout << "Prediction samples / groundtruth samples = "
<< predictionDict.size() << " / " << groundtruthDict.size() << " = "
<< (float)predictionDict.size() / (float)groundtruthDict.size()
<< std::endl;
// Duration
std::cout << "Prediction duration / groundtruth duration = "
<< predictionDuration << " / " << groundtruthDuration
<< " (seconds) = " << predictionDuration / (60.0 * 60.0 * 1000.0)
<< " / " << groundtruthDuration / (60.0 * 60.0 * 1000.0)
<< " (hours) = "
<< (float)predictionDuration / (float)groundtruthDuration
<< std::endl;
// WER
std::cout << "WER is " << wer.value()[0] << std::endl;
return 0;
}