int main()

in recipes/self_training/pseudo_labeling/AnalyzeDataset.cpp [21:61]


int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, false);

  auto predictionDict =
      filter::dataset::createTranscriptDictFromFile(FLAGS_infile);
  auto groundtruthDict =
      filter::dataset::createTranscriptDictFromFile(FLAGS_groundtruthfile);

  fl::EditDistanceMeter wer;
  size_t predictionDuration{0};
  for (auto& sample : predictionDict) {
    auto prediction = sample.second;
    auto groundtruth = groundtruthDict[sample.first];

    predictionDuration += prediction->getDuration();
    wer.add(prediction->transcriptWords, groundtruth->transcriptWords);
  }

  size_t groundtruthDuration{0};
  for (auto& sample : groundtruthDict) {
    groundtruthDuration += sample.second->getDuration();
  }

  // Num samples
  std::cout << "Prediction samples / groundtruth samples = "
            << predictionDict.size() << " / " << groundtruthDict.size() << " = "
            << (float)predictionDict.size() / (float)groundtruthDict.size()
            << std::endl;
  // Duration
  std::cout << "Prediction duration / groundtruth duration = "
            << predictionDuration << " / " << groundtruthDuration
            << " (seconds) = " << predictionDuration / (60.0 * 60.0 * 1000.0)
            << " / " << groundtruthDuration / (60.0 * 60.0 * 1000.0)
            << " (hours) = "
            << (float)predictionDuration / (float)groundtruthDuration
            << std::endl;
  // WER
  std::cout << "WER is " << wer.value()[0] << std::endl;

  return 0;
}