void EmbedModel::loadTsv()

in src/model.cpp [777:826]


void EmbedModel::loadTsv(const char* fname, const string sep) {
  cout << "Loading model from file " << fname << endl;
  auto cols = args_->dim;

  std::ifstream ifs(fname);
  auto filelen = [&](ifstream& f) {
    auto pos = f.tellg();
    f.seekg(0, ios_base::end);
    auto retval = f.tellg();
    f.seekg(pos, ios_base::beg);
    return retval;
  };

  auto len = filelen(ifs);
  auto numThreads = getNumberOfCores();
  vector<off_t> partitions(numThreads + 1);
  partitions[0] = 0;
  partitions[numThreads] = len;

  string unused;
  for (int i = 1; i < numThreads; i++) {
    ifs.seekg((len / numThreads) * i);
    getline(ifs, unused);
    partitions[i] = ifs.tellg();
  }

  // It's possible that the ranges in partitions overlap; consider,
  // e.g., a machine with 100 hardware threads and only 99 lines
  // in the file. In this case, we'll do some excess work but loadTsvLine
  // is idempotent, so it is ok.
  std::vector<thread> threads;
  for (int i = 0; i < numThreads; i++) {
    auto body = [this, fname, cols, sep, i, &partitions]() {
      // Get our own seek pointer.
      ifstream ifs(fname);
      ifs.seekg(partitions[i]);
      string line;
      while (ifs.tellg() < partitions[i + 1] && getline(ifs, line)) {
        // We don't know the line number. Super-bummer.
        loadTsvLine(line, -1, cols, sep);
      }
    };
    threads.emplace_back(body);
  }
  for (auto& t: threads) {
    t.join();
  }

  cout << "Model loaded.\n";
}