in src/model.cpp [777:826]
void EmbedModel::loadTsv(const char* fname, const string sep) {
cout << "Loading model from file " << fname << endl;
auto cols = args_->dim;
std::ifstream ifs(fname);
auto filelen = [&](ifstream& f) {
auto pos = f.tellg();
f.seekg(0, ios_base::end);
auto retval = f.tellg();
f.seekg(pos, ios_base::beg);
return retval;
};
auto len = filelen(ifs);
auto numThreads = getNumberOfCores();
vector<off_t> partitions(numThreads + 1);
partitions[0] = 0;
partitions[numThreads] = len;
string unused;
for (int i = 1; i < numThreads; i++) {
ifs.seekg((len / numThreads) * i);
getline(ifs, unused);
partitions[i] = ifs.tellg();
}
// It's possible that the ranges in partitions overlap; consider,
// e.g., a machine with 100 hardware threads and only 99 lines
// in the file. In this case, we'll do some excess work but loadTsvLine
// is idempotent, so it is ok.
std::vector<thread> threads;
for (int i = 0; i < numThreads; i++) {
auto body = [this, fname, cols, sep, i, &partitions]() {
// Get our own seek pointer.
ifstream ifs(fname);
ifs.seekg(partitions[i]);
string line;
while (ifs.tellg() < partitions[i + 1] && getline(ifs, line)) {
// We don't know the line number. Super-bummer.
loadTsvLine(line, -1, cols, sep);
}
};
threads.emplace_back(body);
}
for (auto& t: threads) {
t.join();
}
cout << "Model loaded.\n";
}