void Dictionary::readFromFile()

in src/dict.cpp [133:210]


void Dictionary::readFromFile(
    const std::string& file,
    shared_ptr<DataParser> parser) {

  int64_t minThreshold = 1;
  size_t lines_read = 0;

  auto readFromInputStream = [&](std::istream& in) {
    string line;
    while (getline(in, line, '\n')) {
      vector<string> tokens;
      parser->parseForDict(line, tokens);
      lines_read++;
      for (auto token : tokens) {
        insert(token);
        if ((ntokens_ % 1000000 == 0) && args_->verbose) {
          std::cerr << "\rRead " << ntokens_  / 1000000 << "M words" << std::flush;
        }
        if (size_ > 0.75 * MAX_VOCAB_SIZE) {
          minThreshold++;
          threshold(minThreshold, minThreshold);
        }
      }
    }
  };

#ifdef COMPRESS_FILE
  if (args_->compressFile == "gzip") {
    cout << "Build dict from compressed input file.\n";
    for (int i = 0; i < args_->numGzFile; i++) {
      filtering_istream in;
      auto str_idx = boost::str(boost::format("%02d") % i);
      auto fname = file + str_idx + ".gz";
      ifstream ifs(fname);
      if (!ifs.good()) {
        continue;
      }
      in.push(gzip_decompressor());
      in.push(ifs);
      readFromInputStream(in);
      ifs.close();
    }
  } else {
    cout << "Build dict from input file : " << file << endl;
    ifstream fin(file);
    if (!fin.is_open()) {
      cerr << "Input file cannot be opened!" << endl;
      exit(EXIT_FAILURE);
    }
    readFromInputStream(fin);
    fin.close();
  }
#else
  cout << "Build dict from input file : " << file << endl;
  ifstream fin(file);
  if (!fin.is_open()) {
    cerr << "Input file cannot be opened!" << endl;
    exit(EXIT_FAILURE);
  }
  readFromInputStream(fin);
  fin.close();
#endif

  threshold(args_->minCount, args_->minCountLabel);

  std::cerr << "\rRead " << ntokens_  / 1000000 << "M words" << std::endl;
  std::cerr << "Number of words in dictionary:  " << nwords_ << std::endl;
  std::cerr << "Number of labels in dictionary: " << nlabels_ << std::endl;
  if (lines_read == 0) {
    std::cerr << "ERROR: Empty file." << std::endl;
    exit(EXIT_FAILURE);
  }
  if (size_ == 0) {
    std::cerr << "Empty vocabulary. Try a smaller -minCount value."
              << std::endl;
    exit(EXIT_FAILURE);
  }
}