size_t Data::libsvmFillEntries()

in cpp/src/common/mmaped.cpp [577:769]


size_t Data::libsvmFillEntries(char*buf,
  SparseMatrixuf& data,
  SparseMatrixuf& label,
  dataCount_t max_entries,
  featureCount_t num_cols,
  uint64_t fileSize,
  EdgeML::DataFormat& format_type)
{
  std::vector <Trip> data_triplet;
  std::vector <Trip> label_triplet;

  FP_TYPE value = 0;
  int dec = -INF; // Need to keep track of number of decimal points. Seamlessly works with int also. 
  off_t nRead = 0; // How many lines have we read?
  featureCount_t col = 0; // Which column are we trying to read?
  bool is_positive = true;
  bool index_flag = false;
  int index_value = 0;
  bool exp_flag = false;
  bool exp_is_positive = true;
  bool nan_flag = false;
  int exp_val = 0;

  for (off_t i = 0; i < fileSize; ++i) { // Iterate over chars in a file
    if (nRead == max_entries) break;
    assert(col <= num_cols);
    switch (buf[i]) {
    case '\r':
      //	assert(col == num_cols-1);
      break;
    case '\n':
      if (index_flag == true) {

        if (exp_flag)
          value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
        if (!is_positive)
          value *= -1;
        if (dec > 0)
          value *= (FP_TYPE)pow(0.1, dec);

#ifdef ZERO_BASED_IO
        index_value++;
#endif	    	  
        if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
          LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
            + "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
            + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
          assert(false);
        }
        data_triplet.push_back(Trip(index_value - 1, nRead, value));
      }

      exp_flag = false; is_positive = true; dec = -INF;
      index_flag = false; index_value = 0;
      value = 0; col = 0;	nRead++;
      break;

    case ':':
      assert(index_flag == false);
      index_value = (labelCount_t)std::round(value);
      index_flag = true;
      value = 0;
      break;

    case '\t': case ' ': case ',':
      if (exp_flag)
        value *= (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
      if (!is_positive)
        value *= -1;
      if (dec > 0)
        value *= (FP_TYPE)pow(0.1, dec);

      if (index_flag == true) {
#ifdef ZERO_BASED_IO
        index_value++;
#endif	    
        if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
          LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
            + "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
            + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
          assert(false);
        }
        data_triplet.push_back(Trip(index_value - 1, nRead, value));
        index_flag = false; index_value = 0;
      }
      else {
        labelCount_t labelValue = (labelCount_t)std::round(value);

#ifdef ZERO_BASED_IO
        labelValue++;
#endif
        if (!(labelValue > 0 && labelValue <= NUM_LABELS)) {
          LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
            + "Label value = " + std::to_string(labelValue) + " incompatible with data-format restrictions specied in README."
            + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
          assert(false);
        }
        label_triplet.push_back(Trip(labelValue - 1, nRead, 1.0f));

        assert((dec < 0) && (exp_flag == false) && (is_positive == true));
      }
      exp_flag = false; is_positive = true; dec = -INF;

      dec = -INF;
      value = 0;
      col++;
      break;

    case '-':
      if (exp_flag)
        exp_is_positive = false;
      else
        is_positive = false;
      break;

    case 'e':
      exp_flag = true; exp_val = 0; exp_is_positive = true;
      break;

    case 'N':
    case 'a':
      if (nan_flag == 0)
        LOG_WARNING("NaN possibly exists in the file. This code is not designed to handle Inf elaborately, and works only in specific cases.");
      nan_flag = 1;
      break;

    case 'I':
    case 'n':
    case 'f':
      if (nan_flag == 0)
        LOG_WARNING("Inf possibly exists in the file. This code is not designed to handle Inf elaborately, and works only in specific cases.");
      nan_flag = 1;
      break;

    case '0': case '1': case '2': case '3':case '4':
    case '5': case '6': case '7': case '8': case '9':
      if (exp_flag) {
        exp_val *= 10; exp_val += buf[i] - '0';
        break;
      }
      value *= 10; value += buf[i] - '0';
      dec++;
      break;

    case '.':
      dec = 0;
      break;

    default:
      exp_flag = false; is_positive = true; exp_is_positive = true;
      value = 0.0f; exp_val = 0;  dec = -INF;
      col = 0;
      LOG_ERROR("Bad format in line: " + std::to_string(nRead) + "; character read: '" + std::string(1, buf[i]) + "'");

      while (buf[i] != '\n')
        i++;
      i++;
    }
  }

  if ((nRead != max_entries) && (col == num_cols - 1)) { // Didnt reach "\n" on last line
    if (exp_flag)
      value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
    if (!is_positive)
      value *= -1;
    if (dec > 0)
      value *= (FP_TYPE)pow(0.1, dec);

#ifdef ZERO_BASED_IO
    index_value++;
#endif
    if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
      LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
        + "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
        + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
      assert(false);
    }
    data_triplet.push_back(Trip(index_value - 1, nRead, value));
    nRead++;
  }
  assert(nRead <= max_entries && "more entries in file than specified");

  data = SparseMatrixuf(NUM_FEATURES, nRead);
  label = SparseMatrixuf(NUM_LABELS, nRead);
  LOG_INFO("Number of non-zero entries in data-matrix = " + std::to_string(data_triplet.size()));
  LOG_INFO("Number of non-zero entries in label-matrix = " + std::to_string(label_triplet.size()));

  data.setFromTriplets(data_triplet.begin(), data_triplet.end());
  label.setFromTriplets(label_triplet.begin(), label_triplet.end());

  LOG_INFO("#Lines of data read: " + std::to_string(nRead) + "\n");
  return nRead;
}