size_t Data::libsvmFillEntries()

in cpp/src/common/mmaped.cpp [383:570]


size_t Data::libsvmFillEntries(char*buf,
  MatrixXuf& data,
  MatrixXuf& label,
  dataCount_t max_entries,
  featureCount_t num_cols,
  uint64_t fileSize,
  EdgeML::DataFormat& format_type)
{
  data = MatrixXuf::Zero(NUM_FEATURES, max_entries);
  label = MatrixXuf::Zero(NUM_LABELS, max_entries);

  FP_TYPE value = 0;
  int dec = -INF; // Need to keep track of number of decimal points. Seamlessly works with int also. 
  off_t nRead = 0; // How many lines have we read?
  featureCount_t col = 0; // Which column are we trying to read?
  bool is_positive = true;
  bool index_flag = false;
  int index_value = 0;
  bool exp_flag = false;
  bool exp_is_positive = true;
  bool nan_flag = false;
  int exp_val = 0;

  for (off_t i = 0; i < fileSize; ++i) { // Iterate over chars in a file
    if (nRead == max_entries) break;
    assert(col <= num_cols);
    switch (buf[i]) {
    case '\r':
      //	assert(col == num_cols-1);
      break;
    case '\n':
      if (exp_flag)
        value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
      if (!is_positive)
        value *= -1;
      if (dec > 0)
        value *= (FP_TYPE)pow(0.1, dec);

      assert(index_flag == true);

#ifdef ZERO_BASED_IO
      index_value++;
#endif	    
      if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
        LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
          + "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
          + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
        assert(false);
      }
      data(index_value - 1, nRead) = value;

      exp_flag = false; is_positive = true; dec = -INF;
      index_flag = false; index_value = 0;
      value = 0; col = 0;	nRead++;
      break;

    case ':':
      assert(index_flag == false);
      index_value = (int)std::round(value);
      index_flag = true;
      value = 0.0f;
      break;

    case '\t': case ' ': case ',':
      if (exp_flag)
        value *= (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
      if (!is_positive)
        value *= -1;
      if (dec > 0)
        value *= (FP_TYPE)pow(0.1, dec);

      if (index_flag == true) {
#ifdef ZERO_BASED_IO
        index_value++;
#endif	    
        if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
          LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
            + "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
            + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
          assert(false);
        }
        data(index_value - 1, nRead) = value;
        index_flag = false; index_value = 0;
      }
      else {
        labelCount_t labelValue = (labelCount_t)std::round(value);

#ifdef ZERO_BASED_IO
        labelValue++;
#endif	  
        if (!(labelValue > 0 && labelValue <= NUM_LABELS)) {
          LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
            + "Label value = " + std::to_string(labelValue) + " incompatible with data-format restrictions specied in README."
            + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
          assert(false);
        }
        label(labelValue - 1, nRead) = 1.0;

        assert((dec < 0) && (exp_flag == false) && (is_positive == true));
      }
      exp_flag = false; is_positive = true; dec = -INF;

      dec = -INF;
      value = 0.0f;
      col++;
      break;

    case '-':
      if (exp_flag)
        exp_is_positive = false;
      else
        is_positive = false;
      break;

    case 'e':
      exp_flag = true; exp_val = 0; exp_is_positive = true;
      break;

    case 'N':
    case 'a':
      if (nan_flag == 0)
        LOG_WARNING("NaN possibly exists in the file. This code is not designed to handle NaN elaborately, and works only in specific cases.");
      nan_flag = 1;
      break;

    case 'I':
    case 'n':
    case 'f':
      if (nan_flag == 0)
        LOG_WARNING("Inf possibly exists in the file. This code is not designed to handle Inf elaborately, and works only in specific cases.");
      nan_flag = 1;
      break;

    case '0': case '1': case '2': case '3':case '4':
    case '5': case '6': case '7': case '8': case '9':
      if (exp_flag) {
        exp_val *= 10; exp_val += buf[i] - '0';
        break;
      }
      value *= 10; value += buf[i] - '0';
      dec++;
      break;

    case '.':
      dec = 0;
      break;

    default:
      exp_flag = false; is_positive = true; exp_is_positive = true;
      value = 0.0f; exp_val = 0;  dec = -INF;
      col = 0;
      LOG_ERROR("Bad format in line: " + std::to_string(nRead) + "; character read: '" + std::string(1, buf[i]) + "'");

      while (buf[i] != '\n')
        i++;
      i++;
    }
  }

  if ((nRead != max_entries) && (col == num_cols - 1)) { // Didnt reach "\n" on last line
    if (exp_flag)
      value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
    if (!is_positive)
      value *= -1;
    if (dec > 0)
      value *= (FP_TYPE)pow(0.1, dec);

#ifdef ZERO_BASED_IO
    index_value++;
#endif
    if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
      LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
        + "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
        + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
      assert(false);
    }
    data(index_value - 1, nRead) = value;

    nRead++;
  }
  assert(nRead <= max_entries && "more entries in file than specified");

  data.conservativeResize(NUM_FEATURES, nRead);
  label.conservativeResize(NUM_LABELS, nRead);

  LOG_INFO("#Lines of data read: " + std::to_string(nRead) + "\n");
  return nRead;
}