size_t Data::fillEntries()

in cpp/src/common/mmaped.cpp [206:376]


size_t Data::fillEntries(
  char*buf,
  MatrixXuf& data,
  MatrixXuf& label,
  dataCount_t max_entries,
  featureCount_t num_cols,
  uint64_t fileSize,
  EdgeML::DataFormat& formatType)
{
  data = MatrixXuf::Zero(NUM_FEATURES, max_entries);
  label = MatrixXuf::Zero(NUM_LABELS, max_entries);

  FP_TYPE value = 0;
  LABEL_TYPE lab = 0;
  int dec = -INF; // Need to keep track of number of decimal points. Seamlessly works with int also. 
  off_t nRead = 0; // How many lines have we read?
  featureCount_t col = 0; // Which column are we trying to read?
  bool is_positive = true;
  bool exp_flag = false;
  bool exp_is_positive = true;
  bool nan_flag = false;
  int exp_val = 0;

  for (off_t i = 0; i < fileSize; ++i) { // Iterate over chars in a file
    if (nRead == max_entries) break;
    assert(col <= num_cols);
    if (col == COL_LABEL) {
      if (formatType == EdgeML::tsvFormat) {
        switch (buf[i]) {
        case '\t':
#ifdef ZERO_BASED_IO
          lab++;
#endif	  
          if (!(lab > 0 && lab <= NUM_LABELS)) {
            LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
              + "Label value = " + std::to_string(lab) + " incompatible with data-format restrictions specied in README."
              + "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
            assert(false);
          }
          label(lab - 1, nRead) = 1.0; lab = 0; col++;
          break;
        case '0': case '1': case '2': case '3':case '4':
        case '5': case '6': case '7': case '8': case '9':
          lab *= 10; lab += buf[i] - '0';
          break;
        default:
          LOG_ERROR("Bad format in line: " + std::to_string(nRead) + "; character read: '" + std::string(1, buf[i]) + "'");
        }
      }
    }
    else if (col < COL_FEATURE) {
      switch (buf[i]) {
      case '\t':
        col++;
      default:
        lab = 0;
        break;
      }
    }
    else if (col >= COL_FEATURE) {
      switch (buf[i]) {
      case '\r':
        //	assert(col == num_cols-1);
        break;
      case '\n':
        if (exp_flag)
          value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
        if (!is_positive)
          value *= -1;
        if (dec > 0)
          value *= (FP_TYPE)pow(0.1, dec);

        data(col - COL_FEATURE, nRead) = value;

        if (col != num_cols - 1)
          for (auto c = col + 1; c < num_cols; ++c)
            data(c - COL_FEATURE, nRead) = 0;

        exp_flag = false; is_positive = true; dec = -INF;
        value = 0.0f; col = 0;	nRead++;
        break;

      case '\t':
        if (exp_flag)
          value *= (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
        if (!is_positive)
          value *= -1;
        if (dec > 0)
          value *= (FP_TYPE)pow(0.1, dec);

        data(col - COL_FEATURE, nRead) = value;

        exp_flag = false; is_positive = true; dec = -INF;

        dec = -INF;
        value = 0.0f;
        col++;
        break;

      case '-':
        if (exp_flag)
          exp_is_positive = false;
        else
          is_positive = false;
        break;

      case 'e':
        exp_flag = true; exp_val = 0; exp_is_positive = true;
        break;

      case 'N':
      case 'a':
        if (nan_flag == 0)
          LOG_WARNING("NaN possibly exists in the file. This code is not designed to handle NaN elaborately, and works only in specific cases.");
        nan_flag = 1;
        break;

      case 'I':
      case 'n':
      case 'f':
        if (nan_flag == 0)
          LOG_WARNING("Inf possibly exists in the file. This code is not designed to handle Inf elaborately, and works only in specific cases.");
        nan_flag = 1;
        break;

      case '0': case '1': case '2': case '3':case '4':
      case '5': case '6': case '7': case '8': case '9':
        if (exp_flag) {
          exp_val *= 10; exp_val += buf[i] - '0';
          break;
        }
        value *= 10.0f; value += buf[i] - '0';
        dec++;
        break;

      case '.':
        dec = 0;
        break;

      default:
        exp_flag = false; is_positive = true; exp_is_positive = true;
        value = 0.0f; exp_val = 0;  dec = -INF;
        col = lab = 0;
        LOG_ERROR("Bad format in line: " + std::to_string(nRead) + "; character read: '" + std::string(1, buf[i]) + "'");

        while (buf[i] != '\n')
          i++;
        i++;
      }
    }
  }
  if ((nRead != max_entries) && (col == num_cols - 1)) { // Didnt reach "\n" on last line
    if (exp_flag)
      value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
    if (!is_positive)
      value *= -1.0f;
    if (dec > 0)
      value *= (FP_TYPE)pow(0.1, dec);

    data(NUM_FEATURES - 1, nRead) = value;

    nRead++;
  }
  assert(nRead <= max_entries && "more entries in file than specified");

  data.conservativeResize(NUM_FEATURES, nRead);
  label.conservativeResize(NUM_LABELS, nRead);

  LOG_INFO("#Lines of data read: " + std::to_string(nRead));
  return nRead;
}