in cpp/src/common/mmaped.cpp [577:769]
size_t Data::libsvmFillEntries(char*buf,
SparseMatrixuf& data,
SparseMatrixuf& label,
dataCount_t max_entries,
featureCount_t num_cols,
uint64_t fileSize,
EdgeML::DataFormat& format_type)
{
std::vector <Trip> data_triplet;
std::vector <Trip> label_triplet;
FP_TYPE value = 0;
int dec = -INF; // Need to keep track of number of decimal points. Seamlessly works with int also.
off_t nRead = 0; // How many lines have we read?
featureCount_t col = 0; // Which column are we trying to read?
bool is_positive = true;
bool index_flag = false;
int index_value = 0;
bool exp_flag = false;
bool exp_is_positive = true;
bool nan_flag = false;
int exp_val = 0;
for (off_t i = 0; i < fileSize; ++i) { // Iterate over chars in a file
if (nRead == max_entries) break;
assert(col <= num_cols);
switch (buf[i]) {
case '\r':
// assert(col == num_cols-1);
break;
case '\n':
if (index_flag == true) {
if (exp_flag)
value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
if (!is_positive)
value *= -1;
if (dec > 0)
value *= (FP_TYPE)pow(0.1, dec);
#ifdef ZERO_BASED_IO
index_value++;
#endif
if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
+ "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
+ "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
assert(false);
}
data_triplet.push_back(Trip(index_value - 1, nRead, value));
}
exp_flag = false; is_positive = true; dec = -INF;
index_flag = false; index_value = 0;
value = 0; col = 0; nRead++;
break;
case ':':
assert(index_flag == false);
index_value = (labelCount_t)std::round(value);
index_flag = true;
value = 0;
break;
case '\t': case ' ': case ',':
if (exp_flag)
value *= (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
if (!is_positive)
value *= -1;
if (dec > 0)
value *= (FP_TYPE)pow(0.1, dec);
if (index_flag == true) {
#ifdef ZERO_BASED_IO
index_value++;
#endif
if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
+ "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
+ "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
assert(false);
}
data_triplet.push_back(Trip(index_value - 1, nRead, value));
index_flag = false; index_value = 0;
}
else {
labelCount_t labelValue = (labelCount_t)std::round(value);
#ifdef ZERO_BASED_IO
labelValue++;
#endif
if (!(labelValue > 0 && labelValue <= NUM_LABELS)) {
LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
+ "Label value = " + std::to_string(labelValue) + " incompatible with data-format restrictions specied in README."
+ "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
assert(false);
}
label_triplet.push_back(Trip(labelValue - 1, nRead, 1.0f));
assert((dec < 0) && (exp_flag == false) && (is_positive == true));
}
exp_flag = false; is_positive = true; dec = -INF;
dec = -INF;
value = 0;
col++;
break;
case '-':
if (exp_flag)
exp_is_positive = false;
else
is_positive = false;
break;
case 'e':
exp_flag = true; exp_val = 0; exp_is_positive = true;
break;
case 'N':
case 'a':
if (nan_flag == 0)
LOG_WARNING("NaN possibly exists in the file. This code is not designed to handle Inf elaborately, and works only in specific cases.");
nan_flag = 1;
break;
case 'I':
case 'n':
case 'f':
if (nan_flag == 0)
LOG_WARNING("Inf possibly exists in the file. This code is not designed to handle Inf elaborately, and works only in specific cases.");
nan_flag = 1;
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8': case '9':
if (exp_flag) {
exp_val *= 10; exp_val += buf[i] - '0';
break;
}
value *= 10; value += buf[i] - '0';
dec++;
break;
case '.':
dec = 0;
break;
default:
exp_flag = false; is_positive = true; exp_is_positive = true;
value = 0.0f; exp_val = 0; dec = -INF;
col = 0;
LOG_ERROR("Bad format in line: " + std::to_string(nRead) + "; character read: '" + std::string(1, buf[i]) + "'");
while (buf[i] != '\n')
i++;
i++;
}
}
if ((nRead != max_entries) && (col == num_cols - 1)) { // Didnt reach "\n" on last line
if (exp_flag)
value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
if (!is_positive)
value *= -1;
if (dec > 0)
value *= (FP_TYPE)pow(0.1, dec);
#ifdef ZERO_BASED_IO
index_value++;
#endif
if (!(index_value > 0 && index_value <= NUM_FEATURES)) {
LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
+ "Index value = " + std::to_string(index_value) + " incompatible with data-format restrictions specied in README."
+ "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
assert(false);
}
data_triplet.push_back(Trip(index_value - 1, nRead, value));
nRead++;
}
assert(nRead <= max_entries && "more entries in file than specified");
data = SparseMatrixuf(NUM_FEATURES, nRead);
label = SparseMatrixuf(NUM_LABELS, nRead);
LOG_INFO("Number of non-zero entries in data-matrix = " + std::to_string(data_triplet.size()));
LOG_INFO("Number of non-zero entries in label-matrix = " + std::to_string(label_triplet.size()));
data.setFromTriplets(data_triplet.begin(), data_triplet.end());
label.setFromTriplets(label_triplet.begin(), label_triplet.end());
LOG_INFO("#Lines of data read: " + std::to_string(nRead) + "\n");
return nRead;
}