in cpp/src/common/mmaped.cpp [206:376]
size_t Data::fillEntries(
char*buf,
MatrixXuf& data,
MatrixXuf& label,
dataCount_t max_entries,
featureCount_t num_cols,
uint64_t fileSize,
EdgeML::DataFormat& formatType)
{
data = MatrixXuf::Zero(NUM_FEATURES, max_entries);
label = MatrixXuf::Zero(NUM_LABELS, max_entries);
FP_TYPE value = 0;
LABEL_TYPE lab = 0;
int dec = -INF; // Need to keep track of number of decimal points. Seamlessly works with int also.
off_t nRead = 0; // How many lines have we read?
featureCount_t col = 0; // Which column are we trying to read?
bool is_positive = true;
bool exp_flag = false;
bool exp_is_positive = true;
bool nan_flag = false;
int exp_val = 0;
for (off_t i = 0; i < fileSize; ++i) { // Iterate over chars in a file
if (nRead == max_entries) break;
assert(col <= num_cols);
if (col == COL_LABEL) {
if (formatType == EdgeML::tsvFormat) {
switch (buf[i]) {
case '\t':
#ifdef ZERO_BASED_IO
lab++;
#endif
if (!(lab > 0 && lab <= NUM_LABELS)) {
LOG_ERROR("Error in line " + std::to_string(nRead) + " of input file.\n"
+ "Label value = " + std::to_string(lab) + " incompatible with data-format restrictions specied in README."
+ "\nCheck also if ZERO_BASED_IO flag is set or not (in config.mk or elsewhere).");
assert(false);
}
label(lab - 1, nRead) = 1.0; lab = 0; col++;
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8': case '9':
lab *= 10; lab += buf[i] - '0';
break;
default:
LOG_ERROR("Bad format in line: " + std::to_string(nRead) + "; character read: '" + std::string(1, buf[i]) + "'");
}
}
}
else if (col < COL_FEATURE) {
switch (buf[i]) {
case '\t':
col++;
default:
lab = 0;
break;
}
}
else if (col >= COL_FEATURE) {
switch (buf[i]) {
case '\r':
// assert(col == num_cols-1);
break;
case '\n':
if (exp_flag)
value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
if (!is_positive)
value *= -1;
if (dec > 0)
value *= (FP_TYPE)pow(0.1, dec);
data(col - COL_FEATURE, nRead) = value;
if (col != num_cols - 1)
for (auto c = col + 1; c < num_cols; ++c)
data(c - COL_FEATURE, nRead) = 0;
exp_flag = false; is_positive = true; dec = -INF;
value = 0.0f; col = 0; nRead++;
break;
case '\t':
if (exp_flag)
value *= (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
if (!is_positive)
value *= -1;
if (dec > 0)
value *= (FP_TYPE)pow(0.1, dec);
data(col - COL_FEATURE, nRead) = value;
exp_flag = false; is_positive = true; dec = -INF;
dec = -INF;
value = 0.0f;
col++;
break;
case '-':
if (exp_flag)
exp_is_positive = false;
else
is_positive = false;
break;
case 'e':
exp_flag = true; exp_val = 0; exp_is_positive = true;
break;
case 'N':
case 'a':
if (nan_flag == 0)
LOG_WARNING("NaN possibly exists in the file. This code is not designed to handle NaN elaborately, and works only in specific cases.");
nan_flag = 1;
break;
case 'I':
case 'n':
case 'f':
if (nan_flag == 0)
LOG_WARNING("Inf possibly exists in the file. This code is not designed to handle Inf elaborately, and works only in specific cases.");
nan_flag = 1;
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8': case '9':
if (exp_flag) {
exp_val *= 10; exp_val += buf[i] - '0';
break;
}
value *= 10.0f; value += buf[i] - '0';
dec++;
break;
case '.':
dec = 0;
break;
default:
exp_flag = false; is_positive = true; exp_is_positive = true;
value = 0.0f; exp_val = 0; dec = -INF;
col = lab = 0;
LOG_ERROR("Bad format in line: " + std::to_string(nRead) + "; character read: '" + std::string(1, buf[i]) + "'");
while (buf[i] != '\n')
i++;
i++;
}
}
}
if ((nRead != max_entries) && (col == num_cols - 1)) { // Didnt reach "\n" on last line
if (exp_flag)
value = value * (FP_TYPE)pow(10, exp_val*(exp_is_positive ? 1 : -1));
if (!is_positive)
value *= -1.0f;
if (dec > 0)
value *= (FP_TYPE)pow(0.1, dec);
data(NUM_FEATURES - 1, nRead) = value;
nRead++;
}
assert(nRead <= max_entries && "more entries in file than specified");
data.conservativeResize(NUM_FEATURES, nRead);
label.conservativeResize(NUM_LABELS, nRead);
LOG_INFO("#Lines of data read: " + std::to_string(nRead));
return nRead;
}