in src/frontend/lightgbm.cc [85:124]
inline std::vector<std::string> LoadText(dmlc::Stream* fi) {
const size_t bufsize = 16 * 1024 * 1024; // 16 MB
std::vector<char> buf(bufsize);
std::vector<std::string> lines;
size_t byte_read;
std::string leftover = ""; // carry over between buffers
while ((byte_read = fi->Read(&buf[0], sizeof(char) * bufsize)) > 0) {
size_t i = 0;
size_t tok_begin = 0;
while (i < byte_read) {
if (buf[i] == '\n' || buf[i] == '\r') { // delimiter for lines
if (tok_begin == 0 && leftover.length() + i > 0) {
// first line in buffer
lines.push_back(leftover + std::string(&buf[0], i));
leftover = "";
} else {
lines.emplace_back(&buf[tok_begin], i - tok_begin);
}
// skip all delimiters afterwards
for (; (buf[i] == '\n' || buf[i] == '\r') && i < byte_read; ++i) {}
tok_begin = i;
} else {
++i;
}
}
// left-over string
leftover += std::string(&buf[tok_begin], byte_read - tok_begin);
}
if (!leftover.empty()) {
LOG(INFO)
<< "Warning: input file was not terminated with end-of-line character.";
lines.push_back(leftover);
}
return lines;
}