in src/io/dataset_loader.cpp [35:191]
void DatasetLoader::SetHeader(const char* filename) {
std::unordered_map<std::string, int> name2idx;
std::string name_prefix("name:");
if (filename != nullptr && CheckCanLoadFromBin(filename) == "") {
TextReader<data_size_t> text_reader(filename, config_.header);
// get column names
if (config_.header) {
std::string first_line = text_reader.first_line();
feature_names_ = Common::Split(first_line.c_str(), "\t,");
} else if (!config_.parser_config_file.empty()) {
// support to get header from parser config, so could utilize following label name to id mapping logic.
TextReader<data_size_t> parser_config_reader(config_.parser_config_file.c_str(), false);
parser_config_reader.ReadAllLines();
std::string parser_config_str = parser_config_reader.JoinedLines();
if (!parser_config_str.empty()) {
std::string header_in_parser_config = Common::GetFromParserConfig(parser_config_str, "header");
if (!header_in_parser_config.empty()) {
Log::Info("Get raw column names from parser config.");
feature_names_ = Common::Split(header_in_parser_config.c_str(), "\t,");
}
}
}
// load label idx first
if (config_.label_column.size() > 0) {
if (Common::StartsWith(config_.label_column, name_prefix)) {
std::string name = config_.label_column.substr(name_prefix.size());
label_idx_ = -1;
for (int i = 0; i < static_cast<int>(feature_names_.size()); ++i) {
if (name == feature_names_[i]) {
label_idx_ = i;
break;
}
}
if (label_idx_ >= 0) {
Log::Info("Using column %s as label", name.c_str());
} else {
Log::Fatal("Could not find label column %s in data file \n"
"or data file doesn't contain header", name.c_str());
}
} else {
if (!Common::AtoiAndCheck(config_.label_column.c_str(), &label_idx_)) {
Log::Fatal("label_column is not a number,\n"
"if you want to use a column name,\n"
"please add the prefix \"name:\" to the column name");
}
Log::Info("Using column number %d as label", label_idx_);
}
}
if (!config_.parser_config_file.empty()) {
// if parser config file exists, feature names may be changed after customized parser applied.
// clear here so could use default filled feature names during dataset construction.
// may improve by saving real feature names defined in parser in the future.
if (!feature_names_.empty()) {
feature_names_.clear();
}
}
if (!feature_names_.empty()) {
// erase label column name
feature_names_.erase(feature_names_.begin() + label_idx_);
for (size_t i = 0; i < feature_names_.size(); ++i) {
name2idx[feature_names_[i]] = static_cast<int>(i);
}
}
// load ignore columns
if (config_.ignore_column.size() > 0) {
if (Common::StartsWith(config_.ignore_column, name_prefix)) {
std::string names = config_.ignore_column.substr(name_prefix.size());
for (auto name : Common::Split(names.c_str(), ',')) {
if (name2idx.count(name) > 0) {
int tmp = name2idx[name];
ignore_features_.emplace(tmp);
} else {
Log::Fatal("Could not find ignore column %s in data file", name.c_str());
}
}
} else {
for (auto token : Common::Split(config_.ignore_column.c_str(), ',')) {
int tmp = 0;
if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
Log::Fatal("ignore_column is not a number,\n"
"if you want to use a column name,\n"
"please add the prefix \"name:\" to the column name");
}
ignore_features_.emplace(tmp);
}
}
}
// load weight idx
if (config_.weight_column.size() > 0) {
if (Common::StartsWith(config_.weight_column, name_prefix)) {
std::string name = config_.weight_column.substr(name_prefix.size());
if (name2idx.count(name) > 0) {
weight_idx_ = name2idx[name];
Log::Info("Using column %s as weight", name.c_str());
} else {
Log::Fatal("Could not find weight column %s in data file", name.c_str());
}
} else {
if (!Common::AtoiAndCheck(config_.weight_column.c_str(), &weight_idx_)) {
Log::Fatal("weight_column is not a number,\n"
"if you want to use a column name,\n"
"please add the prefix \"name:\" to the column name");
}
Log::Info("Using column number %d as weight", weight_idx_);
}
ignore_features_.emplace(weight_idx_);
}
// load group idx
if (config_.group_column.size() > 0) {
if (Common::StartsWith(config_.group_column, name_prefix)) {
std::string name = config_.group_column.substr(name_prefix.size());
if (name2idx.count(name) > 0) {
group_idx_ = name2idx[name];
Log::Info("Using column %s as group/query id", name.c_str());
} else {
Log::Fatal("Could not find group/query column %s in data file", name.c_str());
}
} else {
if (!Common::AtoiAndCheck(config_.group_column.c_str(), &group_idx_)) {
Log::Fatal("group_column is not a number,\n"
"if you want to use a column name,\n"
"please add the prefix \"name:\" to the column name");
}
Log::Info("Using column number %d as group/query id", group_idx_);
}
ignore_features_.emplace(group_idx_);
}
}
if (config_.categorical_feature.size() > 0) {
if (Common::StartsWith(config_.categorical_feature, name_prefix)) {
std::string names = config_.categorical_feature.substr(name_prefix.size());
for (auto name : Common::Split(names.c_str(), ',')) {
if (name2idx.count(name) > 0) {
int tmp = name2idx[name];
categorical_features_.emplace(tmp);
} else {
Log::Fatal("Could not find categorical_feature %s in data file", name.c_str());
}
}
} else {
for (auto token : Common::Split(config_.categorical_feature.c_str(), ',')) {
int tmp = 0;
if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
Log::Fatal("categorical_feature is not a number,\n"
"if you want to use a column name,\n"
"please add the prefix \"name:\" to the column name");
}
categorical_features_.emplace(tmp);
}
}
}
}