in src/io/dataset.cpp [1320:1472]
void Dataset::AddFeaturesFrom(Dataset* other) {
if (other->num_data_ != num_data_) {
Log::Fatal(
"Cannot add features from other Dataset with a different number of "
"rows");
}
if (other->has_raw_ != has_raw_) {
Log::Fatal("Can only add features from other Dataset if both or neither have raw data.");
}
int mv_gid = -1;
int other_mv_gid = -1;
for (int i = 0; i < num_groups_; ++i) {
if (IsMultiGroup(i)) {
mv_gid = i;
}
}
for (int i = 0; i < other->num_groups_; ++i) {
if (other->IsMultiGroup(i)) {
other_mv_gid = i;
}
}
// Only one multi-val group, just simply merge
if (mv_gid < 0 || other_mv_gid < 0) {
PushVector(&feature2subfeature_, other->feature2subfeature_);
PushVector(&group_feature_cnt_, other->group_feature_cnt_);
feature_groups_.reserve(other->feature_groups_.size());
for (auto& fg : other->feature_groups_) {
const int cur_group_id = static_cast<int>(feature_groups_.size());
feature_groups_.emplace_back(new FeatureGroup(*fg, true, cur_group_id));
}
for (auto feature_idx : other->used_feature_map_) {
if (feature_idx >= 0) {
used_feature_map_.push_back(feature_idx + num_features_);
} else {
used_feature_map_.push_back(-1); // Unused feature.
}
}
PushOffset(&real_feature_idx_, other->real_feature_idx_,
num_total_features_);
PushOffset(&feature2group_, other->feature2group_, num_groups_);
auto bin_offset = group_bin_boundaries_.back();
// Skip the leading 0 when copying group_bin_boundaries.
for (auto i = other->group_bin_boundaries_.begin() + 1;
i < other->group_bin_boundaries_.end(); ++i) {
group_bin_boundaries_.push_back(*i + bin_offset);
}
PushOffset(&group_feature_start_, other->group_feature_start_,
num_features_);
num_groups_ += other->num_groups_;
num_features_ += other->num_features_;
} else {
std::vector<std::vector<int>> features_in_group;
for (int i = 0; i < num_groups_; ++i) {
int f_start = group_feature_start_[i];
int f_cnt = group_feature_cnt_[i];
features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) {
const int real_fidx = real_feature_idx_[f_start + j];
features_in_group.back().push_back(real_fidx);
}
}
feature_groups_[mv_gid]->AddFeaturesFrom(
other->feature_groups_[other_mv_gid].get(), mv_gid);
for (int i = 0; i < other->num_groups_; ++i) {
int f_start = other->group_feature_start_[i];
int f_cnt = other->group_feature_cnt_[i];
if (i == other_mv_gid) {
for (int j = 0; j < f_cnt; ++j) {
const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
features_in_group[mv_gid].push_back(real_fidx);
}
} else {
features_in_group.emplace_back();
for (int j = 0; j < f_cnt; ++j) {
const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
features_in_group.back().push_back(real_fidx);
}
feature_groups_.emplace_back(
new FeatureGroup(*other->feature_groups_[i], false, -1));
}
}
// regenerate other fields
num_groups_ += other->num_groups_ - 1;
CHECK(num_groups_ == static_cast<int>(features_in_group.size()));
num_features_ += other->num_features_;
int cur_fidx = 0;
used_feature_map_ =
std::vector<int>(num_total_features_ + other->num_total_features_, -1);
real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_);
group_feature_start_.resize(num_groups_);
group_feature_cnt_.resize(num_groups_);
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
auto cur_features = features_in_group[i];
int cur_cnt_features = static_cast<int>(cur_features.size());
group_feature_start_[i] = cur_fidx;
group_feature_cnt_[i] = cur_cnt_features;
for (int j = 0; j < cur_cnt_features; ++j) {
int real_fidx = cur_features[j];
used_feature_map_[real_fidx] = cur_fidx;
real_feature_idx_[cur_fidx] = real_fidx;
feature2group_[cur_fidx] = i;
feature2subfeature_[cur_fidx] = j;
++cur_fidx;
}
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
}
std::unordered_set<std::string> feature_names_set;
for (const auto& val : feature_names_) {
feature_names_set.emplace(val);
}
for (const auto& val : other->feature_names_) {
std::string new_name = val;
int cnt = 2;
while (feature_names_set.count(new_name)) {
new_name = "D" + std::to_string(cnt) + "_" + val;
++cnt;
}
if (new_name != val) {
Log::Warning(
"Find the same feature name (%s) in Dataset::AddFeaturesFrom, change "
"its name to (%s)",
val.c_str(), new_name.c_str());
}
feature_names_set.emplace(new_name);
feature_names_.push_back(new_name);
}
PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
PushClearIfEmpty(&max_bin_by_feature_, num_total_features_,
other->max_bin_by_feature_, other->num_total_features_, -1);
num_total_features_ += other->num_total_features_;
for (size_t i = 0; i < (other->numeric_feature_map_).size(); ++i) {
int feat_ind = numeric_feature_map_[i];
if (feat_ind > -1) {
numeric_feature_map_.push_back(feat_ind + num_numeric_features_);
} else {
numeric_feature_map_.push_back(-1);
}
}
num_numeric_features_ += other->num_numeric_features_;
if (has_raw_) {
for (int i = 0; i < other->num_numeric_features_; ++i) {
raw_data_.push_back(other->raw_data_[i]);
}
}
}