in src/core/storage/sframe_data/testing_utils.cpp [163:519]
sframe make_random_sframe(
size_t n_rows, std::string column_types,
bool generate_target, size_t _random_seed) {
sframe data;
size_t num_columns = column_types.size();
size_t n_threads = thread::cpu_count();
std::vector<std::string> names;
std::vector<flex_type_enum> types;
names.resize(column_types.size());
types.resize(column_types.size());
////////////////////////////////////////////////////////////////////////////////
// Set up the information lookups for each of the columns: type,
// whether it's categorical, and the description to print.
//
for(size_t c_idx = 0; c_idx < num_columns; c_idx++){
names[c_idx] = std::string("X") + std::to_string(c_idx + 1) + "-" + column_types[c_idx];
switch(column_types[c_idx]) {
case 'n':
case 'N':
case 'r':
case 'R':
types[c_idx] = flex_type_enum::FLOAT;
break;
case 'b':
case 'z':
case 'Z':
types[c_idx] = flex_type_enum::INTEGER;
break;
case 'c':
case 'C':
case 's':
case 'S':
case 'x':
case 'X':
case 'h':
case 'H':
types[c_idx] = flex_type_enum::STRING;
break;
case 'v':
case 'V':
case 'w':
case 'W':
types[c_idx] = flex_type_enum::VECTOR;
break;
case 'l':
case 'L':
case 'm':
case 'M':
case 'u':
case 'U':
types[c_idx] = flex_type_enum::LIST;
break;
case 'd':
case 'D':
types[c_idx] = flex_type_enum::DICT;
break;
case '1':
case '2':
case '3':
case '4':
case 'A':
types[c_idx] = flex_type_enum::ND_VECTOR;
break;
default:
std::string msg = (std::string("Column type ") + column_types[c_idx] + " not recognized.");
ASSERT_MSG(false, msg.c_str());
break;
}
}
size_t target_column = names.size();
if(generate_target) {
names.push_back("target");
types.push_back(flex_type_enum::INTEGER); // Changed to float later on.
}
////////////////////////////////////////////////////////////////////////////////
// Create the sframe with each of the columns as determined above.
data.open_for_write(names, types, "", n_threads);
// For generating a target that can mostly be learned
static const size_t n_bins = 16;
static const size_t n_target_precision = (1 << 24);
// Hash it once for a bit of extra random_ness.
uint64_t random_seed = hash64(_random_seed);
std::vector<flex_int> target_adjust;
if(generate_target) {
target_adjust.resize(n_bins);
size_t c = 0;
for(flex_int & x : target_adjust) {
x = long(hash64(++c, random_seed) % n_target_precision) - (n_target_precision/2);
}
}
in_parallel([&](size_t thread_idx, size_t num_segments) {
auto it_out = data.get_output_iterator(thread_idx);
std::vector<flexible_type> row(column_types.size() + (generate_target ? 1 : 0));
size_t start_idx = (thread_idx * n_rows) / num_segments;
size_t end_idx = ((thread_idx + 1) * n_rows) / num_segments;
for(size_t i = start_idx; i < end_idx; ++i, ++it_out) {
/** Base random number generators. If there is a target
* present, then these also affect the target
*/
size_t _rng_state = hash64(i, random_seed);
// Start the target seed for this row.
flex_int target_value = 0;
// Go through the columns, randomly filling each.
for(size_t c_idx = 0; c_idx < column_types.size(); ++c_idx) {
auto rng_int = [&](size_t lb, size_t ub) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
size_t z = size_t(hash64(++_rng_state) % (ub - lb + 1));
if(generate_target) {
target_value += target_adjust[z % target_adjust.size()];
}
return z + lb;
};
auto rng_dbl = [&](double lb, double ub) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
double v01 = double(hash64(++_rng_state)) / std::numeric_limits<uint64_t>::max();
if(generate_target) {
target_value += long(std::round(n_target_precision * v01) - (n_target_precision/2));
}
double v = lb + (ub - lb) * v01;
// Now, round it to the nearest 2**12 in order to accomodate possible float32 vs float64 issues.
double C = double(1 << 12);
return std::round(v * C) / C;
};
/** Composite random number generators.
*/
auto rng_dbl_nan = [&](double lb, double ub) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
return (hash64(++_rng_state) < (std::numeric_limits<uint64_t>::max() / 100)
? NAN : rng_dbl(lb,ub));
};
// Generate a random hex string of the form "C-###"
auto rng_str = [&](size_t pool_size) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
// Everything is deterministic from a random set (1, ...,
// pool_size), allowing for limiting the number of random things available
char ret[16];
std::fill(ret, ret + 16, '\0');
snprintf(&ret[0], 16, "C-%ld", rng_int(0, pool_size-1));
return std::string(ret);
};
// Generate a random hex key
auto rng_hex = [&](size_t length, size_t pool_size) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
static const char charset[] = "0123456789abcdef";
// Everything is deterministic from a random set (1, ...,
// pool_size), allowing for limiting the number of random things available
size_t x = hash64(random_seed, hash64(++_rng_state) % pool_size);
std::string ret;
ret.reserve(length);
for(size_t i = 0; i < length; i += 8) {
uint64_t number = x;
for(size_t j = 0; j < 16; ++j) {
ret.push_back(charset[number & 0xF]);
number >>= 4;
if(ret.size() >= length) {
return ret;
}
}
x = hash64(x);
}
return ret;
};
// Generate a random list
auto rng_list =
[&](size_t max_size, size_t key_pool_size, bool string_values, bool unique)
GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
size_t s = rng_int(0, max_size);
std::vector<flex_int> v(s);
for (flex_int& f : v) {
f = rng_int(0, key_pool_size);
}
if(unique) {
std::set<flex_int> s(v.begin(), v.end());
v.assign(s.begin(), s.end());
}
flex_list ret(v.begin(), v.end());
if(string_values) {
for(flexible_type& f : ret) {
f = "C-" + f.to<flex_string>();
}
}
if(unique) {
std::sort(ret.begin(), ret.end());
}
return ret;
};
// Generate a random vector
auto rng_vec = [&](size_t s) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
flex_vec v(s);
for(double& f : v) f = rng_dbl(0,1);
return v;
};
// Generate a random vector
auto rng_vec_nan = [&](size_t s) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
flex_vec v(s);
for(double& f : v) f = rng_dbl_nan(0,1);
return v;
};
// Generate a random dictionary
auto rng_dict = [&](size_t max_size, size_t key_pool_size) GL_GCC_ONLY(GL_HOT_INLINE_FLATTEN) {
std::map<size_t,double> m;
size_t s = rng_int(0, max_size);
for(size_t i = 0; i < s; ++i) {
size_t index = rng_int(1, key_pool_size);
double value = rng_dbl(0, 1);
m[index] = value;
}
flex_dict d;
d.reserve(m.size());
char key[16];
for(const auto& p : m) {
snprintf(key, 15, "K-%ld", p.first);
d.push_back( {flex_string(key), p.second} );
}
return d;
};
auto rng_nd_vec = [&](const flex_nd_vec::index_range_type& shape,
const flex_nd_vec::index_range_type& stride) {
flex_nd_vec v(shape, stride, 0.0);
size_t n = v.num_elem();
for (size_t vidx = 0; vidx < n; ++vidx) {
v[vidx] = rng_dbl(0, 1);
}
return v;
};
// Based on the column output type, write it out
switch(column_types[c_idx]){
case 'n': { row[c_idx] = rng_dbl(0,1); break; }
case 'N': { row[c_idx] = rng_dbl_nan(0,1); break; }
case 'r': { row[c_idx] = rng_dbl(-100,100); break; }
case 'R': { row[c_idx] = rng_dbl_nan(-1000,1000); break; }
case 'b': { row[c_idx] = rng_int(0, 1); break; }
case 'z': { row[c_idx] = rng_int(1, 10); break; }
case 'Z': { row[c_idx] = rng_int(1, 100); break; }
case 's': { row[c_idx] = rng_str(10); break; }
case 'S': { row[c_idx] = rng_str(100); break; }
case 'c': { row[c_idx] = rng_str(1000); break; }
case 'C': { row[c_idx] = rng_str(100000); break; }
case 'x': { row[c_idx] = rng_hex(32, 1000); break; }
case 'X': { row[c_idx] = rng_hex(64, 100000); break; }
case 'h': { row[c_idx] = rng_hex(32, size_t(-1)); break; }
case 'H': { row[c_idx] = rng_hex(64, size_t(-1)); break; }
case 'v': { row[c_idx] = rng_vec(10); break; }
case 'V': { row[c_idx] = rng_vec(100); break; }
case 'w': { row[c_idx] = rng_vec_nan(10); break; }
case 'W': { row[c_idx] = rng_vec_nan(100); break; }
case 'l': { row[c_idx] = rng_list(10, 100, false, false); break; }
case 'L': { row[c_idx] = rng_list(100, 1000, false, false); break; }
case 'm': { row[c_idx] = rng_list(10, 100, true, false); break; }
case 'M': { row[c_idx] = rng_list(100, 1000, true, false); break; }
case 'u': { row[c_idx] = rng_list(10, 100, true, true); break; }
case 'U': { row[c_idx] = rng_list(100, 1000, true, true); break; }
case 'd': { row[c_idx] = rng_dict(10, 100); break; }
case 'D': { row[c_idx] = rng_dict(100, 1000); break; }
case '1': { row[c_idx] = rng_nd_vec({10}, {}); break; }
case '2': { row[c_idx] = rng_nd_vec({4,3}, {}); break; }
case '3': { row[c_idx] = rng_nd_vec({4,3,2}, {}); break; }
case '4': { row[c_idx] = rng_nd_vec({4,3,2,2}, {}); break; }
case 'A': {
flex_nd_vec::index_range_type shape = {2,3,4};
flex_nd_vec::index_range_type stride(3);
flex_nd_vec::index_range_type _order = {0, 1, 2};
stride.resize(3, 0);
size_t cur_stride = 1;
while (!_order.empty()) {
size_t pick_index =
rng_int(0, _order.size() - 1);
size_t index = _order[pick_index];
_order.erase(_order.begin() + pick_index);
stride[index] = cur_stride;
cur_stride *= shape[index];
}
row[c_idx] = rng_nd_vec(shape, stride);
break;
}
default:
std::string msg = (std::string("Column type ") + column_types[c_idx] + " not recognized.");
ASSERT_MSG(false, msg.c_str());
}
}
if(generate_target) {
row[target_column] = target_value;
}
*it_out = row;
}
});
data.close();
return data;
}