in src/backends/cuda/cuda.cpp [82:162]
std::string gen_access(const LoopTree <, const Auxiliary &aux,
const Allocation &alloc, LoopTree::TreeRef use,
const UnrollMap &unroll, int external_idx = -1) {
std::stringstream ss;
std::vector<LoopTree::Loop> parent_chain;
auto parent = lt.parent(use);
while (parent != -1) {
parent_chain.emplace_back(lt.loop(parent));
parent = lt.parent(parent);
}
std::reverse(parent_chain.begin(), parent_chain.end());
auto order = parent_chain; // lt.loop_order(lt.node(use));
auto idx_vec = gen_idx_vector(lt, aux, alloc, use);
// To vectorize, every inner size needs to be
// evenly divisible by 4 (unless innermost)
// TODO relax this by adding modular arithmetic or casting
bool unrolled_vectorize = false;
for (const auto &p : idx_vec) {
bool innermost = &p == &idx_vec.front();
unrolled_vectorize &= (p.second % 4 == 0) || (innermost && (p.second == 1));
}
bool vectorize = false;
bool flatten = true; // cast from float4 to float
if (alloc.size % 4 == 0) {
// we can index directly into the vector
if (unrolled_vectorize) {
vectorize = true;
flatten = false;
} else {
flatten = true;
}
}
// memory name
if (flatten) {
ss << "((float*)";
}
if (external_idx > -1) { // for reads/writes to real memory
ss << "ext_" << external_idx;
} else {
ss << "mem_" << alloc.idx;
}
if (flatten) {
ss << ")";
}
// memory index
std::string extra = "";
ss << "[";
for (const auto &p : idx_vec) {
auto loop = order.at(p.first);
auto size = p.second;
bool innermost = &p == &idx_vec.front();
if (vectorize) {
ASSERT(size % 4 == 0 || (innermost && (size == 1)))
<< "invalid unroll for " << lt.ir.dump(lt.node(use))
<< " found innermost size to be " << size << " for var at depth "
<< p.first << "\n";
}
std::pair<IR::VarRef, int> key = {loop.var, loop.var_depth};
if (unroll.count(key)) {
ss << unroll.at(key) * size / (vectorize ? 4 : 1);
ASSERT((size % (vectorize ? 4 : 1) == 0) || (innermost && (size == 1)));
if (innermost && vectorize) {
extra = "." + (std::vector<std::string>(
{"x", "y", "z", "w"})[unroll.at(key) % 4]);
}
} else {
auto s = (size / (vectorize ? 4 : 1));
ss << lt.ir.var(loop.var).name() << "_" << loop.var_depth << " * " << s;
}
ss << " + ";
}
ss << "0"; // keeps the expression well formed
ss << "]" << extra;
return ss.str();
};