in src/backends/cuda/cuda.cpp [190:255]
std::string gen_node(const LoopTree <, const Auxiliary &aux,
UnrollMap &unroll, LoopTree::TreeRef ref) {
std::stringstream ss;
auto depth = lt.tree_node(ref).depth;
auto node_ref = lt.node(ref);
auto out_alloc = aux.allocs.at(node_ref);
const auto &node = lt.ir.node(node_ref);
if (node.op() == Operation::add) {
ss << indent(depth);
ss << gen_compute(lt, aux, unroll, ref, "+");
} else if (node.op() == Operation::multiply) {
ss << indent(depth);
ss << gen_compute(lt, aux, unroll, ref, "*");
} else if (node.op() == Operation::read) {
int external_memory = -1;
for (auto i = 0; i < lt.ir.inputs().size(); ++i) {
if (lt.ir.inputs()[i] == lt.node(ref)) {
external_memory = i;
}
}
ASSERT(external_memory > -1 && "No input found!");
auto out_alloc = aux.allocs.at(node_ref);
auto inp_alloc = out_alloc;
inp_alloc.lca = -1; // TODO clean up read hacks
ss << indent(depth);
ss << gen_access(lt, aux, out_alloc, ref, unroll);
ss << " = ";
ss << gen_access(lt, aux, inp_alloc, ref, unroll, external_memory);
ss << ";\n";
} else if (node.op() == Operation::write) {
int external_memory = -1;
for (auto i = 0; i < lt.ir.outputs().size(); ++i) {
if (lt.ir.outputs()[i] == lt.node(ref)) {
external_memory = i + lt.ir.inputs().size();
}
}
ASSERT(external_memory > -1 && "No output found!");
auto out_alloc = aux.allocs.at(node_ref);
ASSERT(node.inputs().size() == 1);
auto inp_alloc = aux.allocs.at(node.inputs().at(0));
ss << indent(depth);
ss << gen_access(lt, aux, out_alloc, ref, unroll, external_memory);
ss << " = ";
ss << gen_access(lt, aux, inp_alloc, ref, unroll);
ss << ";\n";
} else if (node.op() == Operation::view) {
auto out_alloc = aux.allocs.at(node_ref);
ASSERT(node.inputs().size() == 1)
<< "Cuda backend can only emit simple views";
auto dep_ref = node.inputs().at(0);
auto inp_alloc = aux.allocs.at(dep_ref);
ss << indent(depth);
ss << gen_access(lt, aux, out_alloc, ref, unroll);
ss << " = ";
ss << gen_access(lt, aux, inp_alloc, ref, unroll);
ss << ";\n";
} else {
ASSERT(0) << "node in IR yet supported in CUDA " << lt.ir.dump(node_ref);
}
return ss.str();
}