std::string gen_access()

in src/backends/cuda/cuda.cpp [82:162]


std::string gen_access(const LoopTree &lt, const Auxiliary &aux,
                       const Allocation &alloc, LoopTree::TreeRef use,
                       const UnrollMap &unroll, int external_idx = -1) {
  std::stringstream ss;

  std::vector<LoopTree::Loop> parent_chain;
  auto parent = lt.parent(use);
  while (parent != -1) {
    parent_chain.emplace_back(lt.loop(parent));
    parent = lt.parent(parent);
  }
  std::reverse(parent_chain.begin(), parent_chain.end());
  auto order = parent_chain;  // lt.loop_order(lt.node(use));
  auto idx_vec = gen_idx_vector(lt, aux, alloc, use);

  // To vectorize, every inner size needs to be
  // evenly divisible by 4 (unless innermost)
  // TODO relax this by adding modular arithmetic or casting
  bool unrolled_vectorize = false;
  for (const auto &p : idx_vec) {
    bool innermost = &p == &idx_vec.front();
    unrolled_vectorize &= (p.second % 4 == 0) || (innermost && (p.second == 1));
  }

  bool vectorize = false;
  bool flatten = true;  // cast from float4 to float
  if (alloc.size % 4 == 0) {
    // we can index directly into the vector
    if (unrolled_vectorize) {
      vectorize = true;
      flatten = false;
    } else {
      flatten = true;
    }
  }

  // memory name
  if (flatten) {
    ss << "((float*)";
  }
  if (external_idx > -1) {  // for reads/writes to real memory
    ss << "ext_" << external_idx;
  } else {
    ss << "mem_" << alloc.idx;
  }
  if (flatten) {
    ss << ")";
  }

  // memory index
  std::string extra = "";
  ss << "[";
  for (const auto &p : idx_vec) {
    auto loop = order.at(p.first);
    auto size = p.second;
    bool innermost = &p == &idx_vec.front();
    if (vectorize) {
      ASSERT(size % 4 == 0 || (innermost && (size == 1)))
          << "invalid unroll for " << lt.ir.dump(lt.node(use))
          << " found innermost size to be " << size << " for var at depth "
          << p.first << "\n";
    }
    std::pair<IR::VarRef, int> key = {loop.var, loop.var_depth};
    if (unroll.count(key)) {
      ss << unroll.at(key) * size / (vectorize ? 4 : 1);
      ASSERT((size % (vectorize ? 4 : 1) == 0) || (innermost && (size == 1)));
      if (innermost && vectorize) {
        extra = "." + (std::vector<std::string>(
                          {"x", "y", "z", "w"})[unroll.at(key) % 4]);
      }
    } else {
      auto s = (size / (vectorize ? 4 : 1));
      ss << lt.ir.var(loop.var).name() << "_" << loop.var_depth << " * " << s;
    }
    ss << " + ";
  }
  ss << "0";  // keeps the expression well formed
  ss << "]" << extra;

  return ss.str();
};