AdjacencyMatrix get_intermediate_node()

in fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp [112:191]


AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
  auto world_size = at::cuda::getNumGPUs();
  auto intermediate_node = [&](Node i, Node j) {
    if (i == j) {
      return std::vector<Node>{-1};
    }
    if (links(i, j) != 0) {
      return std::vector<Node>{-1};
    }

    std::vector<std::pair<Node, Links>> paths;
    for (const auto k : c10::irange(world_size)) {
      if (k != i && k != j && links(i, k) != 0 && links(k, j) != 0) {
        paths.push_back({k, links(i, k) + links(k, j)});
      }
    }
    if (paths.empty()) {
      LOG(WARNING)
          << "Expect very bad performance for p2p copies, we are going via sys path for GPU "
          << i << " -> GPU " << j;
      return std::vector<Node>{-1};
    }
    auto mp = std::max_element(
                  paths.begin(),
                  paths.end(),
                  [](std::pair<Node, Links> a, std::pair<Node, Links> b) {
                    return a.second < b.second;
                  })
                  ->second;
    std::vector<Node> candidates;
    for (const auto& p : paths) {
      if (p.second == mp) {
        candidates.push_back(p.first);
      }
    }
    return candidates;
  };

  std::vector<Node> assignments(world_size * world_size);
  // Use a two-phase assignment protocol as the greedy approach
  // can lead to unbalanced usage.
  std::unordered_map<Node, int64_t> uses;
  for (const auto i : c10::irange(world_size)) {
    for (const auto j : c10::irange(world_size)) {
      auto ims = intermediate_node(i, j);
      if (ims.size() == 1) {
        auto v = ims.front();
        if (v != -1) {
          uses[v] += 1;
        }
        assignments[i * world_size + j] = v;
      }
    }
  }

  for (const auto i : c10::irange(world_size)) {
    for (const auto j : c10::irange(world_size)) {
      auto ims = intermediate_node(i, j);
      if (ims.size() > 1) {
        auto v = *std::min_element(ims.begin(), ims.end(), [&](Node a, Node b) {
          return uses[a] < uses[b];
        });
        uses[v] += 1;
        assignments[i * world_size + j] = v;
      }
    }
  }
  if (std::any_of(assignments.begin(), assignments.end(), [](Node n) {
        return n != -1;
      })) {
    auto tensor = at::from_blob(
        assignments.data(),
        {world_size, world_size},
        at::TensorOptions().dtype(at::kLong));
    LOG(INFO) << "Detected a multi-hop NVLink configuration: \n" << tensor;
    return [=](Node i, Node j) { return assignments[i * world_size + j]; };
  } else {
    return [](Node, Node) { return -1; };
  }
}