in fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp [112:191]
AdjacencyMatrix<Node> get_intermediate_node(AdjacencyMatrix<Links> links) {
auto world_size = at::cuda::getNumGPUs();
auto intermediate_node = [&](Node i, Node j) {
if (i == j) {
return std::vector<Node>{-1};
}
if (links(i, j) != 0) {
return std::vector<Node>{-1};
}
std::vector<std::pair<Node, Links>> paths;
for (const auto k : c10::irange(world_size)) {
if (k != i && k != j && links(i, k) != 0 && links(k, j) != 0) {
paths.push_back({k, links(i, k) + links(k, j)});
}
}
if (paths.empty()) {
LOG(WARNING)
<< "Expect very bad performance for p2p copies, we are going via sys path for GPU "
<< i << " -> GPU " << j;
return std::vector<Node>{-1};
}
auto mp = std::max_element(
paths.begin(),
paths.end(),
[](std::pair<Node, Links> a, std::pair<Node, Links> b) {
return a.second < b.second;
})
->second;
std::vector<Node> candidates;
for (const auto& p : paths) {
if (p.second == mp) {
candidates.push_back(p.first);
}
}
return candidates;
};
std::vector<Node> assignments(world_size * world_size);
// Use a two-phase assignment protocol as the greedy approach
// can lead to unbalanced usage.
std::unordered_map<Node, int64_t> uses;
for (const auto i : c10::irange(world_size)) {
for (const auto j : c10::irange(world_size)) {
auto ims = intermediate_node(i, j);
if (ims.size() == 1) {
auto v = ims.front();
if (v != -1) {
uses[v] += 1;
}
assignments[i * world_size + j] = v;
}
}
}
for (const auto i : c10::irange(world_size)) {
for (const auto j : c10::irange(world_size)) {
auto ims = intermediate_node(i, j);
if (ims.size() > 1) {
auto v = *std::min_element(ims.begin(), ims.end(), [&](Node a, Node b) {
return uses[a] < uses[b];
});
uses[v] += 1;
assignments[i * world_size + j] = v;
}
}
}
if (std::any_of(assignments.begin(), assignments.end(), [](Node n) {
return n != -1;
})) {
auto tensor = at::from_blob(
assignments.data(),
{world_size, world_size},
at::TensorOptions().dtype(at::kLong));
LOG(INFO) << "Detected a multi-hop NVLink configuration: \n" << tensor;
return [=](Node i, Node j) { return assignments[i * world_size + j]; };
} else {
return [](Node, Node) { return -1; };
}
}