in src/tuner/nccl_ofi_regions.cpp [273:598]
static ncclResult_t region_init_internal_p5en(nccl_ofi_tuner_region_context_t *region_ctx)
{
ncclResult_t ret = ncclSuccess;
ncclFunc_t collType;
size_t nRanks = region_ctx->dims.num_ranks;
size_t nNodes = region_ctx->dims.num_nodes;
if (nRanks == 8 * nNodes) {
{
collType = ncclFuncAllReduce;
nccl_ofi_tuner_point_t extended_tree_ll =
extend_region((nccl_ofi_tuner_point_t){262144, 192},
(nccl_ofi_tuner_point_t){262144, 1024},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_tree_ll128 =
extend_region((nccl_ofi_tuner_point_t){150994944, 128},
(nccl_ofi_tuner_point_t){251658240, 1024},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_nvlstree_simple =
extend_region((nccl_ofi_tuner_point_t){6442450944, 256},
(nccl_ofi_tuner_point_t){17179869184, 768},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
const nccl_ofi_tuner_region_t regions[] = {
{.algorithm = NCCL_ALGO_TREE,
.protocol = NCCL_PROTO_LL,
.num_vertices = 4,
.vertices = {{0, 16}, {262144, 16}, {262144, 1024}, extended_tree_ll}},
{.algorithm = NCCL_ALGO_TREE,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 7,
.vertices = {extended_tree_ll,
{262144, 1024},
{262144, 16},
{14680064, 16},
{150994944, 128},
{251658240, 1024},
extended_tree_ll128}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 5,
.vertices = {{150994944, 128},
{14680064, 16},
{33554432, 16},
{536870912, 32},
{536870912, 128}}},
{.algorithm = NCCL_ALGO_NVLS_TREE,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 8,
.vertices = {extended_tree_ll128,
{251658240, 1024},
{150994944, 128},
{536870912, 128},
{536870912, 32},
{6442450944, 256},
{17179869184, 768},
extended_nvlstree_simple}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 7,
.vertices = {extended_nvlstree_simple,
{17179869184, 768},
{6442450944, 256},
{536870912, 32},
{33554432, 16},
{1073741824, 16},
{TUNER_MAX_SIZE, 16}}}};
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
if (ret != ncclSuccess) {
goto exit;
}
}
{
collType = ncclFuncAllGather;
nccl_ofi_tuner_point_t extended_ring_ll =
extend_region((nccl_ofi_tuner_point_t){8388608, 256},
(nccl_ofi_tuner_point_t){33554432, 1024},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_ring_ll128 =
extend_region((nccl_ofi_tuner_point_t){8589934592, 512},
(nccl_ofi_tuner_point_t){17179869184, 1024},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
const nccl_ofi_tuner_region_t regions[] = {
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL,
.num_vertices = 7,
.vertices = {{0, 16}, {131072, 16}, {262144, 32}, {8388608, 256}, {33554432, 1024}, extended_ring_ll, {0, TUNER_MAX_RANKS}}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 10,
.vertices = {extended_ring_ll,
{33554432, 1024},
{8388608, 256},
{262144, 32},
{131072, 16},
{268435456, 16},
{2147483648, 128},
{8589934592, 512},
{17179869184, 1024},
extended_ring_ll128}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 6,
.vertices = {extended_ring_ll128,
{17179869184, 1024},
{8589934592, 512},
{268435456, 16},
{17179869184, 16},
{TUNER_MAX_SIZE, 16}}}};
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
if (ret != ncclSuccess) {
goto exit;
}
}
{
collType = ncclFuncReduceScatter;
nccl_ofi_tuner_point_t extended_ring_ll =
extend_region((nccl_ofi_tuner_point_t){8388608, 256},
(nccl_ofi_tuner_point_t){33554432, 1024},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_ring_ll128 =
extend_region((nccl_ofi_tuner_point_t){8589934592, 512},
(nccl_ofi_tuner_point_t){17179869184, 1024},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
const nccl_ofi_tuner_region_t regions[] = {
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL,
.num_vertices = 7,
.vertices = {{0, 16}, {131072, 16}, {262144, 32}, {8388608, 256}, {33554432, 1024}, extended_ring_ll, {0, TUNER_MAX_RANKS}}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 10,
.vertices = {extended_ring_ll,
{33554432, 1024},
{8388608, 256},
{262144, 32},
{131072, 16},
{268435456, 16},
{2147483648, 128},
{8589934592, 512},
{17179869184, 1024},
extended_ring_ll128}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 6,
.vertices = {extended_ring_ll128,
{17179869184, 1024},
{8589934592, 512},
{268435456, 16},
{17179869184, 16},
{TUNER_MAX_SIZE, 16}}}};
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
if (ret != ncclSuccess) {
goto exit;
}
}
} else if (nRanks == nNodes) {
{
collType = ncclFuncAllReduce;
nccl_ofi_tuner_point_t extended_tree_ll128 =
extend_region((nccl_ofi_tuner_point_t){524288, 8},
(nccl_ofi_tuner_point_t){1048576, 96},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_tree_simple =
extend_region((nccl_ofi_tuner_point_t){8388608, 32},
(nccl_ofi_tuner_point_t){33554432, 128},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_ring_ll128 =
extend_region((nccl_ofi_tuner_point_t){50331648, 16},
(nccl_ofi_tuner_point_t){301989888, 128},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
const nccl_ofi_tuner_region_t regions[] = {
{.algorithm = NCCL_ALGO_TREE,
.protocol = NCCL_PROTO_LL,
.num_vertices = 4,
.vertices = {{0, 2}, {65536, 2}, {65536, 64}, {65536, TUNER_MAX_RANKS}}},
{.algorithm = NCCL_ALGO_TREE,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 7,
.vertices = {{65536, TUNER_MAX_RANKS},
{65536, 64},
{65536, 2},
{262144, 2},
{524288, 8},
{1048576, 96},
extended_tree_ll128}},
{.algorithm = NCCL_ALGO_TREE,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 7,
.vertices = {extended_tree_ll128,
{1048576, 768},
{524288, 8},
{262144, 2},
{8388608, 32},
{33554432, 128},
extended_tree_simple}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 8,
.vertices = {extended_tree_simple,
{33554432, 128},
{8388608, 32},
{262144, 2},
{6291456, 2},
{50331648, 16},
{301989888, 128},
extended_ring_ll128}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 5,
.vertices = {extended_ring_ll128,
{301989888, 128},
{50331648, 16},
{6291456, 2},
{TUNER_MAX_SIZE, 2}}}};
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
if (ret != ncclSuccess){
goto exit;
}
}
{
collType = ncclFuncAllGather;
nccl_ofi_tuner_point_t extended_pat_simple =
extend_region((nccl_ofi_tuner_point_t){50331648, 64},
(nccl_ofi_tuner_point_t){117440512, 128},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_ring_ll128 =
extend_region((nccl_ofi_tuner_point_t){50331648, 16},
(nccl_ofi_tuner_point_t){301989888, 128},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
const nccl_ofi_tuner_region_t regions[] = {
{.algorithm = NCCL_ALGO_PAT,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 10,
.vertices = {{0, 2},
{65536, 2},
{1048576, 2},
{16777216, 32},
{50331648, 64},
{117440512, 128},
extended_pat_simple,
{TUNER_MAX_SIZE, TUNER_MAX_RANKS},
{65536, TUNER_MAX_RANKS},
{0, TUNER_MAX_RANKS}}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 9,
.vertices = {extended_pat_simple,
{117440512, 128},
{50331648, 64},
{16777216, 32},
{1048576, 2},
{4194304, 2},
{50331648, 16},
{301989888, 128},
extended_ring_ll128}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 5,
.vertices = {extended_ring_ll128,
{301989888, 128},
{50331648, 16},
{4194304, 2},
{TUNER_MAX_SIZE, 2}}}};
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
if (ret != ncclSuccess) {
goto exit;
}
}
{
collType = ncclFuncReduceScatter;
nccl_ofi_tuner_point_t extended_pat_simple =
extend_region((nccl_ofi_tuner_point_t){50331648, 64},
(nccl_ofi_tuner_point_t){117440512, 128},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
nccl_ofi_tuner_point_t extended_ring_ll128 =
extend_region((nccl_ofi_tuner_point_t){50331648, 16},
(nccl_ofi_tuner_point_t){301989888, 128},
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
const nccl_ofi_tuner_region_t regions[] = {
{.algorithm = NCCL_ALGO_PAT,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 10,
.vertices = {{0, 2},
{65536, 2},
{1048576, 2},
{16777216, 32},
{50331648, 64},
{117440512, 128},
extended_pat_simple,
{TUNER_MAX_SIZE, TUNER_MAX_RANKS},
{65536, TUNER_MAX_RANKS},
{0, TUNER_MAX_RANKS}}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_LL128,
.num_vertices = 9,
.vertices = {extended_pat_simple,
{117440512, 128},
{50331648, 64},
{16777216, 32},
{1048576, 2},
{4194304, 2},
{50331648, 16},
{301989888, 128},
extended_ring_ll128}},
{.algorithm = NCCL_ALGO_RING,
.protocol = NCCL_PROTO_SIMPLE,
.num_vertices = 5,
.vertices = {extended_ring_ll128,
{301989888, 128},
{50331648, 16},
{4194304, 2},
{TUNER_MAX_SIZE, 2}}}};
ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
if (ret != ncclSuccess) {
goto exit;
}
}
}
exit:
return ret;
}