static ncclResult_t region_init_internal_p5en()

in src/tuner/nccl_ofi_regions.cpp [273:598]


static ncclResult_t region_init_internal_p5en(nccl_ofi_tuner_region_context_t *region_ctx)
{
	ncclResult_t ret = ncclSuccess;
	ncclFunc_t collType;
	size_t nRanks = region_ctx->dims.num_ranks;
	size_t nNodes = region_ctx->dims.num_nodes;

	if (nRanks == 8 * nNodes) {
		{
			collType = ncclFuncAllReduce;
			nccl_ofi_tuner_point_t extended_tree_ll =
				extend_region((nccl_ofi_tuner_point_t){262144, 192},
							  (nccl_ofi_tuner_point_t){262144, 1024},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_tree_ll128 =
				extend_region((nccl_ofi_tuner_point_t){150994944, 128},
							  (nccl_ofi_tuner_point_t){251658240, 1024},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_nvlstree_simple =
				extend_region((nccl_ofi_tuner_point_t){6442450944, 256},
							  (nccl_ofi_tuner_point_t){17179869184, 768},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL,
				 .num_vertices = 4,
				 .vertices = {{0, 16}, {262144, 16}, {262144, 1024}, extended_tree_ll}},
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 7,
				 .vertices = {extended_tree_ll,
							  {262144, 1024},
							  {262144, 16},
							  {14680064, 16},
							  {150994944, 128},
							  {251658240, 1024},
							  extended_tree_ll128}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 5,
				 .vertices = {{150994944, 128},
							  {14680064, 16},
							  {33554432, 16},
							  {536870912, 32},
							  {536870912, 128}}},
				{.algorithm = NCCL_ALGO_NVLS_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 8,
				 .vertices = {extended_tree_ll128,
							  {251658240, 1024},
							  {150994944, 128},
							  {536870912, 128},
							  {536870912, 32},
							  {6442450944, 256},
							  {17179869184, 768},
							  extended_nvlstree_simple}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 7,
				 .vertices = {extended_nvlstree_simple,
							  {17179869184, 768},
							  {6442450944, 256},
							  {536870912, 32},
							  {33554432, 16},
							  {1073741824, 16},
							  {TUNER_MAX_SIZE, 16}}}};
			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
		{
			collType = ncclFuncAllGather;
			nccl_ofi_tuner_point_t extended_ring_ll =
				extend_region((nccl_ofi_tuner_point_t){8388608, 256},
							  (nccl_ofi_tuner_point_t){33554432, 1024},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_ring_ll128 =
				extend_region((nccl_ofi_tuner_point_t){8589934592, 512},
							  (nccl_ofi_tuner_point_t){17179869184, 1024},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL,
				 .num_vertices = 7,
				 .vertices = {{0, 16}, {131072, 16}, {262144, 32}, {8388608, 256}, {33554432, 1024}, extended_ring_ll, {0, TUNER_MAX_RANKS}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 10,
				 .vertices = {extended_ring_ll,
							  {33554432, 1024},
							  {8388608, 256},
							  {262144, 32},
							  {131072, 16},
							  {268435456, 16},
							  {2147483648, 128},
							  {8589934592, 512},
							  {17179869184, 1024},
							  extended_ring_ll128}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 6,
				 .vertices = {extended_ring_ll128,
							  {17179869184, 1024},
							  {8589934592, 512},
							  {268435456, 16},
							  {17179869184, 16},
							  {TUNER_MAX_SIZE, 16}}}};
			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
		{
			collType = ncclFuncReduceScatter;
			nccl_ofi_tuner_point_t extended_ring_ll =
				extend_region((nccl_ofi_tuner_point_t){8388608, 256},
							  (nccl_ofi_tuner_point_t){33554432, 1024},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_ring_ll128 =
				extend_region((nccl_ofi_tuner_point_t){8589934592, 512},
							  (nccl_ofi_tuner_point_t){17179869184, 1024},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL,
				 .num_vertices = 7,
				 .vertices = {{0, 16}, {131072, 16}, {262144, 32}, {8388608, 256}, {33554432, 1024}, extended_ring_ll, {0, TUNER_MAX_RANKS}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 10,
				 .vertices = {extended_ring_ll,
							  {33554432, 1024},
							  {8388608, 256},
							  {262144, 32},
							  {131072, 16},
							  {268435456, 16},
							  {2147483648, 128},
							  {8589934592, 512},
							  {17179869184, 1024},
							  extended_ring_ll128}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 6,
				 .vertices = {extended_ring_ll128,
							  {17179869184, 1024},
							  {8589934592, 512},
							  {268435456, 16},
							  {17179869184, 16},
							  {TUNER_MAX_SIZE, 16}}}};
			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
	} else if (nRanks == nNodes) {
		{
			collType = ncclFuncAllReduce;
			nccl_ofi_tuner_point_t extended_tree_ll128 =
				extend_region((nccl_ofi_tuner_point_t){524288, 8},
							  (nccl_ofi_tuner_point_t){1048576, 96},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_tree_simple =
				extend_region((nccl_ofi_tuner_point_t){8388608, 32},
							  (nccl_ofi_tuner_point_t){33554432, 128},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_ring_ll128 =
				extend_region((nccl_ofi_tuner_point_t){50331648, 16},
							  (nccl_ofi_tuner_point_t){301989888, 128},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL,
				 .num_vertices = 4,
				 .vertices = {{0, 2}, {65536, 2}, {65536, 64}, {65536, TUNER_MAX_RANKS}}},
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 7,
				 .vertices = {{65536, TUNER_MAX_RANKS},
							  {65536, 64},
							  {65536, 2},
							  {262144, 2},
							  {524288, 8},
							  {1048576, 96},
							  extended_tree_ll128}},
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 7,
				 .vertices = {extended_tree_ll128,
							  {1048576, 768},
							  {524288, 8},
							  {262144, 2},
							  {8388608, 32},
							  {33554432, 128},
							  extended_tree_simple}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 8,
				 .vertices = {extended_tree_simple,
							  {33554432, 128},
							  {8388608, 32},
							  {262144, 2},
							  {6291456, 2},
							  {50331648, 16},
							  {301989888, 128},
							  extended_ring_ll128}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 5,
				 .vertices = {extended_ring_ll128,
							  {301989888, 128},
							  {50331648, 16},
							  {6291456, 2},
							  {TUNER_MAX_SIZE, 2}}}};
			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess){
				goto exit;
			}
		}
		{
			collType = ncclFuncAllGather;
			nccl_ofi_tuner_point_t extended_pat_simple =
				extend_region((nccl_ofi_tuner_point_t){50331648, 64},
							  (nccl_ofi_tuner_point_t){117440512, 128},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_ring_ll128 =
				extend_region((nccl_ofi_tuner_point_t){50331648, 16},
							  (nccl_ofi_tuner_point_t){301989888, 128},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_PAT,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 10,
				 .vertices = {{0, 2},
							  {65536, 2},
							  {1048576, 2},
							  {16777216, 32},
							  {50331648, 64},
							  {117440512, 128},
							  extended_pat_simple,
							  {TUNER_MAX_SIZE, TUNER_MAX_RANKS},
							  {65536, TUNER_MAX_RANKS},
							  {0, TUNER_MAX_RANKS}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 9,
				 .vertices = {extended_pat_simple,
							  {117440512, 128},
							  {50331648, 64},
							  {16777216, 32},
							  {1048576, 2},
							  {4194304, 2},
							  {50331648, 16},
							  {301989888, 128},
							  extended_ring_ll128}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 5,
				 .vertices = {extended_ring_ll128,
							  {301989888, 128},
							  {50331648, 16},
							  {4194304, 2},
							  {TUNER_MAX_SIZE, 2}}}};
			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
		{
			collType = ncclFuncReduceScatter;
			nccl_ofi_tuner_point_t extended_pat_simple =
				extend_region((nccl_ofi_tuner_point_t){50331648, 64},
							  (nccl_ofi_tuner_point_t){117440512, 128},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_ring_ll128 =
				extend_region((nccl_ofi_tuner_point_t){50331648, 16},
							  (nccl_ofi_tuner_point_t){301989888, 128},
							  (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_PAT,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 10,
				 .vertices = {{0, 2},
							  {65536, 2},
							  {1048576, 2},
							  {16777216, 32},
							  {50331648, 64},
							  {117440512, 128},
							  extended_pat_simple,
							  {TUNER_MAX_SIZE, TUNER_MAX_RANKS},
							  {65536, TUNER_MAX_RANKS},
							  {0, TUNER_MAX_RANKS}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 9,
				 .vertices = {extended_pat_simple,
							  {117440512, 128},
							  {50331648, 64},
							  {16777216, 32},
							  {1048576, 2},
							  {4194304, 2},
							  {50331648, 16},
							  {301989888, 128},
							  extended_ring_ll128}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 5,
				 .vertices = {extended_ring_ll128,
							  {301989888, 128},
							  {50331648, 16},
							  {4194304, 2},
							  {TUNER_MAX_SIZE, 2}}}};
			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
	}
exit:
	return ret;
}