static ncclResult_t region_init_internal_p5_p5e()

in src/tuner/nccl_ofi_regions.cpp [604:899]


static ncclResult_t region_init_internal_p5_p5e(nccl_ofi_tuner_region_context_t *region_ctx)
{
	ncclResult_t ret = ncclSuccess;
	ncclFunc_t collType;
	size_t nRanks = region_ctx->dims.num_ranks;
	size_t nNodes = region_ctx->dims.num_nodes;

	if (nRanks == 8 * nNodes) {
		{
			collType = ncclFuncAllReduce;
			nccl_ofi_tuner_point_t extended_tree_ll128 =
				extend_region((nccl_ofi_tuner_point_t){402653184, 2048},
					      (nccl_ofi_tuner_point_t){402653184, 4096},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			nccl_ofi_tuner_point_t extended_nvlstree_simple_1 =
				extend_region((nccl_ofi_tuner_point_t){8053063680, 160},
					      (nccl_ofi_tuner_point_t){9663676416, 192},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			nccl_ofi_tuner_point_t extended_nvlstree_simple_2 =
				extend_region((nccl_ofi_tuner_point_t){402653184, 2048},
					      (nccl_ofi_tuner_point_t){402653184, 4096},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			nccl_ofi_tuner_point_t extended_ring_simple =
				extend_region((nccl_ofi_tuner_point_t){8053063680, 160},
					      (nccl_ofi_tuner_point_t){9663676416, 192},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 12,
				 .vertices = {{0, 16},
					      {31457280, 16},
					      {37748736, 32},
					      {117440512, 64},
					      {301989888, 128},
					      {301989888, 256},
					      {335544320, 512},
					      {536870912, 1024},
					      {402653184, 2048},
					      {402653184, 4096},
					      extended_tree_ll128,
					      {0, extended_tree_ll128.y}}},
				{.algorithm = NCCL_ALGO_NVLS_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 3,
				 .vertices = {{31457281, 16}, {TUNER_MAX_SIZE, 16}, {31457281, 16}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 11,
				 .vertices = {{31457280, 17},
					      {1073741824, 17},
					      {2147483648, 64},
					      {2147483648, 128},
					      {1342177280, 160},
					      {2147483648, 256},
					      {1074790400, 256},
					      {444596224, 160},
					      {301989888, 128},
					      {117440512, 64},
					      {37748736, 32}}},
				{.algorithm = NCCL_ALGO_NVLS_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 17,
				 .vertices = {{2147483648, 128},
					      {6442450944, 128},
					      {8053063680, 160},
					      {9663676416, 192},
					      extended_nvlstree_simple_1,
					      extended_nvlstree_simple_2,
					      {402653184, 4096},
					      {402653184, 2048},
					      {536870912, 1024},
					      {335544320, 512},
					      {301989888, 256},
					      {310378496, 160},
					      {444596224, 160},
					      {1074790400, 256},
					      {2684354560, 256},
					      {2147483648, 224},
					      {1342177280, 160}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 7,
				 .vertices = {{1073741824, 17},
					      {extended_ring_simple.x, 17},
					      extended_ring_simple,
					      {9663676416, 192},
					      {8053063680, 160},
					      {2684354560, 64},
					      {1610612736, 32}}}};

			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
	} else if (nRanks == 2 * nNodes) {
		{
			collType = ncclFuncAllReduce;
			nccl_ofi_tuner_point_t extended_tree_ll128 =
				extend_region((nccl_ofi_tuner_point_t){88160256, 128},
					      (nccl_ofi_tuner_point_t){178163712, 256},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			nccl_ofi_tuner_point_t extended_tree_simple_1 =
				extend_region((nccl_ofi_tuner_point_t){787480576, 128},
					      (nccl_ofi_tuner_point_t){1073741824, 256},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			nccl_ofi_tuner_point_t extended_tree_simple_2 =
				extend_region((nccl_ofi_tuner_point_t){257114112, 128},
					      (nccl_ofi_tuner_point_t){269484032, 256},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			nccl_ofi_tuner_point_t extended_nvlstree_simple =
				extend_region((nccl_ofi_tuner_point_t){787480576, 128},
					      (nccl_ofi_tuner_point_t){1073741824, 256},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 11,
				 .vertices = {{0, 4},
					      {1314816, 4},
					      {1051648, 8},
					      {1051648, 12},
					      {2367488, 16},
					      {5525504, 32},
					      {9473024, 64},
					      {88160256, 128},
					      {178163712, 256},
					      extended_tree_ll128,
					      {0, extended_tree_ll128.y}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 14,
				 .vertices = {{1314816, 4},
					      {19736576, 4},
					      {41842688, 8},
					      {296747008, 64},
					      {257114112, 128},
					      {269484032, 256},
					      {178163712, 256},
					      {88160256, 128},
					      {9473024, 64},
					      {5525504, 32},
					      {2367488, 16},
					      {1051648, 12},
					      {1051648, 8},
					      {1314816, 4}}},
				{.algorithm = NCCL_ALGO_NVLS_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 6,
				 .vertices = {{19736576, 4},
					      {81844224, 4},
					      {275775488, 8},
					      {275775488, 48},
					      {296747008, 64},
					      {41842688, 8}}},
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 3,
				 .vertices = {{81844224, 4}, {269484032, 4}, {81844224, 4}}},
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 3,
				 .vertices = {{269484032, 4}, {TUNER_MAX_SIZE, 4}, {269484032, 4}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 10,
				 .vertices = {{81844224, 5},
					      {TUNER_MAX_SIZE, 5},
					      {TUNER_MAX_SIZE, 32},
					      {1073741824, 40},
					      {1073741824, 128},
					      {787480576, 128},
					      {296747008, 64},
					      {275775488, 48},
					      {275775488, 8},
					      {81844224, 5}}},
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 7,
				 .vertices = {{296747008, 64},
					      {787480576, 128},
					      {1073741824, 256},
					      extended_tree_simple_1,
					      extended_tree_simple_2,
					      {269484032, 256},
					      {257114112, 128}}},
				{.algorithm = NCCL_ALGO_NVLS_TREE,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 6,
				 .vertices = {extended_nvlstree_simple,
					      {1073741824, 256},
					      {787480576, 128},
					      {1073741824, 128},
					      {1073741824, 40},
					      {TUNER_MAX_SIZE, 32}}}};

			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
	} else if (nRanks == nNodes) {
		{
			collType = ncclFuncAllReduce;
			nccl_ofi_tuner_point_t extended_tree_ll128 =
				extend_region((nccl_ofi_tuner_point_t){9999360, 64},
					      (nccl_ofi_tuner_point_t){119477248, 128},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
			nccl_ofi_tuner_point_t extended_ring_ll128 =
				extend_region((nccl_ofi_tuner_point_t){4736000, 2},
					      (nccl_ofi_tuner_point_t){269484032, 128},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {
				{.algorithm = NCCL_ALGO_TREE,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 5,
				 .vertices = {{0, 16}, {2367488, 16}, {9999360, 64}, {119477248, 128}, extended_tree_ll128}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_LL128,
				 .num_vertices = 9,
				 .vertices = {{0, 2},
					      {4736000, 2},
					      {269484032, 128},
					      extended_ring_ll128,
					      extended_tree_ll128,
					      {119477248, 128},
					      {9999360, 64},
					      {2367488, 16},
					      {0, 16}}},
				{.algorithm = NCCL_ALGO_RING,
				 .protocol = NCCL_PROTO_SIMPLE,
				 .num_vertices = 4,
				 .vertices = {{4736000, 2}, {TUNER_MAX_SIZE, 2}, extended_ring_ll128, {269484032, 128}}}};

			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
		{
			collType = ncclFuncAllGather;
			nccl_ofi_tuner_point_t extended_ring_simple =
				extend_region((nccl_ofi_tuner_point_t){4194304, 2},
					      (nccl_ofi_tuner_point_t){8589934592, 2048},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {{.algorithm = NCCL_ALGO_RING,
								    .protocol = NCCL_PROTO_SIMPLE,
								    .num_vertices = 4,
								    .vertices = {{4194304, 2},
										 {TUNER_MAX_SIZE, 2},
										 extended_ring_simple,
										 {8589934592, 2048}}}};

			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
		{
			collType = ncclFuncReduceScatter;
			nccl_ofi_tuner_point_t extended_ring_simple =
				extend_region((nccl_ofi_tuner_point_t){8388608, 2},
					      (nccl_ofi_tuner_point_t){4294967296, 1024},
					      (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});

			const nccl_ofi_tuner_region_t regions[] = {{.algorithm = NCCL_ALGO_RING,
								    .protocol = NCCL_PROTO_SIMPLE,
								    .num_vertices = 4,
								    .vertices = {{8388608, 2},
										 {TUNER_MAX_SIZE, 2},
										 extended_ring_simple,
										 {4294967296, 1024}}}};

			ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions);
			if (ret != ncclSuccess) {
				goto exit;
			}
		}
	} else {
		/* Fall back to NCCL's tuner, so no regions */
	}

exit:
	return ret;
}