void cpuinfo_arm_mach_init()

in src/arm/mach/init.c [255:619]


void cpuinfo_arm_mach_init(void) {
	struct cpuinfo_processor* processors = NULL;
	struct cpuinfo_core* cores = NULL;
	struct cpuinfo_cluster* clusters = NULL;
	struct cpuinfo_package* packages = NULL;
	struct cpuinfo_uarch_info* uarchs = NULL;
	struct cpuinfo_cache* l1i = NULL;
	struct cpuinfo_cache* l1d = NULL;
	struct cpuinfo_cache* l2 = NULL;
	struct cpuinfo_cache* l3 = NULL;

	struct cpuinfo_mach_topology mach_topology = cpuinfo_mach_detect_topology();
	processors = calloc(mach_topology.threads, sizeof(struct cpuinfo_processor));
	if (processors == NULL) {
		cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" logical processors",
			mach_topology.threads * sizeof(struct cpuinfo_processor), mach_topology.threads);
		goto cleanup;
	}
	cores = calloc(mach_topology.cores, sizeof(struct cpuinfo_core));
	if (cores == NULL) {
		cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" cores",
			mach_topology.cores * sizeof(struct cpuinfo_core), mach_topology.cores);
		goto cleanup;
	}
	packages = calloc(mach_topology.packages, sizeof(struct cpuinfo_package));
	if (packages == NULL) {
		cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" packages",
			mach_topology.packages * sizeof(struct cpuinfo_package), mach_topology.packages);
		goto cleanup;
	}

	const uint32_t threads_per_core = mach_topology.threads / mach_topology.cores;
	const uint32_t threads_per_package = mach_topology.threads / mach_topology.packages;
	const uint32_t cores_per_package = mach_topology.cores / mach_topology.packages;

	for (uint32_t i = 0; i < mach_topology.packages; i++) {
		packages[i] = (struct cpuinfo_package) {
			.processor_start = i * threads_per_package,
			.processor_count = threads_per_package,
			.core_start = i * cores_per_package,
			.core_count = cores_per_package,
		};
		decode_package_name(packages[i].name);
	}


	const uint32_t cpu_family = get_sys_info_by_name("hw.cpufamily");
	const uint32_t cpu_type = get_sys_info_by_name("hw.cputype");
	const uint32_t cpu_subtype = get_sys_info_by_name("hw.cpusubtype");
	switch (cpu_type) {
		case CPU_TYPE_ARM64:
			cpuinfo_isa.aes = true;
			cpuinfo_isa.sha1 = true;
			cpuinfo_isa.sha2 = true;
			cpuinfo_isa.pmull = true;
			cpuinfo_isa.crc32 = true;
			break;
#if CPUINFO_ARCH_ARM
		case CPU_TYPE_ARM:
			switch (cpu_subtype) {
				case CPU_SUBTYPE_ARM_V8:
					cpuinfo_isa.armv8 = true;
					cpuinfo_isa.aes = true;
					cpuinfo_isa.sha1 = true;
					cpuinfo_isa.sha2 = true;
					cpuinfo_isa.pmull = true;
					cpuinfo_isa.crc32 = true;
					/* Fall-through to add ARMv7S features */
				case CPU_SUBTYPE_ARM_V7S:
				case CPU_SUBTYPE_ARM_V7K:
					cpuinfo_isa.fma = true;
					/* Fall-through to add ARMv7F features */
				case CPU_SUBTYPE_ARM_V7F:
					cpuinfo_isa.armv7mp = true;
					cpuinfo_isa.fp16 = true;
					/* Fall-through to add ARMv7 features */
				case CPU_SUBTYPE_ARM_V7:
					break;
				default:
					break;
			}
			break;
#endif
	}
	/*
	 * Support for ARMv8.1 Atomics & FP16 arithmetic instructions is supposed to be detected via
	 * sysctlbyname calls with "hw.optional.armv8_1_atomics" and "hw.optional.neon_fp16" arguments
	 * (see https://devstreaming-cdn.apple.com/videos/wwdc/2018/409t8zw7rumablsh/409/409_whats_new_in_llvm.pdf),
	 * but on new iOS versions these calls just fail with EPERM.
	 *
	 * Thus, we whitelist CPUs known to support these instructions.
	 */
	switch (cpu_family) {
		case CPUFAMILY_ARM_MONSOON_MISTRAL:
		case CPUFAMILY_ARM_VORTEX_TEMPEST:
		case CPUFAMILY_ARM_LIGHTNING_THUNDER:
		case CPUFAMILY_ARM_FIRESTORM_ICESTORM:
			#if CPUINFO_ARCH_ARM64
				cpuinfo_isa.atomics = true;
			#endif
			cpuinfo_isa.fp16arith = true;
	}

	/*
	 * There does not yet seem to exist an OS mechanism to detect support for
	 * ARMv8.2 optional dot-product instructions, so we currently whitelist CPUs
	 * known to support these instruction.
	 */
	switch (cpu_family) {
		case CPUFAMILY_ARM_LIGHTNING_THUNDER:
		case CPUFAMILY_ARM_FIRESTORM_ICESTORM:
			cpuinfo_isa.dot = true;
	}

	uint32_t num_clusters = 1;
	for (uint32_t i = 0; i < mach_topology.cores; i++) {
		cores[i] = (struct cpuinfo_core) {
			.processor_start = i * threads_per_core,
			.processor_count = threads_per_core,
			.core_id = i % cores_per_package,
			.package = packages + i / cores_per_package,
			.vendor = cpuinfo_vendor_apple,
			.uarch = decode_uarch(cpu_family, cpu_subtype, i, mach_topology.cores),
		};
		if (i != 0 && cores[i].uarch != cores[i - 1].uarch) {
			num_clusters++;
		}
	}
	for (uint32_t i = 0; i < mach_topology.threads; i++) {
		const uint32_t smt_id = i % threads_per_core;
		const uint32_t core_id = i / threads_per_core;
		const uint32_t package_id = i / threads_per_package;

		processors[i].smt_id = smt_id;
		processors[i].core = &cores[core_id];
		processors[i].package = &packages[package_id];
	}

	clusters = calloc(num_clusters, sizeof(struct cpuinfo_cluster));
	if (clusters == NULL) {
		cpuinfo_log_error(
			"failed to allocate %zu bytes for descriptions of %"PRIu32" clusters",
			num_clusters * sizeof(struct cpuinfo_cluster), num_clusters);
		goto cleanup;
	}
	uarchs = calloc(num_clusters, sizeof(struct cpuinfo_uarch_info));
	if (uarchs == NULL) {
		cpuinfo_log_error(
			"failed to allocate %zu bytes for descriptions of %"PRIu32" uarchs",
			num_clusters * sizeof(enum cpuinfo_uarch), num_clusters);
		goto cleanup;
	}
	uint32_t cluster_idx = UINT32_MAX;
	for (uint32_t i = 0; i < mach_topology.cores; i++) {
		if (i == 0 || cores[i].uarch != cores[i - 1].uarch) {
			cluster_idx++;
			uarchs[cluster_idx] = (struct cpuinfo_uarch_info) {
				.uarch = cores[i].uarch,
				.processor_count = 1,
				.core_count = 1,
			};
			clusters[cluster_idx] = (struct cpuinfo_cluster) {
				.processor_start = i * threads_per_core,
				.processor_count = 1,
				.core_start = i,
				.core_count = 1,
				.cluster_id = cluster_idx,
				.package = cores[i].package,
				.vendor = cores[i].vendor,
				.uarch = cores[i].uarch,
			};
		} else {
			uarchs[cluster_idx].processor_count++;
			uarchs[cluster_idx].core_count++;
			clusters[cluster_idx].processor_count++;
			clusters[cluster_idx].core_count++;
		}
		cores[i].cluster = &clusters[cluster_idx];
	}

	for (uint32_t i = 0; i < mach_topology.threads; i++) {
		const uint32_t core_id = i / threads_per_core;
		processors[i].cluster = cores[core_id].cluster;
	}

	for (uint32_t i = 0; i < mach_topology.packages; i++) {
		packages[i].cluster_start = 0;
		packages[i].cluster_count = num_clusters;
	}

	const uint32_t cacheline_size = get_sys_info(HW_CACHELINE, "HW_CACHELINE");
	const uint32_t l1d_cache_size = get_sys_info(HW_L1DCACHESIZE, "HW_L1DCACHESIZE");
	const uint32_t l1i_cache_size = get_sys_info(HW_L1ICACHESIZE, "HW_L1ICACHESIZE");
	const uint32_t l2_cache_size = get_sys_info(HW_L2CACHESIZE, "HW_L2CACHESIZE");
	const uint32_t l3_cache_size = get_sys_info(HW_L3CACHESIZE, "HW_L3CACHESIZE");
	const uint32_t l1_cache_associativity = 4;
	const uint32_t l2_cache_associativity = 8;
	const uint32_t l3_cache_associativity = 16;
	const uint32_t cache_partitions = 1;
	const uint32_t cache_flags = 0;

	uint32_t threads_per_l1 = 0, l1_count = 0;
	if (l1i_cache_size != 0 || l1d_cache_size != 0) {
		/* Assume L1 caches are private to each core */
		threads_per_l1 = 1;
		l1_count = mach_topology.threads / threads_per_l1;
		cpuinfo_log_debug("detected %"PRIu32" L1 caches", l1_count);
	}

	uint32_t threads_per_l2 = 0, l2_count = 0;
	if (l2_cache_size != 0) {
		/* Assume L2 cache is shared between all cores */
		threads_per_l2 = mach_topology.cores;
		l2_count = 1;
		cpuinfo_log_debug("detected %"PRIu32" L2 caches", l2_count);
	}

	uint32_t threads_per_l3 = 0, l3_count = 0;
	if (l3_cache_size != 0) {
		/* Assume L3 cache is shared between all cores */
		threads_per_l3 = mach_topology.cores;
		l3_count = 1;
		cpuinfo_log_debug("detected %"PRIu32" L3 caches", l3_count);
	}

	if (l1i_cache_size != 0) {
		l1i = calloc(l1_count, sizeof(struct cpuinfo_cache));
		if (l1i == NULL) {
			cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1I caches",
				l1_count * sizeof(struct cpuinfo_cache), l1_count);
			goto cleanup;
		}
		for (uint32_t c = 0; c < l1_count; c++) {
			l1i[c] = (struct cpuinfo_cache) {
				.size            = l1i_cache_size,
				.associativity   = l1_cache_associativity,
				.sets            = l1i_cache_size / (l1_cache_associativity * cacheline_size),
				.partitions      = cache_partitions,
				.line_size       = cacheline_size,
				.flags           = cache_flags,
				.processor_start = c * threads_per_l1,
				.processor_count = threads_per_l1,
			};
		}
		for (uint32_t t = 0; t < mach_topology.threads; t++) {
			processors[t].cache.l1i = &l1i[t / threads_per_l1];
		}
	}

	if (l1d_cache_size != 0) {
		l1d = calloc(l1_count, sizeof(struct cpuinfo_cache));
		if (l1d == NULL) {
			cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1D caches",
				l1_count * sizeof(struct cpuinfo_cache), l1_count);
			goto cleanup;
		}
		for (uint32_t c = 0; c < l1_count; c++) {
			l1d[c] = (struct cpuinfo_cache) {
				.size            = l1d_cache_size,
				.associativity   = l1_cache_associativity,
				.sets            = l1d_cache_size / (l1_cache_associativity * cacheline_size),
				.partitions      = cache_partitions,
				.line_size       = cacheline_size,
				.flags           = cache_flags,
				.processor_start = c * threads_per_l1,
				.processor_count = threads_per_l1,
			};
		}
		for (uint32_t t = 0; t < mach_topology.threads; t++) {
			processors[t].cache.l1d = &l1d[t / threads_per_l1];
		}
	}

	if (l2_count != 0) {
		l2 = calloc(l2_count, sizeof(struct cpuinfo_cache));
		if (l2 == NULL) {
			cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L2 caches",
				l2_count * sizeof(struct cpuinfo_cache), l2_count);
			goto cleanup;
		}
		for (uint32_t c = 0; c < l2_count; c++) {
			l2[c] = (struct cpuinfo_cache) {
				.size            = l2_cache_size,
				.associativity   = l2_cache_associativity,
				.sets            = l2_cache_size / (l2_cache_associativity * cacheline_size),
				.partitions      = cache_partitions,
				.line_size       = cacheline_size,
				.flags           = cache_flags,
				.processor_start = c * threads_per_l2,
				.processor_count = threads_per_l2,
			};
		}
		for (uint32_t t = 0; t < mach_topology.threads; t++) {
			processors[t].cache.l2 = &l2[0];
		}
	}

	if (l3_count != 0) {
		l3 = calloc(l3_count, sizeof(struct cpuinfo_cache));
		if (l3 == NULL) {
			cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L3 caches",
												l3_count * sizeof(struct cpuinfo_cache), l3_count);
			goto cleanup;
		}
		for (uint32_t c = 0; c < l3_count; c++) {
			l3[c] = (struct cpuinfo_cache) {
				.size            = l3_cache_size,
				.associativity   = l3_cache_associativity,
				.sets            = l3_cache_size / (l3_cache_associativity * cacheline_size),
				.partitions      = cache_partitions,
				.line_size       = cacheline_size,
				.flags           = cache_flags,
				.processor_start = c * threads_per_l3,
				.processor_count = threads_per_l3,
			};
		}
		for (uint32_t t = 0; t < mach_topology.threads; t++) {
			processors[t].cache.l3 = &l3[0];
		}
	}

	/* Commit changes */
	cpuinfo_processors = processors;
	cpuinfo_cores = cores;
	cpuinfo_clusters = clusters;
	cpuinfo_packages = packages;
	cpuinfo_uarchs = uarchs;
	cpuinfo_cache[cpuinfo_cache_level_1i] = l1i;
	cpuinfo_cache[cpuinfo_cache_level_1d] = l1d;
	cpuinfo_cache[cpuinfo_cache_level_2]  = l2;
	cpuinfo_cache[cpuinfo_cache_level_3]  = l3;

	cpuinfo_processors_count = mach_topology.threads;
	cpuinfo_cores_count = mach_topology.cores;
	cpuinfo_clusters_count = num_clusters;
	cpuinfo_packages_count = mach_topology.packages;
	cpuinfo_uarchs_count = num_clusters;
	cpuinfo_cache_count[cpuinfo_cache_level_1i] = l1_count;
	cpuinfo_cache_count[cpuinfo_cache_level_1d] = l1_count;
	cpuinfo_cache_count[cpuinfo_cache_level_2]  = l2_count;
	cpuinfo_cache_count[cpuinfo_cache_level_3]  = l3_count;
	cpuinfo_max_cache_size = cpuinfo_compute_max_cache_size(&processors[0]);

	__sync_synchronize();

	cpuinfo_is_initialized = true;

	processors = NULL;
	cores = NULL;
	clusters = NULL;
	packages = NULL;
	uarchs = NULL;
	l1i = l1d = l2 = l3 = NULL;

cleanup:
	free(processors);
	free(cores);
	free(clusters);
	free(packages);
	free(uarchs);
	free(l1i);
	free(l1d);
	free(l2);
	free(l3);
}