in erts/emulator/asmjit/x86/x86features.cpp [134:443]
ASMJIT_FAVOR_SIZE void detectCpu(CpuInfo& cpu) noexcept {
using Support::bitTest;
cpuid_t regs;
xgetbv_t xcr0 { 0, 0 };
Features& features = cpu._features.as<Features>();
cpu.reset();
cpu._arch = Environment::kArchHost;
cpu._subArch = Environment::kSubArchUnknown;
cpu._reserved = 0;
cpu._maxLogicalProcessors = 1;
features.add(Features::kI486);
// --------------------------------------------------------------------------
// [CPUID EAX=0]
// --------------------------------------------------------------------------
// Get vendor string/id.
cpuidQuery(®s, 0x0);
uint32_t maxId = regs.eax;
uint32_t maxSubLeafId_0x7 = 0;
simplifyCpuVendor(cpu, regs.ebx, regs.edx, regs.ecx);
// --------------------------------------------------------------------------
// [CPUID EAX=1]
// --------------------------------------------------------------------------
if (maxId >= 0x1) {
// Get feature flags in ECX/EDX and family/model in EAX.
cpuidQuery(®s, 0x1);
// Fill family and model fields.
uint32_t modelId = (regs.eax >> 4) & 0x0F;
uint32_t familyId = (regs.eax >> 8) & 0x0F;
// Use extended family and model fields.
if (familyId == 0x06u || familyId == 0x0Fu)
modelId += (((regs.eax >> 16) & 0x0Fu) << 4);
if (familyId == 0x0Fu)
familyId += ((regs.eax >> 20) & 0xFFu);
cpu._modelId = modelId;
cpu._familyId = familyId;
cpu._brandId = ((regs.ebx ) & 0xFF);
cpu._processorType = ((regs.eax >> 12) & 0x03);
cpu._maxLogicalProcessors = ((regs.ebx >> 16) & 0xFF);
cpu._stepping = ((regs.eax ) & 0x0F);
cpu._cacheLineSize = ((regs.ebx >> 8) & 0xFF) * 8;
if (bitTest(regs.ecx, 0)) features.add(Features::kSSE3);
if (bitTest(regs.ecx, 1)) features.add(Features::kPCLMULQDQ);
if (bitTest(regs.ecx, 3)) features.add(Features::kMONITOR);
if (bitTest(regs.ecx, 5)) features.add(Features::kVMX);
if (bitTest(regs.ecx, 6)) features.add(Features::kSMX);
if (bitTest(regs.ecx, 9)) features.add(Features::kSSSE3);
if (bitTest(regs.ecx, 13)) features.add(Features::kCMPXCHG16B);
if (bitTest(regs.ecx, 19)) features.add(Features::kSSE4_1);
if (bitTest(regs.ecx, 20)) features.add(Features::kSSE4_2);
if (bitTest(regs.ecx, 22)) features.add(Features::kMOVBE);
if (bitTest(regs.ecx, 23)) features.add(Features::kPOPCNT);
if (bitTest(regs.ecx, 25)) features.add(Features::kAESNI);
if (bitTest(regs.ecx, 26)) features.add(Features::kXSAVE);
if (bitTest(regs.ecx, 27)) features.add(Features::kOSXSAVE);
if (bitTest(regs.ecx, 30)) features.add(Features::kRDRAND);
if (bitTest(regs.edx, 0)) features.add(Features::kFPU);
if (bitTest(regs.edx, 4)) features.add(Features::kRDTSC);
if (bitTest(regs.edx, 5)) features.add(Features::kMSR);
if (bitTest(regs.edx, 8)) features.add(Features::kCMPXCHG8B);
if (bitTest(regs.edx, 15)) features.add(Features::kCMOV);
if (bitTest(regs.edx, 19)) features.add(Features::kCLFLUSH);
if (bitTest(regs.edx, 23)) features.add(Features::kMMX);
if (bitTest(regs.edx, 24)) features.add(Features::kFXSR);
if (bitTest(regs.edx, 25)) features.add(Features::kSSE);
if (bitTest(regs.edx, 26)) features.add(Features::kSSE, Features::kSSE2);
if (bitTest(regs.edx, 28)) features.add(Features::kMT);
// Get the content of XCR0 if supported by the CPU and enabled by the OS.
if (features.hasXSAVE() && features.hasOSXSAVE()) {
xgetbvQuery(&xcr0, 0);
}
// Detect AVX+.
if (bitTest(regs.ecx, 28)) {
// - XCR0[2:1] == 11b
// XMM & YMM states need to be enabled by OS.
if ((xcr0.eax & 0x00000006u) == 0x00000006u) {
features.add(Features::kAVX);
if (bitTest(regs.ecx, 12)) features.add(Features::kFMA);
if (bitTest(regs.ecx, 29)) features.add(Features::kF16C);
}
}
}
constexpr uint32_t kXCR0_AMX_Bits = 0x3u << 17;
bool amxEnabledByOS = (xcr0.eax & kXCR0_AMX_Bits) == kXCR0_AMX_Bits;
#if defined(__APPLE__)
// Apple platform provides on-demand AVX512 support. When an AVX512 instruction is used
// the first time it results in #UD, which would cause the thread being promoted to use
// AVX512 support by the OS in addition to enabling the necessary bits in XCR0 register.
bool avx512EnabledByOS = true;
#else
// - XCR0[2:1] == 11b - XMM/YMM states need to be enabled by OS.
// - XCR0[7:5] == 111b - Upper 256-bit of ZMM0-XMM15 and ZMM16-ZMM31 need to be enabled by OS.
constexpr uint32_t kXCR0_AVX512_Bits = (0x3u << 1) | (0x7u << 5);
bool avx512EnabledByOS = (xcr0.eax & kXCR0_AVX512_Bits) == kXCR0_AVX512_Bits;
#endif
// --------------------------------------------------------------------------
// [CPUID EAX=7 ECX=0]
// --------------------------------------------------------------------------
// Detect new features if the processor supports CPUID-07.
bool maybeMPX = false;
if (maxId >= 0x7) {
cpuidQuery(®s, 0x7);
maybeMPX = bitTest(regs.ebx, 14);
maxSubLeafId_0x7 = regs.eax;
if (bitTest(regs.ebx, 0)) features.add(Features::kFSGSBASE);
if (bitTest(regs.ebx, 3)) features.add(Features::kBMI);
if (bitTest(regs.ebx, 4)) features.add(Features::kHLE);
if (bitTest(regs.ebx, 7)) features.add(Features::kSMEP);
if (bitTest(regs.ebx, 8)) features.add(Features::kBMI2);
if (bitTest(regs.ebx, 9)) features.add(Features::kERMS);
if (bitTest(regs.ebx, 11)) features.add(Features::kRTM);
if (bitTest(regs.ebx, 18)) features.add(Features::kRDSEED);
if (bitTest(regs.ebx, 19)) features.add(Features::kADX);
if (bitTest(regs.ebx, 20)) features.add(Features::kSMAP);
if (bitTest(regs.ebx, 23)) features.add(Features::kCLFLUSHOPT);
if (bitTest(regs.ebx, 24)) features.add(Features::kCLWB);
if (bitTest(regs.ebx, 29)) features.add(Features::kSHA);
if (bitTest(regs.ecx, 0)) features.add(Features::kPREFETCHWT1);
if (bitTest(regs.ecx, 4)) features.add(Features::kOSPKE);
if (bitTest(regs.ecx, 5)) features.add(Features::kWAITPKG);
if (bitTest(regs.ecx, 8)) features.add(Features::kGFNI);
if (bitTest(regs.ecx, 9)) features.add(Features::kVAES);
if (bitTest(regs.ecx, 10)) features.add(Features::kVPCLMULQDQ);
if (bitTest(regs.ecx, 22)) features.add(Features::kRDPID);
if (bitTest(regs.ecx, 25)) features.add(Features::kCLDEMOTE);
if (bitTest(regs.ecx, 27)) features.add(Features::kMOVDIRI);
if (bitTest(regs.ecx, 28)) features.add(Features::kMOVDIR64B);
if (bitTest(regs.ecx, 29)) features.add(Features::kENQCMD);
if (bitTest(regs.edx, 14)) features.add(Features::kSERIALIZE);
if (bitTest(regs.edx, 16)) features.add(Features::kTSXLDTRK);
if (bitTest(regs.edx, 18)) features.add(Features::kPCONFIG);
// Detect 'TSX' - Requires at least one of `HLE` and `RTM` features.
if (features.hasHLE() || features.hasRTM())
features.add(Features::kTSX);
// Detect 'AVX2' - Requires AVX as well.
if (bitTest(regs.ebx, 5) && features.hasAVX())
features.add(Features::kAVX2);
// Detect 'AMX'.
if (amxEnabledByOS) {
if (bitTest(regs.edx, 22)) features.add(Features::kAMX_BF16);
if (bitTest(regs.edx, 24)) features.add(Features::kAMX_TILE);
if (bitTest(regs.edx, 25)) features.add(Features::kAMX_INT8);
}
// Detect 'AVX_512'.
if (avx512EnabledByOS && bitTest(regs.ebx, 16)) {
features.add(Features::kAVX512_F);
if (bitTest(regs.ebx, 17)) features.add(Features::kAVX512_DQ);
if (bitTest(regs.ebx, 21)) features.add(Features::kAVX512_IFMA);
if (bitTest(regs.ebx, 26)) features.add(Features::kAVX512_PFI);
if (bitTest(regs.ebx, 27)) features.add(Features::kAVX512_ERI);
if (bitTest(regs.ebx, 28)) features.add(Features::kAVX512_CDI);
if (bitTest(regs.ebx, 30)) features.add(Features::kAVX512_BW);
if (bitTest(regs.ebx, 31)) features.add(Features::kAVX512_VL);
if (bitTest(regs.ecx, 1)) features.add(Features::kAVX512_VBMI);
if (bitTest(regs.ecx, 6)) features.add(Features::kAVX512_VBMI2);
if (bitTest(regs.ecx, 11)) features.add(Features::kAVX512_VNNI);
if (bitTest(regs.ecx, 12)) features.add(Features::kAVX512_BITALG);
if (bitTest(regs.ecx, 14)) features.add(Features::kAVX512_VPOPCNTDQ);
if (bitTest(regs.edx, 2)) features.add(Features::kAVX512_4VNNIW);
if (bitTest(regs.edx, 3)) features.add(Features::kAVX512_4FMAPS);
if (bitTest(regs.edx, 8)) features.add(Features::kAVX512_VP2INTERSECT);
}
}
// --------------------------------------------------------------------------
// [CPUID EAX=7 ECX=1]
// --------------------------------------------------------------------------
if (features.hasAVX512_F() && maxSubLeafId_0x7 >= 1) {
cpuidQuery(®s, 0x7, 1);
if (bitTest(regs.eax, 5)) features.add(Features::kAVX512_BF16);
}
// --------------------------------------------------------------------------
// [CPUID EAX=13 ECX=0]
// --------------------------------------------------------------------------
if (maxId >= 0xD) {
cpuidQuery(®s, 0xD, 0);
// Both CPUID result and XCR0 has to be enabled to have support for MPX.
if (((regs.eax & xcr0.eax) & 0x00000018u) == 0x00000018u && maybeMPX)
features.add(Features::kMPX);
cpuidQuery(®s, 0xD, 1);
if (bitTest(regs.eax, 0)) features.add(Features::kXSAVEOPT);
if (bitTest(regs.eax, 1)) features.add(Features::kXSAVEC);
if (bitTest(regs.eax, 3)) features.add(Features::kXSAVES);
}
// --------------------------------------------------------------------------
// [CPUID EAX=14 ECX=0]
// --------------------------------------------------------------------------
if (maxId >= 0xE) {
cpuidQuery(®s, 0xE, 0);
if (bitTest(regs.ebx, 4)) features.add(Features::kPTWRITE);
}
// --------------------------------------------------------------------------
// [CPUID EAX=0x80000000...maxId]
// --------------------------------------------------------------------------
maxId = 0x80000000u;
uint32_t i = maxId;
// The highest EAX that we understand.
uint32_t kHighestProcessedEAX = 0x8000001Fu;
// Several CPUID calls are required to get the whole branc string. It's easy
// to copy one DWORD at a time instead of performing a byte copy.
uint32_t* brand = cpu._brand.u32;
do {
cpuidQuery(®s, i);
switch (i) {
case 0x80000000u:
maxId = Support::min<uint32_t>(regs.eax, kHighestProcessedEAX);
break;
case 0x80000001u:
if (bitTest(regs.ecx, 0)) features.add(Features::kLAHFSAHF);
if (bitTest(regs.ecx, 2)) features.add(Features::kSVM);
if (bitTest(regs.ecx, 5)) features.add(Features::kLZCNT);
if (bitTest(regs.ecx, 6)) features.add(Features::kSSE4A);
if (bitTest(regs.ecx, 7)) features.add(Features::kMSSE);
if (bitTest(regs.ecx, 8)) features.add(Features::kPREFETCHW);
if (bitTest(regs.ecx, 12)) features.add(Features::kSKINIT);
if (bitTest(regs.ecx, 15)) features.add(Features::kLWP);
if (bitTest(regs.ecx, 21)) features.add(Features::kTBM);
if (bitTest(regs.ecx, 29)) features.add(Features::kMONITORX);
if (bitTest(regs.edx, 20)) features.add(Features::kNX);
if (bitTest(regs.edx, 21)) features.add(Features::kFXSROPT);
if (bitTest(regs.edx, 22)) features.add(Features::kMMX2);
if (bitTest(regs.edx, 27)) features.add(Features::kRDTSCP);
if (bitTest(regs.edx, 29)) features.add(Features::kPREFETCHW);
if (bitTest(regs.edx, 30)) features.add(Features::k3DNOW2, Features::kMMX2);
if (bitTest(regs.edx, 31)) features.add(Features::kPREFETCHW);
if (cpu.hasFeature(Features::kAVX)) {
if (bitTest(regs.ecx, 11)) features.add(Features::kXOP);
if (bitTest(regs.ecx, 16)) features.add(Features::kFMA4);
}
// These seem to be only supported by AMD.
if (cpu.isVendor("AMD")) {
if (bitTest(regs.ecx, 4)) features.add(Features::kALTMOVCR8);
}
break;
case 0x80000002u:
case 0x80000003u:
case 0x80000004u:
*brand++ = regs.eax;
*brand++ = regs.ebx;
*brand++ = regs.ecx;
*brand++ = regs.edx;
// Go directly to the next one we are interested in.
if (i == 0x80000004u) i = 0x80000008u - 1;
break;
case 0x80000008u:
if (bitTest(regs.ebx, 0)) features.add(Features::kCLZERO);
if (bitTest(regs.ebx, 0)) features.add(Features::kRDPRU);
if (bitTest(regs.ebx, 8)) features.add(Features::kMCOMMIT);
if (bitTest(regs.ebx, 9)) features.add(Features::kWBNOINVD);
// Go directly to the next one we are interested in.
i = 0x8000001Fu - 1;
break;
case 0x8000001Fu:
if (bitTest(regs.eax, 4)) features.add(Features::kSNP);
break;
}
} while (++i <= maxId);
// Simplify CPU brand string a bit by removing some unnecessary spaces.
simplifyCpuBrand(cpu._brand.str);
}