in kernels/fmha/softmax.h [219:234]
inline __device__ void apply_exp(const float (&max)[MMAS_M * 2]) {
#pragma unroll
for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
// Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
// max * log_2(e)) This allows the compiler to use the ffma
// instruction instead of fadd and fmul separately.
constexpr float kLog2e = M_LOG2E;
const float max_base2 = max_in_base2 ? max[mi] : max[mi] * kLog2e;
#pragma unroll
for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
// elt_[mi][ni] = apply_exp_(elt_[mi][ni], max[mi]);
elt_[mi][ni] = apply_exp2_(elt_in_base2 ? elt_[mi][ni] : elt_[mi][ni] * kLog2e,
max_base2);
}
}
}