in candle-flash-attn-v1/kernels/fmha/softmax.h [238:252]
    inline __device__ void scale_apply_exp(const float (&max)[MMAS_M * 2], const float scale_) {
        const float max_scale = scale_max ? scale_ * M_LOG2E : M_LOG2E;
        const float scale = scale_ * M_LOG2E;
        #pragma unroll
        for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
            // max * log_2(e)) This allows the compiler to use the ffma
            // instruction instead of fadd and fmul separately.
            const float max_scaled = max[mi] * max_scale;
            #pragma unroll
            for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
                elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * scale, max_scaled);
            }
        }
    }