in candle-flash-attn-v1/kernels/fmha/softmax.h [256:266]
inline __device__ void apply_exp_col(const float (&max)[MMAS_N * 4]) {
#pragma unroll
for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
constexpr float kLog2e = M_LOG2E;
const float max_base2 = max_in_base2 ? max[ni] : max[ni] * kLog2e;
#pragma unroll
for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * kLog2e, max_base2);
}
}
}