in sparse_autoencoder/kernels.py [0:0]
def triton_mse_loss_fp16_kernel( output_ptr, target_ptr, out_ptr, stride_a_output, stride_a_target, a, b, BLOCK_SIZE_B: tl.constexpr,