in candle-flash-attn-v1/kernels/fmha/smem_tile.h [244:255]
inline __device__ void move_to_next_write_buffer() {
// if( BUFFERS_PER_TILE > 1 && smem_write_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
// this->smem_write_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
// } else if( BUFFERS_PER_TILE > 1 ) {
// this->smem_write_buffer_ += BYTES_PER_BUFFER;
// }
if( BUFFERS_PER_TILE > 1 && smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
this->smem_write_offset_ -= BYTES_PER_TILE_INC_BOUNDARY;
} else if( BUFFERS_PER_TILE > 1 ) {
this->smem_write_offset_ += BYTES_PER_BUFFER;
}
}