xformers/sparse/csr_tensor.py (4 lines):
	- line 150: # TODO: check if need to return this or not
	- line 195: # TODO add bias here
	- line 282: # TODO this is not always true, but is a fast approximation for now
	- line 429: # TODO: check this


experimental/ragged_inference/triton_v2_ragged_qk_dotprod.py (3 lines):
	- line 23: # TODO: tune these
	- line 153: # TODO: link to a drawing of what these tensors are
	- line 321: # TODO: flag use zeros for garbage


experimental/ragged_inference/garbage_pad_ragged_acts.py (3 lines):
	- line 101: # TODO: flag use zeros for garbage
	- line 170: # TODO: Build LUT
	- line 179: # TODO: Add the QK dotprod to get scores


xformers/components/attention/nystrom.py (2 lines):
	- line 96: # TODO: update defaults for use_razavi_pinverse and inv_iterations
	- line 128: # TODO: should be able to not have to pass in num_heads


xformers/triton/k_dropout.py (1 line):
	- line 179: seed = SEEDS + col_id  # FIXME index the seed properly


xformers/factory/model_factory.py (1 line):
	- line 237: # TODO: pass in key and value independently.


xformers/components/attention/core.py (1 line):
	- line 192: # TODO assume we have (N, S, hs) instead of (B, nh, S, hs), with N = B x nh


xformers/components/attention/ortho.py (1 line):
	- line 108: # FIXME: Should we still accept a mask in that case ?


xformers/components/attention/_sputnik_sparse.py (1 line):
	- line 12: # TODO: this is here for BC


xformers/components/attention/csrc/cpu/matmul.cpp (1 line):
	- line 17: int64_t grain_size = 128; // TODO: tune this


xformers/triton/k_layer_norm.py (1 line):
	- line 243: # FIXME: @lefaudeux tensor shape changes are not well handled, see shape3


xformers/components/attention/csrc/autograd/matmul.cpp (1 line):
	- line 39: // TODO: compute grad only if they require grad


xformers/components/attention/csrc/cuda/spmm.cu (1 line):
	- line 829: // TODO investigate misaligned address errors in values ptr