modules/SwissArmyTransformer/sat/generation/autoregressive_sampling.py (4 lines):
	- line 105: log_attention_weights_part = log_attention_weights[..., index: counter+1, :counter+1] # TODO memlen
	- line 115: attention_mask=attention_mask[..., index: counter+1, :counter+1], # TODO memlen
	- line 196: log_attention_weights_part = log_attention_weights[..., index: counter+1, :counter+1] # TODO memlen
	- line 206: attention_mask=attention_mask[..., index: counter+1, :counter+1], # TODO memlen


modules/SwissArmyTransformer/sat/ops/csrc/adam/multi_tensor_apply.cuh (2 lines):
	- line 24: // TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
	- line 65: // TODO:  Print which tensor fails.


modules/SwissArmyTransformer/sat/generation/sampling_strategies/beam_search_strategy.py (2 lines):
	- line 96: ngram_prefix = tokens[i, -(self.ngram-1):].tolist() # TODO ngram=1
	- line 143: ngram_prefix = tuple(tokens[next_indices[i], -(self.ngram-1):].tolist()) # TODO ngram=1


sat/sgm/modules/diffusionmodules/openaimodel.py (2 lines):
	- line 386: # TODO add crossframe attention and use mixed checkpoint
	- line 389: )  # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!


diffusers-version/tora/i2v_pipeline.py (1 line):
	- line 800: # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline


modules/SwissArmyTransformer/sat/training/model_io.py (1 line):
	- line 394: # TODO: Remove `args` and the parsing logic when BC allows.


modules/SwissArmyTransformer/sat/model/transformer.py (1 line):
	- line 656: # TODO add warning for depth>=2 grad tensors


modules/SwissArmyTransformer/sat/data_utils/configure_data.py (1 line):
	- line 60: if distributed: # TODO reformat this, but it is not urgent


sat/app.py (1 line):
	- line 1302: # TODO: passing 'base' params through the command line


modules/SwissArmyTransformer/sat/training/deepspeed_training.py (1 line):
	- line 152: # TODO add rng states for data parallel and wrap drops in main path.


sat/sgm/modules/attention.py (1 line):
	- line 325: # TODO: Use this directly in the attention operation, as a bias


sat/sgm/modules/autoencoding/regularizers/quantize.py (1 line):
	- line 152: # TODO: shape not yet optional


modules/SwissArmyTransformer/sat/model/base_model.py (1 line):
	- line 432: try: # TODO: is this useful?


sat/data_video.py (1 line):
	- line 229: duration: preknow the duration to speed up by seeking to sampled start. TODO by_pass if unknown.


modules/SwissArmyTransformer/sat/model/encoder_decoder_model.py (1 line):
	- line 101: def from_pretrained(cls, args, name, *, home_path=None, url=None): # TODO update model-only mode


modules/SwissArmyTransformer/sat/arguments.py (1 line):
	- line 65: # TODO: fully test it, support the generation.


diffusers-version/tora/t2v_pipeline.py (1 line):
	- line 724: # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline


sat/vae_modules/attention.py (1 line):
	- line 325: # TODO: Use this directly in the attention operation, as a bias