fairseq/models/speech_to_text/modules/augmented_memory_attention.py (5 lines):
	- line 83: # TODO: fix positional embedding
	- line 100: # TODO: Consider mask here
	- line 142: # TODO reseach new sum_query method
	- line 260: # TODO: positional embedding on memory
	- line 263: # TODO: need to fix here


fairseq/tasks/online_backtranslation.py (5 lines):
	- line 234: # TODO: could we do the BT using denoise sample ?
	- line 255: # TODO: should we shuffle ? we are already sorting batch by sizes so ?
	- line 494: # TODO: allow more complex mapping
	- line 528: # TODO: Could we translate to several language at once ?
	- line 641: # TODO: we should reuse the pretrained model dict which already has <mask>


fairseq/data/multilingual/multilingual_data_manager.py (5 lines):
	- line 707: # TODO: Unifiy with alter_dataset_langtok
	- line 721: # TODO: Unifiy with alter_dataset_langtok
	- line 836: # TODO: handle modified lang toks for mined data and dae data
	- line 946: # TODO: to extend with extra datasets and keys and loop over different shard data paths
	- line 1087: # TODO: to investigate why TransformEosLangPairDataset doesn't work with ConcatDataset


fairseq/tasks/multilingual_language_modeling.py (4 lines):
	- line 50: # TODO common var add to parent
	- line 125: # TODO: legacy parameter kept for compatibility
	- line 137: # TODO common vars below add to parent
	- line 427: # TODO: add an option for shrinking all size ratios to below 1


fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py (3 lines):
	- line 163: # HACK for now, need to fix (TODO sidgoyal)
	- line 436: # TODO  remove this once we update apex with the fix
	- line 499: # TODO: add back prev_self_attn_state, prev_attn_state,


fairseq/models/transformer/transformer_config.py (2 lines):
	- line 108: # TODO should really be in the encoder config
	- line 114: # TODO should really be in the decoder config


fairseq/iterative_refinement_generator.py (2 lines):
	- line 108: # TODO: iterative refinement generator does not support ensemble for now.
	- line 129: # TODO: better encoder inputs?


fairseq/tasks/multilingual_translation.py (2 lines):
	- line 373: # TODO make summing of the sample sizes configurable
	- line 400: # TODO make summing of the sample sizes configurable


fairseq/models/speech_to_text/modules/emformer.py (2 lines):
	- line 1814: scaled_init=True,  # TODO: use constant for now.
	- line 1840: # TODO: make it configurable from the args


fairseq/models/roberta/enc_dec.py (2 lines):
	- line 106: # TODO: hide setting "encoder_attn" layers behind a flag.
	- line 139: # TODO: this would become easier if encoder/decoder where using a similar


fairseq/data/iterators.py (2 lines):
	- line 507: # TODO: Below is a lazy implementation which discard the final batch regardless
	- line 549: # TODO: [Hack] Here the grouped iterator modifies the base iterator size so that


fairseq/data/round_robin_zip_datasets.py (1 line):
	- line 87: # TODO make it configurable whether to use max() or sum() here


fairseq_cli/generate.py (1 line):
	- line 403: # TODO: replace this workaround with refactoring of `AudioPretraining`


fairseq/optim/nag.py (1 line):
	- line 22: # TODO common vars in parent class


fairseq/data/audio/data_cfg.py (1 line):
	- line 171: # TODO: move this into individual transforms


fairseq/models/nat/nonautoregressive_transformer.py (1 line):
	- line 394: # TODO: implementing length-beam


fairseq/data/audio/speech_to_speech_dataset.py (1 line):
	- line 204: "tgt_speaker": tgt_speakers,  # TODO: unify "speaker" and "tgt_speaker"


fairseq/optim/cpu_adam.py (1 line):
	- line 49: # TODO common vars below in parent


fairseq/data/transform_eos_concat_langpair_dataset.py (1 line):
	- line 127: # TODO: support different padding direction on target side


fairseq/models/transformer/transformer_base.py (1 line):
	- line 56: # --  TODO T96535332


fairseq/data/legacy/block_pair_dataset.py (1 line):
	- line 218: TODO: ids in skip_ids should be consecutive, we can extend it to more generic version later


fairseq/data/legacy/masked_lm_dataset.py (1 line):
	- line 196: # TODO: Can we add deteminism without this constraint?


fairseq/logging/progress_bar.py (1 line):
	- line 367: # TODO add hparams to Tensorboard


fairseq/models/wav2vec/wav2vec2.py (1 line):
	- line 490: # FIXME: what happens if padding_count is specified?


fairseq/tasks/semisupervised_translation.py (1 line):
	- line 419: # TODO make summing of the sample sizes configurable


fairseq/models/speech_to_text/utils.py (1 line):
	- line 495: TODO:


fairseq/criterions/legacy_masked_lm.py (1 line):
	- line 102: # TODO: Remove this after refactor of BERTModel


fairseq/models/lstm.py (1 line):
	- line 450: # TODO make bias configurable


fairseq/models/wav2vec/wav2vec2_asr.py (1 line):
	- line 586: # TODO: update this when transformer gets converted to dataclass configs


fairseq/data/transform_eos_lang_pair_dataset.py (1 line):
	- line 82: # TODO: support different padding direction on target side


fairseq/tasks/speech_to_speech.py (1 line):
	- line 57: # TODO: incorporate max_len_a and max_len_b


fairseq/sequence_generator.py (1 line):
	- line 463: # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it


fairseq/utils.py (1 line):
	- line 223: # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully


fairseq/criterions/wav2vec_criterion.py (1 line):
	- line 219: # FIXME: revert when gather based xla reduction is implemented


fairseq/data/noising.py (1 line):
	- line 121: # TODO: speed up the following loop


fairseq/optim/adam.py (1 line):
	- line 39: # TODO common vars below in parent


fairseq/models/nat/insertion_transformer.py (1 line):
	- line 181: # TODO: decoding for InsertionTransformer


fairseq/tasks/language_modeling.py (1 line):
	- line 96: # TODO common vars below add to parent


fairseq/modules/positional_embedding.py (1 line):
	- line 21: # TODO: The right place for this offset would be inside