src/nanotron/models/starcoder2.py (21 lines):
	- line 104: # TODO @nouamane: Figure out why we can't set `DTypeInvariantTensor` ...
	- line 121: # TODO @nouamane: One we figure out how to do the DTypeInvariantTensor, this can be removed and changed to an assert
	- line 165: # TODO @nouamane: support position_ids
	- line 184: # TODO @thomasw21: refactor so that we store that default in a single place.
	- line 259: # TODO @thomasw21: Compute once, instead of computing for each layers.
	- line 293: # TODO @thomasw21: It's really had to make sure that our sliced view keeps the same memory space as the original gradient
	- line 300: # TODO @thomasw21: Can I trigger hooks that we've set in `register_hook`
	- line 630: # TODO @thomasw21: refactor so that we store that default in a single place.
	- line 676: )  # TODO @nouamane: compute based on free memory, because in rope we can surpass max_position_embeddings
	- line 763: # TODO @nouamane: support custom masking
	- line 840: # TODO @nouamane: seems like this doesn't help to indicate padding in (for first iteration it's just 0)
	- line 956: )  # TODO @nouamane: compute based on free memory, because in rope we can surpass max_position_embeddings
	- line 978: )  # TODO @nouamane: can we transpose qkv instead?
	- line 1004: # TODO @nouamane: support custom masking
	- line 1085: # TODO @nouamane: seems like this doesn't help to indicate padding in (for first iteration it's just 0)
	- line 1180: raise ValueError("Either `multi_query` or `grouped_query` must be True")  # TODO: @nouamane not necessarily
	- line 1330: # TODO @thomasw21: refactor so that we store that default in a single place.
	- line 1398: )  # TODO @nouamane: case where TP=1 should be simpler
	- line 1399: # TODO @thomasw21: It's unclear what kind of normalization we want to do.
	- line 1529: # TODO @thomasw21: Sometimes we actually want 0
	- line 1681: hardware_flops = model_flops  # TODO @nouamanetazi: This is a placeholder for now


src/nanotron/generation/decode.py (14 lines):
	- line 211: - Everyone receives ALL the input text. # TODO @thomasw21: technically only specific ranks need to receive input.
	- line 212: - Only a specific rank will output the generated text_ids as `torch.Tensor`, the others return a `TensorPointer`. # TODO @thomasw21: Maybe all ranks should return the text.
	- line 362: # TODO @thomasw21: Handle this correctly, ie from some point after <eos> this should only generate masked tokens
	- line 363: # TODO @thomasw21: Actually I can probably build this thing on the next device directly. Will save some communication
	- line 370: # TODO @thomasw21: We need to have stop condition.
	- line 520: # TODO @thomasw21: We could actually have all ranks return the output, since it's been already broadcasted
	- line 548: - Everyone receives ALL the input text. # TODO @thomasw21: technically only specific ranks need to receive input.
	- line 549: - Only a specific rank will output the generated text_ids as `torch.Tensor`, the others return a `TensorPointer`. # TODO @thomasw21: Maybe all ranks should return the text.
	- line 570: # TODO @thomasw21: Fix this as we shouldn't get P2P like that
	- line 627: # TODO @thomasw21: Make a diagram to show how this works
	- line 680: # TODO @thomasw21: Handle this correctly, ie from some point after <eos> this should only generate masked tokens
	- line 681: # TODO @thomasw21: Actually I can probably build this thing on the next device directly. Will save some communication
	- line 688: # TODO @thomasw21: We need to have stop condition.
	- line 802: # TODO @thomasw21: We could actually have all ranks return the output, since it's been already broadcasted


src/nanotron/models/llama.py (13 lines):
	- line 61: # TODO @nouamane: Figure out why we can't set `DTypeInvariantTensor` ...
	- line 62: # TODO @thomasw21: Complex buffers break DDP, instead we store float and view them as complex
	- line 76: # TODO @nouamane: One we figure out how to do the DTypeInvariantTensor, this can be removed and changed to an assert
	- line 100: ) or seq_length >= self.end:  # TODO @nouamane: check if this causes cpu-gpu sync
	- line 217: # TODO @thomasw21: refactor so that we store that default in a single place.
	- line 256: # TODO @thomasw21: GPT has a weird `d_kv` config which I'm guessing is essentically a `d_qkv`
	- line 277: # TODO @thomasw21: Compute once, instead of computing for each layers.
	- line 375: # TODO @thomasw21: refactor so that we store that default in a single place.
	- line 435: )  # TODO @nouamane: compute based on free memory, because in rope we can surpass max_position_embeddings
	- line 522: # TODO @nouamane: support custom masking
	- line 653: # TODO @nouamane: seems like this doesn't help to indicate padding in (for first iteration it's just 0)
	- line 890: # TODO @thomasw21: refactor so that we store that default in a single place.
	- line 1222: hardware_flops = model_flops  # TODO: This is a placeholder for now


src/nanotron/trainer.py (11 lines):
	- line 201: # TODO: find a better way to handle this
	- line 533: # TODO @nouamanetazi: refactor this
	- line 566: # TODO: only works for BlendableDataset
	- line 584: self.metadata.consumed_train_samples += self.global_batch_size # TODO: Legacy: idc abt this
	- line 651: # TODO @thomasw21: This is too memory hungry, instead we run all_reduce
	- line 656: # TODO @nouamane: Put this in hooks so we can overlap communication with gradient computation on the last backward pass.
	- line 748: # TODO @nouamanetazi: Megatron-LM seems to be using a barrier to report their interval time. Check if this is necessary. https://github.com/NouamaneTazi/Megatron-LM/blob/e241a96c3085b18e36c6cee1d68a8155de77b5a6/megatron/training.py#L607
	- line 987: # TODO: add max_position_embeddings
	- line 1126: # TODO @nouamanetazi: better memory logs
	- line 1155: # TODO @thomasw21: DDP doesn't support broadcasting complex buffers (and we don't really need that broadcasting anyway)
	- line 1261: self.config.general.consumed_train_samples = self.metadata.consumed_train_samples # TODO: idc abt this


src/nanotron/parallel/pipeline_parallel/engine.py (9 lines):
	- line 91: # TODO @nouamane: this fixes interleaved afab but makes 1f1b hang
	- line 94: #         for activation in reversed(activations): #TODO @nouamane: need to bwd only 2nd chunk
	- line 141: state = PipelineTrainBatchState()  # TODO: do i need state?
	- line 151: # TODO @thomasw21: Somehow this needs to be done somewhere else to support interleaving. Somewhere right after a "stage"
	- line 201: # TODO @thomasw21: Somehow this needs to be done somewhere else to support interleaving. Somewhere right after a "stage"
	- line 270: # TODO @thomasw21: Somehow this needs to be done somewhere else to support interleaving. Somewhere right after a "stage"
	- line 281: # TODO @thomasw21: Somehow this needs to be done somewhere else to support interleaving. Somewhere right after a "stage"
	- line 326: # TODO @thomasw21: Somehow this needs to be done somewhere else to support interleaving. Somewhere right after a "stage"
	- line 339: # TODO @thomasw21: Somehow this needs to be done somewhere else to support interleaving. Somewhere right after a "stage"


src/nanotron/optim/zero.py (8 lines):
	- line 97: # TODO: @nouamanetazi: handle syncing param groups attrs (e.g. if we update lr)
	- line 127: # TODO @thomasw21: This is a call to torch internal API, we need to fix this
	- line 130: # TODO @thomasw21: This is a call to torch internal API, we need to fix this
	- line 137: # TODO @thomasw21: This is a call to torch internal API, we need to fix this
	- line 279: # TODO: clone storage aliasing
	- line 289: # TODO @thomasw21: Make is so that you can never update this value
	- line 323: # TODO @thomasw21: This is unfortunately necessary since we might pass `SliceTensor` to the optimizer.
	- line 334: # TODO @thomasw21: Figure out why this function doesn't get inherited. https://github.com/pytorch/pytorch/issues/102337#issuecomment-1634363356


src/nanotron/serialize/weights.py (7 lines):
	- line 53: # TODO @nouamanetazi: Handle buffers
	- line 64: # TODO @thomasw21: We could rotate in order to balance the load.
	- line 159: # TODO @thomasw21: Choose only a slice if we switch the TP topology
	- line 176: # TODO @thomasw21: Interestingly enough we don't actually need to instantiate the entire model at all.
	- line 246: # TODO @nouamane: do we consider exp_size=1 expert_sharded?
	- line 262: # TODO @thomasw21: Choose only a slice if we switch the TP topology
	- line 278: # TODO @thomasw21: Make so that we don't need to code this logic somewhere else than in `get_path`


src/nanotron/serialize/optimizer.py (5 lines):
	- line 54: # TODO: Figure out if I need to save param groups. Right now I'm assuming no as we only store what's trainable
	- line 55: # TODO: We can probably "rotate" so that every process stores something (maybe doesn't matter if we're I/O bound)
	- line 216: # TODO: this does not handle the edge case of different pipeline parallel optimizer state shards saving different state keys
	- line 262: # TODO: maybe better to allocate memory for all states at once
	- line 333: # TODO @thomasw21: Load optimizer type and check that it's compatible otherwise we might be be loading something else completely


src/nanotron/parallel/pipeline_parallel/state.py (5 lines):
	- line 109: # TODO @thomasw21: We assume that each rank has a single contiguous list of blocks. This also means that we only send activations from higher ranks
	- line 113: # TODO @thomasw21: We assume that each rank has a single contiguous list of blocks. This also means that we only recv activations from lower ranks
	- line 117: # TODO @thomasw21: We assume that each rank has a single contiguous list of blocks. This also means that we only send gradients to lower ranks
	- line 121: # TODO @thomasw21: We assume that each rank has a single contiguous list of blocks. This also means that we only recv gradients from higher ranks
	- line 174: # TODO @thomasw21: I need some mechanism to point to whatever is now sorted in a buffer, typically some id that would point to the correct tensor in our buffer instead of relying on the sorted list.


src/nanotron/nn/rotary.py (4 lines):
	- line 50: self.freqs_cis = self.freqs_cis.to(torch.float)  # TODO @nouamane: Fix using `DTypeInvariantTensor` ...
	- line 56: # TODO @nouamane: Using position_ids means we compute redundant embeddings for same positions
	- line 71: # TODO @nouamane: Using position_ids means we compute redundant embeddings for same positions. Only use them in SFT
	- line 138: # TODO @nouamane: support cu_seqlens from position_ids


src/nanotron/parallel/pipeline_parallel/p2p.py (4 lines):
	- line 61: # TODO @thomasw21: Find the issue with send/recv complex tensors
	- line 68: # TODO @thomasw21: Find the issue with send/recv complex tensors
	- line 81: # TODO @nouamane: avoid having two metadata comms, and preallocate shape/stride instead
	- line 302: # TODO @thomasw21: Find the issue with send/recv complex tensors


src/nanotron/config/models_config.py (4 lines):
	- line 227: attention_softmax_in_fp32: bool = True  # TODO: not used
	- line 229: bos_token_id: int = 49152  # TODO: not used
	- line 235: initializer_range: float = 0.02  # TODO: not used
	- line 249: use_position_embeddings: bool = False  # TODO @nouamane this is not used


src/nanotron/scaling/parametrization.py (4 lines):
	- line 40: # TODO: double check if correct initialization for grouped MLP
	- line 61: # TODO @nouamane: should we use trunc_normal_
	- line 75: # TODO @nouamane: should we use trunc_normal_
	- line 105: # TODO @nouamane: should we use trunc_normal_


run_train.py (4 lines):
	- line 114: # TODO @nouamanetazi: this may timeout before 1st device finishes processing dataset. Can we have a ctxmanager to modify timeout?
	- line 115: # TODO: generalise to include  for validation/test splits
	- line 231: num_samples=trainer.config.tokens.train_steps * trainer.global_batch_size, # TODO: this overshoots what's needed by the current stage, but it doesnt matter?
	- line 349: ) # TODO: check this


src/nanotron/serialize/metadata.py (4 lines):
	- line 28: sequence_length: Optional[int] = None # TODO: put back as non-optional
	- line 33: self.sequence_length = 4096 # TODO: temp
	- line 51: consumed_train_samples: int # TODO: Legacy. This assumed same sequence length across all stages. Not used anymore
	- line 53: consumed_tokens_total: Optional[int] = None # TODO: put back as non-optional


src/nanotron/serialize/main.py (4 lines):
	- line 69: # TODO @nouamane: catch full disk error
	- line 123: # TODO @thomas21: sanity check, not sure whether that needs to happen at testing or now (depends how much it costs)
	- line 183: # FIXME @thomasw21: Some data is actually on `cpu`, just for this test we most it to `cuda`
	- line 222: # TODO @thomasw21: make a better structure system so that we get typing correct


src/nanotron/parallel/tied_parameters.py (3 lines):
	- line 50: # TODO @thomasw21: When we support Zero3 this isn't true anymore
	- line 113: param.get_tied_info().global_ranks,  # TODO @nouamane: merge groups which tie the same parameter
	- line 122: module: nn.Module,  # TODO: NanotronModel


src/nanotron/serialize/random.py (3 lines):
	- line 23: # TODO @thomasw21: That's annothing but this actually uses pickle, we might need to change that for something else
	- line 28: # TODO @thomasw21: This basically assumes that we have exactly the same topology as the one we used when saving.
	- line 35: # TODO @thomasw21: That's annothing but this actually uses pickle, we might need to change that for something else


src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py (3 lines):
	- line 64: # TODO @thomasw21: gather along another dimension
	- line 102: # TODO @thomasw21: shard along another dimension
	- line 108: # TODO @thomasw21: Collectives seem to require tensors to be contiguous


src/nanotron/parallel/tensor_parallel/functional.py (3 lines):
	- line 265: # TODO @thomasw21: gather along another dimension
	- line 450: # TODO @thomasw21: This sounds like we don't have the optimal physical layout
	- line 691: # TODO @thomasw21: This sounds like we don't have the optimal physical layout


src/nanotron/config/config.py (3 lines):
	- line 115: # TODO @nouamane: which config do we want for SFT?
	- line 277: consumed_train_samples: Optional[int] = None # TODO: remove this
	- line 544: ), f"num_key_value_heads ({self.model.model_config.num_key_value_heads}) must be >= tp ({self.parallelism.tp})"  # TODO: remove this once we ensure KV heads get duplicated correctly


src/nanotron/sanity_checks.py (3 lines):
	- line 36: # TODO @nouamane: Getting Greatest absolute difference: 4.6e-10 at large scale when syncing tied weights
	- line 40: # TODO @nouamanetazi: remove this with SANITY_CHECKS
	- line 112: # TODO: add checks for memory contiguousness


src/nanotron/models/qwen.py (3 lines):
	- line 219: # TODO: support SFT
	- line 283: max_seqlen = seq_length  # TODO: should this be max position_ids?
	- line 1089: hardware_flops = model_flops  # TODO: This is a placeholder for now


src/nanotron/data/clm_collator.py (2 lines):
	- line 50: # TODO @nouamanetazi: Is it better to have examples as np.array or torch.Tensor?
	- line 128: # }  # TODO: @nouamane in case of memory issues, try keeping numpy here.


src/nanotron/data/nanoset.py (2 lines):
	- line 94: # self.dataset_index, self.dataset_sample_index = self.new_build_nanoset_index() # TODO: Fix this
	- line 241: # TODO: Add 0.5% (the 1.005 factor) so in case the bleding dataset does


src/nanotron/data/nemo_dataset/__init__.py (2 lines):
	- line 441: # TODO: Do we handle the following point from FIM paper?
	- line 615: # TODO: check that this is not problematic indeed


src/nanotron/s3_checkpoints/s3_mover.py (2 lines):
	- line 23: # TODO @eliebak update the doc to state that it also the function use to download it to the disk with start_downloading
	- line 228: time.sleep(1)  # TODO @nouamane: make this configurable


src/nanotron/utils.py (2 lines):
	- line 112: # TODO @nouamanetazi: we pass `self`(which is module) to checkpoint, so it's stored in `ctx.inputs` whereas some other methods create a custom fwd and pass only tensors without `self`. Need to investigate which is better
	- line 146: # TODO @thomasw21: Figure out what's the best Pytorch way of building a tensor from a storage.


src/nanotron/nn/attention.py (2 lines):
	- line 20: # TODO check if some bugs cause push backs on the exact version
	- line 226: # TODO @nouamane: optimize this, and make sure it works with flashattn and flexattn


src/nanotron/data/dataloader.py (2 lines):
	- line 88: # TODO @thomasw21: That's really hard to test as input gets sharded across the PP, let's assume it works for now.
	- line 91: # TODO @nouamanetazi: add this test


src/nanotron/parallel/pipeline_parallel/tensor_pointer.py (2 lines):
	- line 9: # TODO @thomasw21: Maybe add which group it belongs to as well? Typically this is highly correlated to `p2p.pg`
	- line 11: # TODO @thomasw21: Maybe add a tag (torch.distributed.send/recv allow for tagging)


src/nanotron/data/nemo_dataset/blendable_dataset.py (2 lines):
	- line 146: return self.datasets[dataset_idx][sample_idx + self.offsets_in_samples[dataset_idx]] # TODO: is it okay to not respect dataset_sample_index? Since it's sequential it's okay for now
	- line 189: ), "Only S3 paths are supported for consumption stats"  # TODO: remove this


src/nanotron/nn/moe.py (2 lines):
	- line 141: # TODO: duplicte the shared expert gate
	- line 146: )  # TODO: ensure shared_expert_gate is tied across TP


src/nanotron/models/base.py (2 lines):
	- line 195: # TODO: classes dont take same args
	- line 213: # TODO: https://github.com/huggingface/nanotron/issues/65


src/nanotron/random.py (2 lines):
	- line 39: # TODO @thomasw21: We make a copy for safety measure.
	- line 141: # TODO @thomasw21: broadcast tensor using `broadcast` in order not to use pickle


src/nanotron/optim/gradient_accumulator.py (2 lines):
	- line 225: # TODO @thomasw21: Is it better to set to zero instead?
	- line 265: # TODO @nouamane: should we use a fused kernel to copy?


src/nanotron/parallel/tensor_parallel/nn.py (1 line):
	- line 255: # TODO @thomasw21: Fix and remove that constraint. Typically there's no reason to have such a constraint.


src/nanotron/parallel/context.py (1 line):
	- line 82: )  # TODO: ep should be a subset of dp


src/nanotron/parallel/pipeline_parallel/block.py (1 line):
	- line 105: # TODO @thomasw21: Figure out a way to build dummy_input in a generic sense, and remove the necessity to have Dict[str, torch.Tensor] as output


src/nanotron/optim/clip_grads.py (1 line):
	- line 65: # TODO @nouamanetazi: Check if we should calculate norm per parameter (remove .pow(norm_type)


src/nanotron/generation/sampler.py (1 line):
	- line 103: # TODO: We're assuming that TensorColumnLinear shards in a specific manner, i.e. rank 0 gets the first.


src/nanotron/distributed.py (1 line):
	- line 263: # TODO @thomasw21: Maybe figure out a way to do distributed `cpu` training at some point


src/nanotron/data/nemo_dataset/dataset_utils.py (1 line):
	- line 67: # TODO: check data leakage between train/val/test?


src/nanotron/s3_checkpoints/fsspec.py (1 line):
	- line 22: # TODO @thomasw21: pass storage options.


src/nanotron/data/sft_processing.py (1 line):
	- line 167: # TODO: Implement sequence packing for SFT


src/nanotron/data/tokenized_bytes.py (1 line):
	- line 509: max_tokens=max_tokens, # TODO: remove


src/nanotron/nn/ring_attention_lucidrain.py (1 line):
	- line 1061: # TODO @nouamane: what's up with triton?