Path Lines of Code run_train.py 281 src/nanotron/config/config.py 515 src/nanotron/config/models_config.py 211 src/nanotron/data/clm_collator.py 158 src/nanotron/data/dataloader.py 250 src/nanotron/data/nanoset.py 164 src/nanotron/data/nemo_dataset/__init__.py 690 src/nanotron/data/nemo_dataset/blendable_dataset.py 177 src/nanotron/data/nemo_dataset/dataset_utils.py 65 src/nanotron/data/sft_processing.py 57 src/nanotron/data/tokenized_bytes.py 572 src/nanotron/distributed.py 179 src/nanotron/generation/decode.py 649 src/nanotron/generation/sampler.py 166 src/nanotron/models/base.py 219 src/nanotron/models/llama.py 926 src/nanotron/models/qwen.py 810 src/nanotron/models/starcoder2.py 1228 src/nanotron/nn/attention.py 167 src/nanotron/nn/moe.py 136 src/nanotron/nn/ring_attention_lucidrain.py 1315 src/nanotron/nn/rotary.py 150 src/nanotron/optim/clip_grads.py 63 src/nanotron/optim/gradient_accumulator.py 250 src/nanotron/optim/zero.py 323 src/nanotron/parallel/context.py 124 src/nanotron/parallel/pipeline_parallel/block.py 115 src/nanotron/parallel/pipeline_parallel/engine.py 235 src/nanotron/parallel/pipeline_parallel/p2p.py 390 src/nanotron/parallel/pipeline_parallel/state.py 194 src/nanotron/parallel/pipeline_parallel/tensor_pointer.py 4 src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py 81 src/nanotron/parallel/tensor_parallel/functional.py 487 src/nanotron/parallel/tensor_parallel/nn.py 245 src/nanotron/parallel/tied_parameters.py 123 src/nanotron/random.py 105 src/nanotron/s3_checkpoints/fsspec.py 25 src/nanotron/s3_checkpoints/s3_mover.py 332 src/nanotron/sanity_checks.py 241 src/nanotron/scaling/parametrization.py 162 src/nanotron/serialize/main.py 223 src/nanotron/serialize/metadata.py 140 src/nanotron/serialize/optimizer.py 265 src/nanotron/serialize/random.py 25 src/nanotron/serialize/weights.py 293 src/nanotron/trainer.py 1104 src/nanotron/utils.py 103