huggingface / smollm
File Size

The distribution of size of files (measured in lines of code).

Intro
Learn more...
File Size Overall
35% | 15% | 15% | 15% | 17%
Legend:
1001+
501-1000
201-500
101-200
1-100


explore: grouped by folders | grouped by size | sunburst | 3D view
File Size per Extension
1001+
501-1000
201-500
101-200
1-100
py32% | 13% | 18% | 14% | 20%
yaml34% | 27% | 0% | 28% | 8%
ipynb81% | 18% | 0% | 0% | 0%
html0% | 0% | 0% | 100% | 0%
toml0% | 0% | 0% | 0% | 100%
js0% | 0% | 0% | 0% | 100%
File Size per Logical Decomposition
primary
1001+
501-1000
201-500
101-200
1-100
vision37% | 15% | 14% | 14% | 17%
tools0% | 53% | 0% | 21% | 25%
text0% | 0% | 42% | 41% | 16%
Longest Files (Top 50)
File# lines# units
3272 -
build_ds_sft.py
in vision/data/datasets_processing_scripts/build_concatenation_datasets_sft
3203 80
3158 -
3015 -
SmolVLM2_Video_FT.ipynb
in vision/finetuning
2909 -
2291 11
trainer.py
in vision/m4/training
1513 36
modeling_vllama3.py
in vision/m4/models/vllama3
1197 49
packing.py
in vision/m4/training
1099 12
modeling_vmistral.py
in vision/m4/models/vmistral
1088 43
dataset.py
in vision/m4/training
1086 69
llava_onevision_config_qual.yaml
in vision/smolvlm2/scripts/mixtures
1012 -
onevision_less_mammoth_more_videos_balanced.yaml
in vision/smolvlm2/scripts/mixtures
1011 -
onevision_less_mammoth_more_videos.yaml
in vision/smolvlm2/scripts/mixtures
1011 -
web_document_filtering.py
in vision/m4/sourcing/data_collection/processors
1002 38
onevision_no_mammoth_more_image_balanced.yaml
in vision/smolvlm2/scripts/mixtures
983 -
modeling_vgpt2.py
in vision/m4/models/vgpt2
970 40
modeling_idefics.py
in vision/m4/models/idefics
909 42
onevision_less_mammoth.yaml
in vision/smolvlm2/scripts/mixtures
860 -
create_table_datasets.py
in vision/data/datasets_processing_scripts/create_fine_tuning_datasets
780 13
utils.py
in vision/m4/training
715 40
web_document_and_filtering_visualization.py
in vision/m4/sourcing/data_collection/visualization
675 8
__init__.py
in vision/m4/evaluation/tasks
651 -
Smol_VLM_FT.ipynb
in vision/finetuning
646 -
dataset_clip_sampling.py
in vision/smolvlm2/smolvlm/datasets
611 19
llava_onevision_config.yaml
in vision/smolvlm2/scripts/mixtures
581 -
580 -
pair_visualization.py
in vision/m4/sourcing/data_collection/visualization
566 13
558 -
demo_tkinter.py
in tools/smol_tools
534 19
dataset.py
in vision/smolvlm2/smolvlm/datasets
530 17
525 10
testing_utils.py
in vision/m4
504 64
pipeline.py
in text/data/smoltalk/magpie_ultra_v1
498 7
build_concat_ds_sft.py
in vision/data/datasets_processing_scripts/build_concatenation_datasets_sft
483 2
456 10
utils.py
in vision/m4/evaluation/custom_metrics
443 10
train_bin_classif.py
in vision/data/datasets_processing_scripts/build_laion_coco_dataset/python_scripts
431 5
filtering_utils.py
in vision/m4/sourcing/data_collection/utils
427 -
perceiver.py
in vision/m4/models/perceiver
397 12
dataset_utils.py
in vision/m4/training
391 19
config.py
in vision/m4/training
381 5
web_document_extractor.py
in vision/m4/sourcing/data_collection/processors
374 20
pair_stat_dashboard.py
in vision/m4/sourcing/data_collection/visualization
359 15
math_utils.py
in text/evaluation
346 7
global_visualization.py
in vision/m4/sourcing/data_collection/visualization/wikipedia
322 7
global_visualization.py
in vision/m4/sourcing/data_collection/visualization
321 7
tasks.py
in text/evaluation
316 7
builder.py
in vision/smolvlm2/smolvlm/datasets
316 10
create_webdataset_tar.py
in vision/m4/utils/datasets
316 15
Files With Most Units (Top 50)
File# lines# units
build_ds_sft.py
in vision/data/datasets_processing_scripts/build_concatenation_datasets_sft
3203 80
dataset.py
in vision/m4/training
1086 69
testing_utils.py
in vision/m4
504 64
modeling_vllama3.py
in vision/m4/models/vllama3
1197 49
modeling_vmistral.py
in vision/m4/models/vmistral
1088 43
modeling_idefics.py
in vision/m4/models/idefics
909 42
utils.py
in vision/m4/training
715 40
modeling_vgpt2.py
in vision/m4/models/vgpt2
970 40
web_document_filtering.py
in vision/m4/sourcing/data_collection/processors
1002 38
trainer.py
in vision/m4/training
1513 36
loader_builder.py
in vision/m4/sourcing/pmd
276 34
logging.py
in vision/m4/utils
115 22
web_document_extractor.py
in vision/m4/sourcing/data_collection/processors
374 20
demo_tkinter.py
in tools/smol_tools
534 19
dataset_clip_sampling.py
in vision/smolvlm2/smolvlm/datasets
611 19
dataset_utils.py
in vision/m4/training
391 19
dom_tree_simplificator.py
in vision/m4/sourcing/data_collection/processors
245 19
pair_filtering.py
in vision/m4/sourcing/data_collection/processors
307 19
dataset.py
in vision/smolvlm2/smolvlm/datasets
530 17
helpers.py
in vision/m4/sourcing/pmd
190 16
create_webdataset_tar.py
in vision/m4/utils/datasets
316 15
activation_tracker.py
in vision/m4/utils
126 15
pair_stat_dashboard.py
in vision/m4/sourcing/data_collection/visualization
359 15
chatter.py
in tools/smol_tools/smol_tools
110 14
create_table_datasets.py
in vision/data/datasets_processing_scripts/create_fine_tuning_datasets
780 13
pair_visualization.py
in vision/m4/sourcing/data_collection/visualization
566 13
packing.py
in vision/m4/training
1099 12
perceiver.py
in vision/m4/models/perceiver
397 12
313 12
web_document_image_deduplication.py
in vision/m4/sourcing/data_collection/processors
202 12
mm_utils.py
in vision/smolvlm2/smolvlm
206 11
13_final_processing.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
276 11
timer.py
in vision/m4/utils/training
47 11
progress.py
in vision/m4/utils
59 11
2291 11
custom_modules.py
in vision/m4/models
192 11
pre_extraction_simplificator.py
in vision/m4/sourcing/data_collection/processors
159 11
agent.py
in tools/smol_tools/smol_tools
84 10
create_mixture.py
in vision/smolvlm2/scripts
115 10
builder.py
in vision/smolvlm2/smolvlm/datasets
316 10
conversation.py
in vision/smolvlm2/smolvlm
302 10
04_screenshot_html_codes.py
in vision/data/datasets_processing_scripts/build_websight_v02/python_scripts
127 10
02_02_generate_html_codes_prompt_2.py
in vision/data/datasets_processing_scripts/build_websight_v02/python_scripts
142 10
525 10
237 10
456 10
utils.py
in vision/m4/evaluation/custom_metrics
443 10
merge_on_image_individual_dataset.py
in vision/data/datasets_processing_scripts/build_concatenation_datasets_sft
129 9
02_01_generate_html_codes_prompt_1.py
in vision/data/datasets_processing_scripts/build_websight_v02/python_scripts
139 9
03_filtering_html_codes.py
in vision/data/datasets_processing_scripts/build_websight_v02/python_scripts
62 9
Files With Long Lines (Top 50)

There are 74 files with lines longer than 120 characters. In total, there are 468 long lines.

File# lines# units# long lines
config.yaml
in vision/experiments/pretraining/vloom/tr_350_smolvlm_2B_2nd_stage
150 - 46
config.yaml
in vision/experiments/pretraining/vloom/tr_345_vsmollm2_256M_2nd_stage
150 - 46
config.yaml
in vision/experiments/pretraining/vloom/tr_347_smolvlm_500M_2nd_stage
150 - 46
config.yaml
in vision/experiments/pretraining/vloom/tr_346_vsmollm2_256M_3rd_stage
133 - 44
config.yaml
in vision/experiments/pretraining/vloom/tr_349_vsmollm2_500M_3rd_stage
133 - 44
SmolVLM2_Video_FT.ipynb
in vision/finetuning
2909 - 25
config.yaml
in vision/experiments/pretraining/vloom/tr_341_smolvlm_025b_1st_stage
136 - 20
config.yaml
in vision/experiments/pretraining/vloom/tr_343_smolvlm_05b_1st_stage
136 - 20
config.yaml
in vision/experiments/pretraining/vloom/tr_348_smolvlm_2B
137 - 20
pipeline.py
in text/data/smoltalk/magpie_ultra_v1
498 7 17
build_ds_sft.py
in vision/data/datasets_processing_scripts/build_concatenation_datasets_sft
3203 80 13
Smol_VLM_FT.ipynb
in vision/finetuning
646 - 11
create_mixture.py
in vision/smolvlm2/scripts
115 10 7
dataset_clip_sampling.py
in vision/smolvlm2/smolvlm/datasets
611 19 7
dataset.py
in vision/smolvlm2/smolvlm/datasets
530 17 6
13_final_processing.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
276 11 4
dataset.py
in text/data/smoltalk/rewrite/pipeline
90 6 3
modeling_smollmm.py
in vision/smolvlm2/smolvlm/model
201 4 3
conversation.py
in vision/smolvlm2/smolvlm
302 10 3
DOM_tree_viz.html
in vision/data/datasets_processing_scripts/clean_m4_prelimenary_experiments/explore/assets
172 - 3
14_01_filter_perplexity_with_language_model.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
121 3 3
web_document_and_filtering_visualization.py
in vision/m4/sourcing/data_collection/visualization
675 8 3
DOM_tree_viz.html
in vision/m4/sourcing/data_collection/visualization/assets
172 - 3
rewriter.py
in tools/smol_tools/smol_tools
16 2 2
demo_tkinter.py
in tools/smol_tools
534 19 2
mm_utils.py
in vision/smolvlm2/smolvlm
206 11 2
15_04_remove_opt_out_images.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
76 3 2
09_05_merge_domain_to_duplicated_texts_sharded.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
86 - 2
11_03_set_img_urls_dedup.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
94 3 2
12_02_remove_opt_out_images.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
74 3 2
15_03_remove_opt_out_documents.py
in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts
67 3 2
05_binary_classification.py
in vision/data/datasets_processing_scripts/build_laion_coco_dataset/python_scripts
149 6 2
train_bin_classif.py
in vision/data/datasets_processing_scripts/build_laion_coco_dataset/python_scripts
431 5 2
packing.py
in vision/m4/training
1099 12 2
3015 - 2
2291 11 2
456 10 2
modeling_vllama3.py
in vision/m4/models/vllama3
1197 49 2
3272 - 2
3158 - 2
136 7 2
42 - 2
update_model_embeds.py
in vision/m4/scripts
53 - 2
base.py
in tools/smol_tools/smol_tools
55 4 1
summarizer.py
in tools/smol_tools/smol_tools
35 2 1
agent.py
in tools/smol_tools/smol_tools
84 10 1
titler.py
in tools/smol_tools/smol_tools
15 2 1
llava_onevision_config.yaml
in vision/smolvlm2/scripts/mixtures
581 - 1
create_scienceqa_old_setup.py
in vision/data/datasets_processing_scripts/create_evaluation_datasets
51 - 1
create_scienceqa.py
in vision/data/datasets_processing_scripts/create_evaluation_datasets
51 - 1