def main()

in scripts/ft_gemma3n_image_trl.py [0:0]


def main():
    parser = TrlParser((ScriptArguments, SFTConfig, ModelConfig))
    script_args, training_args, model_args = parser.parse_args_and_config()
    training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
    training_args.remove_unused_columns = False
    training_args.dataset_kwargs = {"skip_prepare_dataset": True}

    ################
    # Model, Tokenizer & Processor
    ################
    torch_dtype = (
        model_args.torch_dtype
        if model_args.torch_dtype in ["auto", None]
        else getattr(torch, model_args.torch_dtype)
    )
    quantization_config = get_quantization_config(model_args)
    model_kwargs = dict(
        revision=model_args.model_revision,
        attn_implementation=model_args.attn_implementation,
        torch_dtype=torch_dtype,
        device_map=get_kbit_device_map() if quantization_config is not None else None,
        quantization_config=quantization_config,
    )
    processor = AutoProcessor.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
    )
    processor.tokenizer.padding_side = "right"

    # Use appropriate model class based on model name
    if "gemma-3n" in model_args.model_name_or_path.lower():
        model = Gemma3nForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path,
            trust_remote_code=model_args.trust_remote_code,
            **model_kwargs,
        )
    else:
        model = AutoModelForImageTextToText.from_pretrained(
            model_args.model_name_or_path,
            trust_remote_code=model_args.trust_remote_code,
            **model_kwargs,
        )

    def collate_fn(examples):
        texts = []
        images_list = []

        for example in examples:
            # Apply chat template to get text
            text = processor.apply_chat_template(
                example["messages"], tokenize=False, add_generation_prompt=False
            ).strip()
            texts.append(text)

            # Extract images
            if "images" in example:  # single-image case
                images = [img.convert("RGB") for img in example["images"]]
            else:  # multi-image case or intersection dataset
                images = process_vision_info(example["messages"])
            images_list.append(images)

        # Tokenize the texts and process the images
        batch = processor(
            text=texts, images=images_list, return_tensors="pt", padding=True
        )

        # The labels are the input_ids, and we mask the padding tokens in the loss computation
        labels = batch["input_ids"].clone()

        # Mask tokens for Gemma3n model
        if "gemma-3n" in model_args.model_name_or_path.lower():
            # Use Gemma3n specific token masking
            labels[labels == processor.tokenizer.pad_token_id] = -100
            if hasattr(processor.tokenizer, "image_token_id"):
                labels[labels == processor.tokenizer.image_token_id] = -100
            if hasattr(processor.tokenizer, "boi_token_id"):
                labels[labels == processor.tokenizer.boi_token_id] = -100
            if hasattr(processor.tokenizer, "eoi_token_id"):
                labels[labels == processor.tokenizer.eoi_token_id] = -100
        else:
            # Original masking for other models
            image_token_id = [
                processor.tokenizer.convert_tokens_to_ids(
                    processor.tokenizer.special_tokens_map["boi_token"]
                )
            ]
            labels[labels == processor.tokenizer.pad_token_id] = -100
            labels[labels == image_token_id] = -100
            labels[labels == 262144] = -100

        batch["labels"] = labels
        return batch

    ################
    # Dataset
    ################
    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)

    # Handle different dataset formats
    if script_args.dataset_name == "FanqingM/MMIU-Benchmark":
        dataset = prepare_dataset(
            dataset, script_args.dataset_name, script_args.dataset_train_split
        )
    elif script_args.dataset_name == "ariG23498/intersection-dataset":
        # Format intersection dataset
        dataset = dataset.map(
            format_intersection_data, batched=True, batch_size=4, num_proc=4
        )

    ################
    # Training
    ################
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        data_collator=collate_fn,
        train_dataset=dataset[script_args.dataset_train_split],
        eval_dataset=dataset[script_args.dataset_test_split]
        if training_args.eval_strategy != "no"
        else None,
        processing_class=processor.tokenizer,
        peft_config=my_get_peft_config(model_args),
    )

    trainer.train()

    # Save and push to hub
    trainer.save_model(training_args.output_dir)
    if training_args.push_to_hub:
        trainer.push_to_hub(dataset_name=script_args.dataset_name)
        if trainer.accelerator.is_main_process:
            processor.push_to_hub(training_args.hub_model_id)