optimum_benchmark/preprocessors/dataset_preprocessor.py [13:47]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    dataset: Dataset,
    pretrained_processor: PretrainedProcessor,
    scenario_config: EnergyStarConfig,
    pretrained_config: PretrainedConfig,
) -> Dataset:
    if scenario_config.input_shapes["batch_size"] == 1:
        # Remove empty samples when batch_size is 1 because empty inputs will make the model fail
        dataset = dataset.filter(lambda example: example[scenario_config.text_column_name] != "")

    if scenario_config.num_samples != -1:
        dataset = dataset.select(range(scenario_config.num_samples))

    if getattr(pretrained_processor, "pad_token", None) is None:
        pretrained_processor.pad_token = pretrained_processor.eos_token

    padding = scenario_config.input_shapes["batch_size"] != 1
    max_length = getattr(pretrained_config, "max_position_embeddings", 512)

    def tokenize_function(examples):
        return pretrained_processor(
            examples[scenario_config.text_column_name],
            truncation=scenario_config.truncation,
            max_length=max_length,
            padding=padding,
        )

    dataset = dataset.map(
        function=tokenize_function,
        desc="Running tokenizer on dataset",
        remove_columns=dataset.features,
        writer_batch_size=50,
        batched=True,
    ).with_format("torch")

    return dataset
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


optimum_benchmark/preprocessors/dataset_preprocessor.py [89:123]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    dataset: Dataset,
    pretrained_processor: PretrainedProcessor,
    scenario_config: EnergyStarConfig,
    pretrained_config: PretrainedConfig,
) -> Dataset:
    if scenario_config.input_shapes["batch_size"] == 1:
        # Remove empty samples when batch_size is 1 because empty inputs will make the model fail
        dataset = dataset.filter(lambda example: example[scenario_config.text_column_name] != "")

    if scenario_config.num_samples != -1:
        dataset = dataset.select(range(scenario_config.num_samples))

    if getattr(pretrained_processor, "pad_token", None) is None:
        pretrained_processor.pad_token = pretrained_processor.eos_token

    padding = scenario_config.input_shapes["batch_size"] != 1
    max_length = getattr(pretrained_config, "max_position_embeddings", 512)

    def tokenize_function(examples):
        return pretrained_processor(
            examples[scenario_config.text_column_name],
            truncation=scenario_config.truncation,
            max_length=max_length,
            padding=padding,
        )

    dataset = dataset.map(
        function=tokenize_function,
        desc="Running tokenizer on dataset",
        remove_columns=dataset.features,
        writer_batch_size=50,
        batched=True,
    ).with_format("torch")

    return dataset
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -