assets/training/finetune_acft_hf_nlp/components/preprocess/translation/spec.yaml (81 lines of code) (raw):
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: translation_datapreprocess
version: 0.0.73
type: command
is_deterministic: true
display_name: Translation DataPreProcess
description: Component to preprocess data for translation task. See [docs](https://aka.ms/azureml/components/translation_datapreprocess) to learn more.
environment: azureml://registries/azureml/environments/acft-hf-nlp-gpu/versions/87
code: ../../../src/preprocess
inputs:
# task arguments
# sample input
# {`en`:"Others have dismissed him as a joke.",`ro`:"Al\u021bii l-au numit o glum\u0103."}
# If the dataset follows above pattern, `source_lang` is en and `target_lang` is ro
# source language codes
# t5 - English (en)
# mbart - Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese, Sim (zh_CN)
# target language codes
# t5 - French (fr), German (de), Romanian (ro)
# mbart - Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese, Sim (zh_CN)
source_lang:
type: string
optional: false
description: key for source language text in an example. This key should be an abbreviated/coded form of the language as understood by tokenizer. Please check the respective model's language codes while updating this information
target_lang:
type: string
optional: false
description: key for target language text in an example. This key should be an abbreviated/coded form of the language as understood by tokenizer. Please check the respective model's language codes while updating this information
batch_size:
type: integer
min: 1
optional: true
default: 1000
description: Number of examples to batch before calling the tokenization function
# Tokenization params
pad_to_max_length:
type: string
enum:
- "true"
- "false"
default: "false"
optional: true
description: If set to True, the returned sequences will be padded according to the model's padding side and padding index, up to their `max_seq_length`. If no `max_seq_length` is specified, the padding is done up to the model's max length.
max_seq_length:
type: integer
optional: true
default: -1
description: Controls the maximum length to use when pad_to_max_length parameter is set to `true`. Default is -1 which means the padding is done up to the model's max length. Else will be padded to `max_seq_length`.
# Data inputs
# Please note that either `train_file_path` or `train_mltable_path` needs to be passed. In case both are passed, `mltable path` will take precedence. The validation and test paths are optional and an automatic split from train data happens if they are not passed.
# If both validation and test files are missing, 10% of train data will be assigned to each of them and the remaining 80% will be used for training
# If anyone of the file is missing, 20% of the train data will be assigned to it and the remaining 80% will be used for training
train_file_path:
type: uri_file
optional: true
description: Path to the registered training data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`.
mode: rw_mount
validation_file_path:
type: uri_file
optional: true
description: Path to the registered validation data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`.
mode: rw_mount
test_file_path:
type: uri_file
optional: true
description: Path to the registered test data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`.
mode: rw_mount
train_mltable_path:
type: mltable
optional: true
description: Path to the registered training data asset in `mltable` format.
validation_mltable_path:
type: mltable
optional: true
description: Path to the registered validation data asset in `mltable` format.
test_mltable_path:
type: mltable
optional: true
description: Path to the registered test data asset in `mltable` format.
# Dataset parameters
model_selector_output:
type: uri_folder
optional: false
description: output folder of model selector containing model metadata like config, checkpoints, tokenizer config
mode: rw_mount
outputs:
output_dir:
type: uri_folder
description: The folder contains the tokenized output of the train, validation and test data along with the tokenizer files used to tokenize the data
mode: rw_mount
command: >-
python preprocess.py --task_name Translation --source_lang '${{inputs.source_lang}}' --target_lang '${{inputs.target_lang}}' $[[--batch_size '${{inputs.batch_size}}']] $[[--pad_to_max_length '${{inputs.pad_to_max_length}}']] $[[--max_seq_length '${{inputs.max_seq_length}}']] $[[--train_file_path '${{inputs.train_file_path}}']] $[[--validation_file_path '${{inputs.validation_file_path}}']] $[[--test_file_path '${{inputs.test_file_path}}']] $[[--train_mltable_path '${{inputs.train_mltable_path}}']] $[[--validation_mltable_path '${{inputs.validation_mltable_path}}']] $[[--test_mltable_path '${{inputs.test_mltable_path}}']] --model_selector_output '${{inputs.model_selector_output}}' --output_dir '${{outputs.output_dir}}'