in codegen_sources/model/src/data/loader.py [0:0]
def load_mono_data(params, data):
"""
Load monolingual data.
"""
data["mono"] = {}
data["mono_stream"] = {}
for lang in params.mono_dataset.keys():
logger.info("============ Monolingual data (%s)" % lang)
assert lang in params.langs and lang not in data["mono"]
data["mono"][lang] = {}
data["mono_stream"][lang] = {}
for splt, data_path in params.mono_dataset[lang].items():
if splt == SELF_TRAINED and lang not in params.st_src_langs:
# continue if not doing self training for this language
continue
# no need to load training data for evaluation
if splt in TRAIN_SPLITS and params.eval_only:
continue
# load data / update dictionary parameters / update data
mono_data = load_binarized(data_path, params)
set_dico_parameters(params, data, mono_data["dico"])
# create stream dataset
bs = params.batch_size if splt == "train" else 1
data["mono_stream"][lang][splt] = StreamDataset(
mono_data["sentences"], mono_data["positions"], bs, params
)
# if there are several processes on the same machine, we can split the dataset
if (
splt in TRAIN_SPLITS
and params.split_data
and 1
< params.n_gpu_per_node
<= data["mono_stream"][lang][splt].n_batches
):
n_batches = (
data["mono_stream"][lang][splt].n_batches // params.n_gpu_per_node
)
a = n_batches * params.local_rank
b = n_batches * params.local_rank + n_batches
data["mono_stream"][lang][splt].select_data(a, b)
# for denoising auto-encoding and online back-translation, we need a non-stream (batched) dataset
if (
lang in params.ae_steps
or lang in params.bt_src_langs
or lang
in [l1 for l1, l2 in params.cmt_steps]
+ [l1 for l1, l2 in params.disc_steps]
or (lang in params.st_src_langs and splt == SELF_TRAINED)
):
# create batched dataset
dataset = Dataset(
mono_data["sentences"],
mono_data["positions"],
params,
has_sentence_ids=(splt, (lang,)) in params.has_sentence_ids,
unit_tests_st=splt == SELF_TRAINED,
)
# remove empty and too long sentences
if splt in TRAIN_SPLITS:
dataset.remove_empty_sentences()
dataset.remove_long_sentences(params.max_len)
if splt == SELF_TRAINED:
dataset.compute_st_scores(params, data["dico"])
data[f"java_st_unit_tests"] = dataset.unit_tests
data[f"java_st_tests_scores"] = dataset.st_tests_scores
# if there are several processes on the same machine, we can split the dataset
if (
splt in TRAIN_SPLITS
and params.n_gpu_per_node > 1
and params.split_data
):
n_sent = len(dataset) // params.n_gpu_per_node
a = n_sent * params.local_rank
b = n_sent * params.local_rank + n_sent
dataset.select_data(a, b)
data["mono"][lang][splt] = dataset
logger.info("")
logger.info("")