in muss/fairseq/roberta.py [0:0]
def train_roberta(
dataset,
sample_break_mode='complete',
batch_size=8192,
max_sentences=16,
max_tokens=12000,
tokens_per_sample=512,
checkpoints_dir=None,
distributed_world_size=None,
sentencepiece_model_path=None,
arch='roberta_base',
dropout=0.1,
total_updates=500000,
log_interval=100,
peak_lr=0.0007,
clip_norm=None,
no_epoch_checkpoint=False,
validate_interval=1,
save_interval=1,
save_interval_updates=5000,
warmup_updates=10000,