code/scripts/lstm_alone.py [93:160]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    log_interval = 100
    batch_size = 32
    lr = 1e-3
    optimizer = 'adam'
    accumulate = None
    epochs = 20

    ## Load BERT model and vocabulary
    bert, vocabulary = nlp.model.get_model('bert_12_768_12',
                                           dataset_name='wiki_multilingual_uncased',
                                           pretrained=True,
                                           ctx=ctx,
                                           use_pooler=False,
                                           use_decoder=False,
                                           use_classifier=False)

    model = ICSL(len(vocabulary), num_slot_labels=len(label2idx), num_intents=len(intent2idx))
    model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
    model.hybridize(static_alloc=True)

    icsl_loss_function = ICSLLoss()
    icsl_loss_function.hybridize(static_alloc=True)

    ic_metric = mx.metric.Accuracy()
    sl_metric = mx.metric.Accuracy()

    ## Load labeled data
    field_separator = nlp.data.Splitter('\t')
    # fields to select from the file: utterance, slot labels, intent, uid
    field_indices = [1, 3, 4, 0]
    train_data = nlp.data.TSVDataset(filename=train_input,
                                     field_separator=field_separator,
                                     num_discard_samples=1,
                                     field_indices=field_indices)

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    train_data_transform = train_data.transform(fn=lambda x: icsl_transform(x, vocabulary, label2idx, intent2idx, bert_tokenizer)[0])
    # create data loader
    pad_token_id = vocabulary[PAD]
    pad_label_id = label2idx[PAD]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Stack(),
        nlp.data.batchify.Pad(axis=0, pad_val=pad_token_id),
        nlp.data.batchify.Pad(axis=0, pad_val=pad_label_id),
        nlp.data.batchify.Stack('float32'),
        nlp.data.batchify.Stack('float32'))
    train_sampler = nlp.data.FixedBucketSampler(lengths=[len(item[1]) for item in train_data_transform],
                                                batch_size=batch_size,
                                                shuffle=True)
    train_dataloader = mx.gluon.data.DataLoader(train_data_transform,
                                                batchify_fn=batchify_fn,
                                                batch_sampler=train_sampler)

    optimizer_params = {'learning_rate': lr}
    trainer = gluon.Trainer(model.collect_params(), optimizer,
                            optimizer_params, update_on_kvstore=False)

    # Collect differentiable parameters
    params = [p for p in model.collect_params().values() if p.grad_req != 'null']
    # Set grad_req if gradient accumulation is required
    if accumulate:
        for p in params:
            p[1].grad_req = 'add'

    epoch_tic = time.time()
    total_num = 0
    log_num = 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



code/scripts/lstm_mt.py [93:160]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    log_interval = 100
    batch_size = 32
    lr = 1e-3
    optimizer = 'adam'
    accumulate = None
    epochs = 20

    ## Load BERT model and vocabulary
    bert, vocabulary = nlp.model.get_model('bert_12_768_12',
                                           dataset_name='wiki_multilingual_uncased',
                                           pretrained=True,
                                           ctx=ctx,
                                           use_pooler=False,
                                           use_decoder=False,
                                           use_classifier=False)

    model = ICSL(len(vocabulary), num_slot_labels=len(label2idx), num_intents=len(intent2idx))
    model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
    model.hybridize(static_alloc=True)

    icsl_loss_function = ICSLLoss()
    icsl_loss_function.hybridize(static_alloc=True)

    ic_metric = mx.metric.Accuracy()
    sl_metric = mx.metric.Accuracy()

    ## Load labeled data
    field_separator = nlp.data.Splitter('\t')
    # fields to select from the file: utterance, slot labels, intent, uid
    field_indices = [1, 3, 4, 0]
    train_data = nlp.data.TSVDataset(filename=train_input,
                                     field_separator=field_separator,
                                     num_discard_samples=1,
                                     field_indices=field_indices)

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    train_data_transform = train_data.transform(fn=lambda x: icsl_transform(x, vocabulary, label2idx, intent2idx, bert_tokenizer)[0])
    # create data loader
    pad_token_id = vocabulary[PAD]
    pad_label_id = label2idx[PAD]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Stack(),
        nlp.data.batchify.Pad(axis=0, pad_val=pad_token_id),
        nlp.data.batchify.Pad(axis=0, pad_val=pad_label_id),
        nlp.data.batchify.Stack('float32'),
        nlp.data.batchify.Stack('float32'))
    train_sampler = nlp.data.FixedBucketSampler(lengths=[len(item[1]) for item in train_data_transform],
                                                batch_size=batch_size,
                                                shuffle=True)
    train_dataloader = mx.gluon.data.DataLoader(train_data_transform,
                                                batchify_fn=batchify_fn,
                                                batch_sampler=train_sampler)

    optimizer_params = {'learning_rate': lr}
    trainer = gluon.Trainer(model.collect_params(), optimizer,
                            optimizer_params, update_on_kvstore=False)

    # Collect differentiable parameters
    params = [p for p in model.collect_params().values() if p.grad_req != 'null']
    # Set grad_req if gradient accumulation is required
    if accumulate:
        for p in params:
            p[1].grad_req = 'add'

    epoch_tic = time.time()
    total_num = 0
    log_num = 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



