def fit()

in tools/utilities/pythonlibs/audio/training/train_classifier.py [0:0]


    def fit(self, training_data, validation_data, options, model, device=None, detail=False, run=None):
        """
        Perform the training.  This is not called "train" because the base class already defines
        that method with a different meaning.  The base class "train" method puts the Module into
        "training mode".
        """
        print("Training {} using {} rows of featurized training input...".format(self.name(), training_data.num_rows))

        if training_data.mean is not None:
            self.mean = torch.from_numpy(np.array([[training_data.mean]])).to(device)
            self.std = torch.from_numpy(np.array([[training_data.std]])).to(device)
        else:
            self.mean = None
            self.std = None

        start = time.time()
        loss_function = nn.NLLLoss()
        initial_rate = options.learning_rate
        lr_scheduler = options.lr_scheduler
        oo = options.optimizer_options
        self.training = True

        if options.optimizer == "Adadelta":
            optimizer = optim.Adadelta(self.parameters(), lr=initial_rate, weight_decay=oo.weight_decay,
                                       rho=oo.rho, eps=oo.eps)
        elif options.optimizer == "Adagrad":
            optimizer = optim.Adagrad(self.parameters(), lr=initial_rate, weight_decay=oo.weight_decay,
                                      lr_decay=oo.lr_decay)
        elif options.optimizer == "Adam":
            optimizer = optim.Adam(self.parameters(), lr=initial_rate, weight_decay=oo.weight_decay,
                                   betas=oo.betas, eps=oo.eps)
        elif options.optimizer == "Adamax":
            optimizer = optim.Adamax(self.parameters(), lr=initial_rate, weight_decay=oo.weight_decay,
                                     betas=oo.betas, eps=oo.eps)
        elif options.optimizer == "ASGD":
            optimizer = optim.ASGD(self.parameters(), lr=initial_rate, weight_decay=oo.weight_decay,
                                   lambd=oo.lambd, alpha=oo.alpha, t0=oo.t0)
        elif options.optimizer == "RMSprop":
            optimizer = optim.RMSprop(self.parameters(), lr=initial_rate, weight_decay=oo.weight_decay,
                                      eps=oo.eps, alpha=oo.alpha, momentum=oo.momentum, centered=oo.centered)
        elif options.optimizer == "Rprop":
            optimizer = optim.Rprop(self.parameters(), lr=initial_rate, etas=oo.etas,
                                    step_sizes=oo.step_sizes)
        elif options.optimizer == "SGD":
            optimizer = optim.SGD(self.parameters(), lr=initial_rate, weight_decay=oo.weight_decay,
                                  momentum=oo.momentum, dampening=oo.dampening)

        print(optimizer)
        num_epochs = options.max_epochs
        batch_size = options.batch_size
        learning_rate = options.learning_rate
        lr_min = options.lr_min
        lr_peaks = options.lr_peaks
        ticks = training_data.num_rows / batch_size  # iterations per epoch

        # Calculation of total iterations in non-rolling vs rolling training
        # ticks = num_rows/batch_size (total number of iterations per epoch)
        # Non-Rolling Training:
        # Total Iteration = num_epochs * ticks
        # Rolling Training:
        # irl = Initial_rolling_length (We are using 2)
        # If num_epochs <=  max_rolling_length:
        # Total Iterations = sum(range(irl, irl + num_epochs))
        # If num_epochs > max_rolling_length:
        # Total Iterations = sum(range(irl, irl + max_rolling_length)) + (num_epochs - max_rolling_length)*ticks
        if options.rolling:
            rolling_length = 2
            max_rolling_length = int(ticks)
            if max_rolling_length > options.max_rolling_length + rolling_length:
                max_rolling_length = options.max_rolling_length + rolling_length
            bag_count = 100
            hidden_bag_size = batch_size * bag_count
            if num_epochs + rolling_length < max_rolling_length:
                max_rolling_length = num_epochs + rolling_length
            total_iterations = sum(range(rolling_length, max_rolling_length))
            if num_epochs + rolling_length > max_rolling_length:
                epochs_remaining = num_epochs + rolling_length - max_rolling_length
                total_iterations += epochs_remaining * training_data.num_rows / batch_size
            ticks = total_iterations / num_epochs
        else:
            total_iterations = ticks * num_epochs
        gamma = options.lr_gamma

        if not lr_min:
            lr_min = learning_rate
        scheduler = None
        if lr_scheduler == "TriangleLR":
            steps = lr_peaks * 2 + 1
            stepsize = num_epochs / steps
            scheduler = TriangularLR(optimizer, stepsize * ticks, lr_min, learning_rate, gamma)
        elif lr_scheduler == "CosineAnnealingLR":
            # divide by odd number to finish on the minimum learning rate
            cycles = lr_peaks * 2 + 1
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_iterations / cycles,
                                                             eta_min=lr_min)
        elif lr_scheduler == "ExponentialLR":
            scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma)
        elif lr_scheduler == "StepLR":
            scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=options.lr_step_size, gamma=gamma)
        elif lr_scheduler == "ExponentialResettingLR":
            reset = (num_epochs * ticks) / 3  # reset at the 1/3 mark.
            scheduler = ExponentialResettingLR(optimizer, gamma, reset)

        # optimizer = optim.Adam(model.parameters(), lr=0.0001)
        log = []
        for epoch in range(num_epochs):
            self.train()
            if options.rolling:
                rolling_length += 1
                if rolling_length <= max_rolling_length:
                    hidden1_bag = torch.from_numpy(np.zeros([1, hidden_bag_size, model.hidden_units],
                                                            dtype=np.float32)).to(device)
                    if model.architecture == 'LSTM':
                        cell1_bag = torch.from_numpy(np.zeros([1, hidden_bag_size, model.hidden_units],
                                                              dtype=np.float32)).to(device)
                    if model.num_layers >= 2:
                        hidden2_bag = torch.from_numpy(np.zeros([1, hidden_bag_size, model.hidden_units],
                                                                dtype=np.float32)).to(device)
                        if model.architecture == 'LSTM':
                            cell2_bag = torch.from_numpy(np.zeros([1, hidden_bag_size, model.hidden_units],
                                                                  dtype=np.float32)).to(device)
                    if model.num_layers == 3:
                        hidden3_bag = torch.from_numpy(np.zeros([1, hidden_bag_size, training_data.num_keywords],
                                                                dtype=np.float32)).to(device)
                        if model.architecture == 'LSTM':
                            cell3_bag = torch.from_numpy(np.zeros([1, hidden_bag_size, training_data.num_keywords],
                                                                  dtype=np.float32)).to(device)
            for i_batch, (audio, labels) in enumerate(training_data.get_data_loader(batch_size)):
                if not self.batch_first:
                    audio = audio.transpose(1, 0)  # GRU wants seq,batch,feature

                if device:
                    audio = audio.to(device)
                    labels = labels.to(device)

                # Also, we need to clear out the hidden state,
                # detaching it from its history on the last instance.
                if options.rolling:
                    if rolling_length <= max_rolling_length:
                        if (i_batch + 1) % rolling_length == 0:
                            self.init_hidden()
                            break
                    shuffled_indices = list(range(hidden_bag_size))
                    np.random.shuffle(shuffled_indices)
                    temp_indices = shuffled_indices[:batch_size]
                    if model.architecture == 'LSTM':
                        if self.hidden1 is not None:
                            hidden1_bag[:, temp_indices, :], cell1_bag[:, temp_indices, :] = self.hidden1
                            self.hidden1 = (hidden1_bag[:, 0:batch_size, :], cell1_bag[:, 0:batch_size, :])
                            if model.num_layers >= 2:
                                hidden2_bag[:, temp_indices, :], cell2_bag[:, temp_indices, :] = self.hidden2
                                self.hidden2 = (hidden2_bag[:, 0:batch_size, :], cell2_bag[:, 0:batch_size, :])
                            if model.num_layers == 3:
                                hidden3_bag[:, temp_indices, :], cell3_bag[:, temp_indices, :] = self.hidden3
                                self.hidden3 = (hidden3_bag[:, 0:batch_size, :], cell3_bag[:, 0:batch_size, :])
                    else:
                        if self.hidden1 is not None:
                            hidden1_bag[:, temp_indices, :] = self.hidden1
                            self.hidden1 = hidden1_bag[:, 0:batch_size, :]
                            if model.num_layers >= 2:
                                hidden2_bag[:, temp_indices, :] = self.hidden2
                                self.hidden2 = hidden2_bag[:, 0:batch_size, :]
                            if model.num_layers == 3:
                                hidden3_bag[:, temp_indices, :] = self.hidden3
                                self.hidden3 = hidden3_bag[:, 0:batch_size, :]
                else:
                    self.init_hidden()

                # Before the backward pass, use the optimizer object to zero all of the
                # gradients for the variables it will update (which are the learnable
                # weights of the model). This is because by default, gradients are
                # accumulated in buffers( i.e, not overwritten) whenever .backward()
                # is called. Checkout docs of torch.autograd.backward for more details.
                optimizer.zero_grad()

                # optionally normalize the audio
                if self.mean is not None:
                    audio = (audio - self.mean) / self.std

                # Run our forward pass.
                keyword_scores = self(audio)

                # Compute the loss, gradients
                loss = loss_function(keyword_scores, labels)

                # Backward pass: compute gradient of the loss with respect to all the learnable
                # parameters of the model. Internally, the parameters of each Module are stored
                # in Tensors with requires_grad=True, so this call will compute gradients for
                # all learnable parameters in the model.
                loss.backward()
                # move to next learning rate
                if scheduler:
                    scheduler.step()

                # Calling the step function on an Optimizer makes an update to its parameters
                # applying the gradients we computed during back propagation
                optimizer.step()

                learning_rate = optimizer.param_groups[0]['lr']
                if detail:
                    learning_rate = optimizer.param_groups[0]['lr']
                    log += [{'iteration': iteration, 'loss': loss.item(), 'learning_rate': learning_rate}]
            # Find the best prediction in each sequence and return it's accuracy
            passed, total, rate = self.evaluate(validation_data, batch_size, device)
            learning_rate = optimizer.param_groups[0]['lr']
            current_loss = float(loss.item())
            print("Epoch {}, Loss {:.3f}, Validation Accuracy {:.3f}, Learning Rate {}".format(
                  epoch, current_loss, rate * 100, learning_rate))
            log += [{'epoch': epoch, 'loss': current_loss, 'accuracy': rate, 'learning_rate': learning_rate}]
            if run is not None:
                run.log('progress', epoch / num_epochs)
                run.log('epoch', epoch)
                run.log('accuracy', rate)
                run.log('loss', current_loss)
                run.log('learning_rate', learning_rate)

        end = time.time()
        self.training = False
        print("Trained in {:.2f} seconds".format(end - start))
        return log