def from_nmt()

in src/mlm/loaders.py [0:0]


    def from_nmt(cls, fp: TextIO, max_utts=None, vocab=None, tokenizer=None):
        """Loads hypotheses from Toan's NMT beam output format
        
        Args:
            fp (TextIO): .nobpe filename
            max_utts (None, optional): Number of utterances to process
            vocab (None, optional): Vocabulary
        
        Returns:
            Predictions: Initialized predictions object
        """

        # Just a dictionary for now
        # but equipped with this factory method
        preds = cls()

        pair_idx = 0
        sents = []
        scores = []
        # TODO: Assumes newline at the end
        for line_idx, line in enumerate(fp):
            if max_utts is not None and max_utts <= pair_idx:
                break
            line = line.strip()
            if line == '':
                hyps = Hypotheses(sents, scores, vocab, tokenizer)
                preds[pair_idx] = hyps
                pair_idx += 1
                sents = []
                scores = []
                continue

            line_parts = line.split()
            neg_log_prob = float(line_parts[-1])

            # TEMPORARY: FOR CATCHING IMPROPER PROCESSING, e.g. ... gedi-25.58
            neg_log_prob_ln_str = line_parts[-2]
            str_parts = neg_log_prob_ln_str.split('-')
            # Were they adjoined?
            if len(str_parts[0]) > 0:
                neg_log_prob_ln_str = '-' + str_parts[-1]
                # logging.warn("Line {}: LN score '{}' was found, treating as '{}'".format(line_idx+1, line_parts[-2], neg_log_prob_ln_str))
            neg_log_prob_ln = float(neg_log_prob_ln_str)

            hyp = ' '.join(line_parts[:-2])

            sents.append(hyp)
            scores.append(neg_log_prob_ln)

        return preds