src/predict.py [187:257]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        assert seq_tokenizer is not None or struct_tokenizer is not None or args.embedding_type is not None
        if seq_tokenizer:
            if subword:
                seq_to_list = subword.process_line(protein_seq).split(" ")
            else:
                seq_to_list = [v for v in protein_seq]
            cur_seq_len = len(seq_to_list)
            if cur_seq_len > args.seq_max_length - 2:
                if args.trunc_type == "left":
                    seq_to_list = seq_to_list[2 - args.seq_max_length:]
                else:
                    seq_to_list = seq_to_list[:args.seq_max_length - 2]
            seq = " ".join(seq_to_list)
            inputs = seq_tokenizer.encode_plus(
                seq,
                None,
                add_special_tokens=True,
                max_length=args.seq_max_length,
                truncation=True
            )
            # input_ids: token index list
            # token_type_ids: token type index list
            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
            real_token_len = len(input_ids)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = args.seq_max_length - len(input_ids)
            attention_mask_padding_length = padding_length

            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            assert len(input_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(input_ids), args.seq_max_length)
            assert len(attention_mask) == args.seq_max_length, "Error with input length {} vs {}".format(len(attention_mask), args.seq_max_length)
            assert len(token_type_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(token_type_ids), args.seq_max_length)
        else:
            input_ids = None
            attention_mask = None
            token_type_ids = None
            real_token_len = None
        if struct_tokenizer:
            # for structure
            cur_seq_len = len(protein_seq)
            seq_list = [ch for ch in protein_seq]
            if cur_seq_len > args.struct_max_length:
                if args.trunc_type == "left":
                    seq_list = seq_list[-args.struct_max_length:]
                else:
                    seq_list = seq_list[:args.struct_max_length]
            seq = " ".join(seq_list)
            inputs = struct_tokenizer.encode_plus(
                seq,
                None,
                add_special_tokens=False,
                max_length=args.struct_max_length,
                truncation=True,
                return_token_type_ids=False,
            )
            struct_input_ids = inputs["input_ids"]
            real_struct_node_size = len(struct_input_ids)
            padding_length = args.struct_max_length - real_struct_node_size if real_struct_node_size < args.struct_max_length else 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


src/predict_many_samples.py [183:253]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    assert seq_tokenizer is not None or struct_tokenizer is not None or args.embedding_type is not None
    if seq_tokenizer:
        if subword:
            seq_to_list = subword.process_line(protein_seq).split(" ")
        else:
            seq_to_list = [v for v in protein_seq]
        cur_seq_len = len(seq_to_list)
        if cur_seq_len > args.seq_max_length - 2:
            if args.trunc_type == "left":
                seq_to_list = seq_to_list[2 - args.seq_max_length:]
            else:
                seq_to_list = seq_to_list[:args.seq_max_length - 2]
        seq = " ".join(seq_to_list)
        inputs = seq_tokenizer.encode_plus(
            seq,
            None,
            add_special_tokens=True,
            max_length=args.seq_max_length,
            truncation=True
        )
        # input_ids: token index list
        # token_type_ids: token type index list
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        real_token_len = len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = args.seq_max_length - len(input_ids)
        attention_mask_padding_length = padding_length

        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(input_ids), args.seq_max_length)
        assert len(attention_mask) == args.seq_max_length, "Error with input length {} vs {}".format(len(attention_mask), args.seq_max_length)
        assert len(token_type_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(token_type_ids), args.seq_max_length)
    else:
        input_ids = None
        attention_mask = None
        token_type_ids = None
        real_token_len = None
    if struct_tokenizer:
        # for structure
        cur_seq_len = len(protein_seq)
        seq_list = [ch for ch in protein_seq]
        if cur_seq_len > args.struct_max_length:
            if args.trunc_type == "left":
                seq_list = seq_list[-args.struct_max_length:]
            else:
                seq_list = seq_list[:args.struct_max_length]
        seq = " ".join(seq_list)
        inputs = struct_tokenizer.encode_plus(
            seq,
            None,
            add_special_tokens=False,
            max_length=args.struct_max_length,
            truncation=True,
            return_token_type_ids=False,
        )
        struct_input_ids = inputs["input_ids"]
        real_struct_node_size = len(struct_input_ids)
        padding_length = args.struct_max_length - real_struct_node_size if real_struct_node_size < args.struct_max_length else 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -