src/app/app.py [85:243]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    '''
    load the model
    :param args:
    :param model_dir:
    :return:
    '''
    # load tokenizer and model
    device = torch.device(args.device)
    config_class, model_class, tokenizer_class = BertConfig, SequenceAndStructureFusionNetwork, BertTokenizer

    # config = config_class(**json.load(open(os.path.join(model_dir, "config.json"), "r"), encoding="UTF-8"))
    config = config_class(**json.load(open(os.path.join(model_dir, "config.json"), "r")))
    # for sequence
    subword = None
    if args.has_seq_encoder:
        seq_tokenizer = tokenizer_class.from_pretrained(
            os.path.join(model_dir, "sequence"),
            do_lower_case=args.do_lower_case
        )
        if args.subword:
            bpe_codes_prot = codecs.open(args.codes_file)
            subword = BPE(bpe_codes_prot, merges=-1, separator='')
    else:
        seq_tokenizer = None

    if args.has_struct_encoder:
        struct_tokenizer = tokenizer_class.from_pretrained(
            os.path.join(model_dir, "struct"),
            do_lower_case=args.do_lower_case
        )
    else:
        struct_tokenizer = None

    model = model_class.from_pretrained(model_dir, args=args)

    model.to(device)
    model.eval()

    # load labels
    label_filepath = args.label_filepath
    label_id_2_name = {}
    label_name_2_id = {}
    with open(label_filepath, "r") as fp:
        for line in fp:
            if line.strip() == "label":
                continue
            label_name = line.strip()
            label_id_2_name[len(label_id_2_name)] = label_name
            label_name_2_id[label_name] = len(label_name_2_id)

    print("-" * 25 + "label_id_2_name:" + "-" * 25)
    if len(label_id_2_name) < 20:
        print(label_id_2_name)
    print("label size: ", len(label_id_2_name))
    print("-" * 60)

    return config, subword, seq_tokenizer, struct_tokenizer, model, label_id_2_name, label_name_2_id


def transform_sample_2_feature(
        args,
        row,
        seq_tokenizer,
        subword,
        struct_tokenizer,
        pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        mask_padding_with_zero=True
):
    '''
    batch sample transform to batch input
    :param args:
    :param row: [protein_id, seq]
    :param seq_tokenizer:
    :param subword:
    :param struct_tokenizer:
    :param pad_on_left:
    :param pad_token:
    :param pad_token_segment_id:
    :param mask_padding_with_zero:
    :return:
    '''
    features = []
    batch_info = []
    # id, seq
    prot_id, protein_seq = row[0], row[1]
    batch_info.append(row)
    assert seq_tokenizer is not None or struct_tokenizer is not None or args.embedding_type is not None
    if seq_tokenizer:
        if subword:
            seq_to_list = subword.process_line(protein_seq).split(" ")
        else:
            seq_to_list = [v for v in protein_seq]
        cur_seq_len = len(seq_to_list)
        if cur_seq_len > args.seq_max_length - 2:
            if args.trunc_type == "left":
                seq_to_list = seq_to_list[2 - args.seq_max_length:]
            else:
                seq_to_list = seq_to_list[:args.seq_max_length - 2]
        seq = " ".join(seq_to_list)
        inputs = seq_tokenizer.encode_plus(
            seq,
            None,
            add_special_tokens=True,
            max_length=args.seq_max_length,
            truncation=True
        )
        # input_ids: token index list
        # token_type_ids: token type index list
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        real_token_len = len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = args.seq_max_length - len(input_ids)
        attention_mask_padding_length = padding_length

        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(input_ids), args.seq_max_length)
        assert len(attention_mask) == args.seq_max_length, "Error with input length {} vs {}".format(len(attention_mask), args.seq_max_length)
        assert len(token_type_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(token_type_ids), args.seq_max_length)
    else:
        input_ids = None
        attention_mask = None
        token_type_ids = None
        real_token_len = None
    if struct_tokenizer:
        # for structure
        cur_seq_len = len(protein_seq)
        seq_list = [ch for ch in protein_seq]
        if cur_seq_len > args.struct_max_length:
            if args.trunc_type == "left":
                seq_list = seq_list[-args.struct_max_length:]
            else:
                seq_list = seq_list[:args.struct_max_length]
        seq = " ".join(seq_list)
        inputs = struct_tokenizer.encode_plus(
            seq,
            None,
            add_special_tokens=False,
            max_length=args.struct_max_length,
            truncation=True,
            return_token_type_ids=False,
        )
        struct_input_ids = inputs["input_ids"]
        real_struct_node_size = len(struct_input_ids)
        padding_length = args.struct_max_length - real_struct_node_size if real_struct_node_size < args.struct_max_length else 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



src/predict_one_sample.py [97:255]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    '''
    load the model
    :param args:
    :param model_dir:
    :return:
    '''
    # load tokenizer and model
    device = torch.device(args.device)
    config_class, model_class, tokenizer_class = BertConfig, SequenceAndStructureFusionNetwork, BertTokenizer

    # config = config_class(**json.load(open(os.path.join(model_dir, "config.json"), "r"), encoding="UTF-8"))
    config = config_class(**json.load(open(os.path.join(model_dir, "config.json"), "r")))
    # for sequence
    subword = None
    if args.has_seq_encoder:
        seq_tokenizer = tokenizer_class.from_pretrained(
            os.path.join(model_dir, "sequence"),
            do_lower_case=args.do_lower_case
        )
        if args.subword:
            bpe_codes_prot = codecs.open(args.codes_file)
            subword = BPE(bpe_codes_prot, merges=-1, separator='')
    else:
        seq_tokenizer = None

    if args.has_struct_encoder:
        struct_tokenizer = tokenizer_class.from_pretrained(
            os.path.join(model_dir, "struct"),
            do_lower_case=args.do_lower_case
        )
    else:
        struct_tokenizer = None

    model = model_class.from_pretrained(model_dir, args=args)

    model.to(device)
    model.eval()

    # load labels
    label_filepath = args.label_filepath
    label_id_2_name = {}
    label_name_2_id = {}
    with open(label_filepath, "r") as fp:
        for line in fp:
            if line.strip() == "label":
                continue
            label_name = line.strip()
            label_id_2_name[len(label_id_2_name)] = label_name
            label_name_2_id[label_name] = len(label_name_2_id)

    print("-" * 25 + "label_id_2_name:" + "-" * 25)
    if len(label_id_2_name) < 20:
        print(label_id_2_name)
    print("label size: ", len(label_id_2_name))
    print("-" * 60)

    return config, subword, seq_tokenizer, struct_tokenizer, model, label_id_2_name, label_name_2_id


def transform_sample_2_feature(
        args,
        row,
        seq_tokenizer,
        subword,
        struct_tokenizer,
        pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        mask_padding_with_zero=True
):
    '''
    batch sample transform to batch input
    :param args:
    :param row: [protein_id, seq]
    :param seq_tokenizer:
    :param subword:
    :param struct_tokenizer:
    :param pad_on_left:
    :param pad_token:
    :param pad_token_segment_id:
    :param mask_padding_with_zero:
    :return:
    '''
    features = []
    batch_info = []
    # id, seq
    prot_id, protein_seq = row[0], row[1]
    batch_info.append(row)
    assert seq_tokenizer is not None or struct_tokenizer is not None or args.embedding_type is not None
    if seq_tokenizer:
        if subword:
            seq_to_list = subword.process_line(protein_seq).split(" ")
        else:
            seq_to_list = [v for v in protein_seq]
        cur_seq_len = len(seq_to_list)
        if cur_seq_len > args.seq_max_length - 2:
            if args.trunc_type == "left":
                seq_to_list = seq_to_list[2 - args.seq_max_length:]
            else:
                seq_to_list = seq_to_list[:args.seq_max_length - 2]
        seq = " ".join(seq_to_list)
        inputs = seq_tokenizer.encode_plus(
            seq,
            None,
            add_special_tokens=True,
            max_length=args.seq_max_length,
            truncation=True
        )
        # input_ids: token index list
        # token_type_ids: token type index list
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        real_token_len = len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = args.seq_max_length - len(input_ids)
        attention_mask_padding_length = padding_length

        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * attention_mask_padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(input_ids), args.seq_max_length)
        assert len(attention_mask) == args.seq_max_length, "Error with input length {} vs {}".format(len(attention_mask), args.seq_max_length)
        assert len(token_type_ids) == args.seq_max_length, "Error with input length {} vs {}".format(len(token_type_ids), args.seq_max_length)
    else:
        input_ids = None
        attention_mask = None
        token_type_ids = None
        real_token_len = None
    if struct_tokenizer:
        # for structure
        cur_seq_len = len(protein_seq)
        seq_list = [ch for ch in protein_seq]
        if cur_seq_len > args.struct_max_length:
            if args.trunc_type == "left":
                seq_list = seq_list[-args.struct_max_length:]
            else:
                seq_list = seq_list[:args.struct_max_length]
        seq = " ".join(seq_list)
        inputs = struct_tokenizer.encode_plus(
            seq,
            None,
            add_special_tokens=False,
            max_length=args.struct_max_length,
            truncation=True,
            return_token_type_ids=False,
        )
        struct_input_ids = inputs["input_ids"]
        real_struct_node_size = len(struct_input_ids)
        padding_length = args.struct_max_length - real_struct_node_size if real_struct_node_size < args.struct_max_length else 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



