def prepare_dataset_ogb_biokg()

in src/preprocess_datasets.py [0:0]


def prepare_dataset_ogb_biokg(name):
    """ogbl-biokg is a OGB link property prediction dataset
    Note that the input formats and evaluation protocols are different from conventional KBC datasets

    training input: (h,r,t, h_type, t_type), the last 2 indices represent the entity types.
    valid/test input: (h,r,t, h_neg, t_neg, h_type, t_type), including 500 negatives respectively for h and t
    """
    dataset = LinkPropPredDataset(name)
    split_edge = dataset.get_edge_split()
    train_triples, valid_triples, test_triples = split_edge["train"], split_edge["valid"], split_edge["test"]
    cur_idx, cur_type_idx, type_dict, entity_dict = 0, 0, {}, {}
    for key in dataset[0]['num_nodes_dict']:
        type_dict[key] = cur_type_idx
        cur_type_idx += 1
        entity_dict[key] = (cur_idx, cur_idx + dataset[0]['num_nodes_dict'][key])
        cur_idx += dataset[0]['num_nodes_dict'][key]

    def index_triples_across_type(triples, entity_dict, type_dict):
        triples['head_type_idx'] = np.zeros_like(triples['head'])
        triples['tail_type_idx'] = np.zeros_like(triples['tail'])
        for i in range(len(triples['head'])):
            h_type = triples['head_type'][i]
            triples['head_type_idx'][i] = type_dict[h_type] 
            triples['head'][i] += entity_dict[h_type][0]
            if 'head_neg' in triples:
                triples['head_neg'][i] += entity_dict[h_type][0]
            t_type = triples['tail_type'][i]
            triples['tail_type_idx'][i] = type_dict[t_type]
            triples['tail'][i] += entity_dict[t_type][0]
            if 'tail_neg' in triples:
                triples['tail_neg'][i] += entity_dict[t_type][0]
        return triples
    
    print('Indexing triples across different entity types ...')
    train_triples = index_triples_across_type(train_triples, entity_dict, type_dict)
    valid_triples = index_triples_across_type(valid_triples, entity_dict, type_dict)
    test_triples = index_triples_across_type(test_triples, entity_dict, type_dict)
    nrelation = int(max(train_triples['relation']))+1
    nentity = sum(dataset[0]['num_nodes_dict'].values())
    assert train_triples['head'].max() <= nentity

    train_array = np.concatenate((train_triples['head'].reshape(-1, 1),
                                  train_triples['relation'].reshape(-1, 1),
                                  train_triples['tail'].reshape(-1, 1),
                                  train_triples['head_type_idx'].reshape(-1, 1),
                                  train_triples['tail_type_idx'].reshape(-1, 1),
                                  ), axis=1)

    valid_array = np.concatenate((valid_triples['head'].reshape(-1, 1),
                                  valid_triples['relation'].reshape(-1, 1),
                                  valid_triples['tail'].reshape(-1, 1),
                                  valid_triples['head_neg'],
                                  valid_triples['tail_neg'],
                                  valid_triples['head_type_idx'].reshape(-1, 1),
                                  valid_triples['tail_type_idx'].reshape(-1, 1),
                                  ), axis=1)

    test_array = np.concatenate((test_triples['head'].reshape(-1, 1),
                                  test_triples['relation'].reshape(-1, 1),
                                  test_triples['tail'].reshape(-1, 1),
                                  test_triples['head_neg'],
                                  test_triples['tail_neg'],
                                  test_triples['head_type_idx'].reshape(-1, 1),
                                  test_triples['tail_type_idx'].reshape(-1, 1),
                                  ), axis=1)
    print('Saving arrays ...')
    p = Path(DATA_PATH / name)
    p.mkdir(parents=True, exist_ok=True)
    with open(Path(DATA_PATH) / name / ('train' + '.pickle'), 'wb') as out:
        pickle.dump(train_array.astype('uint64'), out)

    with open(Path(DATA_PATH) / name / ('valid' + '.pickle'), 'wb') as out:
        pickle.dump(valid_array.astype('uint64'), out)

    with open(Path(DATA_PATH) / name / ('test' + '.pickle'), 'wb') as out:
        pickle.dump(test_array.astype('uint64'), out)

    print('Saving meta_info ...')
    meta_info = {'n_provided_neg_head': valid_triples['head_neg'].shape[1],
                 'n_provided_neg_tail': valid_triples['tail_neg'].shape[1],
                 'type_dict': type_dict, # type name: idx
                 'entity_dict': entity_dict, #  type name: first entity idx, last entity idx
                 }
    out = open(Path(DATA_PATH) / name / ('meta_info' + '.pickle'), 'wb')
    pickle.dump(meta_info, out)
    out.close()
    print('Done processing!')