in src/preprocess_datasets.py [0:0]
def prepare_dataset_ogb_wikikg2(name):
"""ogbl-wikikg2 is a OGB link property prediction dataset.
Note that the evaluation protocol is different from conventional KBC datasets.
training input: (h,r,t)
valid/test input: (h, r, t, h_neg, t_neg), including 500 negatives respectively for h and t.
"""
dataset = LinkPropPredDataset(name)
split_edge = dataset.get_edge_split()
train_triples, valid_triples, test_triples = split_edge["train"], split_edge["valid"], split_edge["test"]
nrelation = int(max(train_triples['relation']))+1
nentity = int(max(np.concatenate((train_triples['head'],
train_triples['tail']))))+1
print(nentity, nrelation)
train_array = np.concatenate((train_triples['head'].reshape(-1, 1),
train_triples['relation'].reshape(-1, 1),
train_triples['tail'].reshape(-1, 1),
), axis=1)
valid_array = np.concatenate((valid_triples['head'].reshape(-1, 1),
valid_triples['relation'].reshape(-1, 1),
valid_triples['tail'].reshape(-1, 1),
valid_triples['head_neg'],
valid_triples['tail_neg'],
), axis=1)
test_array = np.concatenate((test_triples['head'].reshape(-1, 1),
test_triples['relation'].reshape(-1, 1),
test_triples['tail'].reshape(-1, 1),
test_triples['head_neg'],
test_triples['tail_neg'],
), axis=1)
print('Saving arrays ...')
p = Path(DATA_PATH / name)
p.mkdir(parents=True, exist_ok=True)
# using npy since it is too big for pickling
with open(Path(DATA_PATH) / name / ('train' + '.npy'), 'wb') as out:
np.save(out, train_array.astype('uint64'))
with open(Path(DATA_PATH) / name / ('valid' + '.npy'), 'wb') as out:
np.save(out, valid_array.astype('uint64'))
with open(Path(DATA_PATH) / name / ('test' + '.npy'), 'wb') as out:
np.save(out, test_array.astype('uint64'))
print('Saving meta_info ...')
meta_info = {'n_provided_neg_head': valid_triples['head_neg'].shape[1],
'n_provided_neg_tail': valid_triples['tail_neg'].shape[1],
}
out = open(Path(DATA_PATH) / name / ('meta_info' + '.pickle'), 'wb')
pickle.dump(meta_info, out)
out.close()
print('Done processing!')