in src/preprocess_datasets.py [0:0]
def prepare_dataset_ogb_biokg(name):
"""ogbl-biokg is a OGB link property prediction dataset
Note that the input formats and evaluation protocols are different from conventional KBC datasets
training input: (h,r,t, h_type, t_type), the last 2 indices represent the entity types.
valid/test input: (h,r,t, h_neg, t_neg, h_type, t_type), including 500 negatives respectively for h and t
"""
dataset = LinkPropPredDataset(name)
split_edge = dataset.get_edge_split()
train_triples, valid_triples, test_triples = split_edge["train"], split_edge["valid"], split_edge["test"]
cur_idx, cur_type_idx, type_dict, entity_dict = 0, 0, {}, {}
for key in dataset[0]['num_nodes_dict']:
type_dict[key] = cur_type_idx
cur_type_idx += 1
entity_dict[key] = (cur_idx, cur_idx + dataset[0]['num_nodes_dict'][key])
cur_idx += dataset[0]['num_nodes_dict'][key]
def index_triples_across_type(triples, entity_dict, type_dict):
triples['head_type_idx'] = np.zeros_like(triples['head'])
triples['tail_type_idx'] = np.zeros_like(triples['tail'])
for i in range(len(triples['head'])):
h_type = triples['head_type'][i]
triples['head_type_idx'][i] = type_dict[h_type]
triples['head'][i] += entity_dict[h_type][0]
if 'head_neg' in triples:
triples['head_neg'][i] += entity_dict[h_type][0]
t_type = triples['tail_type'][i]
triples['tail_type_idx'][i] = type_dict[t_type]
triples['tail'][i] += entity_dict[t_type][0]
if 'tail_neg' in triples:
triples['tail_neg'][i] += entity_dict[t_type][0]
return triples
print('Indexing triples across different entity types ...')
train_triples = index_triples_across_type(train_triples, entity_dict, type_dict)
valid_triples = index_triples_across_type(valid_triples, entity_dict, type_dict)
test_triples = index_triples_across_type(test_triples, entity_dict, type_dict)
nrelation = int(max(train_triples['relation']))+1
nentity = sum(dataset[0]['num_nodes_dict'].values())
assert train_triples['head'].max() <= nentity
train_array = np.concatenate((train_triples['head'].reshape(-1, 1),
train_triples['relation'].reshape(-1, 1),
train_triples['tail'].reshape(-1, 1),
train_triples['head_type_idx'].reshape(-1, 1),
train_triples['tail_type_idx'].reshape(-1, 1),
), axis=1)
valid_array = np.concatenate((valid_triples['head'].reshape(-1, 1),
valid_triples['relation'].reshape(-1, 1),
valid_triples['tail'].reshape(-1, 1),
valid_triples['head_neg'],
valid_triples['tail_neg'],
valid_triples['head_type_idx'].reshape(-1, 1),
valid_triples['tail_type_idx'].reshape(-1, 1),
), axis=1)
test_array = np.concatenate((test_triples['head'].reshape(-1, 1),
test_triples['relation'].reshape(-1, 1),
test_triples['tail'].reshape(-1, 1),
test_triples['head_neg'],
test_triples['tail_neg'],
test_triples['head_type_idx'].reshape(-1, 1),
test_triples['tail_type_idx'].reshape(-1, 1),
), axis=1)
print('Saving arrays ...')
p = Path(DATA_PATH / name)
p.mkdir(parents=True, exist_ok=True)
with open(Path(DATA_PATH) / name / ('train' + '.pickle'), 'wb') as out:
pickle.dump(train_array.astype('uint64'), out)
with open(Path(DATA_PATH) / name / ('valid' + '.pickle'), 'wb') as out:
pickle.dump(valid_array.astype('uint64'), out)
with open(Path(DATA_PATH) / name / ('test' + '.pickle'), 'wb') as out:
pickle.dump(test_array.astype('uint64'), out)
print('Saving meta_info ...')
meta_info = {'n_provided_neg_head': valid_triples['head_neg'].shape[1],
'n_provided_neg_tail': valid_triples['tail_neg'].shape[1],
'type_dict': type_dict, # type name: idx
'entity_dict': entity_dict, # type name: first entity idx, last entity idx
}
out = open(Path(DATA_PATH) / name / ('meta_info' + '.pickle'), 'wb')
pickle.dump(meta_info, out)
out.close()
print('Done processing!')