in para_graph_sampler/graph_engine/frontend/data_converter.py [0:0]
def _convert_ogb2shadow_link(data_meta, dir_shadow: str, dir_ogb: str) -> None:
"""
For link prediction tasks: convert from OGB format to shaDow-GNN format
"""
name_data = data_meta.name
from ogb.linkproppred import PygLinkPropPredDataset
print(f"Preparing shaDow-GNN 'link' dataset from OGB format")
dir_ogb_parent = '/'.join(dir_ogb.split('/')[:-1])
if not os.path.exists(dir_ogb_parent):
os.makedirs(dir_ogb_parent)
dataset = PygLinkPropPredDataset(data_meta.original_name, root=dir_ogb_parent)
split_edge = dataset.get_edge_split()
graph = dataset[0]
num_node = graph.x.shape[0]
assert num_node == graph.num_nodes
# feat_full.npy
np.save(dir_shadow.format('feat_full.npy'), graph.x.numpy().astype(np.float32, copy=False))
graph.x = None
if name_data == 'collab':
"""
split_edge:
train edge/weight/year
valid edge/weight/year/edge_neg
test edge/weight/year/edge_neg
where edge & edge_neg are 2D: m x 2; and weight & year are 1D: m
leading dim of edge & weight & year are the same
split_edge['train']['edge'].shape[0] + split_edge['valid']['edge'].shape[0] + split_edge['test']['edge'][0]
matches the edge number in ogb paper
"""
assert graph.edge_index.shape[1] == graph.edge_weight.shape[0] == graph.edge_year.shape[0]
"""
adj -- prepare two versions
in the vanilla setting, adj_full only contains edges in the training set
in the alternative setting, adj_full contains validation edges as well (i.e., use_valedges_as_input)
row_full, col_full = graph.edge_index.numpy()
By default we perform coalescing and store in adj_full_raw.
without coalescing, there will be multiple edges between nodes, and thus csr is invalid
"""
from torch_sparse import coalesce
edge_index, edge_weight = coalesce(graph.edge_index, graph.edge_weight, num_node, num_node)
row_full, col_full = edge_index.numpy()
adj_full = sp.coo_matrix(
(
edge_weight.numpy().flatten(),
(row_full, col_full),
), shape=(num_node, num_node)
).tocsr()
dtype = get_adj_dtype(adj=adj_full)
adj_full.indptr = adj_full.indptr.astype(dtype, copy=False)
adj_full.indices = adj_full.indices.astype(dtype, copy=False)
sp.save_npz(dir_shadow.format('adj_full_raw.npz'), adj_full)
adj_full = None
# valedge as input
valedges_und = to_undirected(split_edge['valid']['edge'].t()).numpy()
row_train_val, col_train_val = np.concatenate([graph.edge_index.numpy(), valedges_und], axis=1)
edge_weight_train_val = np.concatenate(
[graph.edge_weight.numpy().flatten(), np.ones(valedges_und.shape[1])]
)
adj_full_train_val = sp.coo_matrix(
(
edge_weight_train_val,
(row_train_val, col_train_val),
), shape=(num_node, num_node)
).tocsr()
dtype = get_adj_dtype(adj=adj_full_train_val)
adj_full_train_val.indptr = adj_full_train_val.indptr.astype(dtype, copy=False)
adj_full_train_val.indices = adj_full_train_val.indices.astype(dtype, copy=False)
sp.save_npz(dir_shadow.format('adj_full_raw_with_val.npz'), adj_full_train_val)
adj_full_train_val = None
graph = None
# skip adj_train for link task --> current don't consider inductive link prediction
# split.npy --> positive and negative sample of edges
np.save(
dir_shadow.format('split.npy'),
{
TRAIN: {'pos': split_edge['train']['edge'].numpy().astype(dtype, copy=False)},
VALID: {'pos': split_edge['valid']['edge'].numpy().astype(dtype, copy=False),
'neg': split_edge['valid']['edge_neg'].numpy().astype(dtype, copy=False)},
TEST : {'pos': split_edge['test']['edge'].numpy().astype(dtype, copy=False),
'neg': split_edge['test']['edge_neg'].numpy().astype(dtype, copy=False)}
# 'ALL': {'pos': graph.edge_index.numpy().astype(dtype, copy=False)}
}
)
elif name_data == 'ppa':
row, col = graph.edge_index
adj_full = sp.coo_matrix(
(
np.ones(graph.num_edges),
(row.numpy(), col.numpy()),
), shape=(num_node, num_node)
).tocsr()
dtype = get_adj_dtype(adj=adj_full)
adj_full.indptr = adj_full.indptr.astype(dtype, copy=False)
adj_full.indices = adj_full.indices.astype(dtype, copy=False)
sp.save_npz(dir_shadow.format('adj_full_raw.npz'), adj_full)
adj_full = graph = None
# same as collab
np.save(
dir_shadow.format('split.npy'),
{
TRAIN: {'pos': split_edge['train']['edge'].numpy().astype(dtype, copy=False)},
VALID: {'pos': split_edge['valid']['edge'].numpy().astype(dtype, copy=False),
'neg': split_edge['valid']['edge_neg'].numpy().astype(dtype, copy=False)},
TEST : {'pos': split_edge['test']['edge'].numpy().astype(dtype, copy=False),
'neg': split_edge['test']['edge_neg'].numpy().astype(dtype, copy=False)}
# 'ALL': {'pos': graph.edge_index.numpy().astype(dtype, copy=False)}
}
)
else:
raise NotImplementedError
print(f"Successfully saved shaDow-GNN dataset into {'/'.join(dir_shadow.split('/')[:-1])}")