in para_graph_sampler/graph_engine/frontend/data_converter.py [0:0]
def _convert_ogb2shadow_node(data_meta, dir_shadow: str, dir_ogb: str) -> None:
"""
For node classification tasks: convert from OGB format to shaDow-GNN format
"""
from ogb.nodeproppred import PygNodePropPredDataset
print(f"Preparing shaDow-GNN 'node' dataset from OGB format")
dir_ogb_parent = '/'.join(dir_ogb.split('/')[:-1])
if not os.path.exists(dir_ogb_parent):
os.makedirs(dir_ogb_parent)
dataset = PygNodePropPredDataset(data_meta.original_name, root=dir_ogb_parent)
split_idx = dataset.get_idx_split()
graph = dataset[0]
num_node = graph.y.shape[0]
num_edge = graph.edge_index.shape[1]
# feat_full.npy
np.save(dir_shadow.format('feat_full.npy'), graph.x.numpy().astype(np.float32, copy=False))
graph.x = None # done with x, so dereference the pointer to save some memory
# label_full.npy NOTE only for single class classification. Otherwise, cannot use 1D label arr
y_non_nan = graph.y[graph.y == graph.y]
assert y_non_nan.min().item() == 0
if y_non_nan.max().item() < 2**8:
dtype_l = np.uint8
elif y_non_nan.max().item() < 2**16:
dtype_l = np.uint16
elif y_non_nan.max().item() < 2**32: # Almost impossible to have so many classes
dtype_l = np.uint32
else:
dtype_l = np.int64
# assert all train / valid / test nodes are not nan
for _k, v in split_idx.items():
assert not graph.y[v].isnan().any().item()
np.save(dir_shadow.format('label_full.npy'), graph.y.numpy().flatten().astype(dtype_l, copy=False))
# adj_full_raw.npz
row_full, col_full = graph.edge_index.numpy()
adj_full = sp.coo_matrix(
(
np.broadcast_to(np.ones(1, dtype=np.bool), row_full.size),
(row_full, col_full),
),
shape=(num_node, num_node)
).tocsr()
dtype = get_adj_dtype(adj=adj_full)
adj_full.indptr = adj_full.indptr.astype(dtype, copy=False)
adj_full.indices = adj_full.indices.astype(dtype, copy=False)
sp.save_npz(dir_shadow.format('adj_full_raw.npz'), adj_full)
adj_full = graph = None
# adj_train_raw.npz
idx_train_set = set(split_idx['train'].numpy().tolist())
idx_test_set = set(split_idx['test'].numpy().tolist())
row_train, col_train = [], []
print("Converting adj into the shaDow format")
for i in tqdm(range(row_full.size)):
if row_full[i] in idx_train_set and col_full[i] in idx_train_set:
row_train.append(row_full[i])
col_train.append(col_full[i])
adj_train = sp.coo_matrix(
(
np.broadcast_to(np.ones(1, dtype=np.bool), len(row_train)),
(np.asarray(row_train), np.asarray(col_train)),
),
shape=(num_node, num_node)
).tocsr()
row_train = col_train = None
adj_train.indptr = adj_train.indptr.astype(dtype, copy=False)
adj_train.indices = adj_train.indices.astype(dtype, copy=False)
sp.save_npz(dir_shadow.format('adj_train_raw.npz'), adj_train)
adj_train = None
# split.npy (need to do as the last step, since dtype should be determined by adj_full)
np.save(
dir_shadow.format('split.npy'),
{
TRAIN: split_idx['train'].numpy().astype(dtype, copy=False),
VALID: split_idx['valid'].numpy().astype(dtype, copy=False),
TEST : split_idx['test'].numpy().astype(dtype, copy=False)
}
)
print(f"Successfully saved shaDow-GNN dataset into {'/'.join(dir_shadow.split('/')[:-1])}")