def _convert_ogb2shadow_node()

in para_graph_sampler/graph_engine/frontend/data_converter.py [0:0]


def _convert_ogb2shadow_node(data_meta, dir_shadow: str, dir_ogb: str) -> None:
    """
    For node classification tasks: convert from OGB format to shaDow-GNN format
    """
    from ogb.nodeproppred import PygNodePropPredDataset
    print(f"Preparing shaDow-GNN 'node' dataset from OGB format")
    dir_ogb_parent = '/'.join(dir_ogb.split('/')[:-1])
    if not os.path.exists(dir_ogb_parent):
        os.makedirs(dir_ogb_parent)
    dataset = PygNodePropPredDataset(data_meta.original_name, root=dir_ogb_parent)
    split_idx = dataset.get_idx_split()
    graph = dataset[0]
    num_node = graph.y.shape[0]
    num_edge = graph.edge_index.shape[1]
    # feat_full.npy
    np.save(dir_shadow.format('feat_full.npy'), graph.x.numpy().astype(np.float32, copy=False))
    graph.x = None          # done with x, so dereference the pointer to save some memory
    # label_full.npy        NOTE only for single class classification. Otherwise, cannot use 1D label arr
    y_non_nan = graph.y[graph.y == graph.y]
    assert y_non_nan.min().item() == 0
    if y_non_nan.max().item() < 2**8:
        dtype_l = np.uint8
    elif y_non_nan.max().item() < 2**16:
        dtype_l = np.uint16
    elif y_non_nan.max().item() < 2**32:   # Almost impossible to have so many classes
        dtype_l = np.uint32
    else:
        dtype_l = np.int64
    # assert all train / valid / test nodes are not nan
    for _k, v in split_idx.items():
        assert not graph.y[v].isnan().any().item()
    np.save(dir_shadow.format('label_full.npy'), graph.y.numpy().flatten().astype(dtype_l, copy=False))
    # adj_full_raw.npz
    row_full, col_full = graph.edge_index.numpy()
    adj_full = sp.coo_matrix(
        (
            np.broadcast_to(np.ones(1, dtype=np.bool), row_full.size),
            (row_full, col_full),
        ),
        shape=(num_node, num_node)
    ).tocsr()
    dtype = get_adj_dtype(adj=adj_full)
    adj_full.indptr = adj_full.indptr.astype(dtype, copy=False)
    adj_full.indices = adj_full.indices.astype(dtype, copy=False)
    sp.save_npz(dir_shadow.format('adj_full_raw.npz'), adj_full)
    adj_full = graph = None
    # adj_train_raw.npz
    idx_train_set = set(split_idx['train'].numpy().tolist())
    idx_test_set = set(split_idx['test'].numpy().tolist())
    row_train, col_train = [], []
    print("Converting adj into the shaDow format")
    for i in tqdm(range(row_full.size)):
        if row_full[i] in idx_train_set and col_full[i] in idx_train_set:
            row_train.append(row_full[i])
            col_train.append(col_full[i])
    adj_train = sp.coo_matrix(
        (
            np.broadcast_to(np.ones(1, dtype=np.bool), len(row_train)),
            (np.asarray(row_train), np.asarray(col_train)),
        ),
        shape=(num_node, num_node)
    ).tocsr()
    row_train = col_train = None
    adj_train.indptr = adj_train.indptr.astype(dtype, copy=False)
    adj_train.indices = adj_train.indices.astype(dtype, copy=False)
    sp.save_npz(dir_shadow.format('adj_train_raw.npz'), adj_train)
    adj_train = None
    # split.npy (need to do as the last step, since dtype should be determined by adj_full)
    np.save(
        dir_shadow.format('split.npy'), 
        {
            TRAIN: split_idx['train'].numpy().astype(dtype, copy=False),
            VALID: split_idx['valid'].numpy().astype(dtype, copy=False),
            TEST : split_idx['test'].numpy().astype(dtype, copy=False)
        }
    )
    print(f"Successfully saved shaDow-GNN dataset into {'/'.join(dir_shadow.split('/')[:-1])}")