def _convert_ogb2shadow_link()

in para_graph_sampler/graph_engine/frontend/data_converter.py [0:0]


def _convert_ogb2shadow_link(data_meta, dir_shadow: str, dir_ogb: str) -> None:
    """
    For link prediction tasks: convert from OGB format to shaDow-GNN format
    """
    name_data = data_meta.name
    from ogb.linkproppred import PygLinkPropPredDataset
    print(f"Preparing shaDow-GNN 'link' dataset from OGB format")
    dir_ogb_parent = '/'.join(dir_ogb.split('/')[:-1])
    if not os.path.exists(dir_ogb_parent):
        os.makedirs(dir_ogb_parent)
    dataset = PygLinkPropPredDataset(data_meta.original_name, root=dir_ogb_parent)
    split_edge = dataset.get_edge_split()
    graph = dataset[0]
    num_node = graph.x.shape[0]
    assert num_node == graph.num_nodes    
    # feat_full.npy
    np.save(dir_shadow.format('feat_full.npy'), graph.x.numpy().astype(np.float32, copy=False))
    graph.x = None        
    if name_data == 'collab':
        """
        split_edge: 
            train   edge/weight/year
            valid   edge/weight/year/edge_neg
            test    edge/weight/year/edge_neg
        where edge & edge_neg are 2D: m x 2; and weight & year are 1D: m
            leading dim of edge & weight & year are the same
        split_edge['train']['edge'].shape[0] + split_edge['valid']['edge'].shape[0] + split_edge['test']['edge'][0]
            matches the edge number in ogb paper
        """
        assert graph.edge_index.shape[1] == graph.edge_weight.shape[0] == graph.edge_year.shape[0]
        """
        adj -- prepare two versions
            in the vanilla setting, adj_full only contains edges in the training set
            in the alternative setting, adj_full contains validation edges as well (i.e., use_valedges_as_input)
        row_full, col_full = graph.edge_index.numpy()
        By default we perform coalescing and store in adj_full_raw. 
            without coalescing, there will be multiple edges between nodes, and thus csr is invalid
        """
        from torch_sparse import coalesce
        edge_index, edge_weight = coalesce(graph.edge_index, graph.edge_weight, num_node, num_node)
        row_full, col_full = edge_index.numpy()
        adj_full = sp.coo_matrix(
            (
                edge_weight.numpy().flatten(),
                (row_full, col_full),
            ), shape=(num_node, num_node)
        ).tocsr()
        dtype = get_adj_dtype(adj=adj_full)
        adj_full.indptr = adj_full.indptr.astype(dtype, copy=False)
        adj_full.indices = adj_full.indices.astype(dtype, copy=False)
        sp.save_npz(dir_shadow.format('adj_full_raw.npz'), adj_full)
        adj_full = None
        # valedge as input
        valedges_und = to_undirected(split_edge['valid']['edge'].t()).numpy()
        row_train_val, col_train_val = np.concatenate([graph.edge_index.numpy(), valedges_und], axis=1)
        edge_weight_train_val = np.concatenate(
            [graph.edge_weight.numpy().flatten(), np.ones(valedges_und.shape[1])]
        )
        adj_full_train_val = sp.coo_matrix(
            (
                edge_weight_train_val,
                (row_train_val, col_train_val),
            ), shape=(num_node, num_node)
        ).tocsr()
        dtype = get_adj_dtype(adj=adj_full_train_val)
        adj_full_train_val.indptr = adj_full_train_val.indptr.astype(dtype, copy=False)
        adj_full_train_val.indices = adj_full_train_val.indices.astype(dtype, copy=False)
        sp.save_npz(dir_shadow.format('adj_full_raw_with_val.npz'), adj_full_train_val)
        adj_full_train_val = None
        graph = None
        # skip adj_train for link task --> current don't consider inductive link prediction
        # split.npy     --> positive and negative sample of edges
        np.save(
            dir_shadow.format('split.npy'), 
            {
                TRAIN: {'pos': split_edge['train']['edge'].numpy().astype(dtype, copy=False)},
                VALID: {'pos': split_edge['valid']['edge'].numpy().astype(dtype, copy=False),
                        'neg': split_edge['valid']['edge_neg'].numpy().astype(dtype, copy=False)},
                TEST : {'pos': split_edge['test']['edge'].numpy().astype(dtype, copy=False),
                        'neg': split_edge['test']['edge_neg'].numpy().astype(dtype, copy=False)}
                # 'ALL': {'pos': graph.edge_index.numpy().astype(dtype, copy=False)}
            }
        )
    elif name_data == 'ppa':
        row, col = graph.edge_index
        adj_full = sp.coo_matrix(
            (
                np.ones(graph.num_edges),
                (row.numpy(), col.numpy()),
            ), shape=(num_node, num_node)
        ).tocsr()
        dtype = get_adj_dtype(adj=adj_full)
        adj_full.indptr = adj_full.indptr.astype(dtype, copy=False)
        adj_full.indices = adj_full.indices.astype(dtype, copy=False)
        sp.save_npz(dir_shadow.format('adj_full_raw.npz'), adj_full)
        adj_full = graph = None
        # same as collab
        np.save(
            dir_shadow.format('split.npy'), 
            {
                TRAIN: {'pos': split_edge['train']['edge'].numpy().astype(dtype, copy=False)},
                VALID: {'pos': split_edge['valid']['edge'].numpy().astype(dtype, copy=False),
                        'neg': split_edge['valid']['edge_neg'].numpy().astype(dtype, copy=False)},
                TEST : {'pos': split_edge['test']['edge'].numpy().astype(dtype, copy=False),
                        'neg': split_edge['test']['edge_neg'].numpy().astype(dtype, copy=False)}
                # 'ALL': {'pos': graph.edge_index.numpy().astype(dtype, copy=False)}
            }
        )
    else:
        raise NotImplementedError
    print(f"Successfully saved shaDow-GNN dataset into {'/'.join(dir_shadow.split('/')[:-1])}")