def read_input()

in hugegraph-ml/src/hugegraph_ml/utils/dgl2hugegraph_utils.py [0:0]


def read_input():
    # reference: https://github.com/dmlc/dgl/blob/master/examples/pytorch/bgnn/run.py
    # I added X, y, cat_features and masks into graph
    input_folder = "dataset/avazu"
    X = pd.read_csv(f"{input_folder}/X.csv")
    y = pd.read_csv(f"{input_folder}/y.csv")

    categorical_columns = []
    if os.path.exists(f"{input_folder}/cat_features.txt"):
        with open(f"{input_folder}/cat_features.txt") as f:
            for line in f:
                if line.strip():
                    categorical_columns.append(line.strip())

    cat_features = None
    if categorical_columns:
        columns = X.columns
        cat_features = np.where(columns.isin(categorical_columns))[0]

        for col in list(columns[cat_features]):
            X[col] = X[col].astype(str)

    gs, _ = load_graphs(f"{input_folder}/graph.dgl")
    graph = gs[0]

    with open(f"{input_folder}/masks.json") as f:
        masks = json.load(f)

    # add X
    features = [[int(x) for x in row] for row in X.values]
    features_tensor = torch.tensor(features, dtype=torch.int32)
    graph.ndata["feat"] = features_tensor

    # add y
    y_tensor = torch.tensor(y.values, dtype=torch.float64)
    graph.ndata["class"] = y_tensor

    # add masks
    for mask_name, node_ids in masks["0"].items():
        mask_tensor = torch.zeros(graph.number_of_nodes(), dtype=torch.int32)
        mask_tensor[node_ids] = 1
        graph.ndata[f"{mask_name}_mask"] = mask_tensor

    # add cat_features
    cat_features_tensor = torch.tensor(cat_features, dtype=torch.int32)
    graph.ndata["cat_features"] = torch.repeat_interleave(
        cat_features_tensor[None, :], repeats=graph.number_of_nodes(), dim=0
    )

    return graph