in hugegraph-ml/src/hugegraph_ml/utils/dgl2hugegraph_utils.py [0:0]
def read_input():
# reference: https://github.com/dmlc/dgl/blob/master/examples/pytorch/bgnn/run.py
# I added X, y, cat_features and masks into graph
input_folder = "dataset/avazu"
X = pd.read_csv(f"{input_folder}/X.csv")
y = pd.read_csv(f"{input_folder}/y.csv")
categorical_columns = []
if os.path.exists(f"{input_folder}/cat_features.txt"):
with open(f"{input_folder}/cat_features.txt") as f:
for line in f:
if line.strip():
categorical_columns.append(line.strip())
cat_features = None
if categorical_columns:
columns = X.columns
cat_features = np.where(columns.isin(categorical_columns))[0]
for col in list(columns[cat_features]):
X[col] = X[col].astype(str)
gs, _ = load_graphs(f"{input_folder}/graph.dgl")
graph = gs[0]
with open(f"{input_folder}/masks.json") as f:
masks = json.load(f)
# add X
features = [[int(x) for x in row] for row in X.values]
features_tensor = torch.tensor(features, dtype=torch.int32)
graph.ndata["feat"] = features_tensor
# add y
y_tensor = torch.tensor(y.values, dtype=torch.float64)
graph.ndata["class"] = y_tensor
# add masks
for mask_name, node_ids in masks["0"].items():
mask_tensor = torch.zeros(graph.number_of_nodes(), dtype=torch.int32)
mask_tensor[node_ids] = 1
graph.ndata[f"{mask_name}_mask"] = mask_tensor
# add cat_features
cat_features_tensor = torch.tensor(cat_features, dtype=torch.int32)
graph.ndata["cat_features"] = torch.repeat_interleave(
cat_features_tensor[None, :], repeats=graph.number_of_nodes(), dim=0
)
return graph