def read_data()

in source/sagemaker/baseline/train_pytorch_mlp_entity_resolution.py [0:0]


def read_data(training_dir, user_features, url_features, transient_edges, train_edges):
    user_features_df = pd.read_csv(os.path.join(training_dir, user_features), header=None).set_index(0)
    logging.info("Read user features".format(os.path.join(training_dir, user_features)))

    url_features_df = pd.read_csv(os.path.join(training_dir, url_features), header=None).set_index(0)
    logging.info("Read url features from {}".format(os.path.join(training_dir, url_features)))

    transient_interactions = pd.read_csv(os.path.join(training_dir, transient_edges), header=None)
    logging.info("Read transient_interactions {}".format(os.path.join(training_dir, transient_edges)))

    transient_interactions = transient_interactions.groupby([0])[1].apply(','.join).reset_index().drop_duplicates().set_index(0)
    logging.info("Grouped transient_interactions")

    (n_user, d_user), d_url,  = user_features_df.shape, url_features_df.shape[1]
    features = np.zeros((n_user, d_user + d_url))
    for i, (uid, row) in enumerate(user_features_df.iterrows()):
        features[i, :d_user] = row
        features[i, d_user:] = url_features_df.loc[transient_interactions.loc[uid].values[0].split(',')].mean(axis=0)

    train_pairs = pd.read_csv(os.path.join(training_dir, train_edges), header=None)
    logging.info("Read ground truth training pairs {}".format(os.path.join(training_dir, train_edges)))
    uid_to_idx = {uid: i for i, uid in enumerate(user_features_df.index.values)}
    map_uid_to_idx = lambda x: uid_to_idx[x]
    true_i = train_pairs[0].apply(map_uid_to_idx)
    true_j = train_pairs[1].apply(map_uid_to_idx)
    return features.astype(np.float32), true_i, true_j, uid_to_idx