in source/sagemaker/baseline/train_pytorch_mlp_entity_resolution.py [0:0]
def read_data(training_dir, user_features, url_features, transient_edges, train_edges):
user_features_df = pd.read_csv(os.path.join(training_dir, user_features), header=None).set_index(0)
logging.info("Read user features".format(os.path.join(training_dir, user_features)))
url_features_df = pd.read_csv(os.path.join(training_dir, url_features), header=None).set_index(0)
logging.info("Read url features from {}".format(os.path.join(training_dir, url_features)))
transient_interactions = pd.read_csv(os.path.join(training_dir, transient_edges), header=None)
logging.info("Read transient_interactions {}".format(os.path.join(training_dir, transient_edges)))
transient_interactions = transient_interactions.groupby([0])[1].apply(','.join).reset_index().drop_duplicates().set_index(0)
logging.info("Grouped transient_interactions")
(n_user, d_user), d_url, = user_features_df.shape, url_features_df.shape[1]
features = np.zeros((n_user, d_user + d_url))
for i, (uid, row) in enumerate(user_features_df.iterrows()):
features[i, :d_user] = row
features[i, d_user:] = url_features_df.loc[transient_interactions.loc[uid].values[0].split(',')].mean(axis=0)
train_pairs = pd.read_csv(os.path.join(training_dir, train_edges), header=None)
logging.info("Read ground truth training pairs {}".format(os.path.join(training_dir, train_edges)))
uid_to_idx = {uid: i for i, uid in enumerate(user_features_df.index.values)}
map_uid_to_idx = lambda x: uid_to_idx[x]
true_i = train_pairs[0].apply(map_uid_to_idx)
true_j = train_pairs[1].apply(map_uid_to_idx)
return features.astype(np.float32), true_i, true_j, uid_to_idx