in source/sagemaker/data-preprocessing/data_preprocessing.py [0:0]
def process_logs(data_dir, log_file, output_dir):
logs = pd.read_csv(os.path.join(data_dir, log_file))
logging.info("Read user website visit logs from: {}".format(os.path.join(data_dir, log_file)))
transient_edges = os.path.join(output_dir, 'transient_edges.csv')
save_file(logs[['uid', 'urls']].drop_duplicates(), transient_edges,
"Saved user -> url transient edges to {}".format(transient_edges))
transient_nodes_file = os.path.join(output_dir, 'transient_nodes.csv')
user_features = get_user_features(logs[['uid', 'ts']])
save_file(user_features, transient_nodes_file, "Saved transient user features to {}".format(transient_nodes_file))
website_nodes_file = os.path.join(output_dir, 'website_nodes.csv')
website_features = get_website_features(logs[['urls', 'titles']].drop_duplicates().fillna(""))
save_file(website_features, website_nodes_file, "Saved website_features to {}".format(website_nodes_file))
website_group_file = os.path.join(output_dir, 'website_group_edges.csv')
website_groupings = get_website_groupings(logs[['urls']].drop_duplicates())
save_file(website_groupings, website_group_file, "Saved url -> domain edges to {}".format(website_group_file))