def process_logs()

in source/sagemaker/data-preprocessing/data_preprocessing.py [0:0]


def process_logs(data_dir, log_file, output_dir):
    logs = pd.read_csv(os.path.join(data_dir, log_file))
    logging.info("Read user website visit logs from: {}".format(os.path.join(data_dir, log_file)))

    transient_edges = os.path.join(output_dir, 'transient_edges.csv')
    save_file(logs[['uid', 'urls']].drop_duplicates(), transient_edges,
              "Saved user -> url transient edges to {}".format(transient_edges))

    transient_nodes_file = os.path.join(output_dir, 'transient_nodes.csv')
    user_features = get_user_features(logs[['uid', 'ts']])
    save_file(user_features, transient_nodes_file, "Saved transient user features to {}".format(transient_nodes_file))

    website_nodes_file = os.path.join(output_dir, 'website_nodes.csv')
    website_features = get_website_features(logs[['urls', 'titles']].drop_duplicates().fillna(""))
    save_file(website_features, website_nodes_file, "Saved website_features to {}".format(website_nodes_file))

    website_group_file = os.path.join(output_dir, 'website_group_edges.csv')
    website_groupings = get_website_groupings(logs[['urls']].drop_duplicates())
    save_file(website_groupings, website_group_file, "Saved url -> domain edges to {}".format(website_group_file))