in source/sagemaker/data-preparation/data_prep.py [0:0]
def merge_websites_with_user_visits(data_dir, facts, url_data, primary_key, output_dir, logs):
with open(os.path.join(data_dir, facts)) as f_in:
for i, line in enumerate(f_in):
j = json.loads(line.strip())
user_visits = pd.json_normalize(j.get("facts"))
fids = user_visits[primary_key].values
user_visits = pd.concat((user_visits.set_index(primary_key), url_data.loc[fids]), axis=1)
user_visits['uid'] = j.get('uid')
mode, header = ('w', True) if i == 0 else ('a', False)
with open(os.path.join(output_dir, logs), mode) as f:
user_visits.to_csv(f, index=False, header=header)