in DataScience/dashboard_utils.py [0:0]
def create_stats(log_fp, log_type='cb', d=None, predictions_files=None, is_summary=False, report_progress=True):
t0 = time.time()
if d is None:
d = {}
if predictions_files is None:
print('Searching prediction files for log file: {}'.format(log_fp))
predictions_files = []
for fn in os.scandir(os.path.dirname(log_fp)):
if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'):
predictions_files.append(fn.path)
# load predictions from predictions_files
pred = {}
for pred_fp in predictions_files:
if os.path.isfile(pred_fp):
if is_summary:
name = pred_fp.split('/')[-1].split('.')[-2]
else:
name = pred_fp.split('.')[-2] # check that policy name is encoded in file_name
if name:
if log_type == 'cb':
pred[name] = [x.strip() for x in open(pred_fp) if x.strip()]
elif log_type == 'ccb':
with open(pred_fp) as f:
pred[name] = []
slot = []
for x in f:
x = x.strip()
if x:
slot.append(x)
else:
pred[name].append(slot)
slot = []
print('Loaded {} predictions from {}'.format(len(pred[name]), pred_fp))
else:
print('Name is not valid - Skip: {}'.format(pred_fp))
else:
print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp))
sys.exit()
if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred):
print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred]))
sys.exit()
print('Processing: {}'.format(log_fp))
bytes_count = 0
tot_bytes = os.path.getsize(log_fp)
evts = 0
for i,x in enumerate(gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')):
if report_progress:
# display progress
bytes_count += len(x)
if (i+1) % 1000 == 0:
if log_fp.endswith('.gz'):
ds_parse.update_progress(i+1)
else:
ds_parse.update_progress(bytes_count,tot_bytes)
data = None
if log_type == 'ccb':
if x.startswith(b'{"Timestamp"') and x.strip().endswith(b'}'):
data = ds_parse.ccb_json_cooked(x)
aggregates_ccb_data(data, pred, d, evts)
elif log_type == 'cb':
if is_summary:
data = json.loads(x.decode("utf-8"))
elif x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
data = ds_parse.json_cooked(x, do_decode=True)
# Skip wrongly formated lines or not activated lines
if data is None or data['skipLearn']:
continue
aggregates_cb_data(data, pred, d, evts)
evts += 1
if report_progress:
if log_fp.endswith('.gz'):
len_text = ds_parse.update_progress(i+1)
else:
len_text = ds_parse.update_progress(bytes_count,tot_bytes)
sys.stdout.write("\r" + " "*len_text + "\r")
sys.stdout.flush()
print('Read {} lines - Processed {} events'.format(i+1, evts))
if any(len(pred[name]) != evts for name in pred):
print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred], evts))
sys.exit()
print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))
return d