def create_stats()

in DataScience/dashboard_utils.py [0:0]


def create_stats(log_fp, log_type='cb', d=None, predictions_files=None, is_summary=False, report_progress=True):

    t0 = time.time()
    if d is None:
        d = {}

    if predictions_files is None:
        print('Searching prediction files for log file: {}'.format(log_fp))
        predictions_files = []
        for fn in os.scandir(os.path.dirname(log_fp)):
            if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'):
                predictions_files.append(fn.path)

    # load predictions from predictions_files
    pred = {}
    for pred_fp in predictions_files:
        if os.path.isfile(pred_fp):
            if is_summary:
                name = pred_fp.split('/')[-1].split('.')[-2]
            else:
                name = pred_fp.split('.')[-2] # check that policy name is encoded in file_name
            if name:
                if log_type == 'cb':
                    pred[name] = [x.strip() for x in open(pred_fp) if x.strip()]
                elif log_type == 'ccb':
                    with open(pred_fp) as f:
                        pred[name] = []
                        slot = []
                        for x in f:
                            x = x.strip()
                            if x:
                                slot.append(x)
                            else:
                                pred[name].append(slot)
                                slot = []
                print('Loaded {} predictions from {}'.format(len(pred[name]), pred_fp))
            else:
                print('Name is not valid - Skip: {}'.format(pred_fp))
        else:
            print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp))
            sys.exit()

    if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred):
        print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred]))
        sys.exit()

    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i,x in enumerate(gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')):
        if report_progress:
            # display progress
            bytes_count += len(x)
            if (i+1) % 1000 == 0:
                if log_fp.endswith('.gz'):
                    ds_parse.update_progress(i+1)
                else:
                    ds_parse.update_progress(bytes_count,tot_bytes)

        data = None

        if log_type == 'ccb':
            if x.startswith(b'{"Timestamp"') and x.strip().endswith(b'}'):
                data = ds_parse.ccb_json_cooked(x)
                aggregates_ccb_data(data, pred, d, evts)

        elif log_type == 'cb':
            if is_summary:
                data = json.loads(x.decode("utf-8"))
            elif x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x, do_decode=True)

            # Skip wrongly formated lines or not activated lines
            if data is None or data['skipLearn']:
                continue

            aggregates_cb_data(data, pred, d, evts)
        evts += 1

    if report_progress:
        if log_fp.endswith('.gz'):
            len_text = ds_parse.update_progress(i+1)
        else:
            len_text = ds_parse.update_progress(bytes_count,tot_bytes)
        sys.stdout.write("\r" + " "*len_text + "\r")
        sys.stdout.flush()

    print('Read {} lines - Processed {} events'.format(i+1, evts))

    if any(len(pred[name]) != evts for name in pred):
        print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred], evts))
        sys.exit()
    print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))
    return d