def main()

in sockeye/evaluate.py [0:0]


def main():
    params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with '
                                                 'respect to a reference set. If multiple hypotheses files are given '
                                                 'the mean and standard deviation of the metrics are reported.')
    arguments.add_evaluate_args(params)
    arguments.add_logging_args(params)
    args = params.parse_args()
    setup_main_logger(file_logging=False)

    if args.quiet:
        logger.setLevel(logging.ERROR)

    utils.check_condition(args.offset >= 0, "Offset should be non-negative.")
    log_sockeye_version(logger)

    logger.info("Command: %s", " ".join(sys.argv))
    logger.info("Arguments: %s", args)

    references = [' '.join(e) for e in data_io_pt.read_content(args.references)]
    all_hypotheses = [[h.strip() for h in hypotheses] for hypotheses in args.hypotheses]
    if not args.not_strict:
        for hypotheses in all_hypotheses:
            utils.check_condition(len(hypotheses) == len(references),
                                  "Number of hypotheses (%d) and references (%d) does not match." % (len(hypotheses),
                                                                                                     len(references)))
    logger.info("%d hypothesis set(s) | %d hypotheses | %d references",
                len(all_hypotheses), len(all_hypotheses[0]), len(references))

    metric_info = ["%s\t(s_opt)" % name for name in args.metrics]
    logger.info("\t".join(metric_info))

    metrics = []  # type: List[Tuple[str, Callable]]
    for name in args.metrics:
        if name == C.BLEU:
            func = partial(raw_corpus_bleu, offset=args.offset)
        elif name == C.CHRF:
            func = raw_corpus_chrf
        elif name == C.ROUGE1:
            func = raw_corpus_rouge1
        elif name == C.ROUGE2:
            func = raw_corpus_rouge2
        elif name == C.ROUGEL:
            func = raw_corpus_rougel
        else:
            raise ValueError("Unknown metric %s." % name)
        metrics.append((name, func))

    if not args.sentence:
        scores = defaultdict(list)  # type: Dict[str, List[float]]
        for hypotheses in all_hypotheses:
            for name, metric in metrics:
                scores[name].append(metric(hypotheses, references))
        _print_mean_std_score(metrics, scores)
    else:
        for hypotheses in all_hypotheses:
            for h, r in zip(hypotheses, references):
                scores = defaultdict(list)  # type: Dict[str, List[float]]
                for name, metric in metrics:
                    scores[name].append(metric([h], [r]))
                _print_mean_std_score(metrics, scores)