in sockeye/evaluate.py [0:0]
def main():
params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with '
'respect to a reference set. If multiple hypotheses files are given '
'the mean and standard deviation of the metrics are reported.')
arguments.add_evaluate_args(params)
arguments.add_logging_args(params)
args = params.parse_args()
setup_main_logger(file_logging=False)
if args.quiet:
logger.setLevel(logging.ERROR)
utils.check_condition(args.offset >= 0, "Offset should be non-negative.")
log_sockeye_version(logger)
logger.info("Command: %s", " ".join(sys.argv))
logger.info("Arguments: %s", args)
references = [' '.join(e) for e in data_io_pt.read_content(args.references)]
all_hypotheses = [[h.strip() for h in hypotheses] for hypotheses in args.hypotheses]
if not args.not_strict:
for hypotheses in all_hypotheses:
utils.check_condition(len(hypotheses) == len(references),
"Number of hypotheses (%d) and references (%d) does not match." % (len(hypotheses),
len(references)))
logger.info("%d hypothesis set(s) | %d hypotheses | %d references",
len(all_hypotheses), len(all_hypotheses[0]), len(references))
metric_info = ["%s\t(s_opt)" % name for name in args.metrics]
logger.info("\t".join(metric_info))
metrics = [] # type: List[Tuple[str, Callable]]
for name in args.metrics:
if name == C.BLEU:
func = partial(raw_corpus_bleu, offset=args.offset)
elif name == C.CHRF:
func = raw_corpus_chrf
elif name == C.ROUGE1:
func = raw_corpus_rouge1
elif name == C.ROUGE2:
func = raw_corpus_rouge2
elif name == C.ROUGEL:
func = raw_corpus_rougel
else:
raise ValueError("Unknown metric %s." % name)
metrics.append((name, func))
if not args.sentence:
scores = defaultdict(list) # type: Dict[str, List[float]]
for hypotheses in all_hypotheses:
for name, metric in metrics:
scores[name].append(metric(hypotheses, references))
_print_mean_std_score(metrics, scores)
else:
for hypotheses in all_hypotheses:
for h, r in zip(hypotheses, references):
scores = defaultdict(list) # type: Dict[str, List[float]]
for name, metric in metrics:
scores[name].append(metric([h], [r]))
_print_mean_std_score(metrics, scores)