def score()

in paper/experiments/mturk/prepare_mturk.py [0:0]


def score(data_folder, out_folder, task, score_folder):
    data_folder = Path(data_folder)
    out_folder = Path(out_folder)
    datasets = ["ldc", "viggo", "webnlg", "e2e"]
    systems = ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota", "human"]
    stats = {}
    first = []
    second = []
    for dataset in datasets:

        print(f"processing {dataset}")
        systems_data = {}

        for system in systems:
            systems_data[system] = json.load(open(data_folder / dataset / f"{system}.json"))

        print(f"dataset: {dataset}")
        all_scored = defaultdict(list)
        score_folder = Path(score_folder)
        score_file = score_folder / task / (f"{dataset}.csv")
        total_texts = 5
        try:
            df = pd.read_csv(score_file)
        except:
            print(f"{score_file} not available.")
            continue
        scores = df.to_dict(orient="records")
        try:
            input_df = pd.read_csv(out_folder / task / (f"mturk_{dataset}.csv"))
        except:
            print(f"ignoring {dataset}")
            continue
        input_data = input_df.to_dict(orient="records")

        if task == "fidelity_annotations":
            for item in scores:
                for i in range(total_texts):
                    text = item[f"Input.text{i + 1}"]
                    index = item["Input.index"]
                    accurate = f"Answer.text{i + 1}_accurate.text{i + 1}_accurate"
                    key = f"{index}_{text}"
                    try:
                        all_scored[key].append({"accurate": item[accurate]})
                    except:
                        import ipdb
                        ipdb.set_trace()

            fidelity_scores = []

            all_ser_scores = []
            all_sfc_scores = []
            true_scores_sfc = []
            true_scores_ser = []
            sfc_data = defaultdict(list)
            ser_data = defaultdict(list)

            for x in all_scored:
                try:
                    one = all_scored[x][0]["accurate"]
                    two = all_scored[x][1]["accurate"]
                    first.append(one)
                    second.append(two)
                except:
                    pass

            for item in input_data:
                for i in range(total_texts):
                    text_i = item[f"text{i + 1}"]
                    system = item[f"system{i + 1}"]
                    index = item["index"]
                    key = f"{index}_{text_i}"

                    if key in all_scored:
                        obj = systems_data[system][index]
                        score = np.mean([int(x["accurate"]) for x in all_scored[key]])
                        # these have to be reconciled if disagreeing: take ceil or floor

                        sample_type = f'{"A_D" if obj["sfc_correct"] else "E_D"}'
                        if dataset != "ldc":
                            sample_type += f',{"A_H" if obj["ser_correct"] else "E_H"}'

                        fidelity_scores.append(
                            {
                                "ind": index,
                                "system": system,
                                "value": math.ceil(score),
                                "sample_type": sample_type,
                                "text": text_i,
                                "data": item["data"],
                                "original_text": obj["original_" + dataset_fields[dataset]["text"].strip()],
                                "sfc_correct": obj["sfc_correct"],
                                "ser_correct": obj["ser_correct"] if "ser_correct" in obj else "",
                            }
                        )
                        # Reconciled cases are those where the expert annotators disagreed. They discussed these and
                        # reached the following agreements
                        reconciled = {
                            "Example 1": 0,
                            "Example 2": 1,
                        }
                        if text_i in reconciled:
                            true_scores_sfc.append(reconciled[text_i])
                            true_scores_ser.append(reconciled[text_i])
                        else:
                            add_closest_score(score, true_scores_sfc, obj["sfc_correct"])
                            if dataset != "ldc":
                                add_closest_score(score, true_scores_ser, obj["ser_correct"])

                        all_sfc_scores.append(obj["sfc_correct"])

                        sfc_data[system].append(obj["sfc_correct"])

                        if dataset != "ldc":
                            all_ser_scores.append(obj["ser_correct"])
                            ser_data[system].append(obj["ser_correct"])

            if dataset != "ldc":
                c_report = classification_report(true_scores_ser, all_ser_scores)
                stats[f"{dataset}_ser_report"] = classification_report(
                    true_scores_ser, all_ser_scores, output_dict=True
                )
                print("SER")
                print(c_report)

            c_report = classification_report(true_scores_sfc, all_sfc_scores)
            stats[f"{dataset}_sfc_report"] = classification_report(true_scores_sfc, all_sfc_scores, output_dict=True)
            print("SFC")
            print(c_report)

            mturk_df = pd.DataFrame(fidelity_scores)

            agg_stats = mturk_df.groupby(["system"]).agg(["mean", "count"])
            print(agg_stats)
            stats[f"{dataset}_score"] = agg_stats.to_dict()[("value", "mean")]
            stats[f"{dataset}_count"] = agg_stats.to_dict()[("value", "count")]
            print(mturk_df.groupby(["system", "sample_type"]).agg(["mean", "count"]))

            if dataset != "ldc":
                tb_b = mcnemar_table(
                    y_target=np.array(true_scores_sfc),
                    y_model1=np.array(all_sfc_scores),
                    y_model2=np.array(all_ser_scores),
                )
                print(tb_b)
                chi2, p = mcnemar(ary=tb_b, corrected=True)
                print(f"mcnemar chi2: {chi2}, p-value {p}")

            for measure in ["sfc_correct", "ser_correct"]:
                if measure == "ser_correct" and dataset == "ldc":
                    continue
                stats[f"{dataset}_significance_{measure}"] = compute_stat_sig(systems_data, system, measure)

        elif task == "fluency":

            for item in scores:
                for i in range(total_texts):
                    field = f"Input.text{i + 1}"
                    answer_field = f"Answer.fluency{i + 1}"
                    all_scored[item[field]].append(item[answer_field])

            for x in all_scored:
                all_scored[x] = {"average": np.mean(all_scored[x]), "count": len(all_scored[x])}

            fluency_scores = defaultdict(list)

            for item in input_data:
                for i in range(total_texts):
                    if item[f"text{i + 1}"] in all_scored:
                        score = all_scored[item[f"text{i + 1}"]]["average"]
                        system = item[f"system{i + 1}"]
                        fluency_scores[system].append(score)

            fluency_df_values = []
            for system in fluency_scores:
                fluency_df_values.extend(
                    [{"system": system, "value": fluency_scores[system][i]} for i in range(len(fluency_scores[system]))]
                )

            mturk_df = pd.DataFrame(fluency_df_values)
            agg_stats = mturk_df.groupby(["system"]).agg(["mean", "count", "median"])
            print(agg_stats)
            stats[dataset] = agg_stats.to_dict()[("value", "mean")]

            test_stats = sp.posthoc_wilcoxon(
                mturk_df, val_col="value", group_col="system", sort=True, zero_method="zsplit"
            )
            print(test_stats)
            significance = defaultdict(list)
            for system in ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota"]:
                for other_system in ["sota", "human"]:
                    p_value = test_stats.loc[system, other_system]
                    if p_value <= 0.05 and p_value >= 0:
                        significance[system].append(other_system[0])
                significance[system] = ",".join(significance[system])
            stats[f"{dataset}_significance"] = significance

    print(cohen_kappa_score(first, second))
    json.dump(stats, open(data_folder / f"{task}.json", "w"), indent=2)