in paper/experiments/mturk/prepare_mturk.py [0:0]
def score(data_folder, out_folder, task, score_folder):
data_folder = Path(data_folder)
out_folder = Path(out_folder)
datasets = ["ldc", "viggo", "webnlg", "e2e"]
systems = ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota", "human"]
stats = {}
first = []
second = []
for dataset in datasets:
print(f"processing {dataset}")
systems_data = {}
for system in systems:
systems_data[system] = json.load(open(data_folder / dataset / f"{system}.json"))
print(f"dataset: {dataset}")
all_scored = defaultdict(list)
score_folder = Path(score_folder)
score_file = score_folder / task / (f"{dataset}.csv")
total_texts = 5
try:
df = pd.read_csv(score_file)
except:
print(f"{score_file} not available.")
continue
scores = df.to_dict(orient="records")
try:
input_df = pd.read_csv(out_folder / task / (f"mturk_{dataset}.csv"))
except:
print(f"ignoring {dataset}")
continue
input_data = input_df.to_dict(orient="records")
if task == "fidelity_annotations":
for item in scores:
for i in range(total_texts):
text = item[f"Input.text{i + 1}"]
index = item["Input.index"]
accurate = f"Answer.text{i + 1}_accurate.text{i + 1}_accurate"
key = f"{index}_{text}"
try:
all_scored[key].append({"accurate": item[accurate]})
except:
import ipdb
ipdb.set_trace()
fidelity_scores = []
all_ser_scores = []
all_sfc_scores = []
true_scores_sfc = []
true_scores_ser = []
sfc_data = defaultdict(list)
ser_data = defaultdict(list)
for x in all_scored:
try:
one = all_scored[x][0]["accurate"]
two = all_scored[x][1]["accurate"]
first.append(one)
second.append(two)
except:
pass
for item in input_data:
for i in range(total_texts):
text_i = item[f"text{i + 1}"]
system = item[f"system{i + 1}"]
index = item["index"]
key = f"{index}_{text_i}"
if key in all_scored:
obj = systems_data[system][index]
score = np.mean([int(x["accurate"]) for x in all_scored[key]])
# these have to be reconciled if disagreeing: take ceil or floor
sample_type = f'{"A_D" if obj["sfc_correct"] else "E_D"}'
if dataset != "ldc":
sample_type += f',{"A_H" if obj["ser_correct"] else "E_H"}'
fidelity_scores.append(
{
"ind": index,
"system": system,
"value": math.ceil(score),
"sample_type": sample_type,
"text": text_i,
"data": item["data"],
"original_text": obj["original_" + dataset_fields[dataset]["text"].strip()],
"sfc_correct": obj["sfc_correct"],
"ser_correct": obj["ser_correct"] if "ser_correct" in obj else "",
}
)
# Reconciled cases are those where the expert annotators disagreed. They discussed these and
# reached the following agreements
reconciled = {
"Example 1": 0,
"Example 2": 1,
}
if text_i in reconciled:
true_scores_sfc.append(reconciled[text_i])
true_scores_ser.append(reconciled[text_i])
else:
add_closest_score(score, true_scores_sfc, obj["sfc_correct"])
if dataset != "ldc":
add_closest_score(score, true_scores_ser, obj["ser_correct"])
all_sfc_scores.append(obj["sfc_correct"])
sfc_data[system].append(obj["sfc_correct"])
if dataset != "ldc":
all_ser_scores.append(obj["ser_correct"])
ser_data[system].append(obj["ser_correct"])
if dataset != "ldc":
c_report = classification_report(true_scores_ser, all_ser_scores)
stats[f"{dataset}_ser_report"] = classification_report(
true_scores_ser, all_ser_scores, output_dict=True
)
print("SER")
print(c_report)
c_report = classification_report(true_scores_sfc, all_sfc_scores)
stats[f"{dataset}_sfc_report"] = classification_report(true_scores_sfc, all_sfc_scores, output_dict=True)
print("SFC")
print(c_report)
mturk_df = pd.DataFrame(fidelity_scores)
agg_stats = mturk_df.groupby(["system"]).agg(["mean", "count"])
print(agg_stats)
stats[f"{dataset}_score"] = agg_stats.to_dict()[("value", "mean")]
stats[f"{dataset}_count"] = agg_stats.to_dict()[("value", "count")]
print(mturk_df.groupby(["system", "sample_type"]).agg(["mean", "count"]))
if dataset != "ldc":
tb_b = mcnemar_table(
y_target=np.array(true_scores_sfc),
y_model1=np.array(all_sfc_scores),
y_model2=np.array(all_ser_scores),
)
print(tb_b)
chi2, p = mcnemar(ary=tb_b, corrected=True)
print(f"mcnemar chi2: {chi2}, p-value {p}")
for measure in ["sfc_correct", "ser_correct"]:
if measure == "ser_correct" and dataset == "ldc":
continue
stats[f"{dataset}_significance_{measure}"] = compute_stat_sig(systems_data, system, measure)
elif task == "fluency":
for item in scores:
for i in range(total_texts):
field = f"Input.text{i + 1}"
answer_field = f"Answer.fluency{i + 1}"
all_scored[item[field]].append(item[answer_field])
for x in all_scored:
all_scored[x] = {"average": np.mean(all_scored[x]), "count": len(all_scored[x])}
fluency_scores = defaultdict(list)
for item in input_data:
for i in range(total_texts):
if item[f"text{i + 1}"] in all_scored:
score = all_scored[item[f"text{i + 1}"]]["average"]
system = item[f"system{i + 1}"]
fluency_scores[system].append(score)
fluency_df_values = []
for system in fluency_scores:
fluency_df_values.extend(
[{"system": system, "value": fluency_scores[system][i]} for i in range(len(fluency_scores[system]))]
)
mturk_df = pd.DataFrame(fluency_df_values)
agg_stats = mturk_df.groupby(["system"]).agg(["mean", "count", "median"])
print(agg_stats)
stats[dataset] = agg_stats.to_dict()[("value", "mean")]
test_stats = sp.posthoc_wilcoxon(
mturk_df, val_col="value", group_col="system", sort=True, zero_method="zsplit"
)
print(test_stats)
significance = defaultdict(list)
for system in ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota"]:
for other_system in ["sota", "human"]:
p_value = test_stats.loc[system, other_system]
if p_value <= 0.05 and p_value >= 0:
significance[system].append(other_system[0])
significance[system] = ",".join(significance[system])
stats[f"{dataset}_significance"] = significance
print(cohen_kappa_score(first, second))
json.dump(stats, open(data_folder / f"{task}.json", "w"), indent=2)