in scripts/batch_eval_KB_completion.py [0:0]
def filter_samples(model, samples, vocab_subset, max_sentence_length, template):
msg = ""
new_samples = []
samples_exluded = 0
for sample in samples:
excluded = False
if "obj_label" in sample and "sub_label" in sample:
obj_label_ids = model.get_id(sample["obj_label"])
if obj_label_ids:
recostructed_word = " ".join(
[model.vocab[x] for x in obj_label_ids]
).strip()
else:
recostructed_word = None
excluded = False
if not template or len(template) == 0:
masked_sentences = sample["masked_sentences"]
text = " ".join(masked_sentences)
if len(text.split()) > max_sentence_length:
msg += "\tEXCLUDED for exeeding max sentence length: {}\n".format(
masked_sentences
)
samples_exluded += 1
excluded = True
# MAKE SURE THAT obj_label IS IN VOCABULARIES
if vocab_subset:
for x in sample["obj_label"].split(" "):
if x not in vocab_subset:
excluded = True
msg += "\tEXCLUDED object label {} not in vocab subset\n".format(
sample["obj_label"]
)
samples_exluded += 1
break
if excluded:
pass
elif obj_label_ids is None:
msg += "\tEXCLUDED object label {} not in model vocabulary\n".format(
sample["obj_label"]
)
samples_exluded += 1
elif not recostructed_word or recostructed_word != sample["obj_label"]:
msg += "\tEXCLUDED object label {} not in model vocabulary\n".format(
sample["obj_label"]
)
samples_exluded += 1
# elif vocab_subset is not None and sample['obj_label'] not in vocab_subset:
# msg += "\tEXCLUDED object label {} not in vocab subset\n".format(sample['obj_label'])
# samples_exluded+=1
elif "judgments" in sample:
# only for Google-RE
num_no = 0
num_yes = 0
for x in sample["judgments"]:
if x["judgment"] == "yes":
num_yes += 1
else:
num_no += 1
if num_no > num_yes:
# SKIP NEGATIVE EVIDENCE
pass
else:
new_samples.append(sample)
else:
new_samples.append(sample)
else:
msg += "\tEXCLUDED since 'obj_label' not sample or 'sub_label' not in sample: {}\n".format(
sample
)
samples_exluded += 1
msg += "samples exluded : {}\n".format(samples_exluded)
return new_samples, msg