in src/mlm/loaders.py [0:0]
def from_dict(cls, obj_dict: Dict[str, Dict[str, Any]], max_utts: Optional[int] = None, vocab: Optional[nlp.Vocab] = None, tokenizer = None):
"""Loads hypotheses from the format of Shin et al. (JSON)
Args:
fp (str): JSON file name
max_utts (None, optional): Number of utterances to process
vocab (None, optional): Vocabulary
Returns:
TYPE: Description
"""
# Just a dictionary for now
# but equipped with this factory method
preds = cls()
item_list = sorted(obj_dict.items())
if max_utts is not None:
item_list = item_list[:max_utts]
for utt_id, hyps_dict in item_list:
num_hyps = 0
for key in hyps_dict.keys():
if key.startswith("hyp_"):
num_hyps += 1
sents = [None]*num_hyps
scores = [None]*num_hyps
# hyps_dict key-values look like:
# 'ref': "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
# 'hyp_100' {'score': -10.107752799987793, 'text': ' mister quillter is the apostle of the middle classes and weir glad to welcome his gospel'}
for hyp_id, hyp_data in hyps_dict.items():
if not hyp_id.startswith('hyp_'):
continue
# 'hyp_100' --> 99
idx = int(hyp_id.split('_')[1]) - 1
sents[idx] = hyp_data['text'].strip()
scores[idx] = hyp_data['score']
hyps = Hypotheses(sents, scores, vocab, tokenizer)
preds[utt_id] = hyps
return preds