in tensorflow_datasets/text_simplification/wiki_auto/wiki_auto.py [0:0]
def _generate_examples(self, filepaths, split):
"""Yields examples."""
if self.builder_config.name == 'manual':
keys = [
'alignment_label', 'simple_sentence_id', 'normal_sentence_id',
'simple_sentence', 'normal_sentence', 'GLEU-score'
]
with tf.io.gfile.GFile(filepaths[split]) as f:
for id_, line in enumerate(f):
values = line.strip().split('\t')
dict_ = {}
for k, v in zip(keys, values):
dict_[k] = v
yield id_, dict_
elif (self.builder_config.name == 'auto_acl' or
self.builder_config.name == 'auto_full_no_split' or
self.builder_config.name == 'auto_full_with_split'):
with tf.io.gfile.GFile(filepaths['normal']) as fi:
with tf.io.gfile.GFile(filepaths['simple']) as fo:
for id_, (norm_se, simp_se) in enumerate(zip(fi, fo)):
yield id_, {
'normal_sentence': norm_se,
'simple_sentence': simp_se,
}
else:
dataset_dict = json.load(tf.io.gfile.GFile(filepaths[split]))
for id_, (eid, example_dict) in enumerate(dataset_dict.items()):
res = {
'example_id': eid,
'normal': {
'normal_article_id': example_dict['normal']['id'],
'normal_article_title': example_dict['normal']['title'],
'normal_article_url': example_dict['normal']['url'],
'normal_article_content': {
'normal_sentence_id': [
sen_id for sen_id, sen_txt in example_dict['normal']
['content'].items()
],
'normal_sentence': [
sen_txt for sen_id, sen_txt in example_dict['normal']
['content'].items()
],
},
},
'simple': {
'simple_article_id': example_dict['simple']['id'],
'simple_article_title': example_dict['simple']['title'],
'simple_article_url': example_dict['simple']['url'],
'simple_article_content': {
'simple_sentence_id': [
sen_id for sen_id, sen_txt in example_dict['simple']
['content'].items()
],
'simple_sentence': [
sen_txt for sen_id, sen_txt in example_dict['simple']
['content'].items()
],
},
},
'paragraph_alignment': {
'normal_paragraph_id': [
norm_id for simp_id, norm_id in example_dict.get(
'paragraph_alignment', [])
],
'simple_paragraph_id': [
simp_id for simp_id, norm_id in example_dict.get(
'paragraph_alignment', [])
],
},
'sentence_alignment': {
'normal_sentence_id': [
norm_id for simp_id, norm_id in example_dict.get(
'sentence_alignment', [])
],
'simple_sentence_id': [
simp_id for simp_id, norm_id in example_dict.get(
'sentence_alignment', [])
],
},
}
yield id_, res