in datasets.py [0:0]
def load_data_single_split(self, split: str, seed: int = None) -> List[InputExample]:
"""
Load data for a single split (train, dev, or test).
"""
file_path = os.path.join(self.data_dir(), f'{split}.json')
examples = []
num_documents = 0
num_entities = 0
num_relations = 0
with open(file_path, 'r') as f:
for j, l in enumerate(f):
document = json.loads(l)
num_documents += 1
offset = 0
for i, tokens in enumerate(document['sentences']):
num_entities += len(document['ner'][i])
num_relations += len(document['relations'][i])
if len(document['ner'][i]) > 0:
entities = [
Entity(type=self.entity_types[entity_type], start=start-offset, end=end-offset+1)
for start, end, entity_type in document['ner'][i]
]
relations = []
skip = False
for start1, end1, start2, end2, relation_type in document['relations'][i]:
# find entities
if len([e for e in entities if e.start == start1-offset and e.end == end1-offset+1]) > 1 \
or \
len([e for e in entities if e.start == start2-offset and e.end == end2-offset+1]) \
> 1:
skip = True
break
[head] = [e for e in entities if e.start == start1-offset and e.end == end1-offset+1]
[tail] = [e for e in entities if e.start == start2-offset and e.end == end2-offset+1]
relations.append(
Relation(type=self.relation_types[relation_type], head=head, tail=tail)
)
if not skip:
example = InputExample(
id=f'{split}-{j}-{i}',
tokens=tokens,
entities=entities,
relations=relations,
)
examples.append(example)
offset += len(tokens)
logging.info(f'Constructed {len(examples)} examples (from {num_documents} documents) for {self.name} ({split})')
return examples