in datasets.py [0:0]
def load_data_single_split(self, split: str, seed: int = None) -> List[InputExample]:
"""
Load data for a single split (train, dev, or test).
"""
file_path = os.path.join(self.data_dir(), f'{split}.json')
self.documents = {}
examples = []
if self.is_eval:
chunk_size = self.data_args.chunk_size_eval
chunk_overlap = self.data_args.chunk_overlap_eval
else:
chunk_size = self.data_args.chunk_size
chunk_overlap = self.data_args.chunk_overlap
with open(file_path, 'r') as f:
for i, l in enumerate(f):
raw_document = json.loads(l)
document_id = f'{split}-{i}'
tokens_data = raw_document['preprocessing']['segments']['tokens']
tokens = [x['extent'] for x in tokens_data]
tokens_start_char = [x['start'] for x in tokens_data]
tokens_end_char = [x['end'] for x in tokens_data]
groups = []
for raw_group in raw_document['annotations']['coreference']['groups']:
mentions = []
for raw_mention in raw_group['mentions']:
# find start and end tokens
start = bisect.bisect_left(tokens_start_char, raw_mention['start'])
end = bisect.bisect_left(tokens_end_char, raw_mention['end']) + 1
mentions.append(Entity(start=start, end=end))
groups.append(mentions)
# create chunks
chunks = []
pos = 0
chunk_id = 0
while pos < len(tokens):
# create a chunk starting at this position
chunk_tokens = tokens[pos:pos+chunk_size]
chunk_groups = []
for group in groups:
mentions = [
Entity(start=mention.start-pos, end=mention.end-pos, type=mention.type)
for mention in group
if mention.start >= pos and mention.end <= pos + chunk_size
]
if len(mentions) >= 2:
chunk_groups.append(mentions)
example = InputExample(
id=f'{split}-{i}-{chunk_id}',
tokens=chunk_tokens,
offset=pos,
groups=chunk_groups,
document_id=document_id,
chunk_id=chunk_id,
)
examples.append(example)
chunks.append(example)
if pos + chunk_size >= len(tokens):
# this chunk arrives until the end, so we can stop
break
pos += chunk_size - chunk_overlap
chunk_id += 1
self.documents[document_id] = CorefDocument(
id=document_id,
tokens=tokens,
groups=groups,
chunks=chunks,
chunk_centers=[example.offset + len(example.tokens) // 2 for example in chunks]
)
logging.info(f"Loaded {len(self.documents)} documents split in {len(examples)} chunks"
f" for split {split} of {self.name}")
return examples