def load_data_single

def load_data_single_split()

in datasets.py [0:0]
64 lines of code
17 McCabe index (conditional complexity)

    def load_data_single_split(self, split: str, seed: int = None) -> List[InputExample]:
        """
        Load data for a single split (train, dev, or test).
        """
        file_path = os.path.join(self.data_dir(), f'{split}.json')

        self.documents = {}
        examples = []

        if self.is_eval:
            chunk_size = self.data_args.chunk_size_eval
            chunk_overlap = self.data_args.chunk_overlap_eval
        else:
            chunk_size = self.data_args.chunk_size
            chunk_overlap = self.data_args.chunk_overlap

        with open(file_path, 'r') as f:
            for i, l in enumerate(f):
                raw_document = json.loads(l)
                document_id = f'{split}-{i}'

                tokens_data = raw_document['preprocessing']['segments']['tokens']
                tokens = [x['extent'] for x in tokens_data]
                tokens_start_char = [x['start'] for x in tokens_data]
                tokens_end_char = [x['end'] for x in tokens_data]

                groups = []
                for raw_group in raw_document['annotations']['coreference']['groups']:
                    mentions = []
                    for raw_mention in raw_group['mentions']:
                        # find start and end tokens
                        start = bisect.bisect_left(tokens_start_char, raw_mention['start'])
                        end = bisect.bisect_left(tokens_end_char, raw_mention['end']) + 1
                        mentions.append(Entity(start=start, end=end))

                    groups.append(mentions)

                # create chunks
                chunks = []
                pos = 0
                chunk_id = 0
                while pos < len(tokens):
                    # create a chunk starting at this position
                    chunk_tokens = tokens[pos:pos+chunk_size]

                    chunk_groups = []
                    for group in groups:
                        mentions = [
                            Entity(start=mention.start-pos, end=mention.end-pos, type=mention.type)
                            for mention in group
                            if mention.start >= pos and mention.end <= pos + chunk_size
                        ]
                        if len(mentions) >= 2:
                            chunk_groups.append(mentions)

                    example = InputExample(
                        id=f'{split}-{i}-{chunk_id}',
                        tokens=chunk_tokens,
                        offset=pos,
                        groups=chunk_groups,
                        document_id=document_id,
                        chunk_id=chunk_id,
                    )

                    examples.append(example)
                    chunks.append(example)

                    if pos + chunk_size >= len(tokens):
                        # this chunk arrives until the end, so we can stop
                        break

                    pos += chunk_size - chunk_overlap
                    chunk_id += 1

                self.documents[document_id] = CorefDocument(
                    id=document_id,
                    tokens=tokens,
                    groups=groups,
                    chunks=chunks,
                    chunk_centers=[example.offset + len(example.tokens) // 2 for example in chunks]
                )

        logging.info(f"Loaded {len(self.documents)} documents split in {len(examples)} chunks"
                     f" for split {split} of {self.name}")

        return examples