def do_tokenization()

in data_measurements/tokenize.py [0:0]

18 lines of code
4 McCabe index (conditional complexity)


    def do_tokenization(self):
        """
        Tokenizes a Hugging Face dataset in the self.feature field.
        :return: Hugging Face Dataset with tokenized text in self.tok_feature.
        """
        sent_tokenizer = self.cvec.build_tokenizer()

        def tokenize_batch(examples):
            if self.lowercase:
                tok_sent = {
                    self.tok_feature: [tuple(sent_tokenizer(text.lower())) for
                                       text in examples[self.feature]]}
            else:
                tok_sent = {
                    self.tok_feature: [tuple(sent_tokenizer(text)) for text in
                                       examples[self.feature]]}
            return tok_sent

        tokenized_dset = self.text_dset.map(
            tokenize_batch,
            batched=True
        )
        logs.info("Tokenized the dataset.")
        return tokenized_dset