def _convert_examples_to_features()

in parsers/Hatespeech/Hatespeech_Preprocess_Ngrams.py [0:0]


def _convert_examples_to_features(data_folder, examples, pipeline):
    """Loads a data file into a list of `InputFeature`s."""

    dataset_features = []

    with open(Path(data_folder, f'word_to_idx.json'), 'r', encoding='utf-8') as f:
        word_to_idx = json.load(f)

    no_ngrams = len(word_to_idx.keys())

    idx = 0
    for i, example in enumerate(examples):

        idx += 1
        if idx % 100 == 0:
            print(f'Parsing sample no {idx}', end='')
            print('\r', end='')

        ngram_feats = torch.zeros(no_ngrams + 1)  # Add length of tweet as well
        targets = torch.zeros(1)  # Add length of tweet as well

        tweet = ' '.join(example['sample'])

        # Remove links and highlights from the tweet
        tweet = re.sub(
            r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
            " ", tweet)

        tweet = re.sub('<POS>', "", tweet)
        tweet = re.sub('<NEG>', "", tweet)
        tweet = re.sub('</POS>', "", tweet)
        tweet = re.sub('</NEG>', "", tweet)
        tweet = re.sub('\n', "", tweet)
        tweet = re.sub('   ', " ", tweet)

        doc = pipeline(tweet)

        tweet_length = len(doc)

        # Remove stop words (follow work from Waseem and Hovy)
        tokens = [token.text.strip() for token in doc if not token.is_stop]

        n = 1
        unigrams = set([(w[i:i + n]).lower() for w in tokens for i in range(len(w) - n + 1)])

        n = 2
        bigrams = set([(w[i:i + n]).lower() for w in tokens for i in range(len(w) - n + 1)])

        n = 3
        trigrams = set([(w[i:i + n]).lower() for w in tokens for i in range(len(w) - n + 1)])

        n = 4
        fourgrams = set([(w[i:i + n]).lower() for w in tokens for i in range(len(w) - n + 1)])

        idxs = list(set([word_to_idx[gram] for gram in (unigrams.union(bigrams, trigrams, fourgrams))]))

        ngram_feats[idxs] = 1
        ngram_feats[-1] = tweet_length
        targets[0] = example['target']

        dataset_features.append((ngram_feats, example['highlighted'], targets))
    return dataset_features