def _convert_examples_to_features()

in parsers/Hatespeech/Hatespeech_Fasttext_Preprocess.py [0:0]


def _convert_examples_to_features(examples, seq_length):
    """Loads a data file into a list of `InputFeature`s."""

    # This variable holds a list of examples. Each example is a list of sentences in the form of "features"
    dataset_features = []
    for example in examples:
        # get example unique ID
        example_unique_id = example.unique_id

        example_highlight = example.highlight

        # get target label associated to the document
        example_target = example.target

        # get the sentences
        sentences = example.text_a  # text_b always None

        # Remove links from the tweet
        sentences = [re.sub(
            r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
            " ", s) for s in sentences]

        # istantiate a list of features, one per sentence
        example_features = []

        # The parsed sentence with <pos> and <neg> tags recombined
        parsed_example = []

        for sentence in sentences:
            tokens = sentence.split()
            # Append parsed sentence to the parsed example (document)
            parsed_example.append([t.lower() for t in tokens])

        # We now prepare the data for BERT

        annotate_as_neg = False  # Needed to associate an annotation to each token
        annotate_as_pos = False  # Needed to associate an annotation to each token

        sentences = []
        for sentence in parsed_example:

            if len(sentence) == 0 or sentence[0] == '':
                continue

            input_type_ids = []
            annotations = []
            tokens = []

            for token in sentence:

                if token == '<neg>':
                    #print(f'found {token}!')
                    assert not annotate_as_pos
                    annotate_as_neg = True
                elif token == '<pos>':
                    #print(f'found {token}!')
                    assert not annotate_as_neg
                    annotate_as_pos = True
                elif token == '</neg>':
                    #print(f'found {token}!')
                    assert annotate_as_neg
                    assert not annotate_as_pos
                    annotate_as_neg = False
                elif token == '</pos>':
                    #print(f'found {token}!')
                    assert annotate_as_pos, sentence
                    assert not annotate_as_neg
                    annotate_as_pos = False
                else:
                    if annotate_as_neg or annotate_as_pos:
                        annotations.append(1)
                    else:
                        annotations.append(0)
                    tokens.append(token)

            assert len(tokens) != 0, example.text_a
            sentences.append((tokens, annotations, input_type_ids))

        # Now it is time to store things
        if len(sentences) == 0:
            continue

        # we also create a sentence ID, it may be useful
        sentence_unique_id = example_unique_id

        for tokens, annotations, input_type_ids in sentences:

            # print(f'Sentence unique id is {sentence_unique_id}')
            example_features.append(
                HatespeechInputFeatures(
                    unique_example_id=example_unique_id,
                    unique_sentence_id=sentence_unique_id,
                    tokens=tokens,
                    annotations=annotations,
                    input_ids=None,
                    input_mask=None,
                    input_type_ids=input_type_ids)
            )
            sentence_unique_id += 1

        dataset_features.append((example_target, example_highlight, example_features))
    return dataset_features