in parsers/Hatespeech/Hatespeech_Fasttext_Preprocess.py [0:0]
def _convert_examples_to_features(examples, seq_length):
"""Loads a data file into a list of `InputFeature`s."""
# This variable holds a list of examples. Each example is a list of sentences in the form of "features"
dataset_features = []
for example in examples:
# get example unique ID
example_unique_id = example.unique_id
example_highlight = example.highlight
# get target label associated to the document
example_target = example.target
# get the sentences
sentences = example.text_a # text_b always None
# Remove links from the tweet
sentences = [re.sub(
r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
" ", s) for s in sentences]
# istantiate a list of features, one per sentence
example_features = []
# The parsed sentence with <pos> and <neg> tags recombined
parsed_example = []
for sentence in sentences:
tokens = sentence.split()
# Append parsed sentence to the parsed example (document)
parsed_example.append([t.lower() for t in tokens])
# We now prepare the data for BERT
annotate_as_neg = False # Needed to associate an annotation to each token
annotate_as_pos = False # Needed to associate an annotation to each token
sentences = []
for sentence in parsed_example:
if len(sentence) == 0 or sentence[0] == '':
continue
input_type_ids = []
annotations = []
tokens = []
for token in sentence:
if token == '<neg>':
#print(f'found {token}!')
assert not annotate_as_pos
annotate_as_neg = True
elif token == '<pos>':
#print(f'found {token}!')
assert not annotate_as_neg
annotate_as_pos = True
elif token == '</neg>':
#print(f'found {token}!')
assert annotate_as_neg
assert not annotate_as_pos
annotate_as_neg = False
elif token == '</pos>':
#print(f'found {token}!')
assert annotate_as_pos, sentence
assert not annotate_as_neg
annotate_as_pos = False
else:
if annotate_as_neg or annotate_as_pos:
annotations.append(1)
else:
annotations.append(0)
tokens.append(token)
assert len(tokens) != 0, example.text_a
sentences.append((tokens, annotations, input_type_ids))
# Now it is time to store things
if len(sentences) == 0:
continue
# we also create a sentence ID, it may be useful
sentence_unique_id = example_unique_id
for tokens, annotations, input_type_ids in sentences:
# print(f'Sentence unique id is {sentence_unique_id}')
example_features.append(
HatespeechInputFeatures(
unique_example_id=example_unique_id,
unique_sentence_id=sentence_unique_id,
tokens=tokens,
annotations=annotations,
input_ids=None,
input_mask=None,
input_type_ids=input_type_ids)
)
sentence_unique_id += 1
dataset_features.append((example_target, example_highlight, example_features))
return dataset_features