parsers/Hatespeech/Hatespeech_Dataset_Builder.py [119:156]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                processed_example = {'target': torch.tensor([target], dtype=torch.long), 'tokens': [],
                                     'tokens_embeddings': None, 'sentence_embeddings': None,
                                     'tokens_annotations': None}


                # This will be needed to compute a single indexing for all tokens in the DOCUMENT
                starting_token_idx = 0
                sentence_idx = -1  # used by reference embeddings

                for sentence in example:
                    sentence_idx += 1
                    unique_sentence_id = sentence['unique_sentence_id']
                    sentence_example_id = sentence['example_id']

                    # The baseline will take the mean of the embeddings at runtime!
                    tokens_annotations = torch.from_numpy(np.array(sentence['tokens_annotations'])).long()  # CLS and SEP already removed
                    tokens_embeddings = torch.from_numpy(sentence['tokens_embeddings'])  # CLS and SEP already removed
                    sentence_embeddings = torch.from_numpy(sentence['sentence_embeddings'])  # CLS and SEP already removed
                    sentence_tokens = sentence['tokens']  # CLS and SEP already removed

                    # print(tokens_annotations.shape, tokens_embeddings.shape, sentence_embeddings.shape, len(sentence_tokens))

                    # Construct ordered pairs of tokens (all of them for now)
                    no_tokens = len(sentence_tokens)

                    # Now update example info by concatenating everything
                    for key, val in [('tokens_embeddings', tokens_embeddings),
                                     ('tokens_annotations', tokens_annotations),
                                     ('sentence_embeddings', sentence_embeddings)]:

                        if processed_example[key] is None:
                            processed_example[key] = val
                        else:
                            processed_example[key] = torch.cat((processed_example[key], val), dim=0)

                    starting_token_idx += no_tokens

                    processed_example['tokens'].extend(sentence_tokens)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


parsers/MovieReview/MovieReview_Dataset_Builder.py [120:157]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                processed_example = {'target': torch.tensor([target], dtype=torch.long), 'tokens': [],
                                     'tokens_embeddings': None, 'sentence_embeddings': None,
                                     'tokens_annotations': None}


                # This will be needed to compute a single indexing for all tokens in the DOCUMENT
                starting_token_idx = 0
                sentence_idx = -1  # used by reference embeddings

                for sentence in example:
                    sentence_idx += 1
                    unique_sentence_id = sentence['unique_sentence_id']
                    sentence_example_id = sentence['example_id']

                    # The baseline will take the mean of the embeddings at runtime!
                    tokens_annotations = torch.from_numpy(np.array(sentence['tokens_annotations'])).long()  # CLS and SEP already removed
                    tokens_embeddings = torch.from_numpy(sentence['tokens_embeddings'])  # CLS and SEP already removed
                    sentence_embeddings = torch.from_numpy(sentence['sentence_embeddings'])  # CLS and SEP already removed
                    sentence_tokens = sentence['tokens']  # CLS and SEP already removed

                    # print(tokens_annotations.shape, tokens_embeddings.shape, sentence_embeddings.shape, len(sentence_tokens))

                    # Construct ordered pairs of tokens (all of them for now)
                    no_tokens = len(sentence_tokens)

                    # Now update example info by concatenating everything
                    for key, val in [('tokens_embeddings', tokens_embeddings),
                                     ('tokens_annotations', tokens_annotations),
                                     ('sentence_embeddings', sentence_embeddings)]:

                        if processed_example[key] is None:
                            processed_example[key] = val
                        else:
                            processed_example[key] = torch.cat((processed_example[key], val), dim=0)

                    starting_token_idx += no_tokens

                    processed_example['tokens'].extend(sentence_tokens)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -