parsers/MovieReview/MovieReview_Finetune_Preprocess.py [16:158]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class MovieReviewInputFeatures(object):
    """A single set of features of data."""

    def __init__(self, unique_example_id, unique_sentence_id, tokens, annotations, input_ids, input_mask, input_type_ids):
        self.unique_example_id=unique_example_id,
        self.unique_sentence_id=unique_sentence_id,
        self.tokens = tokens
        self.annotations = annotations
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids


# Defines a single Input Example
class MovieReviewInputExample(object):

    def __init__(self, unique_id, target, text_a, text_b):
        self.unique_id = unique_id
        self.target = target
        self.text_a = text_a
        self.text_b = text_b


def load_moviereview(data_folder):
    no_rats_neg, no_rats_pos = 'noRats_neg', 'noRats_pos'
    rats_neg, rats_pos = 'withRats_neg', 'withRats_pos'

    # How to create the pairs of words. In order not to incur in computational issues, we split the document in sentences,
    # we create ordered pairs for each sentence and we combine all these pairs together.

    # TEST SET is made by the LAST 100 not annotated negatives and the LAST 100 not annotated positives

    annotated_dataset = []
    test_set = []  # should contain last 100 examples of all folders

    max_length = 0
    doc_lengths = []

    # Create training + validation set
    for polarity in [rats_neg, rats_pos]:
        for filename in sorted(os.listdir(Path(data_folder, 'review_polarity_rationales', polarity))):
            target = 1 if polarity == rats_pos else -1
            with open(Path(data_folder, 'review_polarity_rationales', polarity, filename), 'r') as f:
                data = f.read()
                annotated_dataset.append({'sample': data, 'target': target, 'filename': filename})

    # Create test set out of the entire data
    for polarity in [no_rats_neg, no_rats_pos]:
        polarity_set = []
        for filename in sorted(os.listdir(Path(data_folder, 'review_polarity_rationales', polarity))):
            target = 1 if polarity == no_rats_pos else -1
            with open(Path(data_folder, 'review_polarity_rationales', polarity, filename), 'r') as f:
                data = f.read()

                doc_len = len(data.split())  # Rough measure of length without tokenization. USE TOKENIZATION?
                doc_lengths.append(doc_len)
                max_length = max(max_length, doc_len)
                polarity_set.append({'sample': data, 'target': target, 'filename': filename})

        polarity_set = polarity_set[-100:]
        test_set.extend(polarity_set)

    return annotated_dataset, test_set


def split_sentences(pipeline, data_folder, train, test):

    for dataset_type, dataset in [('train', train), ('test', test)]:

        idx = 0
        for example in dataset:
            idx += 1
            if idx % 100 == 0:
                print(f'Parsing sample no {idx}', end='')
                print('\r', end='')

            doc = example['sample']
            doc = pipeline(doc)

            new_example = []
            first_sentence = True
            sentence = []

            for token in doc:
                if token.is_sent_start and first_sentence:
                    sentence.append(token.text)
                    first_sentence = False

                elif token.is_sent_start and not first_sentence:
                    new_example.append(" ".join(sentence))
                    sentence = []
                    sentence.append(token.text)
                else:
                    sentence.append(token.text)

            new_example.append(" ".join(sentence))
            example['sample'] = new_example
            # print(new_example)

        print('')

        print(f'{len(dataset)} sentences have been splitted for {dataset_type} dataset')
        with open(Path(data_folder, f'splitted_{dataset_type}_sentences.json'), 'w', encoding='utf-8') as f:
            json.dump(dataset, f)


def _convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputFeature`s."""

    # This variable holds a list of examples. Each example is a list of sentences in the form of "features"
    dataset_features = []
    example_no = 0
    for example in examples:
        # get example unique ID
        example_unique_id = example.unique_id

        # get target label associated to the document
        example_target = example.target

        # get the sentences
        sentences = example.text_a  # text_b always None

        # istantiate a list of features, one per sentence
        example_features = []

        # The parsed sentence with <pos> and <neg> tags recombined
        parsed_example = []

        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)

            tokens_sentence = []  # the tokens with "corrected" annotation placeholders

            # ------ Finite State Machine to replace specific substrings  ------ #
            left_out_tokens = []
            possible_match = False

            for token in tokens:
                if not possible_match:
                    if token == '<':
                        possible_match = True  # start tracking possible tag
                        left_out_tokens.append(token)
                    else:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



parsers/MovieReview/MovieReview_Preprocess.py [16:158]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class MovieReviewInputFeatures(object):
    """A single set of features of data."""

    def __init__(self, unique_example_id, unique_sentence_id, tokens, annotations, input_ids, input_mask, input_type_ids):
        self.unique_example_id=unique_example_id,
        self.unique_sentence_id=unique_sentence_id,
        self.tokens = tokens
        self.annotations = annotations
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids


# Defines a single Input Example
class MovieReviewInputExample(object):

    def __init__(self, unique_id, target, text_a, text_b):
        self.unique_id = unique_id
        self.target = target
        self.text_a = text_a
        self.text_b = text_b


def load_moviereview(data_folder):
    no_rats_neg, no_rats_pos = 'noRats_neg', 'noRats_pos'
    rats_neg, rats_pos = 'withRats_neg', 'withRats_pos'

    # How to create the pairs of words. In order not to incur in computational issues, we split the document in sentences,
    # we create ordered pairs for each sentence and we combine all these pairs together.

    # TEST SET is made by the LAST 100 not annotated negatives and the LAST 100 not annotated positives

    annotated_dataset = []
    test_set = []  # should contain last 100 examples of all folders

    max_length = 0
    doc_lengths = []

    # Create training + validation set
    for polarity in [rats_neg, rats_pos]:
        for filename in sorted(os.listdir(Path(data_folder, 'review_polarity_rationales', polarity))):
            target = 1 if polarity == rats_pos else -1
            with open(Path(data_folder, 'review_polarity_rationales', polarity, filename), 'r') as f:
                data = f.read()
                annotated_dataset.append({'sample': data, 'target': target, 'filename': filename})

    # Create test set out of the entire data
    for polarity in [no_rats_neg, no_rats_pos]:
        polarity_set = []
        for filename in sorted(os.listdir(Path(data_folder, 'review_polarity_rationales', polarity))):
            target = 1 if polarity == no_rats_pos else -1
            with open(Path(data_folder, 'review_polarity_rationales', polarity, filename), 'r') as f:
                data = f.read()

                doc_len = len(data.split())  # Rough measure of length without tokenization. USE TOKENIZATION?
                doc_lengths.append(doc_len)
                max_length = max(max_length, doc_len)
                polarity_set.append({'sample': data, 'target': target, 'filename': filename})

        polarity_set = polarity_set[-100:]
        test_set.extend(polarity_set)

    return annotated_dataset, test_set


def split_sentences(pipeline, data_folder, train, test):

    for dataset_type, dataset in [('train', train), ('test', test)]:

        idx = 0
        for example in dataset:
            idx += 1
            if idx % 100 == 0:
                print(f'Parsing sample no {idx}', end='')
                print('\r', end='')

            doc = example['sample']
            doc = pipeline(doc)

            new_example = []
            first_sentence = True
            sentence = []

            for token in doc:
                if token.is_sent_start and first_sentence:
                    sentence.append(token.text)
                    first_sentence = False

                elif token.is_sent_start and not first_sentence:
                    new_example.append(" ".join(sentence))
                    sentence = []
                    sentence.append(token.text)
                else:
                    sentence.append(token.text)

            new_example.append(" ".join(sentence))
            example['sample'] = new_example
            # print(new_example)

        print('')

        print(f'{len(dataset)} sentences have been splitted for {dataset_type} dataset')
        with open(Path(data_folder, f'splitted_{dataset_type}_sentences.json'), 'w', encoding='utf-8') as f:
            json.dump(dataset, f)


def _convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputFeature`s."""

    # This variable holds a list of examples. Each example is a list of sentences in the form of "features"
    dataset_features = []
    example_no = 0
    for example in examples:
        # get example unique ID
        example_unique_id = example.unique_id

        # get target label associated to the document
        example_target = example.target

        # get the sentences
        sentences = example.text_a  # text_b always None

        # istantiate a list of features, one per sentence
        example_features = []

        # The parsed sentence with <pos> and <neg> tags recombined
        parsed_example = []

        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)

            tokens_sentence = []  # the tokens with "corrected" annotation placeholders

            # ------ Finite State Machine to replace specific substrings  ------ #
            left_out_tokens = []
            possible_match = False

            for token in tokens:
                if not possible_match:
                    if token == '<':
                        possible_match = True  # start tracking possible tag
                        left_out_tokens.append(token)
                    else:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



