def __init__()

in pretrain/PyTorch/sources.py [0:0]


    def __init__(self, path, tokenizer: BertTokenizer,  max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 6, small_seq_prob: float = 0.1):
        self.dupe_factor = dupe_factor
        self.max_seq_length = max_seq_length
        self.small_seq_prob = small_seq_prob

        documents = []
        instances = []
        with open(path, encoding='utf-8') as fd:
            document = []
            for i, line in enumerate(tqdm(fd)):
                line = line.replace('\n', '')
                # document = line
                # if len(document.split("<sep>")) <= 3:
                #     continue
                if len(line) > 0 and line[:2] ==  "[[" : # This is end of document
                    documents.append(document)
                    document = []
                if len(line.split(' ')) > 2:
                    document.append(tokenizer.tokenize(line))
            if len(document) > 0:
                documents.append(document)

        documents = [x for x in documents if x]
        self.documents = documents
        for _ in range(self.dupe_factor):
            for index in range(len(self.documents)):
                instances.extend(self.create_training_instance(index))

        shuffle(instances)
        self.instances = instances
        self.len = len(self.instances)
        self.documents = None
        documents = None