in pretrain/PyTorch/sources.py [0:0]
def __init__(self, path, tokenizer: BertTokenizer, max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 6, small_seq_prob: float = 0.1):
self.dupe_factor = dupe_factor
self.max_seq_length = max_seq_length
self.small_seq_prob = small_seq_prob
documents = []
instances = []
with open(path, encoding='utf-8') as fd:
document = []
for i, line in enumerate(tqdm(fd)):
line = line.replace('\n', '')
# document = line
# if len(document.split("<sep>")) <= 3:
# continue
if len(line) > 0 and line[:2] == "[[" : # This is end of document
documents.append(document)
document = []
if len(line.split(' ')) > 2:
document.append(tokenizer.tokenize(line))
if len(document) > 0:
documents.append(document)
documents = [x for x in documents if x]
self.documents = documents
for _ in range(self.dupe_factor):
for index in range(len(self.documents)):
instances.extend(self.create_training_instance(index))
shuffle(instances)
self.instances = instances
self.len = len(self.instances)
self.documents = None
documents = None