in preprocess-bigpatent.py [0:0]
def big_patent_preprocess(n_jobs, batchsize=100):
from datasets import load_dataset
from gpt2.data.dataset import CorpusDataset, UserLvlDataset:q
def text2tensor(dset, i):
for j in range(i,batchsize+i):
if j >= len(dataset[dset]):
break
doc = dataset[dset][j]
txt = 'abstract : ' + doc['abstract']
txt += ' description : ' + doc['description']
txt = torch.tensor(tokenizer.encode(txt))
torch.save(txt, f'bigpatent-{dset}-{j}.pt')
for dset in ['train', 'test', 'validation']:
method = lambda i : text2tensor(dset,i)
Parallel(n_jobs=n_jobs)(delayed(method)(i) for i in range(0,len(dataset[dset]),batchsize))