def big_patent_preprocess()

in preprocess-bigpatent.py [0:0]


def big_patent_preprocess(n_jobs, batchsize=100):
    from datasets import load_dataset
    from gpt2.data.dataset import CorpusDataset, UserLvlDataset:q
    
    def text2tensor(dset, i):
        for j in range(i,batchsize+i):
            if j >= len(dataset[dset]):
                break
            doc = dataset[dset][j]
            txt = 'abstract : ' + doc['abstract']
            txt += ' description : ' + doc['description']
            txt = torch.tensor(tokenizer.encode(txt))
            torch.save(txt, f'bigpatent-{dset}-{j}.pt')

    for dset in ['train', 'test', 'validation']:
        method = lambda i : text2tensor(dset,i)
        Parallel(n_jobs=n_jobs)(delayed(method)(i) for i in range(0,len(dataset[dset]),batchsize))