def tokenize_all_files()

in tokenizers/python/tokenizepythoncorpus.py [0:0]


def tokenize_all_files(directory: str, output_folder: str, only_ids: bool=False):
    print('Tokenizing in folder %s.' % directory)
    def all_file_tokenizer():
        for file in glob.iglob(os.path.join(directory, '**', '*.py'), recursive=True):
            if os.path.isdir(file): continue
            yield tokenize_file(file, only_ids)

    directory_name = os.path.basename(directory)
    save_jsonl_gz(all_file_tokenizer(), os.path.join(output_folder, directory_name + '-tokens.jsonl.gz'))