in tokenizers/python/tokenizepythoncorpus.py [0:0]
def tokenize_all_files(directory: str, output_folder: str, only_ids: bool=False):
print('Tokenizing in folder %s.' % directory)
def all_file_tokenizer():
for file in glob.iglob(os.path.join(directory, '**', '*.py'), recursive=True):
if os.path.isdir(file): continue
yield tokenize_file(file, only_ids)
directory_name = os.path.basename(directory)
save_jsonl_gz(all_file_tokenizer(), os.path.join(output_folder, directory_name + '-tokens.jsonl.gz'))