def __init_

def init()

in src/datatrove/pipeline/tokens/tokenizer.py [0:0]

17 lines of code
9 McCabe index (conditional complexity)


    def __init__(
        self,
        output_folder: DataFolderLike,
        tokenizer_name_or_path: str,  # tokenizer to use, from HF or a local path
        local_working_dir: DataFolderLike | None = None,
        save_filename: str | None = None,  # if defined, the final output filename will be this
        eos_token: str
        | None = None,  # if not None, will override the postprocessor to add the EOS token after each document. This should be the text and not the id (<|endoftext|>)
        save_index: bool = True,  # save the beginning and end of each document in the index file
        save_loss_metadata: bool = False,  # save the loss information
        save_final_metadata: bool = True,  # save a small .metadata file at the end with token count and the name of the tokenizer
        batch_size: int = 10000,  # batch size for tokenization
        max_tokens_per_file: int | None = None,  # max tokens per file to get more (smaller) shuffled output files
        seed: int | None = None,
        upload_block_size: int | None = None,
        # you can set this if your s3 uploads are failing because of "Part
        # number must be an integer between 1 and 10000, inclusive". Example: 20 * 2**20 (20MB)
        shuffle_documents: bool = True,
        shuffle_chunk_size: int | None = None,

def __init__()

def init()