def read_special_tokens()

in src/datatuner/lm/model_loader.py [0:0]


def read_special_tokens(task_config=None, special_tokens_file=None, dataset_path=None):
    """Read special tokens from file and from the task configuration"""
    tokens = []
    # If no special tokens file is explicitly passed, we try finding a special_tokens.txt file in the model directory
    if special_tokens_file is None:
        if dataset_path is not None:
            special_tokens_file = Path(dataset_path) / "special_tokens.txt"

    # Add any special tokens indicated in the file
    if special_tokens_file is not None and special_tokens_file.exists():
        tokens += [x for x in special_tokens_file.read_text().split("\n") if x.strip()]
        logger.info(f"read {len(tokens)} special tokens from {special_tokens_file}")

    if task_config is not None:
        # add any special tokens defined in the tokenization
        for item in task_config["data_shape"]:
            if item["type"] == "special":
                tokens += [item["id"]]

        if "extra_special_tokens" in task_config:
            tokens.extend(task_config["extra_special_tokens"])

    # Add basic eos and padding tokens
    tokens += [PAD_TOKEN, "<eos>"]

    return tokens