in src/datatuner/lm/model_loader.py [0:0]
def read_special_tokens(task_config=None, special_tokens_file=None, dataset_path=None):
"""Read special tokens from file and from the task configuration"""
tokens = []
# If no special tokens file is explicitly passed, we try finding a special_tokens.txt file in the model directory
if special_tokens_file is None:
if dataset_path is not None:
special_tokens_file = Path(dataset_path) / "special_tokens.txt"
# Add any special tokens indicated in the file
if special_tokens_file is not None and special_tokens_file.exists():
tokens += [x for x in special_tokens_file.read_text().split("\n") if x.strip()]
logger.info(f"read {len(tokens)} special tokens from {special_tokens_file}")
if task_config is not None:
# add any special tokens defined in the tokenization
for item in task_config["data_shape"]:
if item["type"] == "special":
tokens += [item["id"]]
if "extra_special_tokens" in task_config:
tokens.extend(task_config["extra_special_tokens"])
# Add basic eos and padding tokens
tokens += [PAD_TOKEN, "<eos>"]
return tokens