in src/nanotron/config/config.py [0:0]
def __post_init__(self):
if isinstance(self.dataset_folder, str): # Case 1: 1 Dataset folder
self.dataset_folder = [self.dataset_folder]
self.dataset_weights = [1]
# Check if dataset_weights is provided and matches the number of dataset folders
if self.dataset_weights is not None and len(self.dataset_weights) != len(self.dataset_folder):
raise ValueError(
f"Number of dataset weights ({len(self.dataset_weights)}) does not match number of dataset folders ({len(self.dataset_folder)})"
)
# Read the first metadata file in the dataset folder to extract tokenizer name and token size.
for folder in self.dataset_folder:
# Find all metadata files in the folder
metadata_files = glob.glob(os.path.join(folder, "*.metadata"))
if metadata_files:
# Read the first line of the first metadata file
with open(metadata_files[0], "r") as f:
first_line = f.readline().strip()
if "|" in first_line:
tokenizer_name, token_size_in_bytes = first_line.split("|")
if self.tokenizer_name is None:
self.tokenizer_name = tokenizer_name
self.token_size_in_bytes = int(token_size_in_bytes)
self.vocab_size = len(AutoTokenizer.from_pretrained(tokenizer_name).get_vocab())
else:
assert (
self.tokenizer_name == tokenizer_name
), f"Tokenizer name mismatch while reading datasets metadata file, found both {self.tokenizer_name} and {tokenizer_name}"
assert self.token_size_in_bytes == int(
token_size_in_bytes
), f"Token size mismatch while reading datasets metadata file, found both {self.token_size_in_bytes} and {token_size_in_bytes}"
# Check if dataset_read_path is provided and matches the number of dataset folders
if self.dataset_read_path is not None and len(self.dataset_read_path) != len(self.dataset_folder):
raise ValueError(
f"Number of dataset read paths ({len(self.dataset_read_path)}) does not match number of dataset folders ({len(self.dataset_folder)})"
)