def __post_init__()

in src/nanotron/config/config.py [0:0]


    def __post_init__(self):
        if isinstance(self.dataset_folder, str):  # Case 1: 1 Dataset folder
            self.dataset_folder = [self.dataset_folder]
            self.dataset_weights = [1]

        # Check if dataset_weights is provided and matches the number of dataset folders
        if self.dataset_weights is not None and len(self.dataset_weights) != len(self.dataset_folder):
            raise ValueError(
                f"Number of dataset weights ({len(self.dataset_weights)}) does not match number of dataset folders ({len(self.dataset_folder)})"
            )

        # Read the first metadata file in the dataset folder to extract tokenizer name and token size.
        for folder in self.dataset_folder:
            # Find all metadata files in the folder
            metadata_files = glob.glob(os.path.join(folder, "*.metadata"))
            if metadata_files:
                # Read the first line of the first metadata file
                with open(metadata_files[0], "r") as f:
                    first_line = f.readline().strip()
                    if "|" in first_line:
                        tokenizer_name, token_size_in_bytes = first_line.split("|")
                        if self.tokenizer_name is None:
                            self.tokenizer_name = tokenizer_name
                            self.token_size_in_bytes = int(token_size_in_bytes)
                            self.vocab_size = len(AutoTokenizer.from_pretrained(tokenizer_name).get_vocab())
                        else:
                            assert (
                                self.tokenizer_name == tokenizer_name
                            ), f"Tokenizer name mismatch while reading datasets metadata file, found both {self.tokenizer_name} and {tokenizer_name}"
                            assert self.token_size_in_bytes == int(
                                token_size_in_bytes
                            ), f"Token size mismatch while reading datasets metadata file, found both {self.token_size_in_bytes} and {token_size_in_bytes}"

        # Check if dataset_read_path is provided and matches the number of dataset folders
        if self.dataset_read_path is not None and len(self.dataset_read_path) != len(self.dataset_folder):
            raise ValueError(
                f"Number of dataset read paths ({len(self.dataset_read_path)}) does not match number of dataset folders ({len(self.dataset_folder)})"
            )