in src/hyperpod_nemo_adapter/conf/config_schemas.py [0:0]
def after_model_validations(self) -> "BaseTrainerConfig":
if "LOCAL_WORLD_SIZE" in os.environ and "WORLD_SIZE" in os.environ:
# read from torchrun environment variables
actual_devices = int(os.environ["LOCAL_WORLD_SIZE"])
actual_num_nodes = int(os.environ["WORLD_SIZE"]) // actual_devices
if isinstance(self.devices, int) and self.devices != actual_devices:
raise ValueError(
f"'devices' ({self.devices}) does not equal actual number of devices ({actual_devices})"
)
if self.num_nodes != actual_num_nodes:
raise ValueError(
f"'num_nodes' ({self.num_nodes}) does not equal actual number of nodes ({actual_num_nodes})"
)
return self