in src/nanotron/eval/one_job_runner.py [0:0]
def eval_single_checkpoint(self, uploaded_files: List[dict]) -> Tuple[str, str]:
"""Run light evaluation on uploaded files."""
if (
self.config.lighteval.eval_interval is not None
and self.config.general.step % self.config.lighteval.eval_interval != 0
):
logger.debug(
f"Skipping evaluation at step {self.config.general.step} because eval_interval is {self.config.lighteval.eval_interval}"
)
return
config_files = [
f for f in uploaded_files if "config.py" in f["destination"] or "config.yaml" in f["destination"]
]
# Sanity check on the config files len (we want only one)
if len(config_files) == 0:
log_rank(
"No config files founds in uploaded checkpoints. Not running evaluation.",
logger=logger,
level=logging.ERROR,
group=self.parallel_context.dp_pg if self.parallel_context is not None else None,
rank=0,
)
return
if len(config_files) > 1:
log_rank(
f"Found multiple config files in uploaded checkpoints: {config_files}",
logger=logger,
level=logging.ERROR,
group=self.parallel_context.dp_pg if self.parallel_context is not None else None,
rank=0,
)
return
checkpoint_path = config_files[0]["destination"].replace("config.yaml", "")
logger.warning(
f"Lighteval Runner got {len(uploaded_files)} files. Using {checkpoint_path} as checkpoint path."
)
if self.config.general.step % self.lighteval_config.eval_interval == 0:
slurm_job_id, slurm_log = run_slurm_one_job(
config=self.config,
lighteval_config=self.lighteval_config,
model_checkpoint_path=checkpoint_path,
current_step=self.config.general.step,
)
else:
logger.warning(
f"Skipping evaluation at step {self.config.general.step} because it's not a multiple of {self.lighteval_config.eval_interval}"
)
return None, None
return slurm_job_id, slurm_log