in chatlearn/schedule/model_manager.py [0:0]
def remote(self) -> list:
"""
convert model to remote
"""
logger.info(f"{LOG_START} model_manager start to convert model to remote")
t1 = time.time()
if self.converted:
return self.dist_models
self._name2distmodel = {}
remote_states = set()
for model in self.local_models:
# create dist model object for each local model
dist_model = self._to_dist_model(model)
self.dist_models.append(dist_model)
self._name2distmodel[model.name] = dist_model
total_gpu_required = self._get_total_gpu_required()
if total_gpu_required > self.resouce_manager.total_gpu:
raise RuntimeError(f"The number of required gpus for current job is {total_gpu_required}, " + \
f"while the number of applied gpus is {self.resouce_manager.total_gpu}")
if self.resouce_manager.total_gpu > total_gpu_required:
logger.warning(f"The number of applied gpus is {self.resouce_manager.total_gpu}, " + \
f"while the number of required gpus is {total_gpu_required}, " + \
f"there is {self.resouce_manager.total_gpu - total_gpu_required} wasted gpus")
t2 = time.time()
logger.info(f"{LOG_START} model_manager convert model to remote, get_total_gpu_required(s):{(t2-t1)}")
env_list = []
for group in self.runtime_args.colocation:
colocate_models = [self._name2distmodel[name] for name in group]
self.place_models_to_remote_devices(colocate_models, env_list)
if len(colocate_models) > 1:
set_colocate = []
for model in colocate_models:
model.is_colocate = True
set_colocate.extend(model.set_colocate(True))
future.wait(set_colocate)
for name in group:
remote_states.add(name)
t3 = time.time()
logger.info(f"{LOG_START} model_manager convert model to remote, set_colocate(s):{(t3-t2)}")
for model in self.dist_models:
# place non-colocate models
if model.name not in remote_states:
self.place_models_to_remote_devices([model], env_list)
self.set_dist_env_concurrent(env_list)
self.converted = True
t4 = time.time()
logger.info(f"{LOG_START} model_manager convert model to remote, place_models_to_remote_devices(s):{(t4-t3)}")
return self.dist_models