in src/torch_ucc_comm.cpp [134:158]
ucc_status_t oob_allgather_free(void* req) {
torch_ucc_oob_coll_info_t* info =
reinterpret_cast<torch_ucc_oob_coll_info_t*>(req);
try {
int num_done = info->store->add({info->getKey("ag_done")}, 1);
if (num_done == info->size) {
info->store->deleteKey(info->getKey("ag_done"));
for (int r = 0; r < info->size; r++) {
info->store->deleteKey(info->getKey("teamr" + std::to_string(r)));
}
for (int r = 0; r < info->size; r++) {
info->store->add({info->getKey("ag_free" + std::to_string(r))}, 1);
}
} else {
info->store->wait({info->getKey("ag_free" + std::to_string(info->rank))});
}
info->store->deleteKey(
info->getKey("ag_free" + std::to_string(info->rank)));
} catch (std::exception& ex) {
LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
<< "[" << ex.what() << "]";
return UCC_ERR_NO_MESSAGE;
}
return UCC_OK;
}