in src/nccl_ofi_rdma.cpp [371:455]
static int write_topo_file(nccl_ofi_topo_t *topo)
{
int ret = 0;
int topo_fd = -1;
FILE *file = NULL;
/**
* If `NCCL_TOPO_FILE` is already set, don't set it again.
*
* Note about forking behavior: in some Python applications, after calling
* plugin init, the process will fork(). This `NCCL_TOPO_FILE` environment
* variable, as well as the file descriptor it refers to, will be copied
* to the child process, and will continue to point to a valid topology
* file until the child process exits.
*/
if (getenv("NCCL_TOPO_FILE")) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"NCCL_TOPO_FILE environment variable is already set to %s",
getenv("NCCL_TOPO_FILE"));
goto exit;
}
/* Create file descriptor */
topo_fd = memfd_create("ofi_nccl_topo", 0);
if (topo_fd == -1) {
NCCL_OFI_WARN("Failed to create anonymous topology file. ERROR: %s",
strerror(errno));
ret = -errno;
goto exit;
}
/* Open file from file descriptor */
file = fdopen(topo_fd, "w");
if (file == NULL) {
NCCL_OFI_WARN("Failed to open NCCL topology file using file descriptor. ERROR %s",
strerror(errno));
ret = -errno;
close(topo_fd);
goto exit;
}
ret = nccl_ofi_topo_write(topo, file);
if (ret) {
NCCL_OFI_WARN("Failed to write NCCL topology using file descriptor. RC: %d", ret);
goto error;
}
/* Flush buffered writes to file. We don't close the file here so that
the underlying descriptor remains open, which we will reference
in `NCCL_TOPO_FILE`. */
if (fflush(file) == EOF) {
NCCL_OFI_WARN("Unable to flush NCCL topology file. ERROR: %s",
strerror(errno));
ret = -errno;
goto error;
}
char filename[32];
if (snprintf(filename, sizeof(filename), "/proc/self/fd/%d", topo_fd) < 0) {
NCCL_OFI_WARN("Errror preparing topo file name");
ret = -EIO;
goto error;
}
/* Set topology file path environment variable `NCCL_TOPO_FILE` */
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
"Setting NCCL_TOPO_FILE environment variable to %s",
filename);
if (setenv("NCCL_TOPO_FILE", filename, 1) != 0) {
NCCL_OFI_WARN("Unable to set NCCL_TOPO_FILE. ERROR: %s",
strerror(errno));
ret = -errno;
goto error;
}
goto exit;
error:
if (file) {
fclose(file);
}
exit:
return ret;
}