def generate_topo_file()

in azure-slurm/slurmcc/topology.py [0:0]


    def generate_topo_file(self) -> None:
        """
        Generates the topology file for SHARP (Scalable Hierarchical Aggregation and Reduction Protocol).

        This method sets up the environment variables and constructs the command to generate the topology file
        using the SHARP command-line tool. The command is executed, and the output is logged to a file.

        Environment Variables:
            SHARP_SMX_UC_INTERFACE: Set to "mlx5_ib0:1".
            SHARP_CMD: Optional. If set, it is used as the base path for the SHARP command.

        Attributes:
            sharp_cmd_path (str): The base path for the SHARP command if SHARP_CMD is not set in the environment.
            guids_file (str): The path to the GUIDs file.
            topo_file (str): The path to the output topology file.
            output_dir (str): The directory where the log file will be saved.

        Raises:
            Any exceptions raised by `slutil.run_command` will propagate.
        """
        env=os.environ.copy()
        if 'SHARP_CMD' not in env:
            command = (
                f"SHARP_SMX_UCX_INTERFACE=mlx5_ib0:1 "
                f"{self.sharp_cmd_path}sharp/bin/sharp_cmd topology "
                f"--ib-dev mlx5_ib0:1 "
                f"--guids_file {self.guids_file} "
                f"--topology_file {self.topo_file}"
            )
        else:
            command = (
                f"SHARP_SMX_UCX_INTERFACE=mlx5_ib0:1 "
                f"{env['SHARP_CMD']}sharp/bin/sharp_cmd topology "
                f"--ib-dev mlx5_ib0:1 "
                f"--guids_file {self.guids_file} "
                f"--topology_file {self.topo_file}"
            )

        try:
            output = slutil.srun([self.hosts[0]], command, shell = True, partition=self.partition)
            log.debug(output.stdout)
        except slutil.SrunExitCodeException as e:
            log.error("Error running sharp_command on host %s",self.hosts[0])
            if e.stderr_content:
                log.error(e.stderr_content)
            log.error(e.stderr)
            sys.exit(e.returncode)
        except subprocesslib.TimeoutExpired:
            sys.exit(1)