in src/nccl_healthcheck/nccl_startup.py [0:0]
def parse_nccl_result(test_result: str) -> NcclResults:
"""Parse the NCCL test result for message size and bandwidth.
Args:
test_result (str): The test result to parse.
Returns:
NcclResults: The parsed NCCL test results.
"""
lines = test_result.splitlines()
results = []
# Iterate through the data lines
for line in lines:
line = line.strip()
if not line or line.startswith("#"):
# Skip empty lines and comments
continue
chunks = line.split()
if len(chunks) != _NCCL_RESULT_LENGTH:
continue
if chunks[_NCCL_RESULT_TYPE_INDEX] != "float":
# If the type isn't float then this is not a valid line
continue
size = chunks[_NCCL_RESULT_MESSAGE_SIZE_INDEX]
bandwidth_label = MESSAGE_SIZE_TO_BANDWIDTH_LABEL.get(size, None)
latency_label = MESSAGE_SIZE_TO_LATENCY_LABEL.get(size, None)
if bandwidth_label is None and latency_label is None:
continue
# In-place bandwidth is in float format, convert it to int since our logic
# currently assumes ints
result = NcclResult(
message_size=size,
in_place_bw=int(float(chunks[_NCCL_RESULT_IN_PLACE_BW_INDEX])),
in_place_time=int(float(chunks[_NCCL_RESULTS_IN_PLACE_TIME_INDEX])),
)
results.append(result)
# Extract the average bus bandwidth from the test result
match = re.search(r"# Avg bus bandwidth\s*:\s*(\d+)", test_result)
if match:
bandwidth = int(match.group(1))
print(f"Found bandwidth: {bandwidth}")
success = True
else:
bandwidth = _NO_BANDWIDTH_VALUE
success = False
nccl_results = NcclResults(
avg_bandwidth=bandwidth,
results=results,
success=success,
)
print(f"results: {results}")
return nccl_results