cli/dependencies.py (169 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common dependencies for configcheck."""
import re
import subprocess
import config
import dependency_version_parser
import local_dependency_version_parser
def _parse_driver_version(name, cmd_result: str) -> config.DependencyConfig:
"""Parses the driver version from the command result."""
version_set = set(version for version in cmd_result.strip().split())
if len(version_set) != 1:
print(f'Expected exactly one driver version, got: {version_set}')
raise ValueError(f'Expected exactly one driver version, got: {version_set}')
return config.DependencyConfig(
name=name,
version=version_set.pop(),
)
def _parse_generic_version(name, cmd_result: str) -> config.DependencyConfig:
"""Parses a generic X.Y....Z version from the command result."""
return config.DependencyConfig(
name=name,
version=re.sub(r'[^A-Za-z0-9\.]+', '', cmd_result),
)
def _parse_nccl_configs(name, cmd_result: str) -> config.DependencyConfig:
"""Parses the nccl configs from the command result."""
configs = {
line.split('=')[0]: line.split('=')[1]
for line in cmd_result.strip().split('\n')
}
return config.DependencyConfig(
name=name,
version='Abridged. See diffs for details.',
config_settings=configs,
)
_NVIDIA_SMI_PATH = '../.././home/kubernetes/bin/nvidia/bin/nvidia-smi'
DEPENDENCY_PARSERS = [
dependency_version_parser.DependencyVersionParser(
name='cosVersion',
cmd=([
'cat',
'/etc/os-release',
'|',
'grep',
'BUILD_ID',
'|',
'egrep',
'-o',
r"'[0-9\\.]+'",
]),
parse_version_fn=_parse_generic_version,
),
dependency_version_parser.DependencyVersionParser(
name='gpuDriverVersion',
cmd=([
_NVIDIA_SMI_PATH,
'--query-gpu=driver_version',
'--format=csv,noheader',
]),
parse_version_fn=_parse_driver_version,
),
dependency_version_parser.DependencyVersionParser(
name='cudaVersion',
cmd=([
_NVIDIA_SMI_PATH,
'|',
'sed',
'-n',
'"3p"',
'|',
'sed',
r'"s/.*CUDA Version: \+\(.*\)|.*/\1/"',
]),
parse_version_fn=_parse_generic_version,
),
dependency_version_parser.DependencyVersionParser(
name='ncclVersion',
cmd=([
'ldconfig',
'-v',
'|',
'grep',
'"libnccl.so"',
'|',
'tail',
'-n1',
'|',
'sed',
'-r',
r'"s/^.*\.so\.//"',
]),
parse_version_fn=_parse_generic_version,
),
]
def _parse_nccl_plugin_version(name, file_path: str) -> config.DependencyConfig:
"""Parses the nccl plugin version from the command result."""
plugin_version = subprocess.run(
' '.join(['nm', '-gD', file_path, '|', 'grep', 'ncclNetPlugin_v']),
shell=True,
check=True,
capture_output=True,
text=True,
).stdout
plugin_version_match = re.search(r'\b([a-zA-Z0-9_]+)$', plugin_version)
if not plugin_version_match:
raise ValueError(
f'Failed to parse nccl plugin version from: {plugin_version}'
)
plugin_version = plugin_version_match.group(1)
return config.DependencyConfig(
name=name,
version=plugin_version,
)
def get_dynamic_dependency_parsers(
node_name: str,
zone: str,
pod_name: str | None = None,
workload_container: str | None = None,
) -> list[dependency_version_parser.DependencyVersionParser]:
"""Returns the dynamic dependency parsers for a given node.
Dynamic parsers are parsers that require specific context to be fetched.
For example, NCCL configs are fetched from the workload container, while
NCCL plugin version requires a node name and zone.
Args:
node_name: The name of the node.
zone: The zone of the node.
pod_name: The name of the pod.
workload_container: The name of the workload container to fetch configs
from. If not specified, NCCL configs will not be fetched.
Returns:
A list of dynamic dependency parsers.
"""
#
parsers = [
local_dependency_version_parser.LocalDependencyVersionParser(
dep_name='ncclPluginVersion',
node_name=node_name,
zone=zone,
remote_file_path='/home/kubernetes/bin/nvidia/lib64/libnccl-net.so',
parse_version_fn=lambda name, file_path: _parse_nccl_plugin_version(
name,
f'{local_dependency_version_parser.LOCAL_FILE_PATH}/{node_name}/libnccl-net.so',
),
),
dependency_version_parser.DependencyVersionParser(
name='Workload Container',
cmd=([
'echo',
workload_container if workload_container else 'None Found',
]),
),
]
if workload_container and pod_name:
parsers.extend([
dependency_version_parser.DependencyVersionParser(
name='ncclConfigs',
cmd=([
'echo',
'-n',
pod_name,
'|',
'xargs',
'-I',
'{}',
'kubectl',
'exec',
'{}',
'-c',
f'{workload_container}',
'--',
'env',
'|',
'grep',
'-E',
'"NCCL|LD_LIBRARY"',
]),
parse_version_fn=_parse_nccl_configs,
),
])
else:
parsers.append(
dependency_version_parser.DependencyVersionParser(
name='ncclConfigs',
cmd=([
'echo',
'Error: Workload Container required',
]),
)
)
return parsers