in dataflux_pytorch/benchmark/checkpointing/simulated/benchmark.py [0:0]
def configure_master_addr():
"""Get coordinator IP Address with retries"""
coordinator_address = ""
coordinator_ip_address = ""
if os.environ.get("COORDINATOR_ADDRESS") is not None:
coordinator_address = os.environ.get("COORDINATOR_ADDRESS")
coordinator_found = False
lookup_attempt = 1
max_coordinator_lookups = 50
while not coordinator_found and lookup_attempt <= max_coordinator_lookups:
try:
coordinator_ip_address = socket.gethostbyname(
coordinator_address)
coordinator_found = True
except socket.gaierror:
print(
f"Failed to recognize coordinator address {coordinator_address} on"
f" attempt {lookup_attempt}, retrying...")
lookup_attempt += 1
time.sleep(5)
print(f"Coordinator IP address: {coordinator_ip_address}")
os.environ["MASTER_ADDR"] = str(coordinator_ip_address)