in train/train.py [0:0]
def make_gossip_buffer(flags, num_agents, mng, device):
"""
Shared gossip buffer for GALA mode.
"""
if num_agents <= 1:
return None, None
# Make topology
topology = []
for rank in range(num_agents):
graph = Graph(rank, num_agents, peers_per_itr=flags.num_gala_peers)
topology.append(graph)
# Initialize parameter buffer
model = learner.make_train_model(flags, device)
# Keep track of local iterations since learner's last sync
sync_list = mng.list([0 for _ in range(num_agents)])
# Used to ensure proc-safe access to agents' message-buffers
buffer_locks = mng.list([mng.Lock() for _ in range(num_agents)])
# Used to signal between processes that message was read
read_events = mng.list(
[mng.list([mng.Event() for _ in range(num_agents)]) for _ in range(num_agents)]
)
# Used to signal between processes that message was written
write_events = mng.list(
[mng.list([mng.Event() for _ in range(num_agents)]) for _ in range(num_agents)]
)
# Need to maintain a reference to all objects in main processes
_references = [topology, model, buffer_locks, read_events, write_events, sync_list]
gossip_buffer = GossipBuffer(
topology,
model,
buffer_locks,
read_events,
write_events,
sync_list,
sync_freq=flags.sync_freq,
)
return gossip_buffer, _references