in astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py [0:0]
def Rail_Opti_SingleToR(parameters):
nodes_per_asw = parameters['nics_per_aswitch']
asw_switch_num_per_segment = parameters['gpu_per_server']
if(parameters['gpu'] % (nodes_per_asw * asw_switch_num_per_segment) == 0):
segment_num = (int)(parameters['gpu']/ (nodes_per_asw * asw_switch_num_per_segment))
else:
segment_num = (int)(parameters['gpu']/ (nodes_per_asw * asw_switch_num_per_segment))+1
if(segment_num != parameters['asw_switch_num'] / asw_switch_num_per_segment):
warnings.warn("Error relations between total GPU Nums and total aws_switch_num.\n \
The correct asw_switch_num is set to "+str(segment_num * asw_switch_num_per_segment))
parameters['asw_switch_num'] = segment_num * asw_switch_num_per_segment
print("asw_switch_num: " + str(parameters['asw_switch_num']))
if segment_num > int(parameters['asw_per_psw'] / asw_switch_num_per_segment):
raise ValueError("Number of GPU exceeds the capacity of Rail_Optimized_SingleToR(One Pod)")
pod_num = 1
print("psw_switch_num: " + str(parameters['psw_switch_num']))
print("Creating Topology of totally " + str(segment_num) + " segment(s), totally "+ str(pod_num) + " pod(s)." )
nv_switch_num = (int)(parameters['gpu'] / parameters['gpu_per_server']) * parameters['nv_switch_per_server']
nodes = (int) (parameters['gpu'] + parameters['asw_switch_num'] + parameters['psw_switch_num']+ nv_switch_num ) #
servers = parameters['gpu'] / parameters['gpu_per_server']
switch_nodes = (int)(parameters['psw_switch_num'] + parameters['asw_switch_num'] + nv_switch_num) #
links = (int)(parameters['psw_switch_num']/pod_num * parameters['asw_switch_num'] + servers * asw_switch_num_per_segment
+ servers * parameters['nv_switch_per_server'] * parameters['gpu_per_server']) #
if parameters['topology'] == 'Spectrum-X':
file_name = "Spectrum-X_"+str(parameters['gpu'])+"g_"+str(parameters['gpu_per_server'])+"gps_"+parameters['bandwidth']+"_"+parameters['gpu_type']
else:
file_name = "Rail_Opti_SingleToR_"+str(parameters['gpu'])+"g_"+str(parameters['gpu_per_server'])+"gps_"+parameters['bandwidth']+"_"+parameters['gpu_type']
with open(file_name, 'w') as f:
print(file_name)
first_line = str(nodes)+" "+str(parameters['gpu_per_server'])+" "+str(nv_switch_num)+" "+str(switch_nodes-nv_switch_num)+" "+str(int(links))+" "+str(parameters['gpu_type'])
f.write(first_line)
f.write('\n')
nv_switch = []
asw_switch = []
psw_switch = []
dsw_switch = []
sec_line = ""
nnodes = nodes - switch_nodes
for i in range(nnodes, nodes):
sec_line = sec_line + str(i) + " "
if len(nv_switch) < nv_switch_num:
nv_switch.append(i)
elif len(asw_switch) < parameters['asw_switch_num']:
asw_switch.append(i)
elif len(psw_switch) < parameters['psw_switch_num']:
psw_switch.append(i)
else:
dsw_switch.append(i)
f.write(sec_line)
f.write('\n')
ind_asw = 0
curr_node = 0
group_num = 0
group_account = 0
ind_nv = 0
for i in range(parameters['gpu']):
curr_node = curr_node + 1
if curr_node > parameters['gpu_per_server']:
curr_node = 1
ind_nv = ind_nv + parameters['nv_switch_per_server']
for j in range(0, parameters['nv_switch_per_server']):
#cnt += 1
line = str(i)+" "+str(nv_switch[ind_nv+j])+" "+str(parameters['nvlink_bw'])+" "+str(parameters['nv_latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')
line = str(i)+" "+str(asw_switch[group_num*asw_switch_num_per_segment+ind_asw])+" "+str(parameters['bandwidth'])+" "+str(parameters['latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')
ind_asw = ind_asw + 1
group_account = group_account + 1
if ind_asw == asw_switch_num_per_segment:
ind_asw = 0
if group_account == (parameters['gpu_per_server'] * parameters['nics_per_aswitch']):
group_num = group_num + 1
group_account = 0
for i in asw_switch: # asw - psw
for j in psw_switch:
line = str(i) + " " + str(j) +" "+ str(parameters['ap_bandwidth'])+" " +str(parameters['latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')