in astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py [0:0]
def Rail_Opti_DualToR_DualPlane(parameters):
nodes_per_asw = parameters['nics_per_aswitch']
asw_switch_num_per_segment = parameters['gpu_per_server']*2
if(parameters['gpu'] % (nodes_per_asw * asw_switch_num_per_segment/2) == 0):
segment_num = (int)(parameters['gpu']/ (nodes_per_asw * asw_switch_num_per_segment/2))
else:
segment_num = (int)(parameters['gpu']/ (nodes_per_asw * asw_switch_num_per_segment/2))+1
if(segment_num != parameters['asw_switch_num'] / asw_switch_num_per_segment):
warnings.warn("Error relations between total GPU Nums and total aws_switch_num.\n \
The correct asw_switch_num is set to "+str(segment_num * asw_switch_num_per_segment))
parameters['asw_switch_num'] = segment_num * asw_switch_num_per_segment
print("asw_switch_num: " + str(parameters['asw_switch_num']))
if segment_num > int(parameters['asw_per_psw'] / (asw_switch_num_per_segment/2)):
raise ValueError("Number of GPU exceeds the capacity of Rail_Optimized_SingleToR(One Pod)")
pod_num = 1
print("psw_switch_num: " + str(parameters['psw_switch_num']))
print("Creating Topology of totally " + str(segment_num) + " segment(s), totally "+ str(pod_num) + " pod(s)." )
nv_switch_num = (int)(parameters['gpu'] / parameters['gpu_per_server']) * parameters['nv_switch_per_server']
nodes = (int) (parameters['gpu'] + parameters['asw_switch_num'] + parameters['psw_switch_num']+ nv_switch_num ) #
servers = parameters['gpu'] / parameters['gpu_per_server']
switch_nodes = (int)(parameters['psw_switch_num'] + parameters['asw_switch_num'] + nv_switch_num) #
links = (int)(parameters['psw_switch_num']/pod_num/2 * parameters['asw_switch_num'] + servers * asw_switch_num_per_segment
+ servers * parameters['nv_switch_per_server'] * parameters['gpu_per_server']) #
if parameters['topology'] == 'AlibabaHPN':
file_name = "AlibabaHPN_"+str(parameters['gpu'])+"g_"+str(parameters['gpu_per_server'])+"gps_DualToR_DualPlane_"+parameters['bandwidth']+"_"+parameters['gpu_type']
else:
file_name = "Rail_Opti_"+str(parameters['gpu'])+"g_"+str(parameters['gpu_per_server'])+"gps_DualToR_DualPlane_"+parameters['bandwidth']+"_"+parameters['gpu_type']
with open(file_name, 'w') as f:
print(file_name)
first_line = str(nodes)+" "+str(parameters['gpu_per_server'])+" "+str(nv_switch_num)+" "+str(switch_nodes-nv_switch_num)+" "+str(int(links))+" "+str(parameters['gpu_type'])
f.write(first_line)
f.write('\n')
nv_switch = []
asw_switch_1 = []
asw_switch_2 = []
psw_switch_1 = []
psw_switch_2 = []
dsw_switch = []
sec_line = ""
nnodes = nodes - switch_nodes
for i in range(nnodes, nodes):
sec_line = sec_line + str(i) + " "
if len(nv_switch) < nv_switch_num:
nv_switch.append(i)
elif len(asw_switch_1) < parameters['asw_switch_num']/2:
asw_switch_1.append(i)
elif len(asw_switch_2) < parameters['asw_switch_num']/2:
asw_switch_2.append(i)
elif len(psw_switch_1) < parameters['psw_switch_num']/2:
psw_switch_1.append(i)
elif len(psw_switch_2) < parameters['psw_switch_num']/2:
psw_switch_2.append(i)
else:
dsw_switch.append(i)
f.write(sec_line)
f.write('\n')
ind_asw = 0
curr_node = 0
group_num = 0
group_account = 0
ind_nv = 0
for i in range(parameters['gpu']):
curr_node = curr_node + 1
if curr_node > parameters['gpu_per_server']:
curr_node = 1
ind_nv = ind_nv + parameters['nv_switch_per_server']
for j in range(0, parameters['nv_switch_per_server']):
#cnt += 1
line = str(i)+" "+str(nv_switch[ind_nv+j])+" "+str(parameters['nvlink_bw'])+" "+str(parameters['nv_latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')
line = str(i)+" "+str(asw_switch_1[group_num*int(asw_switch_num_per_segment/2)+ind_asw])+" "+str(parameters['bandwidth'])+" "+str(parameters['latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')
line = str(i)+" "+str(asw_switch_2[group_num*int(asw_switch_num_per_segment/2)+ind_asw])+" "+str(parameters['bandwidth'])+" "+str(parameters['latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')
ind_asw = ind_asw + 1
group_account = group_account + 1
if ind_asw == int(asw_switch_num_per_segment/2):
ind_asw = 0
if group_account == (parameters['gpu_per_server'] * parameters['nics_per_aswitch']):
group_num = group_num + 1
group_account = 0
for i in asw_switch_1: # asw - psw
for j in psw_switch_1:
line = str(i) + " " + str(j) +" "+ str(parameters['ap_bandwidth'])+" " +str(parameters['latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')
for i in asw_switch_2: # asw - psw
for j in psw_switch_2:
line = str(i) + " " + str(j) +" "+ str(parameters['ap_bandwidth'])+" " +str(parameters['latency'])+" "+str(parameters['error_rate'])
f.write(line)
f.write('\n')