astra-sim-alibabacloud/astra-sim/network_frontend/phynet/SimAiMain.cc (151 lines of code) (raw):
/*
*Copyright (c) 2024, Alibaba Group;
*Licensed under the Apache License, Version 2.0 (the "License");
*you may not use this file except in compliance with the License.
*You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*Unless required by applicable law or agreed to in writing, software
*distributed under the License is distributed on an "AS IS" BASIS,
*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*See the License for the specific language governing permissions and
*limitations under the License.
*/
#include<unistd.h>
#include<string>
#include<iostream>
#include"SimAiPhyNetwork.h"
#include"PhySimAi.h"
#include"SimAiEntry.h"
#include "astra-sim/system/AstraComputeAPI.hh"
#include "astra-sim/system/Sys.hh"
#include "astra-sim/system/RecvPacketEventHadndlerData.hh"
#include "astra-sim/system/MockNcclLog.h"
#include "astra-sim/system/BootStrapnet.hh"
#include "astra-sim/system/PhyMultiThread.hh"
#include "astra-sim/system/Common.hh"
#ifdef PHY_RDMA
#include "astra-sim/system/SimAiFlowModelRdma.hh"
#endif
#define RESULT_PATH "/etc/astra-sim/results/ncclFlowModel_"
using namespace std;
extern int local_rank;
extern AstraSim::Sys* global_sys;
extern FlowPhyRdma flow_rdma;
struct user_param {
int thread;
int gpus;
string workload;
int comm_scale;
GPUType gpu_type;
int nvswitch_num;
int gpus_per_server;
int gid_index;
user_param() {
thread = 1;
gpus = 8;
workload = "microAllReduce.txt";
comm_scale = 1;
gpu_type = GPUType::A100;
nvswitch_num = 1;
gpus_per_server = 8 ;
gid_index = 0;
};
~user_param(){};
user_param(int _thread, int _gpus, string _workload, int _comm_scale = 1)
: thread(_thread),
gpus(_gpus),
workload(_workload),
comm_scale(_comm_scale){};
};
static int user_param_prase(int argc,char * argv[],struct user_param* user_param){
int opt;
static struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"workloads", required_argument, 0, 'w'},
{"gpus", required_argument, 0, 'g'},
{"comm_scale", required_argument, 0, 's'},
{"gid_index", required_argument, 0, 'i'},
{0, 0, 0, 0}};
while ((opt = getopt(argc,argv,"ht:w:g:s:i:"))!=-1){
switch (opt)
{
case 'h':
/* code */
std::cout<<"-w workloads default microAllReduce.txt "<<std::endl;
std::cout<<"-g number of gpus,default 1"<<std::endl;
std::cout<<"-s comm_scale default 1"<<std::endl;
std::cout<<"-i rdma gid_indxe default 0" <<std::endl;
break;
case 't':
user_param->thread = stoi(optarg);
break;
case 'w':
user_param->workload = optarg;
break;
case 'g':
user_param->gpus = stoi(optarg);
if(user_param->gpus <= 8){
user_param->gpus =8;
}
break;
case 's':
user_param->comm_scale = stof(optarg);
break;
case 'i':
user_param->gid_index = stoi(optarg);
break;
default:
break;
}
}
return 0 ;
}
int main(int argc,char *argv[]){
BootStrapNet(argc,argv);
pid_t pid = getpid();
MockNcclLog::set_log_name("SimAi_"+to_string(local_rank)+".log");
MockNcclLog* NcclLog = MockNcclLog::getInstance();
NcclLog->writeLog(NcclLogLevel::DEBUG," Local rank %d PID %d ",local_rank,pid);
struct user_param user_param;
if(user_param_prase(argc,argv,&user_param)){
return -1;
}
#ifdef PHY_RDMA
flow_rdma = FlowPhyRdma(user_param.gid_index);
flow_rdma.ibv_init();
#endif
set_simai_network_callback();
std::vector<int> physical_dims = {user_param.gpus};
std::vector<int>NVswitchs;
std::vector<int> queues_per_dim={1};
std::map<int, int> node2nvswitch;
for(int i = 0; i < user_param.gpus; ++ i) {
node2nvswitch[i] = user_param.gpus + i / user_param.gpus_per_server;
}
for(int i = user_param.gpus; i < user_param.gpus + user_param.nvswitch_num; ++ i){
node2nvswitch[i] = i;
NVswitchs.push_back(i);
}
physical_dims[0] += user_param.nvswitch_num;
SimAiPhyNetWork* phy_network = new SimAiPhyNetWork(local_rank);
global_sys = new AstraSim::Sys(
phy_network,
nullptr,
local_rank,
0,
1,
physical_dims,
queues_per_dim,
"",
user_param.workload,
user_param.comm_scale,
1,
1,
1,
0,
RESULT_PATH,
"phynet_test",
true,
false,
user_param.gpu_type,
{user_param.gpus},
NVswitchs,
user_param.gpus_per_server
);
global_sys->nvswitch_id = node2nvswitch[local_rank];
global_sys->num_gpus = user_param.gpus;
global_sys->workload->fire();
PhyNetSim::Run();
PhyNetSim::Stop();
notify_all_thread_finished();
PhyNetSim::Destory();
MPI_Finalize();
return 0;
};