in astra-sim-alibabacloud/astra-sim/workload/Layer.cc [546:831]
LayerData Layer::report(
std::string run_name,
int layer_num,
int total_rows,
int stat_row,
CSVWriter* detailed,
CSVWriter* EndToEnd,
double& total_compute,
double& total_exposed,
double& pre_bubble_time,
double& DP_comm,
double& DP_EP_comm,
double& Expose_TP_comm,
double& Expose_EP_comm,
bool seprate_log) {
LayerData layerData;
take_stream_stats_average();
int TP_size = workload->model_parallel_npu_group;
int PP_size = workload->pipeline_model_parallelism;
int vpp = workload->vpp;
uint32_t pp_commsize = workload->pp_commsize;
int DP_size = generator->all_gpus[0] / (TP_size * PP_size);
int GA = workload->GA;
int EP_size = workload->expert_parallel_npu_group;
int fwd_pass_group_size ;
int weight_grad_group_size ;
int input_grad_group_size ;
UserParam* param = UserParam::getInstance();
input_grad_group_size =
input_grad_group_type == MockNccl::GroupType::EP ? EP_size : TP_size;
fwd_pass_group_size =
fwd_pass_group_type == MockNccl::GroupType::EP ? EP_size : TP_size;
weight_grad_group_size =
weight_grad_group_type == MockNccl::GroupType::DP_EP ? DP_size / EP_size
: DP_size;
if(param->mode == ModeType::ANALYTICAL){
total_fwd_comm = compute_time(fwd_pass_comm_type,TP_size,fwd_pass_group_size,fwd_pass_comm_size,fwd_pass_group_type,generator->all_gpus[0],EP_size);
total_weight_grad_comm = compute_time(weight_grad_comm_type,TP_size,weight_grad_group_size,weight_grad_comm_size,weight_grad_group_type,generator->all_gpus[0],EP_size);
total_input_grad_comm = compute_time(input_grad_comm_type,TP_size,input_grad_group_size,input_grad_comm_size,input_grad_group_type,generator->all_gpus[0],EP_size);
total_waiting_for_fwd_comm = total_fwd_comm; //tp forward
total_waiting_for_ig_comm = total_input_grad_comm; //tp backward
total_waiting_for_wg_comm = total_weight_grad_comm;
}
if (id != "embedding_layer"){
pre_bubble_time += ((total_waiting_for_fwd_comm + total_forward_pass_compute + total_weight_grad_compute + total_input_grad_compute + total_waiting_for_ig_comm) / FREQ);
}
if(weight_grad_group_type == MockNccl::GroupType::DP_EP){
total_waiting_for_wg_comm *= (1-param->net_work_param.dp_overlap_ratio);
DP_EP_comm += (total_waiting_for_wg_comm / FREQ);
}
else{
total_waiting_for_wg_comm *= (1-param->net_work_param.dp_overlap_ratio);
DP_comm += (total_waiting_for_wg_comm / FREQ);
}
if(fwd_pass_group_type == MockNccl::GroupType::EP){
total_waiting_for_fwd_comm *= (1-param->net_work_param.ep_overlap_ratio);
total_waiting_for_ig_comm *= (1-param->net_work_param.ep_overlap_ratio);
Expose_EP_comm += ((total_waiting_for_fwd_comm + total_waiting_for_ig_comm) / FREQ);
}
else{
total_waiting_for_fwd_comm *= (1-param->net_work_param.tp_overlap_ratio);
total_waiting_for_ig_comm *= (1-param->net_work_param.tp_overlap_ratio);
Expose_TP_comm += ((total_waiting_for_fwd_comm + total_waiting_for_ig_comm) / FREQ);
}
total_compute += (total_forward_pass_compute / FREQ);
total_compute += (total_weight_grad_compute / FREQ);
total_compute += (total_input_grad_compute / FREQ);
total_exposed += (total_waiting_for_fwd_comm / FREQ);
total_exposed += (total_waiting_for_wg_comm / FREQ);
total_exposed += (total_waiting_for_ig_comm / FREQ);
layerData.layer_name = id;
layerData.total_forward_pass_compute = total_forward_pass_compute / FREQ;
layerData.total_weight_grad_compute = total_weight_grad_compute / FREQ;
layerData.total_input_grad_compute = total_input_grad_compute / FREQ;
layerData.total_waiting_for_fwd_comm = total_waiting_for_fwd_comm / FREQ;
layerData.total_waiting_for_wg_comm = total_waiting_for_wg_comm / FREQ;
layerData.total_waiting_for_ig_comm = total_waiting_for_ig_comm / FREQ;
layerData.total_fwd_comm = total_fwd_comm / FREQ;
layerData.total_weight_grad_comm = total_weight_grad_comm / FREQ;
layerData.total_input_grad_comm = total_input_grad_comm / FREQ;
int i = 0;
for (auto& qd : queuing_delay) {
layerData.avg_queuing_delay.push_back(std::make_pair(i, qd / FREQ));
}
i = 1;
for (auto& ml : net_message_latency) {
layerData.avg_network_message_dealy.push_back(std::make_pair(i, ml / FREQ));
}
#ifdef NS3_MPI
if (seprate_log)
#else
if (seprate_log)
#endif
{
std::string data;
std::pair<float, float> total_bw;
std::cout << "*******************" << std::endl;
std::cout << "Layer id: " << id << std::endl;
std::cout << "Total collectives issued for this layer: " << collective_counter << std::endl;
std::cout << "************************* Workload stats ************************* " << id << std::endl;
if (stat_row == 0 && layer_num == 0) {
data = "layer_name," + run_name + ",fwd compute,wg compute,ig compute,fwd exposed comm,wg exposed comm,ig exposed comm,fwd total comm,algbw,busbw,wg total comm,algbw,busbw,ig total comm,algbw,busbw";
EndToEnd->write_line(data);
}
data = "";
if (stat_row == 0) {
data += id;
}
data = data + "," + run_name;
auto format_value = [](double value) {
std::ostringstream stream;
if (std::isfinite(value)) {
stream << std::fixed << std::setprecision(0) << value;
} else {
stream << "NaN or Inf";
}
return stream.str();
};
auto format_value_bs = [](double value) {
std::ostringstream stream;
stream << std::fixed << std::setprecision(2) << value;
return stream.str();
};
std::cout << "id: " << id << " ,Total cycles spent on fwd pass compute: "
<< format_value(total_forward_pass_compute / FREQ ) << std::endl;
data = data + "," + format_value(total_forward_pass_compute / FREQ );
std::cout << "id: " << id << " ,Total cycles spent on weight grad compute: "
<< format_value(total_weight_grad_compute / FREQ ) << std::endl;
data = data + "," + format_value(total_weight_grad_compute / FREQ );
std::cout << "id: " << id << " ,Total cycles spent on input grad compute: "
<< format_value(total_input_grad_compute / FREQ ) << std::endl;
data = data + "," + format_value(total_input_grad_compute / FREQ );
std::cout << "id: " << id
<< " ,Total cycles spent idle waiting for fwd finish: "
<< format_value(total_waiting_for_fwd_comm / FREQ ) << std::endl;
data = data + "," + format_value(total_waiting_for_fwd_comm / FREQ );
std::cout << "id: " << id
<< " ,Total cycles spent idle waiting for weight grad finish: "
<< format_value(total_waiting_for_wg_comm / FREQ ) << std::endl;
data = data + "," + format_value(total_waiting_for_wg_comm / FREQ );
std::cout << "id: " << id
<< " ,Total cycles spent idle waiting for input grad finish: "
<< format_value(total_waiting_for_ig_comm / FREQ ) << std::endl;
data = data + "," + format_value(total_waiting_for_ig_comm / FREQ );
std::cout << "id: " << id
<< " ,Total cycles spent on fwd pass comm: " << format_value(total_fwd_comm / FREQ ) << std::endl;
total_bw = compute_busbw(fwd_pass_comm_type, fwd_pass_group_size, fwd_pass_comm_size, total_fwd_comm);
data = data + "," + format_value(total_fwd_comm / FREQ );
data = data + "," + format_value_bs(total_bw.first);
data = data + "," + format_value_bs(total_bw.second);
std::cout << "id: " << id << " ,Total cycles spent on weight grad comm: "
<< format_value(total_weight_grad_comm / FREQ ) << std::endl;
total_bw = compute_busbw(weight_grad_comm_type, weight_grad_group_size, weight_grad_comm_size, total_weight_grad_comm);
data = data + "," + format_value(total_weight_grad_comm / FREQ );
data = data + "," + format_value_bs(total_bw.first);
data = data + "," + format_value_bs(total_bw.second);
std::cout << "id: " << id << " ,Total cycles spent on input grad comm: "
<< format_value(total_input_grad_comm / FREQ ) << std::endl;
total_bw = compute_busbw(input_grad_comm_type, input_grad_group_size, input_grad_comm_size, total_input_grad_comm);
data = data + "," + format_value(total_input_grad_comm / FREQ );
data = data + "," + format_value_bs(total_bw.first);
data = data + "," + format_value_bs(total_bw.second);
// data = data + "," + format_value(((double)Sys::boostedTick()) / FREQ );
EndToEnd->write_line(data);
if (layer_num == workload->SIZE - 1) {
if (param->mode != ModeType::ANALYTICAL) {
total_exposed = (((double)Sys::boostedTick()) / FREQ ) - total_compute;
}
//pp commtime
Tick Expose_PP_time = (2 * vpp * GA * (pp_commsize * GBps / (param->net_work_param.pp) * 1e9) / FREQ );
Expose_PP_time *= (1-param->net_work_param.pp_overlap_ratio) ;
//pp bubble time
pre_bubble_time *= static_cast<double>(PP_size - 1) / (GA * vpp);
//total time
double total_time = total_compute + total_exposed + pre_bubble_time + Expose_PP_time;
auto format_percentage = [&](double value) {
double percentage = (value / total_time) * 100;
std::ostringstream stream;
stream << std::fixed << std::setprecision(2) << percentage;
return stream.str() + "%";
};
std::string file_name = param->res;
size_t last_slash_pos = param->res.find_last_of('/');
std::string result;
if (last_slash_pos != std::string::npos) {
file_name = param->res.substr(last_slash_pos + 1); // 取 '/' 后面的部分
}
std::string keys = "File name, Expose DP comm, Expose DP_EP comm, Expose TP comm, Expose_EP_comm, Expose_PP_comm, bubble time, total comp, total exposed comm, Total time";
std::string values = file_name + ", " +
format_value(DP_comm) + " (" + format_percentage(DP_comm) + "), " +
format_value(DP_EP_comm) + " (" + format_percentage(DP_EP_comm) + "), " +
format_value(Expose_TP_comm) + " (" + format_percentage(Expose_TP_comm) + "), " +
format_value(Expose_EP_comm) + " (" + format_percentage(Expose_EP_comm) + "), " +
format_value(Expose_PP_time) + " (" + format_percentage(Expose_PP_time) + "), " +
format_value(pre_bubble_time) + " (" + format_percentage(pre_bubble_time) + "), " +
format_value(total_compute) + " (" + format_percentage(total_compute) + "), " +
format_value(total_exposed) + " (" + format_percentage(total_exposed) + "), " +
format_value(total_time);
data = keys + "\n" + values;
EndToEnd->write_res(data);
if(param->net_work_param.visual){
std::string chart_path = EndToEnd->path;
std::ofstream htmlFile(chart_path + "chart.html");
std::string file_name = getFileName(chart_path);
htmlFile << "<!DOCTYPE html>\n";
htmlFile << "<html>\n<head>\n";
htmlFile << "<script src=\"https://cdn.jsdelivr.net/npm/chart.js\"></script>\n";
htmlFile << "<style>\n";
htmlFile << "body { display: flex; flex-direction: column; justify-content: center; align-items: center; height: 50vh; margin: 0; padding-top: 10%; }\n";
htmlFile << "canvas { width: 50%; max-width: 400px; height: auto; }\n";
htmlFile << "h2 { margin: 5px 0; }\n";
htmlFile << "</style>\n";
htmlFile << "</head>\n<body>\n";
htmlFile << "<canvas id=\"myPieChart\"></canvas>\n";
htmlFile << "<h2>Total Time: " << to_string(total_time) << " ns</h2>\n";
htmlFile << "<h2>model: " << file_name << " </h2>\n";
htmlFile << "<script>\n";
htmlFile << "var ctx = document.getElementById('myPieChart').getContext('2d');\n";
htmlFile << "var myPieChart = new Chart(ctx, {\n";
htmlFile << " type: 'pie',\n";
htmlFile << " data: {\n";
htmlFile << " labels: ['Expose DP comm', 'Expose DP_EP comm','Expose TP comm', 'Expose_EP_comm','Total compute', 'PP Bubble time', 'Expose PP comm'],\n";
htmlFile << " datasets: [{\n";
htmlFile << " data: ["
<< DP_comm << ", "
<< DP_EP_comm << ", "
<< Expose_TP_comm << ", "
<< Expose_EP_comm << ", "
<< total_compute << ", "
<< pre_bubble_time << ", "
<< Expose_PP_time << "],\n";
htmlFile << " backgroundColor: ['#FF6384', '#36A2EB', '#FFCE56', '#4BC0C0', '#9966FF', '#FF9F40','#FF5733'],\n";
htmlFile << " }]\n";
htmlFile << " },\n";
htmlFile << " options: {\n";
htmlFile << " responsive: true,\n";
htmlFile << " maintainAspectRatio: true,\n";
htmlFile << " plugins: {\n";
htmlFile << " tooltip: {\n";
htmlFile << " callbacks: {\n";
htmlFile << " label: function(context) {\n";
htmlFile << " var label = context.label || '';\n";
htmlFile << " if (label) {\n";
htmlFile << " label += ': ';\n";
htmlFile << " }\n";
htmlFile << " if (context.parsed !== null) {\n";
htmlFile << " label += context.parsed + ' ns';\n";
htmlFile << " }\n";
htmlFile << " return label;\n";
htmlFile << " }\n";
htmlFile << " }\n";
htmlFile << " }\n";
htmlFile << " }\n";
htmlFile << " }\n";
htmlFile << "});\n";
htmlFile << "</script>\n";
htmlFile << "</body>\n</html>";
htmlFile.close();
std::cout << "HTML file created" << std::endl;
}
}
}
return layerData;
}