in scripts/simulator.cc [912:1051]
float simulate_time(const std::map<Op*, OpConfig>& global, bool print = false)
{
// We need to reset task_global_guid
task_global_guid = 0;
int comm_task_id = 0;
std::set<Task*, TaskCompare> readyQueue;
totalDataXfer = 0.0f;
float accCompTime = 0.0f;
for (int i = 0; i < op_global_guid; i++) {
Op* op = guidToOp[i];
OpConfig config = global.find(op)->second;
for (int j = 0; j < config.nParts; j++) {
tasks[i][j] = new Task(config.map[j], op->compute(config));
accCompTime += tasks[i][j]->computeTime;
}
// Build dependencies
for (int j = 0; j < op->inputTensors.size(); j++) {
Tensor t = op->inputTensors[j];
Op* preOp = t.owner;
if (preOp == NULL) continue;
OpConfig preConfig = global.find(preOp)->second;
for (int dstId = 0; dstId < config.nParts; dstId++) {
Rect dstR = op->get_tensor_shape(config, dstId, t, true/*is_input*/);
Task* dstT = tasks[i][dstId];
//printf("dstOp(%d) idx(%d)\n", op->guid, dstId);
for (int srcId = 0; srcId < preConfig.nParts; srcId++) {
Rect srcR = preOp->get_tensor_shape(preConfig, srcId, t, false/*is_input*/);
Task* srcT = tasks[preOp->guid][srcId];
//printf("srcOp(%d) idx(%d)\n", preOp->guid, srcId);
// Make sure we are adding links in order
assert(preOp->guid < i);
if (intersect(srcR, dstR) > 0) {
// Add dependency between srcT -> dstT
float cost = 0.0f;
if (srcT->workerId != dstT->workerId) {
cost = intersect(srcR, dstR) * LSTM_PER_NODE_LENGTH * 4 / bandwidth(srcT->workerId, dstT->workerId);
totalDataXfer += intersect(srcR, dstR) * LSTM_PER_NODE_LENGTH * 4;
int comm_device_id = (srcId + 1) * MAX_NUM_WORKERS + dstId;
Task* task = new Task(comm_device_id, cost);
commTasks[comm_task_id++] = task;
srcT->add_next_task(task);
task->add_next_task(dstT);
} else {
srcT->add_next_task(dstT);
}
}
}
}
}
// If No dependencies, add tasks to readyQueues
for (int j = 0; j < config.nParts; j++)
if (tasks[i][j]->preTasks.size() == 0) {
readyQueue.insert(tasks[i][j]);
}
}
std::vector<Task*> allTasks;
float gpuTime[MAX_NUM_WORKERS * (MAX_NUM_WORKERS + 1)];
for (int i = 0; i < MAX_NUM_WORKERS * (MAX_NUM_WORKERS + 1); i++) {
gpuTime[i] = 0.0f;
}
while (!readyQueue.empty()) {
// Find the task with earliest start time
Task* t = *readyQueue.begin();
allTasks.push_back(t);
readyQueue.erase(readyQueue.begin());
int gpuId = t->workerId;
gpuTime[gpuId] = std::max(gpuTime[gpuId], t->readyTime);
t->startTime = gpuTime[gpuId];
gpuTime[gpuId] += t->computeTime;
for (int i = 0; i < t->nextTasks.size(); i++) {
Task* next = t->nextTasks[i].task;
float nextReadyTime = t->startTime + t->computeTime;
next->readyTime = std::max(next->readyTime, nextReadyTime);
next->counter ++;
if (next->counter == next->preTasks.size()) {
// The task is ready
readyQueue.insert(next);
}
}
if (print)
printf("[%zu] gpu(%.2lf %.2lf %.2lf %.2lf)\n", allTasks.size(), gpuTime[0], gpuTime[1], gpuTime[2], gpuTime[3]);
}
float totalTime = 0.0f;
for (int i = 0; i < MAX_NUM_WORKERS; i++)
totalTime = std::max(totalTime, gpuTime[i]);
assert(allTasks.size() == task_global_guid);
// Add update cost
for (int i = 0; i < parameters.size(); i++) {
assert(parameters[i].size() > 0);
std::vector<Op*> opList = parameters[i];
std::vector<OpConfig> configs;
std::map<Op*, OpConfig>::const_iterator it;
for (int j = 0; j < opList.size(); j++) {
it = global.find(opList[j]);
configs.push_back(it->second);
}
float updateCost = opList[0]->update(configs);
totalTime += updateCost;
}
#ifdef VERBOSE
for (int i = 0; i < op_global_guid; i++) {
OpConfig c = global.find(guidToOp[i])->second;
printf("c[%d] dim(%d %d) map(%d %d %d %d)\n", i, c.dim[0], c.dim[1], c.map[0], c.map[1], c.map[2], c.map[3]);
}
for (int i = 0; i < op_global_guid; i++) {
OpConfig config = global.find(guidToOp[i])->second;
for (int j = 0; j < config.nParts; j++) {
Task* t = tasks[i][j];
printf("t[%d] ready(%.2lf) start(%.2lf) compute(%.2lf) next(", t->guid, t->readyTime, t->startTime, t->computeTime);
for (int k = 0; k < t->nextTasks.size(); k++)
printf("%d %.2lf, ", t->nextTasks[k].task->guid, t->nextTasks[k].cost);
printf(")\n");
}
}
for (int i = 0; i < allTasks.size(); i++)
for (int j = i+1; j < allTasks.size(); j++)
if (allTasks[i]->guid == allTasks[j]->guid) {
assert(allTasks[i]->startTime + allTasks[i]->computeTime <= allTasks[j]->startTime);
}
#endif
#ifdef NOT_USE_SIMULATE_OPT
for (int i = 0; i < op_global_guid; i++) {
OpConfig config = global.find(guidToOp[i])->second;
for (int j = 0; j < config.nParts; j++)
delete tasks[i][j];
}
for (int i = 0; i < comm_task_id; i++)
delete commTasks[i];
#endif
if (print) {
printf("totalTime = %.2lf\n", totalTime);
printf("totalDataXfer = %.2lf\n", totalDataXfer);
printf("totalCompTime = %.2lf\n", accCompTime);
}
//return totalTime + costDataXfer * DATA_XFER_FACTOR;
return totalTime;
}