float simulate_time()

in scripts/simulator.cc [912:1051]


float simulate_time(const std::map<Op*, OpConfig>& global, bool print = false)
{
  // We need to reset task_global_guid
  task_global_guid = 0;
  int comm_task_id = 0;
  std::set<Task*, TaskCompare> readyQueue;
  totalDataXfer = 0.0f;
  float accCompTime = 0.0f;
  for (int i = 0; i < op_global_guid; i++) {
    Op* op = guidToOp[i];
    OpConfig config = global.find(op)->second;
    for (int j = 0; j < config.nParts; j++) {
      tasks[i][j] = new Task(config.map[j], op->compute(config));
      accCompTime += tasks[i][j]->computeTime;
    }
    // Build dependencies
    for (int j = 0; j < op->inputTensors.size(); j++) {
      Tensor t = op->inputTensors[j];
      Op* preOp = t.owner;
      if (preOp == NULL) continue;
      OpConfig preConfig = global.find(preOp)->second;
      for (int dstId = 0; dstId < config.nParts; dstId++) {
        Rect dstR = op->get_tensor_shape(config, dstId, t, true/*is_input*/);
        Task* dstT = tasks[i][dstId];
        //printf("dstOp(%d) idx(%d)\n", op->guid, dstId);
        for (int srcId = 0; srcId < preConfig.nParts; srcId++) {
          Rect srcR = preOp->get_tensor_shape(preConfig, srcId, t, false/*is_input*/);
          Task* srcT = tasks[preOp->guid][srcId];
          //printf("srcOp(%d) idx(%d)\n", preOp->guid, srcId);
          // Make sure we are adding links in order
          assert(preOp->guid < i);
          if (intersect(srcR, dstR) > 0) {
            // Add dependency between srcT -> dstT
            float cost = 0.0f;
            if (srcT->workerId != dstT->workerId) {
              cost = intersect(srcR, dstR) * LSTM_PER_NODE_LENGTH * 4 / bandwidth(srcT->workerId, dstT->workerId);
              totalDataXfer += intersect(srcR, dstR) * LSTM_PER_NODE_LENGTH * 4;
              int comm_device_id = (srcId + 1) * MAX_NUM_WORKERS + dstId;
              Task* task = new Task(comm_device_id, cost);
              commTasks[comm_task_id++] = task;
              srcT->add_next_task(task);
              task->add_next_task(dstT);
            } else {
              srcT->add_next_task(dstT);
            }
          }
        }
      }
    }
    // If No dependencies, add tasks to readyQueues
    for (int j = 0; j < config.nParts; j++)
      if (tasks[i][j]->preTasks.size() == 0) {
        readyQueue.insert(tasks[i][j]);
      }
  }
  std::vector<Task*> allTasks;
  float gpuTime[MAX_NUM_WORKERS * (MAX_NUM_WORKERS + 1)];
  for (int i = 0; i < MAX_NUM_WORKERS * (MAX_NUM_WORKERS + 1); i++) {
    gpuTime[i] = 0.0f;
  }
  while (!readyQueue.empty()) {
    // Find the task with earliest start time
    Task* t = *readyQueue.begin();
    allTasks.push_back(t);
    readyQueue.erase(readyQueue.begin());
    int gpuId = t->workerId;
    gpuTime[gpuId] = std::max(gpuTime[gpuId], t->readyTime);
    t->startTime = gpuTime[gpuId];
    gpuTime[gpuId] += t->computeTime;
    for (int i = 0; i < t->nextTasks.size(); i++) {
      Task* next = t->nextTasks[i].task;
      float nextReadyTime = t->startTime + t->computeTime;
      next->readyTime = std::max(next->readyTime, nextReadyTime);
      next->counter ++;
      if (next->counter == next->preTasks.size()) {
        // The task is ready
        readyQueue.insert(next);
      }
    }
    if (print)
      printf("[%zu] gpu(%.2lf %.2lf %.2lf %.2lf)\n", allTasks.size(), gpuTime[0], gpuTime[1], gpuTime[2], gpuTime[3]);
  }
  float totalTime = 0.0f;
  for (int i = 0; i < MAX_NUM_WORKERS; i++)
    totalTime = std::max(totalTime, gpuTime[i]);

  assert(allTasks.size() == task_global_guid);
  // Add update cost
  for (int i = 0; i < parameters.size(); i++) {
    assert(parameters[i].size() > 0);
    std::vector<Op*> opList = parameters[i];
    std::vector<OpConfig> configs;
    std::map<Op*, OpConfig>::const_iterator it;
    for (int j = 0; j < opList.size(); j++) {
      it = global.find(opList[j]);
      configs.push_back(it->second);
    }
    float updateCost = opList[0]->update(configs);
    totalTime += updateCost;
  }
#ifdef VERBOSE
  for (int i = 0; i < op_global_guid; i++) {
    OpConfig c = global.find(guidToOp[i])->second;
    printf("c[%d] dim(%d %d) map(%d %d %d %d)\n", i, c.dim[0], c.dim[1], c.map[0], c.map[1], c.map[2], c.map[3]);
  }
  for (int i = 0; i < op_global_guid; i++) {
    OpConfig config = global.find(guidToOp[i])->second;
    for (int j = 0; j < config.nParts; j++) {
      Task* t = tasks[i][j];
      printf("t[%d] ready(%.2lf) start(%.2lf) compute(%.2lf) next(", t->guid, t->readyTime, t->startTime, t->computeTime);
      for (int k = 0; k < t->nextTasks.size(); k++)
        printf("%d %.2lf, ", t->nextTasks[k].task->guid, t->nextTasks[k].cost);
      printf(")\n");
    }
  }
  for (int i = 0; i < allTasks.size(); i++)
    for (int j = i+1; j < allTasks.size(); j++)
      if (allTasks[i]->guid == allTasks[j]->guid) {
        assert(allTasks[i]->startTime + allTasks[i]->computeTime <= allTasks[j]->startTime);
      }
#endif
#ifdef NOT_USE_SIMULATE_OPT
  for (int i = 0; i < op_global_guid; i++) {
    OpConfig config = global.find(guidToOp[i])->second;
    for (int j = 0; j < config.nParts; j++)
      delete tasks[i][j];
  }
  for (int i = 0; i < comm_task_id; i++)
    delete commTasks[i];
#endif

  if (print) {
    printf("totalTime = %.2lf\n", totalTime);
    printf("totalDataXfer = %.2lf\n", totalDataXfer);
    printf("totalCompTime = %.2lf\n", accCompTime);
  }

  //return totalTime + costDataXfer * DATA_XFER_FACTOR;
  return totalTime;
}