void batchForward()

in csrc/AsyncModelWrapper.h [39:95]


  void batchForward() {
    torch::NoGradGuard noGrad;
    int i = 0;
    while (true) {
      i += 1;
      int B = 1000;
      int P = 1000000;
      if (i % B == 0) {
        if (i % P == 0) std::cerr << "avg time (over " << B << " runs): " << std::endl;
        for (auto& kv : timer_) {
          if (i % P == 0) std::cerr << kv.first << ", " << kv.second / B << std::endl;
          timer_[kv.first] = 0;
        }
        if (i % P == 0) std::cerr << "===================" << std::endl;
      }

      auto start = high_resolution_clock::now();

      TensorDict input;
      try {
        input = batcher_.get();
      } catch (ExitThread &e) {
        break;
      }
      timer_["batch_size"] += input["s"].size(0);

      auto stop = high_resolution_clock::now();
      auto duration = duration_cast<microseconds>(stop - start).count();
      timer_["wait_for_batch"] += duration;

      start = high_resolution_clock::now();
      std::vector<torch::jit::IValue> jitInput;
      jitInput.push_back(tensorDictToTorchDict(input, device_));

      stop = high_resolution_clock::now();
      duration = duration_cast<microseconds>(stop - start).count();
      timer_["to_device"] += duration;

      start = high_resolution_clock::now();
      auto jitOutput = model_.forward(jitInput);;
      auto output = iValueToTensorDict(jitOutput, torch::kCPU, true);

      stop = high_resolution_clock::now();
      duration = duration_cast<microseconds>(stop - start).count();
      timer_["forward"] += duration;

      // stop = high_resolution_clock::now();
      // duration = duration_cast<microseconds>(stop - start).count();
      // timer_["forward"] += duration;

      start = high_resolution_clock::now();
      batcher_.set(std::move(output));
      stop = high_resolution_clock::now();
      duration = duration_cast<microseconds>(stop - start).count();
      timer_["post_process"] += duration;
    }
  }