in csrc/AsyncModelWrapper.h [39:95]
void batchForward() {
torch::NoGradGuard noGrad;
int i = 0;
while (true) {
i += 1;
int B = 1000;
int P = 1000000;
if (i % B == 0) {
if (i % P == 0) std::cerr << "avg time (over " << B << " runs): " << std::endl;
for (auto& kv : timer_) {
if (i % P == 0) std::cerr << kv.first << ", " << kv.second / B << std::endl;
timer_[kv.first] = 0;
}
if (i % P == 0) std::cerr << "===================" << std::endl;
}
auto start = high_resolution_clock::now();
TensorDict input;
try {
input = batcher_.get();
} catch (ExitThread &e) {
break;
}
timer_["batch_size"] += input["s"].size(0);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start).count();
timer_["wait_for_batch"] += duration;
start = high_resolution_clock::now();
std::vector<torch::jit::IValue> jitInput;
jitInput.push_back(tensorDictToTorchDict(input, device_));
stop = high_resolution_clock::now();
duration = duration_cast<microseconds>(stop - start).count();
timer_["to_device"] += duration;
start = high_resolution_clock::now();
auto jitOutput = model_.forward(jitInput);;
auto output = iValueToTensorDict(jitOutput, torch::kCPU, true);
stop = high_resolution_clock::now();
duration = duration_cast<microseconds>(stop - start).count();
timer_["forward"] += duration;
// stop = high_resolution_clock::now();
// duration = duration_cast<microseconds>(stop - start).count();
// timer_["forward"] += duration;
start = high_resolution_clock::now();
batcher_.set(std::move(output));
stop = high_resolution_clock::now();
duration = duration_cast<microseconds>(stop - start).count();
timer_["post_process"] += duration;
}
}