int Executor::executeNetwork()

in tools/loader/ExecutorCore.cpp [262:703]


int Executor::executeNetwork() {

  parseInputFiles(inputImageFilenames_);

  if (excludedFirstWarmupRuns && excludedFirstWarmupRuns >= warmup) {
    llvm::errs() << "Excluding all warmup runs does not make sense\n";
    return 1;
  }
  // Stream input mode.
  const bool streamInputFilenamesMode = inputImageFilenamesOpt.size() == 1 &&
                                        inputImageFilenamesOpt.front() == "-";

  CHECK(!(streamInputFilenamesMode && emittingBundle()))
      << "Cannot emit a bundle and also stream inputs.";

  // If tracing is enabled, create a TraceContext to merge each runs events
  // into.
  if (!tracePath.empty()) {
    traceContext = glow::make_unique<TraceContext>(TraceLevel::STANDARD);
  }

  // Mini-batch mode.
  const bool miniBatchMode = miniBatch > 0;
  CHECK(((!miniBatchMode) || (!streamInputFilenamesMode)))
      << "The minibatch option is not compatible with the stream input "
         "image mode.";
  CHECK(((!miniBatchMode) || (inputImageFilenames_[0].size() % miniBatch == 0)))
      << "The number of input images must be a multiple of the mini-batch.";

  CHECK(((!iterationsOpt) || (!miniBatchMode) ||
         (iterationsOpt % miniBatch == 0)))
      << "Benchmark count must be a multiple of the mini-batch.";
  CHECK(!preloadAllImages || miniBatchMode)
      << "preload-all-images can only be used with minibatch";

  const bool singleBatchRepeatedMode = repeatSingleBatchCount > 0;
  CHECK(!(streamInputFilenamesMode && singleBatchRepeatedMode))
      << "singleBatchRepeatedMode is not compatible with "
         "streamInputFilenamesMode";

  // When the mini-batch mode is enabled do not allow debug instrumentation.
  if (miniBatchMode) {
    CHECK(!instrumentDebug)
        << "The minibatch option is not compatible with debug instrumentation.";
  }

  CHECK(!preloadAllImages || (modelInputsOpt.size() == 1))
      << "Preloading all images doesn't support networks with multiple inputs.";

  CHECK(!iterationsOpt || (modelInputsOpt.size() == 1))
      << "Benchmark mode doesn't support networks with multiple inputs.";

  // Print out the inferred image classification.
  llvm::outs() << "Model: " << Loader::getModelOptPath() << "\n";
  std::mutex ioMu;
  int numErrors = 0;

  if (runAllInputsOnAllDevices) {
    if (numDevices != miniBatchThreads) {
      llvm::outs() << "Setting " << miniBatchThreads.ArgStr << " to match "
                   << numDevices.ArgStr << " (" << numDevices
                   << ") as required by " << runAllInputsOnAllDevices.ArgStr
                   << "\n";
      miniBatchThreads.getValue() = numDevices;
    }
  }

  // If preloading then load+process all images here in preloadedInputImageData.
  Tensor preloadedInputImageData;
  if (preloadAllImages) {
    Loader loader;
    PreProcessInputExecutor ppImageExecutor;
    addLoaderExtensions(loader);
    ppImageExecutor.registerInputDataPreProcessingExtension(
        ppInputDataExtensions_);

    if (!inputTensorListFile.empty()) {
      loadInputImageFromFileWithType(
          inputImageFilenames_[0], &preloadedInputImageData, imageLayoutOpt[0]);
    } else {
      // Load and process the image data into the inputImageData Tensor.
      loadImagesAndPreprocess(inputImageFilenames_, {&preloadedInputImageData});
      ppImageExecutor.processInputTensor({&preloadedInputImageData}, 0,
                                         inputImageFilenames_[0].size(),
                                         preloadedInputImageData.dims()[0]);
    }
  }

  // Process a set of minibatches with indices [startIndex, endIndex).
  auto processImageRange = [&](size_t startIndex, size_t endIndex, size_t TID) {
    std::unique_ptr<ExecutionContext> exContext =
        glow::make_unique<ExecutionContext>();
    PlaceholderBindings &bindings = *exContext->getPlaceholderBindings();
    if (traceContext) {
      exContext->setTraceContext(
          glow::make_unique<TraceContext>(TraceLevel::STANDARD));
    }
    // If runAllInputsOnAllDevices, then assign this thread with TID to device
    // TID. E.g. if this is TID 2 then this will be assigned to device 2.
    Loader loader = runAllInputsOnAllDevices ? Loader(TID) : Loader();
    PostProcessExecutor ppResultExecutor;
    PreProcessInputExecutor ppImageExecutor;

    // Registering all the extensions per thread.
    addLoaderExtensions(loader);
    ppResultExecutor.registerPostProcessOutputExtensions(
        ppOutputDataExtensions_);
    ppImageExecutor.registerInputDataPreProcessingExtension(
        ppInputDataExtensions_);

    // Used to make sure we only compile once, and run only once if not
    // streaming.
    bool isFirstRun = true;

    // Perform graph profiling initialization if needed.
    // if (profilingGraph()) {
    //  loader.initGraphProfiling(
    //      bindings, miniBatch > 0 ? miniBatch :
    //      inputImageFilenames_[0].size(), inputImageFilenames_[0].size());
    //}

    // These will be set during the first run.
    llvm::StringMap<Placeholder *> iPHM;
    llvm::StringMap<Placeholder *> oPHM;
    std::vector<Placeholder *> inPHs;
    std::vector<Placeholder *> outPHs;

    size_t miniBatchIndex = startIndex;
    std::vector<Tensor> inputData(modelInputsOpt.size());
    if (preloadAllImages) {
      inputData[0] = preloadedInputImageData.getUnowned();
    }

    VecVec<std::string> inputImageBatchFilenames;
    if ((!miniBatchMode) &&
        (!streamInputFilenamesMode || singleBatchRepeatedMode)) {
      inputImageBatchFilenames = inputImageFilenames_;
    } else if (singleBatchRepeatedMode) {
      for (size_t i = 0, e = modelInputsOpt.size(); i < e; i++) {
        std::vector<std::string> names(inputImageFilenames_[0].begin(),
                                       inputImageFilenames_[0].begin() +
                                           miniBatch);
        inputImageBatchFilenames.push_back(names);
      }
    }
    if (!tracePath.empty()) {
      loader.getHostManager()->setTraceContext(
          glow::make_unique<TraceContext>(traceLevel));
      Error err = loader.getHostManager()->startDeviceTrace();
      if (err) {
        LOG(INFO) << "Failed to start device trace.";
        numErrors = 1;
        return;
      } else {
        llvm::outs() << "Device trace started.";
      }
    }

    // Pass input tensors around as array of pointers.
    std::vector<Tensor *> inputImageData;
    for (auto &data : inputData) {
      inputImageData.push_back(&data);
    }

    unsigned repeatedLoopCountRemaining = repeatSingleBatchCount;

    auto loopCond = [&]() {
      // If in stream mode then get the next image filenames if they exist,
      // otherwise exit.
      if (streamInputFilenamesMode) {
        return getNextStdinImageFilenames(inputImageBatchFilenames);
      }

      // If a single batch is going to be loaded once and repeated then keep
      // running repeatedLoopCountRemaining mores times.
      if (singleBatchRepeatedMode) {
        return repeatedLoopCountRemaining-- != 0;
      }

      // If in miniBatchMode then continue if we have already preloaded all
      // images (will break inside loop once done), or otherwise get the next
      // miniBatch image filenames if they exist, otherwise exit.
      if (miniBatchMode) {
        return getNextMiniBatch(inputImageBatchFilenames, inputImageFilenames_,
                                miniBatchIndex, miniBatch, endIndex);
      }

      // At least enter once, e.g. to just dump a bundle.
      return isFirstRun;
    };

    while (loopCond()) {
      if (!preloadAllImages && (!singleBatchRepeatedMode || isFirstRun)) {
        // Load and process the image data into the inputImageData Tensor.
        if (!inputTensorListFile.empty()) {
          loadInputImageFromFileWithType(inputImageBatchFilenames[0],
                                         inputImageData[0], imageLayoutOpt[0]);
        } else {
          loadImagesAndPreprocess(inputImageBatchFilenames, inputImageData);
          ppImageExecutor.processInputTensor(inputImageData, startIndex,
                                             endIndex,
                                             inputImageData[0]->dims()[0]);
        }
      }

      // Note: At this point miniBatchIndex is the end index, so subtract
      // miniBatch to get the start index.
      const dim_t startMiniBatchIndex = miniBatchIndex - miniBatch;

      ShapeVector imageShape(inputImageData[0]->getType().dims().begin(),
                             inputImageData[0]->getType().dims().end());
      if (miniBatch) {
        imageShape[0] = miniBatch;
      } else if (iterationsOpt) {
        imageShape[0] = iterationsOpt;
      }

      // If we are benchmarking reset the image data to the batch size we need.
      if (iterationsOpt) {
        auto resetTensor = [](Tensor *tensor) {
          ShapeVector imageSize(tensor->getType().dims().begin(),
                                tensor->getType().dims().end());
          imageSize[0] = miniBatch ? miniBatch : iterationsOpt;
          tensor->reset(ElemKind::FloatTy, imageSize);
        };
        std::for_each(inputImageData.begin(), inputImageData.end(),
                      resetTensor);
      }

      // If this is the first run, then we need to build and compile the model.
      if (isFirstRun) {
        isFirstRun = false;

        std::vector<TypeRef> types;
        auto preloadTy =
            Type::newShape(inputImageData[0]->getType(), imageShape);

        if (preloadAllImages) {
          types.push_back(&preloadTy);
        } else {
          // get types of all input tensors.
          for_each(inputImageData.begin(), inputImageData.end(),
                   [&](auto *t) { types.push_back(&t->getType()); });
        }

        // Build and compile the graph, then get input and output Placeholders.
        std::tie(iPHM, oPHM) =
            buildAndCompileAndGetInAndOutPair(loader, bindings, types);

        // If in bundle mode, the bundle has been saved by the above call, so we
        // can safely return.
        if (emittingBundle()) {
          LOG(INFO) << "Emit bundle mode is on. Network is compiled only.";
          return;
        }

        // Obtain input/output placeholders from input/output map.
        // For inputs, we got map but need to convert to array - need to
        // take from map in order specified by modelInputsOpt.
        for (size_t i = 0, e = modelInputsOpt.size(); i < e; i++) {
          auto it = iPHM.find(modelInputsOpt[i]);
          CHECK(it != iPHM.end())
              << "Couldn't find placeholder: " << modelInputsOpt[i];
          CHECK((*it).second) << "Placeholder in input map is NULL.";
          inPHs.push_back((*it).second);
        };
        for_each(oPHM.begin(), oPHM.end(), [&](auto &p) {
          CHECK(p.second) << "Placeholder in output map is NULL.";
          outPHs.push_back(p.second);
        });
      }

      // preloadAllImages - set a new Tensor that takes a slice from the 1st
      // (and only) input tensor. Assign this new Tensor the tensor array of
      // pointers, inputImageData, used further.
      Tensor inputImageDataBatch;
      if (preloadAllImages) {
        std::vector<dim_t> imgSliceStart(imageShape.size(), 0);
        imgSliceStart[0] = startMiniBatchIndex;
        inputImageDataBatch =
            inputImageData[0]->getUnowned(imageShape, imgSliceStart);
        inputImageData[0] = &inputImageDataBatch;
      }

      // Compile done.
      CHECK(!inPHs.empty()) << "Input must be valid.";
      CHECK(!outPHs.empty()) << "Output must be valid.";
      CHECK_EQ(inPHs.size(), inputImageData.size())
          << "Number of input placeholders and tensors must match";
      for (size_t i = 0, e = inputImageData.size(); i < e; i++) {
        CHECK(inPHs[i]->dims() == inputImageData[i]->dims())
            << "New input shape does not match the compiled function: "
            << inPHs[i]->dims() << " vs " << inputImageData[i]->dims();
      }

      // Convert the raw input to fp16. This must be done every time we get new
      // image data.
      // Convert the raw input to fp16.
      if (convertInAndOutToFp16) {
        for (auto &t : inputImageData) {
          t->convertToType(ElemKind::Float16Ty);
        }
      }

      // If we are benchmarking we are done with the while loop.
      if (iterationsOpt) {
        break;
      }

      // Minibatch inference initialization of loader extensions
      loader.inferInitMiniBatch(bindings, startMiniBatchIndex, miniBatch);

      // About to run inference, so update the input image Placeholder's backing
      // Tensor with inputImageDataBatch.
      updateInputPlaceholders(bindings, inPHs, inputImageData);

      // Perform the inference execution, updating output tensors.
      auto batchSize = inputImageData[0]->dims()[0];
      loader.runInference(exContext.get(), batchSize);
      if (traceContext) {
        traceContext->merge(exContext->getTraceContext());
      }

      // Process output of the network. Each app cand do its own post-processing
      // depending on type of the network.
      {
        std::lock_guard<std::mutex> lock(ioMu);
        numErrors += ppResultExecutor.processOutputs(oPHM, bindings,
                                                     inputImageBatchFilenames);
      }

      // Minibatch inference initialization of loader extensions.
      loader.inferEndMiniBatch(bindings, startMiniBatchIndex, miniBatch);
    }

    if (iterationsOpt) {
      // Image tensors loaded up to be run at once for benchmark mode.
      UniquePtrVec<ExecutionContext> contexts =
          setupContextPool(outPHs, inPHs[0], *inputImageData[0]);

      std::string name = loader.getFunctionName();
      std::unique_ptr<llvm::Timer> restRunsTimer = nullptr;
      std::unique_ptr<llvm::Timer> firstRunsTimer = nullptr;
      std::unique_ptr<double> bestRunTime = nullptr;
      if (timeOpt) {
        if (excludedFirstWarmupRuns) {
          firstRunsTimer.reset(
              new llvm::Timer("First Runs", "First inference runs"));
          restRunsTimer.reset(
              new llvm::Timer("Rest Inferences", "Rest of the inference runs"));
        } else {
          restRunsTimer.reset(
              new llvm::Timer("Inferences", "All inference runs"));
        }
        bestRunTime.reset(new double);
        *bestRunTime = DBL_MAX;
      }
      unsigned requestCount = miniBatch ? iterationsOpt / miniBatch : 1;

      runBenchmark(name, loader, std::move(contexts), requestCount, warmup,
                   restRunsTimer.get(), firstRunsTimer.get(),
                   bestRunTime.get());
      if (timeOpt) {
        double wallTime = restRunsTimer->getTotalTime().getWallTime();
        llvm::outs() << llvm::formatv(
            "Average wall time per item (s): {0:f4}\n",
            wallTime / (iterationsOpt + warmup - excludedFirstWarmupRuns));
        llvm::outs() << llvm::formatv(
            "            Best wall time (s): {0:f4}\n", *bestRunTime);
      }
    }

    if (profilingGraph()) {
      loader.generateAndSerializeProfilingInfos(bindings);
    }

    if (!tracePath.empty()) {
      Error err = loader.getHostManager()->stopDeviceTrace();
      if (err) {
        LOG(INFO) << "Failed to stop device trace:";
        numErrors = 1;
        return;
      } else {
        traceContext->merge(loader.getHostManager()->getTraceContext());
      }
    }
  };

  // We will force single-threaded execution if:
  // - Minibatch mode and runAllInputsOnAllDevices are disabled;
  // - We are going to emit bundle and do not do inference;
  // - We are collecting inference profile.
  // Otherwise, there can be several minibatches of equal size.
  const bool multiThreadingAllowed =
      (runAllInputsOnAllDevices || miniBatchMode) && !emittingBundle() &&
      !profilingGraph();
  const size_t numBatches =
      miniBatchMode ? inputImageFilenames_[0].size() / miniBatch : 1u;
  const size_t numThreads =
      runAllInputsOnAllDevices
          ? miniBatchThreads
          : (multiThreadingAllowed
                 ? std::min(size_t(miniBatchThreads), numBatches)
                 : 1u);
  if (miniBatchThreads > 1 && !multiThreadingAllowed) {
    llvm::outs() << "WARNING: multi-threaded execution is not possible. Make "
                    "sure that minibatch size is specified and you are not "
                    "trying to dump profile or emit bundle.\n";
  }

  llvm::outs() << "Running " << numThreads << " thread(s).\n";
  std::vector<std::thread> threads(numThreads);
  const size_t miniBatchesPerThread =
      (numBatches + numThreads - 1) / numThreads;
  for (size_t i = 0; i < numThreads; i++) {
    size_t startIndex, endIndex;
    if (!runAllInputsOnAllDevices && numThreads > 1) {
      startIndex = i * miniBatchesPerThread * miniBatch;
      endIndex = std::min((i + 1) * miniBatchesPerThread * miniBatch,
                          inputImageFilenames_[0].size());
    } else {
      startIndex = 0;
      endIndex = inputImageFilenames_[0].size();
    }
    auto worker = [&processImageRange, startIndex, endIndex, i]() {
      processImageRange(startIndex, endIndex, i);
    };
    threads.push_back(std::thread(worker));
  }

  for (auto &t : threads) {
    if (t.joinable()) {
      t.join();
    }
  }

  if (!tracePath.empty()) {
    traceContext->dump(tracePath, appName_);
  }

  return numErrors;
}