void FusedPairwiseReduceWithComm()

in horovod/common/ops/adasum/adasum.h [338:398]


  void FusedPairwiseReduceWithComm(std::vector<TensorTableEntry>& entries,
                                   uint8_t* a, uint8_t* b,
                                   DataType horovod_datatype,
                                   std::vector<int>& tensor_counts, int layerid,
                                   Communicator_type& comm, bool isLeftNeighbor,
                                   std::vector<double>& normAndDots,
                                   HorovodGlobalState* global_state) {
    static double sqrt_double_min = std::sqrt(DBL_MIN);
    int per_element_size =
        global_state->controller->GetTypeSize(horovod_datatype);
    int bytesSoFar = 0;
    for (size_t i = 0; i < tensor_counts.size(); i++) {
      double dotProduct = 0.;
      double anormsq = 0.;
      double bnormsq = 0.;

      DispatchComputeDotAndNormSqrds(&a[bytesSoFar], &b[bytesSoFar],
                                     horovod_datatype, tensor_counts[i],
                                     dotProduct, anormsq, bnormsq, layerid);
      normAndDots[i * 3] = dotProduct;
      if (isLeftNeighbor) {
        normAndDots[i * 3 + 1] = anormsq;
        normAndDots[i * 3 + 2] = bnormsq;
      } else {
        normAndDots[i * 3 + 1] = bnormsq;
        normAndDots[i * 3 + 2] = anormsq;
      }
      bytesSoFar += tensor_counts[i] * per_element_size;
    }

    SumAllreduceWithComm(entries, (void*)normAndDots.data(),
                         3 * tensor_counts.size(), DataType::HOROVOD_FLOAT64,
                         comm, global_state);

    bytesSoFar = 0;
    for (size_t i = 0; i < tensor_counts.size(); i++) {
      double dotProduct = normAndDots[i * 3];
      double anormsq;
      double bnormsq;
      if (isLeftNeighbor) {
        anormsq = normAndDots[i * 3 + 1];
        bnormsq = normAndDots[i * 3 + 2];
      } else {
        bnormsq = normAndDots[i * 3 + 1];
        anormsq = normAndDots[i * 3 + 2];
      }

      double acoeff = 1;
      double bcoeff = 1;
      if (anormsq >= sqrt_double_min) {
        acoeff = 1.0 - dotProduct / anormsq * 0.5;
      }
      if (bnormsq >= sqrt_double_min) {
        bcoeff = 1.0 - dotProduct / bnormsq * 0.5;
      }

      DispatchScaledAdd(horovod_datatype, tensor_counts[i], acoeff,
                        &a[bytesSoFar], bcoeff, &b[bytesSoFar], layerid);
      bytesSoFar += tensor_counts[i] * per_element_size;
    }
  }