in horovod/common/ops/adasum/adasum.h [338:398]
void FusedPairwiseReduceWithComm(std::vector<TensorTableEntry>& entries,
uint8_t* a, uint8_t* b,
DataType horovod_datatype,
std::vector<int>& tensor_counts, int layerid,
Communicator_type& comm, bool isLeftNeighbor,
std::vector<double>& normAndDots,
HorovodGlobalState* global_state) {
static double sqrt_double_min = std::sqrt(DBL_MIN);
int per_element_size =
global_state->controller->GetTypeSize(horovod_datatype);
int bytesSoFar = 0;
for (size_t i = 0; i < tensor_counts.size(); i++) {
double dotProduct = 0.;
double anormsq = 0.;
double bnormsq = 0.;
DispatchComputeDotAndNormSqrds(&a[bytesSoFar], &b[bytesSoFar],
horovod_datatype, tensor_counts[i],
dotProduct, anormsq, bnormsq, layerid);
normAndDots[i * 3] = dotProduct;
if (isLeftNeighbor) {
normAndDots[i * 3 + 1] = anormsq;
normAndDots[i * 3 + 2] = bnormsq;
} else {
normAndDots[i * 3 + 1] = bnormsq;
normAndDots[i * 3 + 2] = anormsq;
}
bytesSoFar += tensor_counts[i] * per_element_size;
}
SumAllreduceWithComm(entries, (void*)normAndDots.data(),
3 * tensor_counts.size(), DataType::HOROVOD_FLOAT64,
comm, global_state);
bytesSoFar = 0;
for (size_t i = 0; i < tensor_counts.size(); i++) {
double dotProduct = normAndDots[i * 3];
double anormsq;
double bnormsq;
if (isLeftNeighbor) {
anormsq = normAndDots[i * 3 + 1];
bnormsq = normAndDots[i * 3 + 2];
} else {
bnormsq = normAndDots[i * 3 + 1];
anormsq = normAndDots[i * 3 + 2];
}
double acoeff = 1;
double bcoeff = 1;
if (anormsq >= sqrt_double_min) {
acoeff = 1.0 - dotProduct / anormsq * 0.5;
}
if (bnormsq >= sqrt_double_min) {
bcoeff = 1.0 - dotProduct / bnormsq * 0.5;
}
DispatchScaledAdd(horovod_datatype, tensor_counts[i], acoeff,
&a[bytesSoFar], bcoeff, &b[bytesSoFar], layerid);
bytesSoFar += tensor_counts[i] * per_element_size;
}
}