in commons-statistics-inference/src/main/java/org/apache/commons/statistics/inference/OneWayAnova.java [258:338]
private static Result aov(Collection<double[]> data, double[] statistic) {
Arguments.checkCategoriesRequiredSize(data.size(), 2);
long n = 0;
for (final double[] array : data) {
n += array.length;
Arguments.checkValuesRequiredSize(array.length, 1);
}
final long dfwg = n - data.size();
if (dfwg == 0) {
throw new InferenceException(InferenceException.ZERO, "Degrees of freedom within groups");
}
// wg = within group
// bg = between group
// F = Var(bg) / Var(wg)
// Var = SS / df
// SStotal = sum((x - u)^2) = sum(x^2) - sum(x)^2/n
// = SSwg + SSbg
// Some cancellation of terms reduces the computation to 3 sums:
// SSwg = [ sum(x^2) - sum(x)^2/n ] - [ sum_g { sum(sum(x^2) - sum(x)^2/n) } ]
// SSbg = SStotal - SSwg
// = sum_g { sum(x)^2/n) } - sum(x)^2/n
// SSwg = SStotal - SSbg
// = sum(x^2) - sum_g { sum(x)^2/n) }
// Stabilize the computation by shifting all to a common mean of zero.
// This minimise the magnitude of x^2 terms.
// The terms sum(x)^2/n -> 0. Included them to capture the round-off.
final double m = StatisticUtils.mean(data);
final Sum sxx = Sum.create();
final Sum sx = Sum.create();
final Sum sg = Sum.create();
// Track if each group had the same value
boolean eachSame = true;
for (final double[] array : data) {
eachSame = eachSame && allMatch(array[0], array);
final Sum s = Sum.create();
for (final double v : array) {
final double x = v - m;
s.add(x);
// sum(x)
sx.add(x);
// sum(x^2)
sxx.add(x * x);
}
// Create the negative sum so we can subtract it via 'add'
// -sum_g { sum(x)^2/n) }
sg.add(-pow2(s.getAsDouble()) / array.length);
}
// Note: SS terms should not be negative given:
// SS = sum((x - u)^2)
// This can happen due to floating-point error in sum(x^2) - sum(x)^2/n
final double sswg = Math.max(0, sxx.add(sg).getAsDouble());
// Flip the sign back
final double ssbg = Math.max(0, -sg.add(pow2(sx.getAsDouble()) / n).getAsDouble());
final int dfbg = data.size() - 1;
// Handle edge-cases:
// Note: 0 / 0 -> NaN : x / 0 -> inf
// These are documented results and should output p=NaN or 0.
// This result will occur naturally.
// However the SS totals may not be 0.0 so we correct these cases.
final boolean allSame = eachSame && allMatch(data);
final double msbg = allSame ? 0 : ssbg / dfbg;
final double mswg = eachSame ? 0 : sswg / dfwg;
final double f = msbg / mswg;
if (statistic != null) {
statistic[0] = f;
return null;
}
final double p = FDistribution.of(dfbg, dfwg).survivalProbability(f);
// Support partitioning the variance
// ni = size of each of the groups
// nO=(1/(a−1))*(sum(ni)−(sum(ni^2)/sum(ni))
final double nO = (n - data.stream()
.mapToDouble(x -> pow2(x.length)).sum() / n) / dfbg;
return new Result(dfbg, dfwg, msbg, mswg, nO, f, p);
}