in commons-statistics-inference/src/main/java/org/apache/commons/statistics/inference/GTest.java [194:249]
public double statistic(long[][] counts) {
Arguments.checkCategoriesRequiredSize(counts.length, 2);
Arguments.checkValuesRequiredSize(counts[0].length, 2);
Arguments.checkRectangular(counts);
Arguments.checkNonNegative(counts);
final int ni = counts.length;
final int nj = counts[0].length;
// Compute row, column and total sums
final double[] sumi = new double[ni];
final double[] sumj = new double[nj];
double n = 0;
// We can sum data on the first pass. See below for computation details.
final Sum sum = Sum.create();
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
final long c = counts[i][j];
sumi[i] += c;
sumj[j] += c;
if (c > 1) {
sum.add(c * Math.log(c));
}
}
checkNonZero(sumi[i], "Row", i);
n += sumi[i];
}
for (int j = 0; j < nj; j++) {
checkNonZero(sumj[j], "Column", j);
}
// This computes a modified form of the Shannon entropy H without requiring
// normalisation of observations to probabilities and without negation,
// i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ].
// H = -sum (p * log(p))
// H' = n * sum (p * log(p))
// = n * sum (o/n * log(o/n))
// = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ]
// = sum(o * log(o)) - n log(n)
// After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n)
sum.addProduct(n, Math.log(n));
// Negative terms
final Sum sum2 = Sum.create();
// All these counts are above zero so no check for zeros
for (final double c : sumi) {
sum2.add(c * -Math.log(c));
}
for (final double c : sumj) {
sum2.add(c * -Math.log(c));
}
return sum.add(sum2).getAsDouble() * 2;
}