in commons-statistics-inference/src/main/java/org/apache/commons/statistics/inference/KolmogorovSmirnovTest.java [1136:1237]
private static double twoSampleTwoSidedPStabilizedInner(long d, int n, int m, int gcd) {
// Check the computation is possible.
// Note that the possible paths is binom(m+n, n).
// However the computation is stable above this limit.
// Possible d values (requiring a unique p-value) = max(dnm) / gcd = lcm(n, m).
// Possible terms to compute <= n * size(cij)
// Simple limit based on the number of possible different p-values
if ((long) n * (m / gcd) > MAX_LCM_TWO_SAMPLE_EXACT_P) {
return -1;
}
// This could be updated to use d in [1, lcm].
// Currently it uses d in [gcd, n*m].
// Largest intermediate value is (dnm + im + n) which is within 2^63
// if n and m are 2^31-1, i = n, dnm = n*m: (2^31-1)^2 + (2^31-1)^2 + 2^31-1 < 2^63
final long dnm = d * gcd;
// Viehmann (2021): Updated for i in [0, n], j in [0, m]
// C_i,j = 1 if |i/n - j/m| >= d
// = 0 if |i/n - j/m| < d and (i=0 or j=0)
// = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j) otherwise
// P2 = C_n,m
// Note: The python listing in Viehmann used d in [0, 1]. This uses dnm in [0, nm]
// so updates the scaling to compute the ranges. Also note that the listing uses
// dist > d or dist < -d where this uses |dist| >= d to compute P(D >= d) (non-strict inequality).
// The provided listing is explicit in the values for each j in the range.
// It can be optimized given the known start and end j for each iteration as only
// j where |i/n - j/m| < d must be processed:
// startJ where: im - jn < dnm : jn > im - dnm
// j = floor((im - dnm) / n) + 1 in [0, m]
// endJ where: jn - im >= dnm
// j = ceil((dnm + im) / n) in [0, m+1]
// First iteration with i = 0
// j = ceil(dnm / n)
int endJ = Math.min(m + 1, (int) ((dnm + n - 1) / n));
// Only require 1 array to store C_i-1,j as the startJ only ever increases
// and we update lower indices using higher ones.
// The maximum value *written* is j=m or less using j/m <= 2*d : j = ceil(2*d*m)
// Viehmann uses: size = int(2*m*d + 2) == ceil(2*d*m) + 1
// The maximum value *read* is j/m <= 2*d. This may be above m. This occurs when
// j - lastStartJ > m and C_i-1,j = 1. This can be avoided if (startJ - lastStartJ) <= 1
// which occurs if m <= n, i.e. the window only slides 0 or 1 in j for each increment i
// and we can maintain Cij as 1 larger than ceil(2*d*m) + 1.
final double[] cij = new double[Math.min(m + 1, 2 * endJ + 2)];
// Each iteration fills C_i,j with values and the remaining values are
// kept as 1 for |i/n - j/m| >= d
//assert (endJ - 1) * (long) n < dnm : "jn >= dnm for j < endJ";
for (int j = endJ; j < cij.length; j++) {
//assert j * (long) n >= dnm : "jn < dnm for j >= endJ";
cij[j] = 1;
}
int startJ = 0;
int length = endJ;
double val = -1;
long im = 0;
for (int i = 1; i <= n; i++) {
im += m;
final int lastStartJ = startJ;
// Compute C_i,j for startJ <= j < endJ
// startJ = floor((im - dnm) / n) + 1 in [0, m]
// endJ = ceil((dnm + im) / n) in [0, m+1]
startJ = im < dnm ? 0 : Math.min(m, (int) ((im - dnm) / n) + 1);
endJ = Math.min(m + 1, (int) ((dnm + im + n - 1) / n));
if (startJ >= endJ) {
// No possible paths inside the boundary
return 1;
}
//assert startJ - lastStartJ <= 1 : "startJ - lastStartJ > 1";
// Initialize previous value C_i,j-1
val = startJ == 0 ? 0 : 1;
//assert startJ == 0 || Math.abs(im - (startJ - 1) * (long) n) >= dnm : "|im - jn| < dnm for j < startJ";
//assert endJ > m || Math.abs(im - endJ * (long) n) >= dnm : "|im - jn| < dnm for j >= endJ";
for (int j = startJ; j < endJ; j++) {
//assert j == 0 || Math.abs(im - j * (long) n) < dnm : "|im - jn| >= dnm for startJ <= j < endJ";
// C_i,j = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)
// Note: if (j - lastStartJ) >= cij.length this creates an IOOB exception.
// In this case cij[j - lastStartJ] == 1. Only happens when m >= n.
// Fixed using c_i-1,j = (j - lastStartJ >= cij.length ? 1 : cij[j - lastStartJ]
val = (cij[j - lastStartJ] * i + val * j) / ((double) i + j);
cij[j - startJ] = val;
}
// Must keep the remaining values in C_i,j as 1 to allow
// cij[j - lastStartJ] * i == i when (j - lastStartJ) > lastLength
final int lastLength = length;
length = endJ - startJ;
for (int j = lastLength - length - 1; j >= 0; j--) {
cij[length + j] = 1;
}
}
// Return the most recently written value: C_n,m
return val;
}