in commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/distribution/FastLoadedDiceRollerDiscreteSampler.java [652:705]
private static FastLoadedDiceRollerDiscreteSampler createSampler(UniformRandomProvider rng,
long[] frequencies,
int[] indices,
long m) {
// ALGORITHM 5: PREPROCESS
// a == frequencies
// m = sum(a)
// h = leaf node count
// H = leaf node label (lH)
final int n = frequencies.length;
// k = ceil(log2(m))
final int k = 64 - Long.numberOfLeadingZeros(m - 1);
// r = a(n+1) = 2^k - m
final long r = (1L << k) - m;
// Note:
// A sparse matrix can often be used for H, as most of its entries are empty.
// This implementation uses a 1D array for efficiency at the cost of memory.
// This is limited to approximately ((2^31 - 1) / k), e.g. 34087042 when the sum of
// observations is large enough to create k=63.
// This could be handled using a 2D array. In practice a number of categories this
// large is not expected and is currently not supported.
final int[] h = new int[k];
final int[] lH = new int[checkArraySize((n + 1L) * k)];
Arrays.fill(lH, NO_LABEL);
int d;
for (int j = 0; j < k; j++) {
final int shift = (k - 1) - j;
final long bitMask = 1L << shift;
d = 0;
for (final int i : indices) {
// bool w ← (a[i] >> (k − 1) − j)) & 1
// h[j] = h[j] + w
// if w then:
if ((frequencies[i] & bitMask) != 0) {
h[j]++;
// H[d][j] = i
lH[d * k + j] = i;
d++;
}
}
// process a(n+1) without extending the input frequencies array by 1
if ((r & bitMask) != 0) {
h[j]++;
lH[d * k + j] = n;
}
}
return new FLDRSampler(rng, n, k, h, lH);
}