static UpdatingInterval createUpdatingInterval()

in commons-numbers-arrays/src/main/java/org/apache/commons/numbers/arrays/IndexSupport.java [38:158]


    static UpdatingInterval createUpdatingInterval(int[] k, int n) {
        // Note: A typical use case is to have a few indices. Thus the heuristics
        // in this method should be very fast when n is small.
        // We have a choice between a KeyUpdatingInterval which requires
        // sorted keys or a BitIndexUpdatingInterval which handles keys in any order.
        // The purpose of the heuristics is to avoid a very bad choice of data structure,
        // rather than choosing the best data structure in all situations. As long as the
        // choice is reasonable the speed will not impact a partition algorithm.

        // Simple cases
        if (n == 2) {
            if (k[0] == k[1]) {
                return newUpdatingInterval(k, 1);
            }
            if (k[1] < k[0]) {
                final int v = k[0];
                k[0] = k[1];
                k[1] = v;
            }
            return newUpdatingInterval(k, 2);
        }

        // Strategy: Must be fast on already ascending data.
        // Note: The recommended way to generate a lot of partition indices is to
        // generate in sequence.

        // n <= small:
        //   Modified insertion sort (naturally finds ascending data)
        // n > small:
        //   Look for ascending sequence and compact
        // else:
        //   Remove duplicates using an order(1) data structure and sort

        if (n <= INSERTION_SORT_SIZE) {
            final int unique = Sorting.insertionSortIndices(k, n);
            return newUpdatingInterval(k, unique);
        }

        if (isAscending(k, n)) {
            // For sorted keys the KeyUpdatingInterval is fast. It may be slower than the
            // BitIndexUpdatingInterval depending on data length but not significantly
            // slower and the difference is lost in the time taken for partitioning.
            // So always use the keys.
            final int unique = compressDuplicates(k, n);
            return newUpdatingInterval(k, unique);
        }

        // At least 20 indices that are partially unordered.

        // Find min/max to understand the range.
        int min = k[n - 1];
        int max = min;
        for (int i = n - 1; --i >= 0;) {
            min = Math.min(min, k[i]);
            max = Math.max(max, k[i]);
        }

        // Here we use a simple test based on the number of comparisons required
        // to perform the expected next/previous look-ups after a split.
        // It is expected that we can cut n keys a maximum of n-1 times.
        // Each cut requires a scan next/previous to divide the interval into two intervals:
        //
        //            cut
        //             |
        //        k1--------k2---------k3---- ... ---------kn    initial interval
        //         <--| find previous
        //    find next |-->
        //        k1        k2---------k3---- ... ---------kn    divided intervals
        //
        // An BitSet will scan from the cut location and find a match in time proportional to
        // the index density. Average density is (size / n) and the scanning covers 64
        // indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)

        // Sorted keys: Sort time Order(n log(n)) : Splitting time Order(log(n)) (binary search approx)
        // Bit keys   : Sort time Order(1)        : Splitting time Order(size / 32)

        // Transition when n * n ~ size / 32
        // Benchmarking shows this is a reasonable approximation when size < 2^20.
        // The speed of the bit keys is approximately independent of n and proportional to size.
        // Large size observes degrading performance of the bit keys vs sorted keys.
        // We introduce a penalty for each 4x increase over size = 2^20.
        // n * n = size/32 * 2^log4(size / 2^20)
        // The transition point still favours the bit keys when sorted keys would be faster.
        // However the difference is held within 4x and the BitSet type structure is still fast
        // enough to be negligible against the speed of partitioning.

        // Transition point: n = sqrt(size/32)
        // size n
        // 2^10 5.66
        // 2^15 32.0
        // 2^20 181.0

        // Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
        // size n
        // 2^22 512.0
        // 2^24 1448.2
        // 2^28 11585
        // 2^31 55108

        final int size = max - min + 1;

        // Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
        // At 2^31 the shift reduces to 0.
        int shift = 5;
        if (size > (1 << 20)) {
            // log4(size/2^20) == (log2(size) - 20) / 2
            shift -= (ceilLog2(size) - 20) >>> 1;
        }

        if ((long) n * n > (size >> shift)) {
            final BitIndexUpdatingInterval interval = new BitIndexUpdatingInterval(min, max);
            for (int i = n; --i >= 0;) {
                interval.set(k[i]);
            }
            return interval;
        }

        // Sort with a hash set to filter indices
        final int unique = Sorting.sortIndices(k, n);
        return new KeyUpdatingInterval(k, unique);
    }