private void build()

in lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java [1922:2130]


  private void build(
      int leavesOffset,
      int numLeaves,
      BKDRadixSelector.PathSlice points,
      IndexOutput out,
      BKDRadixSelector radixSelector,
      byte[] minPackedValue,
      byte[] maxPackedValue,
      int[] parentSplits,
      byte[] splitPackedValues,
      byte[] splitDimensionValues,
      PackedLongValues.Builder leafBlockFPs,
      int totalNumLeaves,
      int[] spareDocIds)
      throws IOException {

    if (numLeaves == 1) {

      // Leaf node: write block
      // We can write the block in any order so by default we write it sorted by the dimension that
      // has the
      // least number of unique bytes at commonPrefixLengths[dim], which makes compression more
      // efficient
      HeapPointWriter heapSource;
      if (points.writer() instanceof HeapPointWriter == false) {
        // Adversarial cases can cause this, e.g. merging big segments with most of the points
        // deleted
        heapSource = switchToHeap(points.writer());
      } else {
        heapSource = (HeapPointWriter) points.writer();
      }

      int from = Math.toIntExact(points.start());
      int to = Math.toIntExact(points.start() + points.count());
      // we store common prefix on scratch
      computeCommonPrefixLength(heapSource, scratch, from, to);

      int sortedDim = 0;
      int sortedDimCardinality = Integer.MAX_VALUE;
      FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
      for (int dim = 0; dim < config.numDims(); ++dim) {
        if (commonPrefixLengths[dim] < config.bytesPerDim()) {
          usedBytes[dim] = new FixedBitSet(256);
        }
      }
      // Find the dimension to compress
      for (int dim = 0; dim < config.numDims(); dim++) {
        int prefix = commonPrefixLengths[dim];
        if (prefix < config.bytesPerDim()) {
          int offset = dim * config.bytesPerDim();
          for (int i = from; i < to; ++i) {
            PointValue value = heapSource.getPackedValueSlice(i);
            BytesRef packedValue = value.packedValue();
            int bucket = packedValue.bytes[packedValue.offset + offset + prefix] & 0xff;
            usedBytes[dim].set(bucket);
          }
          int cardinality = usedBytes[dim].cardinality();
          if (cardinality < sortedDimCardinality) {
            sortedDim = dim;
            sortedDimCardinality = cardinality;
          }
        }
      }

      // sort the chosen dimension
      radixSelector.heapRadixSort(heapSource, from, to, sortedDim, commonPrefixLengths[sortedDim]);
      // compute cardinality
      int leafCardinality = heapSource.computeCardinality(from, to, commonPrefixLengths);

      // Save the block file pointer:
      leafBlockFPs.add(out.getFilePointer());
      // System.out.println("  write leaf block @ fp=" + out.getFilePointer());

      // Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
      // loading the values:
      int count = to - from;
      assert count > 0 : "numLeaves=" + numLeaves + " leavesOffset=" + leavesOffset;
      assert count <= spareDocIds.length : "count=" + count + " > length=" + spareDocIds.length;
      // Write doc IDs
      int[] docIDs = spareDocIds;
      for (int i = 0; i < count; i++) {
        docIDs[i] = heapSource.getPackedValueSlice(from + i).docID();
      }
      writeLeafBlockDocs(out, docIDs, 0, count);

      // TODO: minor opto: we don't really have to write the actual common prefixes, because
      // BKDReader on recursing can regenerate it for us
      // from the index, much like how terms dict does so from the FST:

      // Write the common prefixes:
      writeCommonPrefixes(out, commonPrefixLengths, scratch);

      // Write the full values:
      IntFunction<BytesRef> packedValues =
          i -> heapSource.getPackedValueSlice(from + i).packedValue();
      assert valuesInOrderAndBounds(
          config, count, sortedDim, minPackedValue, maxPackedValue, packedValues, docIDs, 0);
      writeLeafBlockPackedValues(
          out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);

    } else {
      // Inner node: partition/recurse

      final int splitDim;
      if (config.numIndexDims() == 1) {
        splitDim = 0;
      } else {
        // for dimensions > 2 we recompute the bounds for the current inner node to help the
        // algorithm choose best
        // split dimensions. Because it is an expensive operation, the frequency we recompute the
        // bounds is given
        // by SPLITS_BEFORE_EXACT_BOUNDS.
        if (numLeaves != totalNumLeaves
            && config.numIndexDims() > 2
            && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
          computePackedValueBounds(points, minPackedValue, maxPackedValue);
        }
        splitDim = split(minPackedValue, maxPackedValue, parentSplits);
      }

      assert numLeaves <= totalNumLeaves
          : "numLeaves=" + numLeaves + " totalNumLeaves=" + totalNumLeaves;

      // How many leaves will be in the left tree:
      final int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
      // How many points will be in the left tree:
      final long leftCount = numLeftLeafNodes * (long) config.maxPointsInLeafNode();

      BKDRadixSelector.PathSlice[] slices = new BKDRadixSelector.PathSlice[2];

      final int commonPrefixLen =
          commonPrefixComparator.compare(
              minPackedValue,
              splitDim * config.bytesPerDim(),
              maxPackedValue,
              splitDim * config.bytesPerDim());

      byte[] splitValue =
          radixSelector.select(
              points,
              slices,
              points.start(),
              points.start() + points.count(),
              points.start() + leftCount,
              splitDim,
              commonPrefixLen);

      final int rightOffset = leavesOffset + numLeftLeafNodes;
      final int splitValueOffset = rightOffset - 1;

      splitDimensionValues[splitValueOffset] = (byte) splitDim;
      int address = splitValueOffset * config.bytesPerDim();
      System.arraycopy(splitValue, 0, splitPackedValues, address, config.bytesPerDim());

      byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()];
      System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength());

      byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()];
      System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength());

      System.arraycopy(
          splitValue,
          0,
          minSplitPackedValue,
          splitDim * config.bytesPerDim(),
          config.bytesPerDim());
      System.arraycopy(
          splitValue,
          0,
          maxSplitPackedValue,
          splitDim * config.bytesPerDim(),
          config.bytesPerDim());

      parentSplits[splitDim]++;
      // Recurse on left tree:
      build(
          leavesOffset,
          numLeftLeafNodes,
          slices[0],
          out,
          radixSelector,
          minPackedValue,
          maxSplitPackedValue,
          parentSplits,
          splitPackedValues,
          splitDimensionValues,
          leafBlockFPs,
          totalNumLeaves,
          spareDocIds);

      // Recurse on right tree:
      build(
          rightOffset,
          numLeaves - numLeftLeafNodes,
          slices[1],
          out,
          radixSelector,
          minSplitPackedValue,
          maxPackedValue,
          parentSplits,
          splitPackedValues,
          splitDimensionValues,
          leafBlockFPs,
          totalNumLeaves,
          spareDocIds);

      parentSplits[splitDim]--;
    }
  }