in lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java [1922:2130]
private void build(
int leavesOffset,
int numLeaves,
BKDRadixSelector.PathSlice points,
IndexOutput out,
BKDRadixSelector radixSelector,
byte[] minPackedValue,
byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
byte[] splitDimensionValues,
PackedLongValues.Builder leafBlockFPs,
int totalNumLeaves,
int[] spareDocIds)
throws IOException {
if (numLeaves == 1) {
// Leaf node: write block
// We can write the block in any order so by default we write it sorted by the dimension that
// has the
// least number of unique bytes at commonPrefixLengths[dim], which makes compression more
// efficient
HeapPointWriter heapSource;
if (points.writer() instanceof HeapPointWriter == false) {
// Adversarial cases can cause this, e.g. merging big segments with most of the points
// deleted
heapSource = switchToHeap(points.writer());
} else {
heapSource = (HeapPointWriter) points.writer();
}
int from = Math.toIntExact(points.start());
int to = Math.toIntExact(points.start() + points.count());
// we store common prefix on scratch
computeCommonPrefixLength(heapSource, scratch, from, to);
int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE;
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
for (int dim = 0; dim < config.numDims(); ++dim) {
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
usedBytes[dim] = new FixedBitSet(256);
}
}
// Find the dimension to compress
for (int dim = 0; dim < config.numDims(); dim++) {
int prefix = commonPrefixLengths[dim];
if (prefix < config.bytesPerDim()) {
int offset = dim * config.bytesPerDim();
for (int i = from; i < to; ++i) {
PointValue value = heapSource.getPackedValueSlice(i);
BytesRef packedValue = value.packedValue();
int bucket = packedValue.bytes[packedValue.offset + offset + prefix] & 0xff;
usedBytes[dim].set(bucket);
}
int cardinality = usedBytes[dim].cardinality();
if (cardinality < sortedDimCardinality) {
sortedDim = dim;
sortedDimCardinality = cardinality;
}
}
}
// sort the chosen dimension
radixSelector.heapRadixSort(heapSource, from, to, sortedDim, commonPrefixLengths[sortedDim]);
// compute cardinality
int leafCardinality = heapSource.computeCardinality(from, to, commonPrefixLengths);
// Save the block file pointer:
leafBlockFPs.add(out.getFilePointer());
// System.out.println(" write leaf block @ fp=" + out.getFilePointer());
// Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
// loading the values:
int count = to - from;
assert count > 0 : "numLeaves=" + numLeaves + " leavesOffset=" + leavesOffset;
assert count <= spareDocIds.length : "count=" + count + " > length=" + spareDocIds.length;
// Write doc IDs
int[] docIDs = spareDocIds;
for (int i = 0; i < count; i++) {
docIDs[i] = heapSource.getPackedValueSlice(from + i).docID();
}
writeLeafBlockDocs(out, docIDs, 0, count);
// TODO: minor opto: we don't really have to write the actual common prefixes, because
// BKDReader on recursing can regenerate it for us
// from the index, much like how terms dict does so from the FST:
// Write the common prefixes:
writeCommonPrefixes(out, commonPrefixLengths, scratch);
// Write the full values:
IntFunction<BytesRef> packedValues =
i -> heapSource.getPackedValueSlice(from + i).packedValue();
assert valuesInOrderAndBounds(
config, count, sortedDim, minPackedValue, maxPackedValue, packedValues, docIDs, 0);
writeLeafBlockPackedValues(
out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
} else {
// Inner node: partition/recurse
final int splitDim;
if (config.numIndexDims() == 1) {
splitDim = 0;
} else {
// for dimensions > 2 we recompute the bounds for the current inner node to help the
// algorithm choose best
// split dimensions. Because it is an expensive operation, the frequency we recompute the
// bounds is given
// by SPLITS_BEFORE_EXACT_BOUNDS.
if (numLeaves != totalNumLeaves
&& config.numIndexDims() > 2
&& Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
computePackedValueBounds(points, minPackedValue, maxPackedValue);
}
splitDim = split(minPackedValue, maxPackedValue, parentSplits);
}
assert numLeaves <= totalNumLeaves
: "numLeaves=" + numLeaves + " totalNumLeaves=" + totalNumLeaves;
// How many leaves will be in the left tree:
final int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
// How many points will be in the left tree:
final long leftCount = numLeftLeafNodes * (long) config.maxPointsInLeafNode();
BKDRadixSelector.PathSlice[] slices = new BKDRadixSelector.PathSlice[2];
final int commonPrefixLen =
commonPrefixComparator.compare(
minPackedValue,
splitDim * config.bytesPerDim(),
maxPackedValue,
splitDim * config.bytesPerDim());
byte[] splitValue =
radixSelector.select(
points,
slices,
points.start(),
points.start() + points.count(),
points.start() + leftCount,
splitDim,
commonPrefixLen);
final int rightOffset = leavesOffset + numLeftLeafNodes;
final int splitValueOffset = rightOffset - 1;
splitDimensionValues[splitValueOffset] = (byte) splitDim;
int address = splitValueOffset * config.bytesPerDim();
System.arraycopy(splitValue, 0, splitPackedValues, address, config.bytesPerDim());
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength());
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength());
System.arraycopy(
splitValue,
0,
minSplitPackedValue,
splitDim * config.bytesPerDim(),
config.bytesPerDim());
System.arraycopy(
splitValue,
0,
maxSplitPackedValue,
splitDim * config.bytesPerDim(),
config.bytesPerDim());
parentSplits[splitDim]++;
// Recurse on left tree:
build(
leavesOffset,
numLeftLeafNodes,
slices[0],
out,
radixSelector,
minPackedValue,
maxSplitPackedValue,
parentSplits,
splitPackedValues,
splitDimensionValues,
leafBlockFPs,
totalNumLeaves,
spareDocIds);
// Recurse on right tree:
build(
rightOffset,
numLeaves - numLeftLeafNodes,
slices[1],
out,
radixSelector,
minSplitPackedValue,
maxPackedValue,
parentSplits,
splitPackedValues,
splitDimensionValues,
leafBlockFPs,
totalNumLeaves,
spareDocIds);
parentSplits[splitDim]--;
}
}