in src/Lucene.Net/Util/Fst/FST.cs [1659:2080]
internal FST<T> Pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio)
{
// NOTE: maxDerefNodes is intentionally int: we cannot
// support > 2.1B deref nodes
// TODO: other things to try
// - renumber the nodes to get more next / better locality?
// - allow multiple input labels on an arc, so
// singular chain of inputs can take one arc (on
// wikipedia terms this could save another ~6%)
// - in the ord case, the output '1' is presumably
// very common (after NO_OUTPUT)... maybe use a bit
// for it..?
// - use spare bits in flags.... for top few labels /
// outputs / targets
if (nodeAddress is null)
{
throw new ArgumentException("this FST was not built with willPackFST=true");
}
FST.Arc<T> arc = new FST.Arc<T>();
FST.BytesReader r = GetBytesReader();
int topN = Math.Min(maxDerefNodes, inCounts.Count);
// Find top nodes with highest number of incoming arcs:
IDictionary<int, int> topNodeMap = null;
#nullable enable
// LUCENENET: Refactored PriorityQueue<T> subclass into PriorityComparer<T>
// implementation, which can be passed into ValuePriorityQueue. ValuePriorityQueue
// lives on the stack, and if the array size is small enough, we also allocate the
// array on the stack. Fallback to the array pool if it is beyond MaxStackByteLimit.
int bufferSize = PriorityQueue.GetArrayHeapSize(topN);
bool usePool = FST.NodeAndInCount.ByteSize * bufferSize > Constants.MaxStackByteLimit;
FST.NodeAndInCount?[]? arrayToReturnToPool = usePool ? ArrayPool<FST.NodeAndInCount?>.Shared.Rent(bufferSize) : null;
try
{
Span<FST.NodeAndInCount?> buffer = usePool ? arrayToReturnToPool : stackalloc FST.NodeAndInCount?[bufferSize];
var q = new ValuePriorityQueue<FST.NodeAndInCount?>(buffer, FST.NodeComparer.Default);
#nullable restore
// TODO: we could use more RAM efficient selection algo here...
FST.NodeAndInCount? bottom = null;
for (int node = 0; node < inCounts.Count; node++)
{
if (inCounts.Get(node) >= minInCountDeref)
{
if (!bottom.HasValue)
{
q.Add(new FST.NodeAndInCount(node, (int)inCounts.Get(node)));
if (q.Count == topN)
{
bottom = q.Top;
}
}
else if (inCounts.Get(node) > bottom.Value.Count)
{
q.InsertWithOverflow(new FST.NodeAndInCount(node, (int)inCounts.Get(node)));
}
}
}
// Free up RAM:
inCounts = null;
topNodeMap = new Dictionary<int, int>(q.Count); // LUCENENET: Allocate the dictionary prior to the loop
for (int downTo = q.Count - 1; downTo >= 0; downTo--)
{
FST.NodeAndInCount? n = q.Pop();
topNodeMap[n.Value.Node] = downTo; // LUCENENET: we are bound in the loop by Count, so we won't go past the end and get null here.
//System.out.println("map node=" + n.Node + " inCount=" + n.count + " to newID=" + downTo);
}
}
finally
{
if (arrayToReturnToPool is not null)
ArrayPool<FST.NodeAndInCount?>.Shared.Return(arrayToReturnToPool);
}
// +1 because node ords start at 1 (0 is reserved as stop node):
GrowableWriter newNodeAddress = new GrowableWriter(PackedInt32s.BitsRequired(this.bytes.Position), (int)(1 + nodeCount), acceptableOverheadRatio);
// Fill initial coarse guess:
for (int node = 1; node <= nodeCount; node++)
{
newNodeAddress.Set(node, 1 + this.bytes.Position - nodeAddress.Get(node));
}
int absCount;
int deltaCount;
int topCount;
int nextCount;
FST<T> fst;
// Iterate until we converge:
while (true)
{
//System.out.println("\nITER");
bool changed = false;
// for assert:
bool negDelta = false;
fst = new FST<T>(inputType, Outputs, bytes.BlockBits);
BytesStore writer = fst.bytes;
// Skip 0 byte since 0 is reserved target:
writer.WriteByte(0);
fst.arcWithOutputCount = 0;
fst.nodeCount = 0;
fst.arcCount = 0;
absCount = deltaCount = topCount = nextCount = 0;
int changedCount = 0;
long addressError = 0;
//int totWasted = 0;
// Since we re-reverse the bytes, we now write the
// nodes backwards, so that BIT_TARGET_NEXT is
// unchanged:
for (int node = (int)nodeCount; node >= 1; node--)
{
fst.nodeCount++;
long address = writer.Position;
//System.out.println(" node: " + node + " address=" + address);
if (address != newNodeAddress.Get(node))
{
addressError = address - newNodeAddress.Get(node);
//System.out.println(" change: " + (address - newNodeAddress[node]));
changed = true;
newNodeAddress.Set(node, address);
changedCount++;
}
int nodeArcCount = 0;
int bytesPerArc = 0;
bool retry = false;
// for assert:
bool anyNegDelta = false;
// Retry loop: possibly iterate more than once, if
// this is an array'd node and bytesPerArc changes:
while (true) // retry writing this node
{
//System.out.println(" cycle: retry");
ReadFirstRealTargetArc(node, arc, r);
bool useArcArray = arc.BytesPerArc != 0;
if (useArcArray)
{
// Write false first arc:
if (bytesPerArc == 0)
{
bytesPerArc = arc.BytesPerArc;
}
writer.WriteByte((byte)FST.ARCS_AS_FIXED_ARRAY);
writer.WriteVInt32(arc.NumArcs);
writer.WriteVInt32(bytesPerArc);
//System.out.println("node " + node + ": " + arc.numArcs + " arcs");
}
int maxBytesPerArc = 0;
//int wasted = 0;
while (true) // iterate over all arcs for this node
{
//System.out.println(" cycle next arc");
long arcStartPos = writer.Position;
nodeArcCount++;
sbyte flags = 0;
if (arc.IsLast)
{
flags += (sbyte)FST.BIT_LAST_ARC;
}
/*
if (!useArcArray && nodeUpto < nodes.length-1 && arc.Target == nodes[nodeUpto+1]) {
flags += BIT_TARGET_NEXT;
}
*/
if (!useArcArray && node != 1 && arc.Target == node - 1)
{
flags += (sbyte)FST.BIT_TARGET_NEXT;
if (!retry)
{
nextCount++;
}
}
if (arc.IsFinal)
{
flags += (sbyte)FST.BIT_FINAL_ARC;
if (arc.NextFinalOutput != NO_OUTPUT)
{
flags += (sbyte)FST.BIT_ARC_HAS_FINAL_OUTPUT;
}
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(arc.NextFinalOutput == NO_OUTPUT);
}
if (!TargetHasArcs(arc))
{
flags += (sbyte)FST.BIT_STOP_NODE;
}
if (arc.Output != NO_OUTPUT)
{
flags += (sbyte)FST.BIT_ARC_HAS_OUTPUT;
}
long absPtr;
bool doWriteTarget = TargetHasArcs(arc) && (flags & FST.BIT_TARGET_NEXT) == 0;
if (doWriteTarget)
{
if (topNodeMap.TryGetValue((int)arc.Target, out int ptr))
{
absPtr = ptr;
}
else
{
absPtr = topNodeMap.Count + newNodeAddress.Get((int)arc.Target) + addressError;
}
long delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.Position - 2;
if (delta < 0)
{
//System.out.println("neg: " + delta);
anyNegDelta = true;
delta = 0;
}
if (delta < absPtr)
{
flags |= FST.BIT_TARGET_DELTA;
}
}
else
{
absPtr = 0;
}
if (Debugging.AssertsEnabled) Debugging.Assert(flags != FST.ARCS_AS_FIXED_ARRAY);
writer.WriteByte((byte)flags);
fst.WriteLabel(writer, arc.Label);
if (arc.Output != NO_OUTPUT)
{
Outputs.Write(arc.Output, writer);
if (!retry)
{
fst.arcWithOutputCount++;
}
}
if (arc.NextFinalOutput != NO_OUTPUT)
{
Outputs.WriteFinalOutput(arc.NextFinalOutput, writer);
}
if (doWriteTarget)
{
long delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.Position;
if (delta < 0)
{
anyNegDelta = true;
//System.out.println("neg: " + delta);
delta = 0;
}
if (Flag(flags, FST.BIT_TARGET_DELTA))
{
//System.out.println(" delta");
writer.WriteVInt64(delta);
if (!retry)
{
deltaCount++;
}
}
else
{
/*
if (ptr != null) {
System.out.println(" deref");
} else {
System.out.println(" abs");
}
*/
writer.WriteVInt64(absPtr);
if (!retry)
{
if (absPtr >= topNodeMap.Count)
{
absCount++;
}
else
{
topCount++;
}
}
}
}
if (useArcArray)
{
int arcBytes = (int)(writer.Position - arcStartPos);
//System.out.println(" " + arcBytes + " bytes");
maxBytesPerArc = Math.Max(maxBytesPerArc, arcBytes);
// NOTE: this may in fact go "backwards", if
// somehow (rarely, possibly never) we use
// more bytesPerArc in this rewrite than the
// incoming FST did... but in this case we
// will retry (below) so it's OK to ovewrite
// bytes:
//wasted += bytesPerArc - arcBytes;
writer.SkipBytes((int)(arcStartPos + bytesPerArc - writer.Position));
}
if (arc.IsLast)
{
break;
}
ReadNextRealArc(arc, r);
}
if (useArcArray)
{
if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc))
{
// converged
//System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted);
//totWasted += wasted;
break;
}
}
else
{
break;
}
//System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc);
// Retry:
bytesPerArc = maxBytesPerArc;
writer.Truncate(address);
nodeArcCount = 0;
retry = true;
anyNegDelta = false;
//writeNodeContinue:;
}
//writeNodeBreak:
negDelta |= anyNegDelta;
fst.arcCount += nodeArcCount;
}
if (!changed)
{
// We don't renumber the nodes (just reverse their
// order) so nodes should only point forward to
// other nodes because we only produce acyclic FSTs
// w/ nodes only pointing "forwards":
if (Debugging.AssertsEnabled) Debugging.Assert(!negDelta);
//System.out.println("TOT wasted=" + totWasted);
// Converged!
break;
}
//System.out.println(" " + changedCount + " of " + fst.nodeCount + " changed; retry");
}
long maxAddress = 0;
foreach (long key in topNodeMap.Keys)
{
maxAddress = Math.Max(maxAddress, newNodeAddress.Get((int)key));
}
PackedInt32s.Mutable nodeRefToAddressIn = PackedInt32s.GetMutable(topNodeMap.Count, PackedInt32s.BitsRequired(maxAddress), acceptableOverheadRatio);
foreach (KeyValuePair<int, int> ent in topNodeMap)
{
nodeRefToAddressIn.Set(ent.Value, newNodeAddress.Get(ent.Key));
}
fst.nodeRefToAddress = nodeRefToAddressIn;
fst.startNode = newNodeAddress.Get((int)startNode);
//System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode);
if (!JCG.EqualityComparer<T>.Default.Equals(emptyOutput, default))
{
fst.EmptyOutput = emptyOutput;
}
if (Debugging.AssertsEnabled)
{
Debugging.Assert(fst.nodeCount == nodeCount,"fst.nodeCount={0} nodeCount={1}", fst.nodeCount, nodeCount);
Debugging.Assert(fst.arcCount == arcCount);
Debugging.Assert(fst.arcWithOutputCount == arcWithOutputCount,"fst.arcWithOutputCount={0} arcWithOutputCount={1}", fst.arcWithOutputCount, arcWithOutputCount);
}
fst.bytes.Finish();
fst.CacheRootArcs();
//final int size = fst.sizeInBytes();
//System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount);
return fst;
}