internal FST Pack()

in src/Lucene.Net/Util/Fst/FST.cs [1659:2080]


        internal FST<T> Pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio)
        {
            // NOTE: maxDerefNodes is intentionally int: we cannot
            // support > 2.1B deref nodes

            // TODO: other things to try
            //   - renumber the nodes to get more next / better locality?
            //   - allow multiple input labels on an arc, so
            //     singular chain of inputs can take one arc (on
            //     wikipedia terms this could save another ~6%)
            //   - in the ord case, the output '1' is presumably
            //     very common (after NO_OUTPUT)... maybe use a bit
            //     for it..?
            //   - use spare bits in flags.... for top few labels /
            //     outputs / targets

            if (nodeAddress is null)
            {
                throw new ArgumentException("this FST was not built with willPackFST=true");
            }

            FST.Arc<T> arc = new FST.Arc<T>();

            FST.BytesReader r = GetBytesReader();

            int topN = Math.Min(maxDerefNodes, inCounts.Count);

            // Find top nodes with highest number of incoming arcs:
            IDictionary<int, int> topNodeMap = null;

#nullable enable

            // LUCENENET: Refactored PriorityQueue<T> subclass into PriorityComparer<T>
            // implementation, which can be passed into ValuePriorityQueue. ValuePriorityQueue
            // lives on the stack, and if the array size is small enough, we also allocate the
            // array on the stack. Fallback to the array pool if it is beyond MaxStackByteLimit.
            int bufferSize = PriorityQueue.GetArrayHeapSize(topN);
            bool usePool = FST.NodeAndInCount.ByteSize * bufferSize > Constants.MaxStackByteLimit;
            FST.NodeAndInCount?[]? arrayToReturnToPool = usePool ? ArrayPool<FST.NodeAndInCount?>.Shared.Rent(bufferSize) : null;
            try
            {
                Span<FST.NodeAndInCount?> buffer = usePool ? arrayToReturnToPool : stackalloc FST.NodeAndInCount?[bufferSize];
                var q = new ValuePriorityQueue<FST.NodeAndInCount?>(buffer, FST.NodeComparer.Default);
#nullable restore

                // TODO: we could use more RAM efficient selection algo here...
                FST.NodeAndInCount? bottom = null;

                for (int node = 0; node < inCounts.Count; node++)
                {
                    if (inCounts.Get(node) >= minInCountDeref)
                    {
                        if (!bottom.HasValue)
                        {
                            q.Add(new FST.NodeAndInCount(node, (int)inCounts.Get(node)));
                            if (q.Count == topN)
                            {
                                bottom = q.Top;
                            }
                        }
                        else if (inCounts.Get(node) > bottom.Value.Count)
                        {
                            q.InsertWithOverflow(new FST.NodeAndInCount(node, (int)inCounts.Get(node)));
                        }
                    }
                }

                // Free up RAM:
                inCounts = null;

                topNodeMap = new Dictionary<int, int>(q.Count); // LUCENENET: Allocate the dictionary prior to the loop
                for (int downTo = q.Count - 1; downTo >= 0; downTo--)
                {
                    FST.NodeAndInCount? n = q.Pop();
                    topNodeMap[n.Value.Node] = downTo; // LUCENENET: we are bound in the loop by Count, so we won't go past the end and get null here.
                    //System.out.println("map node=" + n.Node + " inCount=" + n.count + " to newID=" + downTo);
                }
            }
            finally
            {
                if (arrayToReturnToPool is not null)
                    ArrayPool<FST.NodeAndInCount?>.Shared.Return(arrayToReturnToPool);
            }

            // +1 because node ords start at 1 (0 is reserved as stop node):
            GrowableWriter newNodeAddress = new GrowableWriter(PackedInt32s.BitsRequired(this.bytes.Position), (int)(1 + nodeCount), acceptableOverheadRatio);

            // Fill initial coarse guess:
            for (int node = 1; node <= nodeCount; node++)
            {
                newNodeAddress.Set(node, 1 + this.bytes.Position - nodeAddress.Get(node));
            }

            int absCount;
            int deltaCount;
            int topCount;
            int nextCount;

            FST<T> fst;

            // Iterate until we converge:
            while (true)
            {
                //System.out.println("\nITER");
                bool changed = false;

                // for assert:
                bool negDelta = false;

                fst = new FST<T>(inputType, Outputs, bytes.BlockBits);

                BytesStore writer = fst.bytes;

                // Skip 0 byte since 0 is reserved target:
                writer.WriteByte(0);

                fst.arcWithOutputCount = 0;
                fst.nodeCount = 0;
                fst.arcCount = 0;

                absCount = deltaCount = topCount = nextCount = 0;

                int changedCount = 0;

                long addressError = 0;

                //int totWasted = 0;

                // Since we re-reverse the bytes, we now write the
                // nodes backwards, so that BIT_TARGET_NEXT is
                // unchanged:
                for (int node = (int)nodeCount; node >= 1; node--)
                {
                    fst.nodeCount++;
                    long address = writer.Position;

                    //System.out.println("  node: " + node + " address=" + address);
                    if (address != newNodeAddress.Get(node))
                    {
                        addressError = address - newNodeAddress.Get(node);
                        //System.out.println("    change: " + (address - newNodeAddress[node]));
                        changed = true;
                        newNodeAddress.Set(node, address);
                        changedCount++;
                    }

                    int nodeArcCount = 0;
                    int bytesPerArc = 0;

                    bool retry = false;

                    // for assert:
                    bool anyNegDelta = false;

                    // Retry loop: possibly iterate more than once, if
                    // this is an array'd node and bytesPerArc changes:
                    while (true) // retry writing this node
                    {
                        //System.out.println("  cycle: retry");
                        ReadFirstRealTargetArc(node, arc, r);

                        bool useArcArray = arc.BytesPerArc != 0;
                        if (useArcArray)
                        {
                            // Write false first arc:
                            if (bytesPerArc == 0)
                            {
                                bytesPerArc = arc.BytesPerArc;
                            }
                            writer.WriteByte((byte)FST.ARCS_AS_FIXED_ARRAY);
                            writer.WriteVInt32(arc.NumArcs);
                            writer.WriteVInt32(bytesPerArc);
                            //System.out.println("node " + node + ": " + arc.numArcs + " arcs");
                        }

                        int maxBytesPerArc = 0;
                        //int wasted = 0;
                        while (true) // iterate over all arcs for this node
                        {
                            //System.out.println("    cycle next arc");

                            long arcStartPos = writer.Position;
                            nodeArcCount++;

                            sbyte flags = 0;

                            if (arc.IsLast)
                            {
                                flags += (sbyte)FST.BIT_LAST_ARC;
                            }
                            /*
                            if (!useArcArray && nodeUpto < nodes.length-1 && arc.Target == nodes[nodeUpto+1]) {
                              flags += BIT_TARGET_NEXT;
                            }
                            */
                            if (!useArcArray && node != 1 && arc.Target == node - 1)
                            {
                                flags += (sbyte)FST.BIT_TARGET_NEXT;
                                if (!retry)
                                {
                                    nextCount++;
                                }
                            }
                            if (arc.IsFinal)
                            {
                                flags += (sbyte)FST.BIT_FINAL_ARC;
                                if (arc.NextFinalOutput != NO_OUTPUT)
                                {
                                    flags += (sbyte)FST.BIT_ARC_HAS_FINAL_OUTPUT;
                                }
                            }
                            else
                            {
                                if (Debugging.AssertsEnabled) Debugging.Assert(arc.NextFinalOutput == NO_OUTPUT);
                            }
                            if (!TargetHasArcs(arc))
                            {
                                flags += (sbyte)FST.BIT_STOP_NODE;
                            }

                            if (arc.Output != NO_OUTPUT)
                            {
                                flags += (sbyte)FST.BIT_ARC_HAS_OUTPUT;
                            }

                            long absPtr;
                            bool doWriteTarget = TargetHasArcs(arc) && (flags & FST.BIT_TARGET_NEXT) == 0;
                            if (doWriteTarget)
                            {
                                if (topNodeMap.TryGetValue((int)arc.Target, out int ptr))
                                {
                                    absPtr = ptr;
                                }
                                else
                                {
                                    absPtr = topNodeMap.Count + newNodeAddress.Get((int)arc.Target) + addressError;
                                }

                                long delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.Position - 2;
                                if (delta < 0)
                                {
                                    //System.out.println("neg: " + delta);
                                    anyNegDelta = true;
                                    delta = 0;
                                }

                                if (delta < absPtr)
                                {
                                    flags |= FST.BIT_TARGET_DELTA;
                                }
                            }
                            else
                            {
                                absPtr = 0;
                            }

                            if (Debugging.AssertsEnabled) Debugging.Assert(flags != FST.ARCS_AS_FIXED_ARRAY);
                            writer.WriteByte((byte)flags);

                            fst.WriteLabel(writer, arc.Label);

                            if (arc.Output != NO_OUTPUT)
                            {
                                Outputs.Write(arc.Output, writer);
                                if (!retry)
                                {
                                    fst.arcWithOutputCount++;
                                }
                            }
                            if (arc.NextFinalOutput != NO_OUTPUT)
                            {
                                Outputs.WriteFinalOutput(arc.NextFinalOutput, writer);
                            }

                            if (doWriteTarget)
                            {
                                long delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.Position;
                                if (delta < 0)
                                {
                                    anyNegDelta = true;
                                    //System.out.println("neg: " + delta);
                                    delta = 0;
                                }

                                if (Flag(flags, FST.BIT_TARGET_DELTA))
                                {
                                    //System.out.println("        delta");
                                    writer.WriteVInt64(delta);
                                    if (!retry)
                                    {
                                        deltaCount++;
                                    }
                                }
                                else
                                {
                                    /*
                                    if (ptr != null) {
                                      System.out.println("        deref");
                                    } else {
                                      System.out.println("        abs");
                                    }
                                    */
                                    writer.WriteVInt64(absPtr);
                                    if (!retry)
                                    {
                                        if (absPtr >= topNodeMap.Count)
                                        {
                                            absCount++;
                                        }
                                        else
                                        {
                                            topCount++;
                                        }
                                    }
                                }
                            }

                            if (useArcArray)
                            {
                                int arcBytes = (int)(writer.Position - arcStartPos);
                                //System.out.println("  " + arcBytes + " bytes");
                                maxBytesPerArc = Math.Max(maxBytesPerArc, arcBytes);
                                // NOTE: this may in fact go "backwards", if
                                // somehow (rarely, possibly never) we use
                                // more bytesPerArc in this rewrite than the
                                // incoming FST did... but in this case we
                                // will retry (below) so it's OK to ovewrite
                                // bytes:
                                //wasted += bytesPerArc - arcBytes;
                                writer.SkipBytes((int)(arcStartPos + bytesPerArc - writer.Position));
                            }

                            if (arc.IsLast)
                            {
                                break;
                            }

                            ReadNextRealArc(arc, r);
                        }

                        if (useArcArray)
                        {
                            if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc))
                            {
                                // converged
                                //System.out.println("  bba=" + bytesPerArc + " wasted=" + wasted);
                                //totWasted += wasted;
                                break;
                            }
                        }
                        else
                        {
                            break;
                        }

                        //System.out.println("  retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc);

                        // Retry:
                        bytesPerArc = maxBytesPerArc;
                        writer.Truncate(address);
                        nodeArcCount = 0;
                        retry = true;
                        anyNegDelta = false;
                        //writeNodeContinue:;
                    }
                    //writeNodeBreak:

                    negDelta |= anyNegDelta;

                    fst.arcCount += nodeArcCount;
                }

                if (!changed)
                {
                    // We don't renumber the nodes (just reverse their
                    // order) so nodes should only point forward to
                    // other nodes because we only produce acyclic FSTs
                    // w/ nodes only pointing "forwards":
                    if (Debugging.AssertsEnabled) Debugging.Assert(!negDelta);
                    //System.out.println("TOT wasted=" + totWasted);
                    // Converged!
                    break;
                }
                //System.out.println("  " + changedCount + " of " + fst.nodeCount + " changed; retry");
            }

            long maxAddress = 0;
            foreach (long key in topNodeMap.Keys)
            {
                maxAddress = Math.Max(maxAddress, newNodeAddress.Get((int)key));
            }

            PackedInt32s.Mutable nodeRefToAddressIn = PackedInt32s.GetMutable(topNodeMap.Count, PackedInt32s.BitsRequired(maxAddress), acceptableOverheadRatio);
            foreach (KeyValuePair<int, int> ent in topNodeMap)
            {
                nodeRefToAddressIn.Set(ent.Value, newNodeAddress.Get(ent.Key));
            }
            fst.nodeRefToAddress = nodeRefToAddressIn;

            fst.startNode = newNodeAddress.Get((int)startNode);
            //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode);

            if (!JCG.EqualityComparer<T>.Default.Equals(emptyOutput, default))
            {
                fst.EmptyOutput = emptyOutput;
            }

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(fst.nodeCount == nodeCount,"fst.nodeCount={0} nodeCount={1}", fst.nodeCount, nodeCount);
                Debugging.Assert(fst.arcCount == arcCount);
                Debugging.Assert(fst.arcWithOutputCount == arcWithOutputCount,"fst.arcWithOutputCount={0} arcWithOutputCount={1}", fst.arcWithOutputCount, arcWithOutputCount);
            }

            fst.bytes.Finish();
            fst.CacheRootArcs();

            //final int size = fst.sizeInBytes();
            //System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount);

            return fst;
        }