bolt/lib/Passes/HFSortPlus.cpp

//===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // hfsort+ - layout of hot functions with i-TLB cache optimization. // // Given an ordering of hot functions (and hence, their assignment to the // i-TLB pages), we can divide all functions calls Into two categories: // - 'short' ones that have a caller-callee distance less than a page; // - 'long' ones where the distance exceeds a page. // The short calls are likely to result in a i-TLB cache hit. For the long ones, // the hit/miss result depends on the 'hotness' of the page (i.e., how often // the page is accessed). Assuming that functions are sent to the i-TLB cache // in a random order, the probability that a page is present in the cache is // proportional to the number of samples corresponding to the functions on the // page. The following algorithm detects short and long calls, and optimizes // the expected number of cache misses for the long ones. // //===----------------------------------------------------------------------===// #include "bolt/Passes/HFSort.h" #include "llvm/Support/CommandLine.h" #include <cmath> #include <set> #include <vector> #define DEBUG_TYPE "hfsort" using namespace llvm; using namespace bolt; namespace opts { extern cl::OptionCategory BoltOptCategory; cl::opt<unsigned> ITLBPageSize("itlb-page-size", cl::desc("The size of i-tlb cache page"), cl::init(4096), cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); cl::opt<unsigned> ITLBEntries("itlb-entries", cl::desc("The number of entries in i-tlb cache"), cl::init(16), cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt<unsigned> ITLBDensity("itlb-density", cl::desc("The density of i-tlb cache"), cl::init(4096), cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt<double> MergeProbability("merge-probability", cl::desc("The minimum probability of a call for merging two clusters"), cl::init(0.9), cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt<double> ArcThreshold("arc-threshold", cl::desc("The threshold for ignoring arcs with a small relative weight"), cl::init(0.00000001), cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); } // namespace opts namespace llvm { namespace bolt { using NodeId = CallGraph::NodeId; using Arc = CallGraph::Arc; namespace { class Edge; using ArcList = std::vector<const Arc *>; // A chain (ordered sequence) of nodes (functions) in the call graph class Chain { public: Chain(const Chain &) = delete; Chain(Chain &&) = default; Chain &operator=(const Chain &) = delete; Chain &operator=(Chain &&) = default; explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_) : Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {} double density() const { return static_cast<double>(Samples) / Size; } Edge *getEdge(Chain *Other) const { for (std::pair<Chain *, Edge *> It : Edges) if (It.first == Other) return It.second; return nullptr; } void removeEdge(Chain *Other) { auto It = Edges.begin(); while (It != Edges.end()) { if (It->first == Other) { Edges.erase(It); return; } It++; } } void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); } void merge(Chain *Other) { Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end()); Samples += Other->Samples; Size += Other->Size; } void mergeEdges(Chain *Other); void clear() { Nodes.clear(); Edges.clear(); } public: size_t Id; uint64_t Samples; uint64_t Size; // Cached score for the chain double Score{0}; // Cached short-calls for the chain double ShortCalls{0}; // Nodes in the chain std::vector<NodeId> Nodes; // Adjacent chains and corresponding edges (lists of arcs) std::vector<std::pair<Chain *, Edge *>> Edges; }; // An edge in the call graph representing Arcs between two Chains. // When functions are merged Into chains, the edges are combined too so that // there is always at most one edge between a pair of chains class Edge { public: Edge(const Edge &) = delete; Edge(Edge &&) = default; Edge &operator=(const Edge &) = delete; Edge &operator=(Edge &&) = default; explicit Edge(Chain *SrcChain_, Chain *DstChain_, const Arc *A) : SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {} void changeEndpoint(Chain *From, Chain *To) { if (From == SrcChain) SrcChain = To; if (From == DstChain) DstChain = To; } void moveArcs(Edge *Other) { Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end()); Other->Arcs.clear(); } void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) { // When forward and backward gains are the same, prioritize merging that // preserves the original order of the functions in the binary if (std::abs(ForwardGain - BackwardGain) < 1e-8) { if (SrcChain->Id < DstChain->Id) { IsGainForward = true; CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain; } else { IsGainForward = false; CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain; } } else if (ForwardGain > BackwardGain) { IsGainForward = PredChain == SrcChain; CachedGain = ForwardGain; } else { IsGainForward = PredChain != SrcChain; CachedGain = BackwardGain; } } double gain() const { return CachedGain; } Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; } Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; } private: Chain *SrcChain{nullptr}; Chain *DstChain{nullptr}; public: // Original arcs in the binary with corresponding execution counts ArcList Arcs; // Cached gain of merging the pair of chains double CachedGain{-1.0}; // Since the gain of merging (Src, Dst) and (Dst, Src) might be different, // we store a flag indicating which of the options results in a higher gain bool IsGainForward; }; void Chain::mergeEdges(Chain *Other) { // Update edges adjacent to chain other for (auto EdgeIt : Other->Edges) { Chain *const DstChain = EdgeIt.first; Edge *const DstEdge = EdgeIt.second; Chain *const TargetChain = DstChain == Other ? this : DstChain; // Find the corresponding edge in the current chain Edge *CurEdge = getEdge(TargetChain); if (CurEdge == nullptr) { DstEdge->changeEndpoint(Other, this); this->addEdge(TargetChain, DstEdge); if (DstChain != this && DstChain != Other) DstChain->addEdge(this, DstEdge); } else { CurEdge->moveArcs(DstEdge); } // Cleanup leftover edge if (DstChain != Other) DstChain->removeEdge(Other); } } class HFSortPlus { public: explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); } /// Run the algorithm and return ordered set of function clusters. std::vector<Cluster> run() { // Pass 1 runPassOne(); // Pass 2 runPassTwo(); outs() << "BOLT-INFO: hfsort+ reduced the number of chains from " << Cg.numNodes() << " to " << HotChains.size() << "\n"; // Sorting chains by density in decreasing order auto DensityComparator = [](const Chain *L, const Chain *R) { if (L->density() != R->density()) return L->density() > R->density(); // Making sure the comparison is deterministic return L->Id < R->Id; }; std::stable_sort(HotChains.begin(), HotChains.end(), DensityComparator); // Return the set of clusters that are left, which are the ones that // didn't get merged (so their first func is its original func) std::vector<Cluster> Clusters; Clusters.reserve(HotChains.size()); for (Chain *Chain : HotChains) Clusters.emplace_back(Cluster(Chain->Nodes, Cg)); return Clusters; } private: /// Initialize the set of active chains, function id to chain mapping, /// total number of samples and function addresses. void initialize() { OutWeight.resize(Cg.numNodes(), 0); InWeight.resize(Cg.numNodes(), 0); AllChains.reserve(Cg.numNodes()); HotChains.reserve(Cg.numNodes()); NodeChain.resize(Cg.numNodes(), nullptr); Addr.resize(Cg.numNodes(), 0); // Initialize chains for (NodeId F = 0; F < Cg.numNodes(); ++F) { AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F)); HotChains.push_back(&AllChains.back()); NodeChain[F] = &AllChains.back(); TotalSamples += Cg.samples(F); for (NodeId Succ : Cg.successors(F)) { if (F == Succ) continue; const Arc &Arc = *Cg.findArc(F, Succ); OutWeight[F] += Arc.weight(); InWeight[Succ] += Arc.weight(); } } AllEdges.reserve(Cg.numArcs()); for (NodeId F = 0; F < Cg.numNodes(); ++F) { for (NodeId Succ : Cg.successors(F)) { if (F == Succ) continue; const Arc &Arc = *Cg.findArc(F, Succ); if (Arc.weight() == 0.0 || Arc.weight() / TotalSamples < opts::ArcThreshold) { continue; } Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]); if (CurEdge != nullptr) { // This edge is already present in the graph assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr); CurEdge->Arcs.push_back(&Arc); } else { // This is a new edge AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc); NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back()); NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back()); } } } for (Chain *&Chain : HotChains) { Chain->ShortCalls = shortCalls(Chain); Chain->Score = score(Chain); } } /// The probability that a page with a given density is not in the cache. /// /// Assume that the hot functions are called in a random order; then the /// probability of an i-TLB page being accessed after a function call is /// p = pageSamples / TotalSamples. The probability that the page is not /// accessed is (1 - p), and the probability that it is not in the cache /// (i.e. not accessed during the last kCacheEntries function calls) /// is (1 - p)^kCacheEntries double missProbability(double ChainDensity) const { double PageSamples = ChainDensity * opts::ITLBDensity; if (PageSamples >= TotalSamples) return 0; double P = PageSamples / TotalSamples; return pow(1.0 - P, double(opts::ITLBEntries)); } /// The expected number of calls on different i-TLB pages for an arc of the /// call graph with a specified weight double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr, double Weight) const { uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr; if (Dist >= opts::ITLBPageSize) return 0; double D = double(Dist) / double(opts::ITLBPageSize); // Increasing the importance of shorter calls return (1.0 - D * D) * Weight; } /// The expected number of calls within a given chain with both endpoints on /// the same cache page double shortCalls(Chain *Chain) const { Edge *Edge = Chain->getEdge(Chain); if (Edge == nullptr) return 0; double Calls = 0; for (const Arc *Arc : Edge->Arcs) { uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()); uint64_t DstAddr = Addr[Arc->dst()]; Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight()); } return Calls; } /// The number of calls between the two chains with both endpoints on /// the same i-TLB page, assuming that a given pair of chains gets merged double shortCalls(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const { double Calls = 0; for (const Arc *Arc : Edge->Arcs) { Chain *SrcChain = NodeChain[Arc->src()]; uint64_t SrcAddr; uint64_t DstAddr; if (SrcChain == ChainPred) { SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()); DstAddr = Addr[Arc->dst()] + ChainPred->Size; } else { SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size; DstAddr = Addr[Arc->dst()]; } Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight()); } Calls += ChainPred->ShortCalls; Calls += ChainSucc->ShortCalls; return Calls; } double score(Chain *Chain) const { double LongCalls = Chain->Samples - Chain->ShortCalls; return LongCalls * missProbability(Chain->density()); } /// The gain of merging two chains. /// /// We assume that the final chains are sorted by their density, and hence /// every chain is likely to be adjacent with chains of the same density. /// Thus, the 'hotness' of every chain can be estimated by density*pageSize, /// which is used to compute the probability of cache misses for long calls /// of a given chain. /// The result is also scaled by the size of the resulting chain in order to /// increase the chance of merging short chains, which is helpful for /// the i-cache performance. double mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const { // Cache misses on the chains before merging double CurScore = ChainPred->Score + ChainSucc->Score; // Cache misses on the merged chain double LongCalls = ChainPred->Samples + ChainSucc->Samples - shortCalls(ChainPred, ChainSucc, Edge); const double MergedSamples = ChainPred->Samples + ChainSucc->Samples; const double MergedSize = ChainPred->Size + ChainSucc->Size; double NewScore = LongCalls * missProbability(MergedSamples / MergedSize); double Gain = CurScore - NewScore; // Scale the result to increase the importance of merging short chains Gain /= std::min(ChainPred->Size, ChainSucc->Size); return Gain; } /// Run the first optimization pass of the algorithm: /// Merge chains that call each other with a high probability. void runPassOne() { // Find candidate pairs of chains for merging std::vector<const Arc *> ArcsToMerge; for (Chain *ChainPred : HotChains) { NodeId F = ChainPred->Nodes.back(); for (NodeId Succ : Cg.successors(F)) { if (F == Succ) continue; const Arc &Arc = *Cg.findArc(F, Succ); if (Arc.weight() == 0.0 || Arc.weight() / TotalSamples < opts::ArcThreshold) continue; const double CallsFromPred = OutWeight[F]; const double CallsToSucc = InWeight[Succ]; const double CallsPredSucc = Arc.weight(); // Probability that the first chain is calling the second one const double ProbOut = CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0; assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability"); // Probability that the second chain is called From the first one const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0; assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability"); if (std::min(ProbOut, ProbIn) >= opts::MergeProbability) ArcsToMerge.push_back(&Arc); } } // Sort the pairs by the weight in reverse order std::sort( ArcsToMerge.begin(), ArcsToMerge.end(), [](const Arc *L, const Arc *R) { return L->weight() > R->weight(); }); // Merge the pairs of chains for (const Arc *Arc : ArcsToMerge) { Chain *ChainPred = NodeChain[Arc->src()]; Chain *ChainSucc = NodeChain[Arc->dst()]; if (ChainPred == ChainSucc) continue; if (ChainPred->Nodes.back() == Arc->src() && ChainSucc->Nodes.front() == Arc->dst()) mergeChains(ChainPred, ChainSucc); } } /// Run the second optimization pass of the hfsort+ algorithm: /// Merge pairs of chains while there is an improvement in the /// expected cache miss ratio. void runPassTwo() { // Creating a priority queue containing all edges ordered by the merge gain auto GainComparator = [](Edge *L, Edge *R) { if (std::abs(L->gain() - R->gain()) > 1e-8) return L->gain() > R->gain(); // Making sure the comparison is deterministic if (L->predChain()->Id != R->predChain()->Id) return L->predChain()->Id < R->predChain()->Id; return L->succChain()->Id < R->succChain()->Id; }; std::set<Edge *, decltype(GainComparator)> Queue(GainComparator); // Inserting the edges Into the queue for (Chain *ChainPred : HotChains) { for (auto EdgeIt : ChainPred->Edges) { Chain *ChainSucc = EdgeIt.first; Edge *ChainEdge = EdgeIt.second; // Ignore loop edges if (ChainPred == ChainSucc) continue; // Ignore already processed edges if (ChainEdge->gain() != -1.0) continue; // Compute the gain of merging the two chains auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge); auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge); ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain); if (ChainEdge->gain() > 0.0) Queue.insert(ChainEdge); } } // Merge the chains while the gain of merging is positive while (!Queue.empty()) { // Extract the best (top) edge for merging Edge *It = *Queue.begin(); Queue.erase(Queue.begin()); Edge *BestEdge = It; Chain *BestChainPred = BestEdge->predChain(); Chain *BestChainSucc = BestEdge->succChain(); if (BestChainPred == BestChainSucc || BestEdge->gain() <= 0.0) continue; // Remove outdated edges for (std::pair<Chain *, Edge *> EdgeIt : BestChainPred->Edges) Queue.erase(EdgeIt.second); for (std::pair<Chain *, Edge *> EdgeIt : BestChainSucc->Edges) Queue.erase(EdgeIt.second); // Merge the best pair of chains mergeChains(BestChainPred, BestChainSucc); // Insert newly created edges Into the queue for (auto EdgeIt : BestChainPred->Edges) { Chain *ChainSucc = EdgeIt.first; Edge *ChainEdge = EdgeIt.second; // Ignore loop edges if (BestChainPred == ChainSucc) continue; // Compute the gain of merging the two chains auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge); auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge); ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain); if (ChainEdge->gain() > 0.0) Queue.insert(ChainEdge); } } } /// Merge chain From into chain Into and update the list of active chains. void mergeChains(Chain *Into, Chain *From) { assert(Into != From && "cannot merge a chain with itself"); Into->merge(From); // Update the chains and addresses for functions merged from From size_t CurAddr = 0; for (NodeId F : Into->Nodes) { NodeChain[F] = Into; Addr[F] = CurAddr; CurAddr += Cg.size(F); } // Merge edges Into->mergeEdges(From); From->clear(); // Update cached scores for the new chain Into->ShortCalls = shortCalls(Into); Into->Score = score(Into); // Remove chain From From the list of active chains auto it = std::remove(HotChains.begin(), HotChains.end(), From); HotChains.erase(it, HotChains.end()); } private: // The call graph const CallGraph &Cg; // All chains of functions std::vector<Chain> AllChains; // Active chains. The vector gets updated at runtime when chains are merged std::vector<Chain *> HotChains; // All edges between chains std::vector<Edge> AllEdges; // Node_id => chain std::vector<Chain *> NodeChain; // Current address of the function From the beginning of its chain std::vector<uint64_t> Addr; // Total weight of outgoing arcs for each function std::vector<double> OutWeight; // Total weight of incoming arcs for each function std::vector<double> InWeight; // The total number of samples in the graph double TotalSamples{0}; }; } // end anonymous namespace std::vector<Cluster> hfsortPlus(CallGraph &Cg) { // It is required that the sum of incoming arc weights is not greater // than the number of samples for every function. // Ensuring the call graph obeys the property before running the algorithm. Cg.adjustArcWeights(); return HFSortPlus(Cg).run(); } } // namespace bolt } // namespace llvm

bolt/lib/Passes/HFSortPlus.cpp (411 lines of code) (raw):