glean/lang/clang/Derive/CxxDeclarationTargets.hs (459 lines of code) (raw):
{-
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree.
-}
{-# LANGUAGE ApplicativeDo, TypeApplications #-}
module Derive.CxxDeclarationTargets
( deriveCxxDeclarationTargets
) where
import Control.Concurrent.Async (Concurrently(..), runConcurrently)
import qualified Control.Concurrent.Async as Async (withAsync, wait)
import Control.Concurrent.STM
import Control.DeepSeq
import Control.Exception
import Control.Monad
import Data.Array hiding ((!))
import Data.Array.ST
import qualified Data.HashSet as HashSet
import Data.IORef
import Data.List (foldl', sortOn)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Maybe
import Data.Proxy
import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Data.Text.IO as Text.IO
import qualified Data.Vector as V
import Data.Vector.Unboxed ((!))
import qualified Data.Vector.Unboxed as VU
import Data.Word (Word64)
import GHC.Compact as Compact
import Util.Log (logInfo)
import Glean
import Glean.FFI (withMany)
import qualified Glean.Schema.Cxx1.Types as Cxx
import qualified Glean.Schema.Src.Types as Src
import Glean.Typed (PidOf, getPid)
import Glean.Util.Declarations ( DeclBranch, applyDeclaration, getDeclId )
import Glean.Util.PredMap (PredMap)
import qualified Glean.Util.PredMap as PredMap
import Glean.Util.Range
import Glean.Util.Time
import Derive.Common
import Derive.Types
-- -----------------------------------------------------------------------------
-- Derive function calls : make delcaration source/target map
-- The second derived pass will reverse this map
-- | a map from a file id to the declaration in the file
type Decls = PredMap Src.File [Cxx.Declaration]
type FileLines = PredMap Src.File LineOffsets
-- | Helper to pick out the declaration branch
toPredicateRef :: Cxx.Declaration -> PredicateRef
toPredicateRef = applyDeclaration (getName . mkProxy)
where
mkProxy :: a -> Proxy a
mkProxy _ = Proxy
{-# INLINE getDeclarationsOfType #-}
getDeclarationsOfType
:: forall a e. ( DeclBranch a
, SumBranches a Cxx.Declaration
, SumQuery (QueryOf a) (QueryOf Cxx.Declaration)
, HasSrcRange (KeyType a)
, Backend e )
=> PidOf a
-> e
-> Config
-> IO Decls
getDeclarationsOfType _pid = go
where
go e cfg = do
let
q :: Query a
q = maybe id limit (cfgMaxQueryFacts cfg) $
limitBytes (cfgMaxQuerySize cfg) allFacts
runQueryEach e (cfgRepo cfg) q mempty $ \newdecls b -> do
key <- case getFactKey b of
Just k -> return k
Nothing -> throwIO $ ErrorCall "internal error: getDeclarationsOfType"
let
source = srcRange key
fileId = getId $ Src.range_file source
d = injectBranch (mkFact (getId b) (Just key) Nothing :: a)
return $! PredMap.insertWith (const (d:)) fileId [d] newdecls
getDeclarations
:: Backend e
=> e
-> Config
-> (forall p. Predicate p => PidOf p)
-> IO Decls
getDeclarations e cfg somePid = do
declss <- mapM (\f -> f e cfg)
[ getDeclarationsOfType (somePid :: PidOf Cxx.FunctionDeclaration)
, getDeclarationsOfType (somePid :: PidOf Cxx.ObjcContainerDeclaration)
, getDeclarationsOfType (somePid :: PidOf Cxx.ObjcMethodDeclaration)
, getDeclarationsOfType (somePid :: PidOf Cxx.ObjcPropertyDeclaration) ]
return $! PredMap.unionsWith (++) declss
getFileLines :: Backend e => e -> Config -> IO FileLines
getFileLines e cfg = do
let
q :: Query Src.FileLines
q = maybe id limit (cfgMaxQueryFacts cfg) $
limitBytes (cfgMaxQuerySize cfg) allFacts
runQueryEach e (cfgRepo cfg) q mempty
$ \fileliness (Src.FileLines _ k) -> do
key <- case k of
Just k -> return k
Nothing -> throwIO $ ErrorCall "internal error: getFileLines"
let
fileId = getId . Src.fileLines_key_file $ key
!offsets = lengthsToLineOffsets key
return $! PredMap.insert fileId offsets fileliness
-- For building the declaration graph, there are a few algorithm choices.
--
-- Given a list of "small" ranges representing cross-references (targets)
-- and a set of "big" ranges representing function declarations (sources),
-- the function matchTargetsToSources returns a list of all pairs consisting
-- of a function declaration and the cross references to functions in it.
-- Each "small" range is represented by a pair ((begin, len), s),
-- and each "big" range is represented by a pair ((begin, len), t),
-- where in both cases begin <= begin + len is assumed.
-- | This performs the target/source matching using brute force naive
-- list traversals. The other implementations below should run
-- faster than this.
--
-- This naive implementation works in O(T * S) time
-- where T = length targets and S = length sources.
matchNaive :: [[(ByteRange, t)]] -> [(ByteRange, s)] -> [(s, [t])]
matchNaive targetss sources =
[ (s, ts)
| (rS, s) <- sources
, let ts = [ t | (rT, t) <- concat targetss, byteRangeContains rS rT ]
, not $ null ts
]
-- the below one works in time O((T+S) * log (T + S) + M)
-- where M is the size of the output. In practice slower than the naive one.
data Event t s = TargetEnds !ByteRange t
| SourceEnds !ByteRange s
data MatchesAcc t s = MatchesAcc !(Map.Map ByteRange [t]) ![(s, [t])]
-- | This performs the target/source matching with a preprocessing pass. It
-- should run faster than `matchNaive`.
--
-- Both targets and sources are pre-processed into a single event list
-- (sorted by range end, ascending).
-- Then events are processed in order. targets are accumulated into a Map,
-- and each source is associated with a suffix of the Map of targets.
matchSweepingLine :: [[(ByteRange, t)]] -> [(ByteRange, s)] -> [(s, [t])]
matchSweepingLine targetss sources = matches
where
(MatchesAcc _ matches) =
foldl' applyEvent (MatchesAcc Map.empty []) events
events = sortOn eventKey $ [ SourceEnds range s | (range, s) <- sources]
++ [ TargetEnds range t | targets <- targetss, (range, t) <- targets]
eventKey :: Event t s -> (Word64, Int)
eventKey (TargetEnds br _) = (byteRangeExclusiveEnd br, 0)
eventKey (SourceEnds br _) = (byteRangeExclusiveEnd br, 1)
applyEvent :: MatchesAcc t s
-> Event t s
-> MatchesAcc t s
applyEvent (MatchesAcc endedTargets matches) event = case event of
(TargetEnds range t) -> MatchesAcc endedTargets' matches
where
endedTargets' = Map.insertWith (const (t:)) range [t] endedTargets
(SourceEnds (ByteRange beginS _) s) -> MatchesAcc endedTargets matches'
where
ts = concatMap snd
$ takeWhile (\(ByteRange beginT _, _) -> beginT >= beginS)
$ Map.toDescList endedTargets
matches' = if null ts then matches else (s, ts) : matches
-- given a sorted vector, returns the least index i such that v!i >= a
-- or (VU.length v) if no such element
binsearchGE :: (Ord a, VU.Unbox a) => VU.Vector a -> a -> Int
binsearchGE v a = go (-1) (VU.length v)
where
-- invariant: v!lo < a && v!hi >= a
go lo hi =
if succ lo == hi
then hi
else
let mi = (lo + hi) `div` 2
in if v!mi < a
then go mi hi
else go lo mi
-- | This performs the target/source matching with pre-processed vectors and
-- ST for arrays. It should run faster than `matchNaive`.
--
-- Iterates though targets. O(S*log(S)) then
-- O(T * log(S) * overlap) where overlap is max(forall targets: number
-- of sources partially overlapping the target).
matchSourceVector :: [[(ByteRange, t)]] -> [(ByteRange, s)] -> [(s, [t])]
matchSourceVector _ [] = []
matchSourceVector [] _ = []
matchSourceVector targetss sources =
let
-- ascending by end position
sortedSources = sortOn (byteRangeExclusiveEnd . fst) sources
-- unsorted
begins = VU.fromList [begin | (ByteRange begin _, _) <- sortedSources]
-- ascending
ends = VU.fromList [byteRangeExclusiveEnd br | (br, _) <- sortedSources]
-- ascending (min of begin of ranges in tail)
beginMins = VU.fromList
$ scanr1 min [begin | (ByteRange begin _, _) <- sortedSources]
numSources = length sortedSources
sourcesForTarget :: ByteRange -> [Int]
sourcesForTarget br@(ByteRange begin _len) =
filter (\i -> begins!i <= begin) $
{- since sources are sorted, we already know that end <= ends!i -}
takeWhile (\i -> beginMins!i <= begin)
[binsearchGE ends (byteRangeExclusiveEnd br)..(numSources-1)]
targetsBySource = runSTArray $ do
arr <- newArray (0,numSources-1) []
forM_ targetss $ \targets ->
forM_ targets $ \(range, t) ->
forM_ (sourcesForTarget range) $ \i -> do
ts <- readArray arr i
writeArray arr i (t:ts)
return arr
in
[ (s, ts) | ((_, s), ts) <- zip sortedSources $ elems targetsBySource]
-- | Helper: type for @incomingQ@ inside 'deriveFunctionCalls'
type QueuePackage = (Src.File, [(IdOf Cxx.FileXRefMap, Cxx.FileXRefMap_key)])
-- | Make 'Cxx.DeclarationTargets' and then 'Cxx.DeclarationSources' facts
--
-- Works by matching target coordinates to span of source declarations. There
-- are 3 choices for this
deriveCxxDeclarationTargets
:: Backend e
=> e
-> Config
-> (Int -> ([Writer] -> IO ()) -> IO ())
-> IO ()
deriveCxxDeclarationTargets e cfg withWriters = withWriters workers $ \ writers -> do
logInfo "deriveCxxDeclarationTargets"
-- ---------------------------------------------------------------------------
-- Main queue and logging
startTimePoint <- getTimePoint -- for cfgBenchmark
-- for cfgBenchmark : total microseconds running the chosen matching algorithm
matchingTimeRef <- newIORef (0::Int)
-- For cfgDebugPrintReferences, track sparse matrix of count of references
summaryRef <- newIORef (Map.empty :: Map (PredicateRef, PredicateRef) Int)
-- Queue of one item per Src.File, ends with Nothings to shutdown workers
-- Unlimited runs of RAM. Limit of 150,000 worked with 202GB max resident.
-- Now takes limit from command line, defaults to queue limit of 10,000.
(incomingQ :: TBQueue (Maybe QueuePackage)) <-
newTBQueueIO (fromIntegral $ cfgMaxQueueSize cfg)
-- ---------------------------------------------------------------------------
-- We are limited by the loading of cxx.FileXRefMap facts, start it first
let
q :: Query Cxx.FileXRefMap
q = maybe id limit (cfgMaxQueryFacts cfg) $
limitBytes (cfgMaxQuerySize cfg) allFacts
doFoldEach = do
fileTargetCountAcc <-
runQueryEach e (cfgRepo cfg) q (Nothing, [], 0::Int, 0::Int) $
\ (!mLastFile, !targetssIn, !countIn, !nIn)
fact@(Cxx.FileXRefMap _ k) -> do
when (mod nIn 10000 == 0) $ logInfo $
"(file count, FileXRefMap count) progress: "
++ show (countIn, nIn)
let !i = getId fact
key <- case k of
Just k -> return k
Nothing -> throwIO $ ErrorCall
"internal error: deriveFunctionCalls"
let !file = Cxx.fileXRefMap_key_file key
case mLastFile of
(Just lastFile) | lastFile /= file -> do
atomically $ writeTBQueue incomingQ
(Just (lastFile, targetssIn))
return (Just file, [(i, key)], succ countIn, succ nIn)
_ ->
return (Just file, (i, key):targetssIn, countIn, succ nIn)
(countF, nF) <- case fileTargetCountAcc of
(Nothing, _empty, countIn, nIn) -> return (countIn, nIn)
(Just file, targetssIn, countIn, nIn) -> do
atomically $ writeTBQueue incomingQ (Just (file, targetssIn))
return (succ countIn, nIn)
logInfo $ "(file count, FileXRefMap count) final: " ++ show (countF, nF)
logInfo "start foldEach Cxx.FileXRefMap async"
Async.withAsync doFoldEach $ \ fileCountAsync -> do -- not indenting
-- ---------------------------------------------------------------------------
-- Load state needed for processing cxx.FileXRefMap
let done :: String -> a -> IO a -- useful for seeing timing in logs
done msg x = logInfo msg >> return x
somePid :: forall p. Predicate p => PidOf p
somePid = getPid (head writers)
(xrefs, decls, fileliness, indirects) <- do
maps <- runConcurrently $ (,,,)
<$> Concurrently (getFileXRefs e cfg >>= done "fileXRefs")
<*> Concurrently (getDeclarations e cfg somePid >>= done "declarations")
<*> Concurrently (getFileLines e cfg >>= done "fileLines")
<*> Concurrently (getIndirectTargets e cfg somePid >>= done "indirect")
logInfo "loaded predmaps, compacting"
compactMaps <- Compact.compact maps
size <- Compact.compactSize compactMaps
logInfo $ "compact complete (" ++ show size ++ " bytes)"
return (Compact.getCompact compactMaps)
logInfo $ "loaded " ++ show (PredMap.size xrefs) ++ " file xrefs"
logInfo $ "loaded " ++ show (PredMap.size decls) ++ " declarations"
logInfo $ "loaded " ++ show (PredMap.size fileliness) ++ " file liness"
logInfo $ "loaded " ++ show (PredMap.size indirects) ++ " indirect targets"
-- ---------------------------------------------------------------------------
-- Code for processing cxx.FileXRefMap
let matchTargetsToSources = case cfgMatchAlgorithm cfg of
Naive -> matchNaive
SweepingLine -> matchSweepingLine
SourceVector -> matchSourceVector
let generateCalls
:: Writer -> Src.File -> [[(ByteRange, [Cxx.Declaration])]] -> IO Int
generateCalls writer file targetss =
let
fileId = getId file
offsets = case PredMap.lookup fileId fileliness of
(Just offsets) -> offsets
_ -> error $ "no file lines for file" ++ show fileId
toSrcRange :: Cxx.Declaration -> Src.Range
toSrcRange = applyDeclaration getSrcRange
where
getSrcRange =
srcRange .
fromMaybe (error "generateCalls toSrcRange: Nothing") .
getFactKey
sources =
[ ( srcRangeToSimpleByteRange offsets srcRange, decl)
| decl <- PredMap.findWithDefault [] fileId decls
, let srcRange = toSrcRange decl
]
calls = matchTargetsToSources targetss sources
callsTargetSets =
[ (s, ts')
| (s, ts) <- calls
, let ts' = Set.fromList $ concat ts
, not $ Set.null ts'
]
in
do
forcedCallsTargetSets <-
if cfgBenchmark cfg
then do
before <- getTimePoint
fcts <- evaluate $ force callsTargetSets
elapsed <- toDiffMicros <$> getElapsedTime before
atomicModifyIORef' matchingTimeRef $ \ old -> (old+elapsed, ())
return fcts
else
evaluate $ force callsTargetSets
when (not $ cfgDryRun cfg) $
writeFacts writer $
forM_ forcedCallsTargetSets $ \(s, ts) -> do
makeFact_ @Cxx.DeclarationTargets Cxx.DeclarationTargets_key
{ declarationTargets_key_source = s
, declarationTargets_key_targets = Set.toList ts
}
when (cfgDebugPrintReferences cfg) $
forM_ forcedCallsTargetSets $ \(s, ts) ->
forM_ ts $ \t -> do
let sname = toPredicateRef s
tname = toPredicateRef t
firstPair <- atomicModifyIORef' summaryRef $ \ old ->
let summary' =
Map.insertWith (const (+1)) (sname, tname) 1 old
in (summary', Map.notMember (sname, tname) old)
when firstPair $ putStrLn $ concat
[ Text.unpack $ predicateRef_name sname
, "{", show $ getDeclId s, "} -> "
, Text.unpack $ predicateRef_name tname
, "{", show $ getDeclId t, "}" ]
let count = sum (map (Set.size . snd) forcedCallsTargetSets)
return $! count
let -- This should be called only once per 'IdOf Cxx.FileXRefMap'
oneFileXRefMap
:: (IdOf Cxx.FileXRefMap, Cxx.FileXRefMap_key)
-> [(ByteRange, [Cxx.Declaration])]
oneFileXRefMap (i, key) =
let fixed = Cxx.fileXRefMap_key_fixed key
fixedTargets =
[ (range, [decl])
| (Cxx.FixedXRef itarget spans) <- fixed
, (Cxx.XRefTarget_declaration decl)
<- mapMaybe (resolve indirects) [itarget]
, range <- relByteSpansToRanges spans
]
matched = zip variable $ V.toList externals
where
variable = Cxx.fileXRefMap_key_variable key
externals = PredMap.findWithDefault mempty i xrefs
variableTargets =
[ (range, targetDecls)
| (spans, extTargets) <- matched
, let !targetDecls = Set.toList . Set.fromList $
[ decl
| (Cxx.XRefTarget_declaration decl)
<- mapMaybe (resolve indirects)
$ HashSet.toList extTargets]
, range <- relByteSpansToRanges spans
]
newTargets = fixedTargets ++ variableTargets
in newTargets
let fileWorker writer = do
localCount <- newIORef (0::Int)
let loop = do
m <- atomically $ readTBQueue incomingQ
case m of
Nothing -> readIORef localCount
Just (file, fileXRefMaps) -> do
targetss <- forM fileXRefMaps $ \ fileXRefMap -> do
let newTargets = oneFileXRefMap fileXRefMap
_ <- evaluate (force (length newTargets))
return newTargets
count <- generateCalls writer file targetss
modifyIORef' localCount (+ count)
loop
loop
-- ---------------------------------------------------------------------------
logInfo "Launch worker threads to process cxx.FileXRefMap"
counts <- withMany Async.withAsync (map fileWorker writers) $
\ workerAsyncs -> do
logInfo "Wait for fileCountAsync forEach"
() <- Async.wait fileCountAsync
logInfo "fileCountAsync forEach done, closing worker threads"
atomically $ replicateM_ workers $ writeTBQueue incomingQ Nothing
logInfo "Waiting on Worker threads"
forM workerAsyncs Async.wait
logInfo "Worker threads closed"
logInfo $ "Per worker counts: " ++ show counts
logInfo $ "found " ++ show (sum counts) ++ " decl calls"
when (cfgBenchmark cfg) $ do
foldEachTime <- toDiffMicros <$> getElapsedTime startTimePoint
matchingTime <- readIORef matchingTimeRef
logInfo $ "foldEach time : "
++ show (fromIntegral foldEachTime / 1000000 :: Double) ++ " seconds"
logInfo $ "matching time : "
++ show (fromIntegral matchingTime / 1000000 :: Double) ++ " seconds"
when (cfgDebugPrintReferences cfg) $ do
putStrLn "x-ref count by declarations kind:"
summary <- readIORef summaryRef
forM_ (Map.toList summary) $ \((sname, tname), count) -> do
Text.IO.putStrLn $ Text.concat
[ predicateRef_name sname
, " -> "
, predicateRef_name tname
, ": "
, Text.pack (show count) ]
let predicateRefs = Set.fromList $
map fst (Map.keys summary) ++ map snd (Map.keys summary)
forM_ (Set.toList predicateRefs) $ \predicateRef ->
putStr $ "," ++ Text.unpack (predicateRef_name predicateRef)
putStrLn ""
forM_ (Set.toList predicateRefs) $ \s -> do
putStr $ Text.unpack $ predicateRef_name s
forM_ (Set.toList predicateRefs) $ \t ->
putStr $ "," ++ show (Map.findWithDefault 0 (s,t) summary)
putStrLn ""
logInfo "deriveFunctionCalls done"
where
-- Number of worker threads consuming from incomingQ and writing to Glean
workers = max 1 (cfgWorkerThreads cfg)