glean/rts/ownership.cpp (214 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "glean/rts/lookup.h"
#include "glean/rts/inventory.h"
#include "glean/rts/ownership.h"
#include "glean/rts/ownership/setu32.h"
#include "glean/rts/ownership/triearray.h"
#include "glean/rts/ownership/uset.h"
#include "glean/rts/timer.h"
#if __x86_64__ // AVX required
#include <folly/experimental/EliasFanoCoding.h>
#include <immintrin.h>
#else
#include "glean/rts/ownership/fallbackavx.h"
#endif
#include <folly/container/F14Map.h>
#include <folly/container/F14Set.h>
#include <folly/Hash.h>
#include <xxhash.h>
#include <algorithm>
#include <initializer_list>
#include <limits>
#include <type_traits>
namespace facebook {
namespace glean {
namespace rts {
namespace {
/**
* Fill ownership data from an iterator
*
* Takes the Unit -> FactId mapping provided by the
* OwnershipUnitIterator and populates the TrieArray with it, making
* a FactId -> Set(Unit) mapping.
*/
FOLLY_NOINLINE TrieArray<Uset> fillOwnership(
OwnershipUnitIterator* iter,
uint32_t& numUnits) {
struct Stats {
size_t units = 0;
size_t intervals = 0;
void bump(size_t curUnits, size_t is) {
units = curUnits;
const auto old_intervals = intervals;
intervals += is;
if ((old_intervals / 5000000) != (intervals / 5000000)) {
dump();
}
}
void dump() {
LOG(INFO)
<< units << " units, "
<< intervals << " intervals";
}
};
TrieArray<Uset> utrie;
uint32_t last_unit = 0;
uint32_t max_unit = 0;
Stats stats;
while(const auto d = iter->get()) {
const auto data = d.value();
CHECK_GE(data.unit,last_unit);
utrie.insert(
data.ids.begin(),
data.ids.end(),
[&](Uset * FOLLY_NULLABLE prev, uint32_t refs) {
if (prev != nullptr) {
if (prev->refs == refs) {
// Do an in-place append if possible (determined via reference
// count).
prev->exp.set.append(data.unit);
return prev;
} else {
auto entry = std::make_unique<Uset>(
SetU32(prev->exp.set, SetU32::copy_capacity), refs);
entry->exp.set.append(data.unit);
prev->refs -= refs;
return entry.release();
}
} else {
auto entry = std::make_unique<Uset>(SetU32(), refs);
entry->exp.set.append(data.unit);
return entry.release();
}
}
);
max_unit = std::max(data.unit, max_unit);
stats.bump(max_unit, data.ids.size());
last_unit = data.unit;
}
stats.dump();
numUnits = max_unit + 1;
return utrie;
}
/** Move the sets from the trie to `Usets`. */
FOLLY_NOINLINE Usets collectUsets(uint32_t numUnits, TrieArray<Uset>& utrie) {
Usets usets(numUnits);
size_t visits = 0;
utrie.foreach([&](Uset *entry) -> Uset* {
++visits;
// entry->link() caches usets.add(entry) below
Uset* p = static_cast<Uset*>(entry->link());
// visited this Uset before?
if (p == nullptr) {
p = usets.add(entry);
entry->link(p);
}
// duplicate set? Update the Trie to point to the canonical one.
if (p != entry) {
entry->drop();
return p;
} else {
return nullptr;
}
});
usets.foreach([](Uset *entry) {
entry->link(nullptr);
});
LOG(INFO)
<< visits << " visits, "
<< usets.size() << " usets";
return usets;
}
/** Transitively complete `Usets` by assigning an ownership unit to facts which
* are transitively referenced by a fact already belonging to that unit.
*
* The resulting `Usets` will contain exactly those sets (the "promoted" ones)
* which describe the ownership of at least one fact.
*/
FOLLY_NOINLINE void completeOwnership(std::vector<Uset *> &facts, Usets& usets,
const Inventory &inventory,
Lookup &lookup) {
struct Stats {
Usets& usets;
size_t total_facts = 0;
size_t owned_facts = 0;
void bumpTotal() {
++total_facts;
}
void bumpOwned() {
++owned_facts;
if (owned_facts%1000000 == 0) {
dump();
}
}
void dump() {
auto ustats = usets.statistics();
LOG(INFO)
<< owned_facts << " of " << total_facts << " facts, "
<< usets.size() << " usets, "
<< ustats.promoted << " promoted, "
<< ustats.bytes << " bytes, "
<< ustats.adds << " adds, "
<< ustats.dups << " dups";
}
};
Stats stats{usets};
std::vector<Id> refs;
Traverser tracker([&](Id id, Pid) {
refs.push_back(id);
});
if (facts.size() == 0) {
return;
}
// Iterate over facts backwards - this ensures that we get all dependencies.
const auto max_id = Id::fromWord(facts.size());
for (auto iter = lookup.enumerateBack(max_id);
auto fact = iter->get();
iter->next()) {
stats.bumpTotal();
// `set == nullptr` means that the fact doesn't have an ownership set - we
// might want to make that an error eventually?
if (auto set = facts[fact.id.toWord()]) {
// TODO: Store the set in the DB and assign it to the current fact.
usets.promote(set);
// Collect all references to facts
const auto *predicate = inventory.lookupPredicate(fact.type);
assert(predicate);
predicate->traverse(tracker, fact.clause);
// For each fact we reference, add our ownership set to what we've
// computed for it so far. Use the `link` field to avoid computing the
// same set union multiple times.
//
// TODO: Try adding a fixed-size LRU (or LFU?) cache for set unions?
std::vector<Uset *> touched;
for (const auto id : refs) {
auto& me = facts[id.toWord()];
if (me == nullptr) {
// The fact didn't have ownership info before, assign the set to it.
me = set;
usets.use(me, 1);
} else if (const auto added = static_cast<Uset *>(me->link())) {
// The fact did have ownership info and we've already computed the
// union for that particular set.
usets.use(added);
usets.drop(me);
me = added;
} else {
// Compute the union.
const auto p = usets.merge(me, set);
me->link(p);
touched.push_back(me);
me = p;
}
}
// Reset the link fields. Note that we always add 1 refcount for sets we
// store in `touched` (to avoid having dangling references there) so drop
// it here.
for (const auto p : touched) {
p->link(nullptr);
usets.drop(p);
}
touched.clear();
refs.clear();
// TODO: We can drop the current's fact ownership info here - could shrink
// the vector.
// facts.shrink(fact.id);
stats.bumpOwned();
}
}
stats.dump();
}
}
std::unique_ptr<ComputedOwnership> computeOwnership(
const Inventory& inventory,
Lookup& lookup,
OwnershipUnitIterator *iter) {
uint32_t numUnits;
auto t = makeAutoTimer("computeOwnership");
LOG(INFO) << "computing ownership";
auto utrie = fillOwnership(iter,numUnits);
t.log("fillOwnership");
auto usets = collectUsets(numUnits,utrie);
t.log("collectUsets");
// TODO: Should `completeOwnership` work with the trie rather than a
// flat vector?
auto facts = utrie.flatten();
LOG(INFO) << "completing ownership: " << facts.size() << " facts";
completeOwnership(facts, usets, inventory, lookup);
t.log("completeOwnership");
std::vector<std::pair<Id,UsetId>> factOwners;
UsetId current = INVALID_USET;
for (uint32_t i = Id::lowest().toWord(); i < facts.size(); i++) {
auto usetid = facts[i] ? facts[i]->id : INVALID_USET;
if (usetid != current) {
factOwners.push_back(std::make_pair(Id::fromWord(i), usetid));
current = usetid;
}
}
auto sets = usets.toEliasFano();
return std::make_unique<ComputedOwnership>(
usets.getFirstId(),
std::move(sets),
std::move(factOwners));
}
}
}
}