glean/rts/factset.h

/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include "glean/rts/define.h" #include "glean/rts/densemap.h" #include "glean/rts/fact.h" #include "glean/rts/inventory.h" #include "glean/rts/substitution.h" #include "glean/rts/store.h" #include <atomic> #include <boost/intrusive/list.hpp> #include <boost/iterator/transform_iterator.hpp> #include <folly/container/F14Set.h> #include <folly/Synchronized.h> namespace facebook { namespace glean { namespace rts { // We used to have maps Id -> Fact* and pair<Id,ByteRange> -> Fact* here but // this used quite a bit of memory (almost exactly as much as the facts // themselves). Sets with folly's heterogenous key comparisons should use a // lot less memory per fact. struct FactById { using value_type = Id; static value_type get(const Fact *fact) { return fact->id(); } static value_type get(const Fact::unique_ptr& p) { return get(p.get()); } static value_type get(const value_type& x) { return x; } }; struct FactByKey { using value_type = std::pair<Pid, folly::ByteRange>; static value_type get(const Fact *fact) { return {fact->type(), fact->key()}; } static value_type get(const Fact::unique_ptr& p) { return get(p.get()); } static value_type get(const value_type& x) { return x; } }; struct FactByKeyOnly { using value_type = folly::ByteRange; static value_type get(const Fact *fact) { return fact->key(); } static value_type get(const Fact::unique_ptr& p) { return get(p.get()); } static value_type get(const value_type& x) { return x; } }; template<typename By> struct EqualBy { template<typename T, typename U> bool operator()(const T& x, const U& y) const { return By::get(x) == By::get(y); } }; template<typename By> struct HashBy { template<typename T> uint64_t operator()(const T& x) const { return folly::Hash()(By::get(x)); } }; template<typename T, typename By> using FastSetBy = folly::F14FastSet< T, folly::transparent<HashBy<By>>, folly::transparent<EqualBy<By>>>; /** * A set of facts which can be looked up by id or by key. * * Iteration through the set happens in the order in which facts have been * first inserted. * */ class FactSet final : public Define { public: explicit FactSet(Id start) : starting_id(start) , fact_memory(0) {} FactSet(FactSet&&) = default; FactSet& operator=(FactSet&&) = default; FactSet(const FactSet&) = delete; FactSet& operator=(const FactSet&) = delete; size_t size() const noexcept { return facts.size(); } bool empty() const noexcept { return facts.empty(); } // TODO: make this into an iterator over facts struct deref { const Fact& operator()(const Fact::unique_ptr& p) const { return *p; } }; using const_iterator = boost::transform_iterator< deref, std::vector<Fact::unique_ptr>::const_iterator>; const_iterator begin() const { return boost::make_transform_iterator(facts.begin(), deref()); } const_iterator end() const { return boost::make_transform_iterator(facts.end(), deref()); } /// Return iterator to the first fact with an id that's not less than the /// given id (or end() if no such fact exists). const_iterator lower_bound(Id id) const { return begin() + (id <= starting_id ? 0 : std::min(distance(starting_id, id), facts.size())); } const_iterator upper_bound(Id id) const { return begin() + (id < starting_id ? 0 : std::min(distance(starting_id, id)+1, facts.size())); } /// Return the number of bytes occupied by facts. size_t factMemory() const noexcept { return fact_memory; } // Lookup implementation Id idByKey(Pid type, folly::ByteRange key) override; Pid typeById(Id id) override; bool factById(Id id, std::function<void(Pid, Fact::Clause)> f) override; Id startingId() const override { return starting_id; } Id firstFreeId() const override { return starting_id + facts.size(); } Interval count(Pid pid) const override; std::unique_ptr<FactIterator> enumerate( Id from = Id::invalid(), Id upto = Id::invalid()) override; std::unique_ptr<FactIterator> enumerateBack( Id from = Id::invalid(), Id downto = Id::invalid()) override; /// Prefix seeks. This function can be called from multiple threads but prefix /// seeks can *not* be interleaved with modifying the FactSet. /// /// WARNING: This is currently not intended for production use as it is very /// slow. The first call for each predicate will be especially slow as it will /// need to create an index. std::unique_ptr<FactIterator> seek( Pid type, folly::ByteRange start, size_t prefix_size) override; // Define implementation Id define(Pid type, Fact::Clause, Id max_ref = Id::invalid()) override; thrift::Batch serialize() const; thrift::Batch serializeReorder(folly::Range<const uint64_t*> order) const; // Substitute all facts in the set and split it into a global and a local part // based on the substitution. Facts with Ids that are in the range of the // substitution go into the global part - they are moved from the set // to the global Store. Facts that are beyond that range (i.e., those with // id >= subst.finish()) are assigned new Ids which don't clash with the // domain of the substitution and are added to the local part which is // returned by the function. FactSet rebase( const Inventory& inventory, const Substitution& subst, Store& global) const; /// Append a set of facts. This operation is only well defined under the /// following conditions. /// /// * other.startingId() == this->firstFreeId() unless one of the FactSets /// is empty /// * the fact sets are disjoint, i.e., there are no facts that exist in /// both sets /// void append(FactSet other); /// Checks if appending a particular fact set would be well defined. bool appendable(const FactSet& other) const; bool sanityCheck() const; private: Id starting_id; std::vector<Fact::unique_ptr> facts; DenseMap<Pid, FastSetBy<const Fact *, FactByKeyOnly>> keys; size_t fact_memory; /// Index for prefix seeks. It is lazily initialised and slow as we typically /// don't do seeks on FactSets. class Index final { public: Index() : impl(nullptr) {} ~Index(); Index(Index&&) noexcept; Index& operator=(Index&&); Index(const Index&) = delete; Index& operator=(const Index&) = delete; void swap(Index&) noexcept; /// Type of index maps using map_t = std::map<folly::ByteRange, const Fact *>; /// Type of index maps with synchronised access using entry_t = folly::Synchronized<map_t>; /// Get a reference to the index map for the given predicate, creating an /// empty one if necessary entry_t& operator[](Pid pid); private: struct Impl; std::atomic<Impl *> impl; }; Index index; }; } } }

glean/rts/factset.h (170 lines of code) (raw):