include/ylt/util/concurrentqueue.h

/* * Copyright (c) 2023, Alibaba Group Holding Limited; * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Provides a C++11 implementation of a multi-producer, multi-consumer lock-free // queue. An overview, including benchmark results, is provided here: // http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ // The full design is also described in excruciating detail at: // http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue // Simplified BSD license: // Copyright (c) 2013-2020, Cameron Desrochers. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // - Redistributions of source code must retain the above copyright notice, this // list of conditions and the following disclaimer. // - Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // Also dual-licensed under the Boost Software License (see LICENSE.md) #pragma once #if defined(__GNUC__) && !defined(__INTEL_COMPILER) // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and // Traits::index_t are set to < 32 bits, causing integer promotion, causing // warnings upon assigning any computed values) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #ifdef MCDBGQ_USE_RELACY #pragma GCC diagnostic ignored "-Wint-to-pointer-cast" #endif #endif #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) // VS2019 with /W4 warns about constant conditional expressions but unless // /std=c++17 or higher does not support `if constexpr`, so we have no choice // but to simply disable the warning #pragma warning(push) #pragma warning(disable : 4127) // conditional expression is constant #endif #if defined(__APPLE__) #include "TargetConditionals.h" #endif #ifdef MCDBGQ_USE_RELACY #include "relacy/relacy_std.hpp" #include "relacy_shims.h" // We only use malloc/free anyway, and the delete macro messes up `= delete` // method declarations. We'll override the default trait malloc ourselves // without a macro. #undef new #undef delete #undef malloc #undef free #else #include <atomic> // Requires C++11. Sorry VS2010. #include <cassert> #endif #include <algorithm> #include <array> #include <climits> // for CHAR_BIT #include <cstddef> // for max_align_t #include <cstdint> #include <cstdlib> #include <limits> #include <mutex> // used for thread exit synchronization #include <thread> // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading #include <type_traits> #include <utility> // Platform-specific definitions of a numeric thread ID type and an invalid // value namespace ylt::detail::moodycamel { namespace details { template <typename thread_id_t> struct thread_id_converter { typedef thread_id_t thread_id_numeric_size_t; typedef thread_id_t thread_id_hash_t; static thread_id_hash_t prehash(thread_id_t const& x) { return x; } }; } // namespace details } // namespace ylt::detail::moodycamel #if defined(MCDBGQ_USE_RELACY) namespace ylt::detail::moodycamel { namespace details { typedef std::uint32_t thread_id_t; static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; static inline thread_id_t thread_id() { return rl::thread_index(); } } // namespace details } // namespace ylt::detail::moodycamel #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) // No sense pulling in windows.h in a header, we'll manually declare the // function we use and rely on backwards-compatibility for this not to break extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId( void); namespace ylt::detail::moodycamel { namespace details { static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); typedef std::uint32_t thread_id_t; static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used // in practice. Note that all Win32 thread IDs are presently // multiples of 4. static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); } } // namespace details } // namespace ylt::detail::moodycamel #elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ (defined(__APPLE__) && TARGET_OS_IPHONE) || \ defined(MOODYCAMEL_NO_THREAD_LOCAL) namespace ylt::detail::moodycamel { namespace details { static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); typedef std::thread::id thread_id_t; static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have // one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined // anyway, which it won't be. static inline thread_id_t thread_id() { return std::this_thread::get_id(); } template <std::size_t> struct thread_id_size {}; template <> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; template <> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; template <> struct thread_id_converter<thread_id_t> { typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t; #ifndef __APPLE__ typedef std::size_t thread_id_hash_t; #else typedef thread_id_numeric_size_t thread_id_hash_t; #endif static thread_id_hash_t prehash(thread_id_t const& x) { #ifndef __APPLE__ return std::hash<std::thread::id>()(x); #else return *reinterpret_cast<thread_id_hash_t const*>(&x); #endif } }; } } #else // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 // In order to get a numeric thread ID in a platform-independent way, we use a // thread-local static variable's address as a thread identifier :-) #if defined(__GNUC__) || defined(__INTEL_COMPILER) #define MOODYCAMEL_THREADLOCAL __thread #elif defined(_MSC_VER) #define MOODYCAMEL_THREADLOCAL __declspec(thread) #else // Assume C++11 compliant compiler #define MOODYCAMEL_THREADLOCAL thread_local #endif namespace ylt::detail::moodycamel { namespace details { typedef std::uintptr_t thread_id_t; static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus // it's not aligned. inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); } } } #endif // Constexpr if #ifndef MOODYCAMEL_CONSTEXPR_IF #if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \ __cplusplus > 201402L #define MOODYCAMEL_CONSTEXPR_IF if constexpr #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] #else #define MOODYCAMEL_CONSTEXPR_IF if #define MOODYCAMEL_MAYBE_UNUSED #endif #endif // Exceptions #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED #if (defined(_MSC_VER) && defined(_CPPUNWIND)) || \ (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ (!defined(_MSC_VER) && !defined(__GNUC__)) #define MOODYCAMEL_EXCEPTIONS_ENABLED #endif #endif #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED #define MOODYCAMEL_TRY try #define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) #define MOODYCAMEL_RETHROW throw #define MOODYCAMEL_THROW(expr) throw(expr) #else #define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true) #define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false) #define MOODYCAMEL_RETHROW #define MOODYCAMEL_THROW(expr) #endif #ifndef MOODYCAMEL_NOEXCEPT #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) #define MOODYCAMEL_NOEXCEPT #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when // it shouldn't :-( We have to assume *all* non-trivial constructors may throw // on VS2012! #define MOODYCAMEL_NOEXCEPT _NOEXCEPT #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ (std::is_rvalue_reference<valueType>::value && \ std::is_move_constructible<type>::value \ ? std::is_trivially_move_constructible<type>::value \ : std::is_trivially_copy_constructible<type>::value) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ ((std::is_rvalue_reference<valueType>::value && \ std::is_move_assignable<type>::value \ ? std::is_trivially_move_assignable<type>::value || \ std::is_nothrow_move_assignable<type>::value \ : std::is_trivially_copy_assignable<type>::value || \ std::is_nothrow_copy_assignable<type>::value) && \ MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ (std::is_rvalue_reference<valueType>::value && \ std::is_move_constructible<type>::value \ ? std::is_trivially_move_constructible<type>::value || \ std::is_nothrow_move_constructible<type>::value \ : std::is_trivially_copy_constructible<type>::value || \ std::is_nothrow_copy_constructible<type>::value) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ ((std::is_rvalue_reference<valueType>::value && \ std::is_move_assignable<type>::value \ ? std::is_trivially_move_assignable<type>::value || \ std::is_nothrow_move_assignable<type>::value \ : std::is_trivially_copy_assignable<type>::value || \ std::is_nothrow_copy_assignable<type>::value) && \ MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #else #define MOODYCAMEL_NOEXCEPT noexcept #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) #endif #endif #ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #else // VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a // crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't // support thread_local either. Finally, iOS/ARM doesn't have support for it // either, and g++/ARM allows it to compile but it's unconfirmed to actually // work #if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ (!defined(__MINGW32__) && !defined(__MINGW64__) || \ !defined(__WINPTHREADS_VERSION)) && \ (!defined(__GNUC__) || __GNUC__ > 4 || \ (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ !defined(_M_ARM) && !defined(__aarch64__) // Assume `thread_local` is fully supported in all other C++11 // compilers/platforms #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; // years ago several users // report having problems with // it on #endif #endif #endif // VS2012 doesn't support deleted functions. // In this case, we declare the function normally but don't define it. A link // error will be generated if the function is called. #ifndef MOODYCAMEL_DELETE_FUNCTION #if defined(_MSC_VER) && _MSC_VER < 1800 #define MOODYCAMEL_DELETE_FUNCTION #else #define MOODYCAMEL_DELETE_FUNCTION = delete #endif #endif namespace ylt::detail::moodycamel { namespace details { #ifndef MOODYCAMEL_ALIGNAS // VS2013 doesn't support alignas or alignof, and align() requires a constant // literal #if defined(_MSC_VER) && _MSC_VER <= 1800 #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type template <int Align, typename T> struct Vs2013Aligned {}; // default, unsupported alignment template <typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; template <typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; template <typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; template <typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; template <typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; template <typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; template <typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; template <typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; template <typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; #else template <typename T> struct identity { typedef T type; }; #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) #define MOODYCAMEL_ALIGNOF(obj) alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ alignas(alignof(obj)) typename details::identity<T>::type #endif #endif } // namespace details } // namespace ylt::detail::moodycamel // TSAN can false report races in lock-free code. To enable TSAN to be used // from projects that use this one, we can apply per-function compile-time // suppression. See // https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer #define MOODYCAMEL_NO_TSAN #if defined(__has_feature) #if __has_feature(thread_sanitizer) #undef MOODYCAMEL_NO_TSAN #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) #endif // TSAN #endif // TSAN // Compiler-specific likely/unlikely hints namespace ylt::detail::moodycamel { namespace details { #if defined(__GNUC__) static inline bool(likely)(bool x) { return __builtin_expect((x), true); } static inline bool(unlikely)(bool x) { return __builtin_expect((x), false); } #else static inline bool(likely)(bool x) { return x; } static inline bool(unlikely)(bool x) { return x; } #endif } // namespace details } // namespace ylt::detail::moodycamel #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG #include "internal/concurrentqueue_internal_debug.h" #endif namespace ylt::detail::moodycamel { namespace details { template <typename T> struct const_numeric_max { static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers"); static const T value = std::numeric_limits<T>::is_signed ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1) : static_cast<T>(-1); }; #if defined(__GLIBCXX__) typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while #else typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can // *only* be accessed via std:: #endif // Some platforms have incorrectly set max_align_t to a type with <8 bytes // alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit // iOS). Work around this with our own union. See issue #64. typedef union { std_max_align_t x; long long y; void* z; } max_align_t; } // namespace details // Default traits for the ConcurrentQueue. To change some of the // traits without re-implementing all of them, inherit from this // struct and shadow the declarations you wish to be different; // since the traits are used as a template type parameter, the // shadowed declarations will be used where defined, and the defaults // otherwise. struct ConcurrentQueueDefaultTraits { // General-purpose size type. std::size_t is strongly recommended. typedef std::size_t size_t; // The type used for the enqueue and dequeue indices. Must be at least as // large as size_t. Should be significantly larger than the number of elements // you expect to hold at once, especially if you have a high turnover rate; // for example, on 32-bit x86, if you expect to have over a hundred million // elements or pump several million elements through your queue in a very // short space of time, using a 32-bit type *may* trigger a race condition. // A 64-bit int type is recommended in that case, and in practice will // prevent a race condition no matter the usage of the queue. Note that // whether the queue is lock-free with a 64-int type depends on the whether // std::atomic<std::uint64_t> is lock-free, which is platform-specific. typedef std::size_t index_t; // Internally, all elements are enqueued and dequeued from multi-element // blocks; this is the smallest controllable unit. If you expect few elements // but many producers, a smaller block size should be favoured. For few // producers and/or many elements, a larger block size is preferred. A sane // default is provided. Must be a power of 2. static const size_t QUEUE_BLOCK_SIZE = 32; // For explicit producers (i.e. when using a producer token), the block is // checked for being empty by iterating through a list of flags, one per // element. For large block sizes, this is too inefficient, and switching to // an atomic counter-based approach is faster. The switch is made for block // sizes strictly larger than this threshold. static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; // How many full blocks can be expected for a single explicit producer? This // should reflect that number's maximum for optimal performance. Must be a // power of 2. static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; // How many full blocks can be expected for a single implicit producer? This // should reflect that number's maximum for optimal performance. Must be a // power of 2. static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; // The initial size of the hash table mapping thread IDs to implicit // producers. Note that the hash is resized every time it becomes half full. // Must be a power of two, and either 0 or at least 1. If 0, implicit // production (using the enqueue methods without an explicit producer token) // is disabled. static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; // Controls the number of items that an explicit consumer (i.e. one with a // token) must consume before it causes all consumers to rotate and move on to // the next internal queue. static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; // The maximum number of elements (inclusive) that can be enqueued to a // sub-queue. Enqueue operations that would cause this limit to be surpassed // will fail. Note that this limit is enforced at the block level (for // performance reasons), i.e. it's rounded up to the nearest block size. static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value; // The number of times to spin before sleeping when waiting on a semaphore. // Recommended values are on the order of 1000-10000 unless the number of // consumer threads exceeds the number of idle cores (in which case try // 0-100). Only affects instances of the BlockingConcurrentQueue. static const int MAX_SEMA_SPINS = 10000; // Whether to recycle dynamically-allocated blocks into an internal free list // or not. If false, only pre-allocated blocks (controlled by the constructor // arguments) will be recycled, and all others will be `free`d back to the // heap. Note that blocks consumed by explicit producers are only freed on // destruction of the queue (not following destruction of the token) // regardless of this trait. static const bool RECYCLE_ALLOCATED_BLOCKS = false; #ifndef MCDBGQ_USE_RELACY // Memory allocation can be customized if needed. // malloc should return nullptr on failure, and handle alignment like // std::malloc. #if defined(malloc) || defined(free) // Gah, this is 2015, stop defining macros that break standard code already! // Work around malloc/free being special macros: static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } static inline void WORKAROUND_free(void* ptr) { return free(ptr); } static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); } static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); } #else static inline void* malloc(size_t size) { return std::malloc(size); } static inline void free(void* ptr) { return std::free(ptr); } #endif #else // Debug versions when running under the Relacy race detector (ignore // these in user code) static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } static inline void free(void* ptr) { return rl::rl_free(ptr, $); } #endif }; // When producing or consuming many elements, the most efficient way is to: // 1) Use one of the bulk-operation methods of the queue with a token // 2) Failing that, use the bulk-operation methods without a token // 3) Failing that, create a token and use that with the single-item methods // 4) Failing that, use the single-parameter methods of the queue // Having said that, don't create tokens willy-nilly -- ideally there should be // a maximum of one token per thread (of each kind). struct ProducerToken; struct ConsumerToken; template <typename T, typename Traits> class ConcurrentQueue; template <typename T, typename Traits> class BlockingConcurrentQueue; class ConcurrentQueueTests; namespace details { struct ConcurrentQueueProducerTypelessBase { ConcurrentQueueProducerTypelessBase* next; std::atomic<bool> inactive; ProducerToken* token; ConcurrentQueueProducerTypelessBase() : next(nullptr), inactive(false), token(nullptr) {} }; template <bool use32> struct _hash_32_or_64 { static inline std::uint32_t hash(std::uint32_t h) { // MurmurHash3 finalizer -- see // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp // Since the thread ID is already unique, all we really want to do is // propagate that uniqueness evenly across all the bits, so that we can use // a subset of the bits while reducing collisions significantly h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; return h ^ (h >> 16); } }; template <> struct _hash_32_or_64<1> { static inline std::uint64_t hash(std::uint64_t h) { h ^= h >> 33; h *= 0xff51afd7ed558ccd; h ^= h >> 33; h *= 0xc4ceb9fe1a85ec53; return h ^ (h >> 33); } }; template <std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {}; static inline size_t hash_thread_id(thread_id_t id) { static_assert( sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); return static_cast<size_t>( hash_32_or_64<sizeof( thread_id_converter<thread_id_t>::thread_id_hash_t)>:: hash(thread_id_converter<thread_id_t>::prehash(id))); } template <typename T> static inline bool circular_less_than(T a, T b) { static_assert( std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer " "types"); return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1))); // Note: extra parens around rhs of operator<< is MSVC bug: // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 // silencing the bug requires #pragma warning(disable: 4554) around the // calling code and has no effect when done here. } template <typename U> static inline char* align_for(char* ptr) { const std::size_t alignment = std::alignment_of::value; return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment; } template <typename T> static inline T ceil_to_pow_2(T x) { static_assert( std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); // Adapted from // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 --x; x |= x >> 1; x |= x >> 2; x |= x >> 4; for (std::size_t i = 1; i < sizeof(T); i <<= 1) { x |= x >> (i << 3); } ++x; return x; } template <typename T> static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right) { T temp = std::move(left.load(std::memory_order_relaxed)); left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); right.store(std::move(temp), std::memory_order_relaxed); } template <typename T> static inline T const& nomove(T const& x) { return x; } template <bool Enable> struct nomove_if { template <typename T> static inline T const& eval(T const& x) { return x; } }; template <> struct nomove_if<false> { template <typename U> static inline auto eval(U&& x) -> decltype(std::forward(x)) { return std::forward(x); } }; template <typename It> static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) { return *it; } #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \ (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) template <typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> {}; #else template <typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> {}; #endif #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY typedef RelacyThreadExitListener ThreadExitListener; typedef RelacyThreadExitNotifier ThreadExitNotifier; #else class ThreadExitNotifier; struct ThreadExitListener { typedef void (*callback_t)(void*); callback_t callback; void* userData; ThreadExitListener* next; // reserved for use by the ThreadExitNotifier ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier }; class ThreadExitNotifier { public: static void subscribe(ThreadExitListener* listener) { auto& tlsInst = instance(); std::lock_guard<std::mutex> guard(mutex()); listener->next = tlsInst.tail; listener->chain = &tlsInst; tlsInst.tail = listener; } static void unsubscribe(ThreadExitListener* listener) { std::lock_guard<std::mutex> guard(mutex()); if (!listener->chain) { return; // race with ~ThreadExitNotifier } auto& tlsInst = *listener->chain; listener->chain = nullptr; ThreadExitListener** prev = &tlsInst.tail; for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { if (ptr == listener) { *prev = ptr->next; break; } prev = &ptr->next; } } private: ThreadExitNotifier() : tail(nullptr) {} ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; ~ThreadExitNotifier() { // This thread is about to exit, let everyone know! assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the " "preprocessor conditions such that " "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); std::lock_guard<std::mutex> guard(mutex()); for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { ptr->chain = nullptr; ptr->callback(ptr->userData); } } // Thread-local static inline ThreadExitNotifier& instance() { static thread_local ThreadExitNotifier notifier; return notifier; } static inline std::mutex& mutex() { // Must be static because the ThreadExitNotifier could be destroyed while // unsubscribe is called static std::mutex mutex; return mutex; } private: ThreadExitListener* tail; }; #endif #endif template <typename T> struct static_is_lock_free_num { enum { value = 0 }; }; template <> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; template <> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; template <> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; }; template <> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; }; template <> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; template <typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {}; template <> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; template <typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; } // namespace details struct ProducerToken { template <typename T, typename Traits> explicit ProducerToken(ConcurrentQueue<T, Traits>& queue); template <typename T, typename Traits> explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue); ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT : producer(other.producer) { other.producer = nullptr; if (producer != nullptr) { producer->token = this; } } inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT { swap(other); return *this; } void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT { std::swap(producer, other.producer); if (producer != nullptr) { producer->token = this; } if (other.producer != nullptr) { other.producer->token = &other; } } // A token is always valid unless: // 1) Memory allocation failed during construction // 2) It was moved via the move constructor // (Note: assignment does a swap, leaving both potentially valid) // 3) The associated queue was destroyed // Note that if valid() returns true, that only indicates // that the token is valid for use with a specific queue, // but not which one; that's up to the user to track. inline bool valid() const { return producer != nullptr; } ~ProducerToken() { if (producer != nullptr) { producer->token = nullptr; producer->inactive.store(true, std::memory_order_release); } } // Disable copying and assignment ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; private: template <typename T, typename Traits> friend class ConcurrentQueue; friend class ConcurrentQueueTests; protected: details::ConcurrentQueueProducerTypelessBase* producer; }; struct ConsumerToken { template <typename T, typename Traits> explicit ConsumerToken(ConcurrentQueue<T, Traits>& q); template <typename T, typename Traits> explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q); ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) {} inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT { swap(other); return *this; } void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT { std::swap(initialOffset, other.initialOffset); std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); std::swap(currentProducer, other.currentProducer); std::swap(desiredProducer, other.desiredProducer); } // Disable copying and assignment ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; private: template <typename T, typename Traits> friend class ConcurrentQueue; friend class ConcurrentQueueTests; private: // but shared with ConcurrentQueue std::uint32_t initialOffset; std::uint32_t lastKnownGlobalOffset; std::uint32_t itemsConsumedFromCurrent; details::ConcurrentQueueProducerTypelessBase* currentProducer; details::ConcurrentQueueProducerTypelessBase* desiredProducer; }; // Need to forward-declare this swap because it's in a namespace. // See // http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces template <typename T, typename Traits> inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; template <typename T, typename Traits = ConcurrentQueueDefaultTraits> class ConcurrentQueue { public: typedef ::ylt::detail::moodycamel::ProducerToken producer_token_t; typedef ::ylt::detail::moodycamel::ConsumerToken consumer_token_t; typedef typename Traits::index_t index_t; typedef typename Traits::size_t size_t; static const size_t QUEUE_BLOCK_SIZE = static_cast<size_t>(Traits::QUEUE_BLOCK_SIZE); static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE); static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE); static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>( Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 4307) // + integral constant overflow (that's what // the ternary expression is for!) #pragma warning(disable : 4309) // static_cast: Truncation of constant value #endif static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < QUEUE_BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (QUEUE_BLOCK_SIZE - 1)) / QUEUE_BLOCK_SIZE * QUEUE_BLOCK_SIZE); #ifdef _MSC_VER #pragma warning(pop) #endif static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type"); static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type"); static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); static_assert( (QUEUE_BLOCK_SIZE > 1) && !(QUEUE_BLOCK_SIZE & (QUEUE_BLOCK_SIZE - 1)), "Traits::QUEUE_BLOCK_SIZE must be a power of 2 (and at least 2)"); static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " "power of 2 (and greater than 1)"); static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and " "greater than 1)"); static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and " "greater than 1)"); static_assert( (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least " "1 (or 0 to disable implicit enqueueing)"); public: // Creates a queue with at least `capacity` element slots; note that the // actual number of elements that can be inserted without additional memory // allocation depends on the number of producers and the block size (e.g. if // the block size is equal to `capacity`, only a single block will be // allocated up-front, which means only a single producer will be able to // enqueue elements without an extra allocation -- blocks aren't shared // between producers). This method is not thread safe -- it is up to the user // to ensure that the queue is fully constructed before it starts being used // by other threads (this includes making the memory effects of construction // visible, possibly with a memory barrier). explicit ConcurrentQueue(size_t capacity = 32 * QUEUE_BLOCK_SIZE) : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0), nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) { implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); populate_initial_implicit_producer_hash(); populate_initial_block_list( capacity / QUEUE_BLOCK_SIZE + ((capacity & (QUEUE_BLOCK_SIZE - 1)) == 0 ? 0 : 1)); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG // Track all the producers using a fully-resolved typed list for // each kind; this makes it possible to debug them starting from // the root queue object (otherwise wacky casts are needed that // don't compile in the debugger's expression evaluator). explicitProducers.store(nullptr, std::memory_order_relaxed); implicitProducers.store(nullptr, std::memory_order_relaxed); #endif } // Computes the correct amount of pre-allocated blocks for you based // on the minimum number of elements you want available at any given // time, and the maximum concurrent number of each type of producer. ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0), nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) { implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); populate_initial_implicit_producer_hash(); size_t blocks = (((minCapacity + QUEUE_BLOCK_SIZE - 1) / QUEUE_BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); populate_initial_block_list(blocks); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG explicitProducers.store(nullptr, std::memory_order_relaxed); implicitProducers.store(nullptr, std::memory_order_relaxed); #endif } // Note: The queue should not be accessed concurrently while it's // being deleted. It's up to the user to synchronize this. // This method is not thread safe. ~ConcurrentQueue() { // Destroy producers auto ptr = producerListTail.load(std::memory_order_relaxed); while (ptr != nullptr) { auto next = ptr->next_prod(); if (ptr->token != nullptr) { ptr->token->producer = nullptr; } destroy(ptr); ptr = next; } // Destroy implicit producer hash tables MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { auto hash = implicitProducerHash.load(std::memory_order_relaxed); while (hash != nullptr) { auto prev = hash->prev; if (prev != nullptr) { // The last hash is part of this object and was // not allocated dynamically for (size_t i = 0; i != hash->capacity; ++i) { hash->entries[i].~ImplicitProducerKVP(); } hash->~ImplicitProducerHash(); (Traits::free)(hash); } hash = prev; } } // Destroy global free list auto block = freeList.head_unsafe(); while (block != nullptr) { auto next = block->freeListNext.load(std::memory_order_relaxed); if (block->dynamicallyAllocated) { destroy(block); } block = next; } // Destroy initial free list destroy_array(initialBlockPool, initialBlockPoolSize); } // Disable copying and copy assignment ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; // Moving is supported, but note that it is *not* a thread-safe operation. // Nobody can use the queue while it's being moved, and the memory effects // of that move must be propagated to other threads before they can use it. // Note: When a queue is moved, its tokens are still valid but can only be // used with the destination queue (i.e. semantically they are moved along // with the queue itself). ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT : producerListTail( other.producerListTail.load(std::memory_order_relaxed)), producerCount(other.producerCount.load(std::memory_order_relaxed)), initialBlockPoolIndex( other.initialBlockPoolIndex.load(std::memory_order_relaxed)), initialBlockPool(other.initialBlockPool), initialBlockPoolSize(other.initialBlockPoolSize), freeList(std::move(other.freeList)), nextExplicitConsumerId( other.nextExplicitConsumerId.load(std::memory_order_relaxed)), globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load( std::memory_order_relaxed)) { // Move the other one into this, and leave the other one as an empty queue implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); populate_initial_implicit_producer_hash(); swap_implicit_producer_hashes(other); other.producerListTail.store(nullptr, std::memory_order_relaxed); other.producerCount.store(0, std::memory_order_relaxed); other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG explicitProducers.store( other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); other.explicitProducers.store(nullptr, std::memory_order_relaxed); implicitProducers.store( other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); other.implicitProducers.store(nullptr, std::memory_order_relaxed); #endif other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); other.initialBlockPoolSize = 0; other.initialBlockPool = nullptr; reown_producers(); } inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT { return swap_internal(other); } // Swaps this queue's state with the other's. Not thread-safe. // Swapping two queues does not invalidate their tokens, however // the tokens that were created for one queue must be used with // only the swapped queue (i.e. the tokens are tied to the // queue's movable state, not the object itself). inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT { swap_internal(other); } private: ConcurrentQueue& swap_internal(ConcurrentQueue& other) { if (this == &other) { return *this; } details::swap_relaxed(producerListTail, other.producerListTail); details::swap_relaxed(producerCount, other.producerCount); details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); std::swap(initialBlockPool, other.initialBlockPool); std::swap(initialBlockPoolSize, other.initialBlockPoolSize); freeList.swap(other.freeList); details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); swap_implicit_producer_hashes(other); reown_producers(); other.reown_producers(); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG details::swap_relaxed(explicitProducers, other.explicitProducers); details::swap_relaxed(implicitProducers, other.implicitProducers); #endif return *this; } public: // Enqueues a single item (by copying it). // Allocates memory if required. Only fails if memory allocation fails (or // implicit production is disabled because // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(T const& item) { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue<CanAlloc>(item); } // Enqueues a single item (by moving it, if possible). // Allocates memory if required. Only fails if memory allocation fails (or // implicit production is disabled because // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(T&& item) { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue<CanAlloc>(std::move(item)); } // Enqueues a single item (by copying it) using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails (or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(producer_token_t const& token, T const& item) { return inner_enqueue<CanAlloc>(token, item); } // Enqueues a single item (by moving it, if possible) using an explicit // producer token. Allocates memory if required. Only fails if memory // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would // be surpassed). Thread-safe. inline bool enqueue(producer_token_t const& token, T&& item) { return inner_enqueue<CanAlloc>(token, std::move(item)); } // Enqueues several items. // Allocates memory if required. Only fails if memory allocation fails (or // implicit production is disabled because // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note: // Use std::make_move_iterator if the elements should be moved instead of // copied. Thread-safe. template <typename It> bool enqueue_bulk(It itemFirst, size_t count) { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue_bulk<CanAlloc>(itemFirst, count); } // Enqueues several items using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template <typename It> bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count); } // Enqueues a single item (by copying it). // Does not allocate memory. Fails if not enough room to enqueue (or implicit // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE // is 0). // Thread-safe. inline bool try_enqueue(T const& item) { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue<CannotAlloc>(item); } // Enqueues a single item (by moving it, if possible). // Does not allocate memory (except for one-time implicit producer). // Fails if not enough room to enqueue (or implicit production is // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). // Thread-safe. inline bool try_enqueue(T&& item) { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue<CannotAlloc>(std::move(item)); } // Enqueues a single item (by copying it) using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Thread-safe. inline bool try_enqueue(producer_token_t const& token, T const& item) { return inner_enqueue<CannotAlloc>(token, item); } // Enqueues a single item (by moving it, if possible) using an explicit // producer token. Does not allocate memory. Fails if not enough room to // enqueue. Thread-safe. inline bool try_enqueue(producer_token_t const& token, T&& item) { return inner_enqueue<CannotAlloc>(token, std::move(item)); } // Enqueues several items. // Does not allocate memory (except for one-time implicit producer). // Fails if not enough room to enqueue (or implicit production is // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template <typename It> bool try_enqueue_bulk(It itemFirst, size_t count) { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count); } // Enqueues several items using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template <typename It> bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count); } // Attempts to dequeue from the queue. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template <typename U> bool try_dequeue(U& item) { // Instead of simply trying each producer in turn (which could cause // needless contention on the first producer), we score them heuristically. size_t nonEmptyCount = 0; ProducerBase* best = nullptr; size_t bestSize = 0; for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { auto size = ptr->size_approx(); if (size > 0) { if (size > bestSize) { bestSize = size; best = ptr; } ++nonEmptyCount; } } // If there was at least one non-empty queue but it appears empty at the // time we try to dequeue from it, we need to make sure every queue's been // tried if (nonEmptyCount > 0) { if ((details::likely)(best->dequeue(item))) { return true; } for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { if (ptr != best && ptr->dequeue(item)) { return true; } } } return false; } // Attempts to dequeue from the queue. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // This differs from the try_dequeue(item) method in that this one does // not attempt to reduce contention by interleaving the order that producer // streams are dequeued from. So, using this method can reduce overall // throughput under contention, but will give more predictable results in // single-threaded consumer scenarios. This is mostly only useful for internal // unit tests. Never allocates. Thread-safe. template <typename U> bool try_dequeue_non_interleaved(U& item) { for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { if (ptr->dequeue(item)) { return true; } } return false; } // Attempts to dequeue from the queue using an explicit consumer token. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template <typename U> bool try_dequeue(consumer_token_t& token, U& item) { // The idea is roughly as follows: // Every 256 items from one producer, make everyone rotate (increase the // global offset) -> this means the highest efficiency consumer dictates the // rotation speed of everyone else, more or less If you see that the global // offset has changed, you must reset your consumption counter and move to // your designated place If there's no items where you're supposed to be, // keep moving until you find a producer with some items If the global // offset has not changed but you've run out of items to consume, move over // from your current position until you find an producer with something in // it if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { if (!update_current_producer_after_rotation(token)) { return false; } } // If there was at least one non-empty queue but it appears empty at the // time we try to dequeue from it, we need to make sure every queue's been // tried if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) { if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); } return true; } auto tail = producerListTail.load(std::memory_order_acquire); auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod(); if (ptr == nullptr) { ptr = tail; } while (ptr != static_cast<ProducerBase*>(token.currentProducer)) { if (ptr->dequeue(item)) { token.currentProducer = ptr; token.itemsConsumedFromCurrent = 1; return true; } ptr = ptr->next_prod(); if (ptr == nullptr) { ptr = tail; } } return false; } // Attempts to dequeue several elements from the queue. // Returns the number of items actually dequeued. // Returns 0 if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template <typename It> size_t try_dequeue_bulk(It itemFirst, size_t max) { size_t count = 0; for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { count += ptr->dequeue_bulk(itemFirst, max - count); if (count == max) { break; } } return count; } // Attempts to dequeue several elements from the queue using an explicit // consumer token. Returns the number of items actually dequeued. Returns 0 if // all producer streams appeared empty at the time they were checked (so, the // queue is likely but not guaranteed to be empty). Never allocates. // Thread-safe. template <typename It> size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) { if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { if (!update_current_producer_after_rotation(token)) { return 0; } } size_t count = static_cast<ProducerBase*>(token.currentProducer) ->dequeue_bulk(itemFirst, max); if (count == max) { if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); } return max; } token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count); max -= count; auto tail = producerListTail.load(std::memory_order_acquire); auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod(); if (ptr == nullptr) { ptr = tail; } while (ptr != static_cast<ProducerBase*>(token.currentProducer)) { auto dequeued = ptr->dequeue_bulk(itemFirst, max); count += dequeued; if (dequeued != 0) { token.currentProducer = ptr; token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued); } if (dequeued == max) { break; } max -= dequeued; ptr = ptr->next_prod(); if (ptr == nullptr) { ptr = tail; } } return count; } // Attempts to dequeue from a specific producer's inner queue. // If you happen to know which producer you want to dequeue from, this // is significantly faster than using the general-case try_dequeue methods. // Returns false if the producer's queue appeared empty at the time it // was checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template <typename U> inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) { return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item); } // Attempts to dequeue several elements from a specific producer's inner // queue. Returns the number of items actually dequeued. If you happen to know // which producer you want to dequeue from, this is significantly faster than // using the general-case try_dequeue methods. Returns 0 if the producer's // queue appeared empty at the time it was checked (so, the queue is likely // but not guaranteed to be empty). Never allocates. Thread-safe. template <typename It> inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) { return static_cast<ExplicitProducer*>(producer.producer) ->dequeue_bulk(itemFirst, max); } // Returns an estimate of the total number of elements currently in the queue. // This estimate is only accurate if the queue has completely stabilized // before it is called (i.e. all enqueue and dequeue operations have completed // and their memory effects are visible on the calling thread, and no further // operations start while this method is being called). Thread-safe. size_t size_approx() const { size_t size = 0; for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { size += ptr->size_approx(); } return size; } // Returns true if the underlying atomic variables used by // the queue are lock-free (they should be on most platforms). // Thread-safe. static constexpr bool is_lock_free() { return details::static_is_lock_free<bool>::value == 2 && details::static_is_lock_free<size_t>::value == 2 && details::static_is_lock_free<std::uint32_t>::value == 2 && details::static_is_lock_free<index_t>::value == 2 && details::static_is_lock_free<void*>::value == 2 && details::static_is_lock_free<typename details::thread_id_converter< details::thread_id_t>::thread_id_numeric_size_t>::value == 2; } private: friend struct ProducerToken; friend struct ConsumerToken; struct ExplicitProducer; friend struct ExplicitProducer; struct ImplicitProducer; friend struct ImplicitProducer; friend class ConcurrentQueueTests; enum AllocationMode { CanAlloc, CannotAlloc }; /////////////////////////////// // Queue methods /////////////////////////////// template <AllocationMode canAlloc, typename U> inline bool inner_enqueue(producer_token_t const& token, U&& element) { return static_cast<ExplicitProducer*>(token.producer) ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>( std::forward(element)); } template <AllocationMode canAlloc, typename U> inline bool inner_enqueue(U&& element) { auto producer = get_or_add_implicit_producer(); return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue< canAlloc>(std::forward(element)); } template <AllocationMode canAlloc, typename It> inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { return static_cast<ExplicitProducer*>(token.producer) ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>( itemFirst, count); } template <AllocationMode canAlloc, typename It> inline bool inner_enqueue_bulk(It itemFirst, size_t count) { auto producer = get_or_add_implicit_producer(); return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer:: template enqueue_bulk<canAlloc>(itemFirst, count); } inline bool update_current_producer_after_rotation(consumer_token_t& token) { // Ah, there's been a rotation, figure out where we should be! auto tail = producerListTail.load(std::memory_order_acquire); if (token.desiredProducer == nullptr && tail == nullptr) { return false; } auto prodCount = producerCount.load(std::memory_order_relaxed); auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); if ((details::unlikely)(token.desiredProducer == nullptr)) { // Aha, first time we're dequeueing anything. // Figure out our local position // Note: offset is from start, not end, but we're traversing from end -- // subtract from count first std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); token.desiredProducer = tail; for (std::uint32_t i = 0; i != offset; ++i) { token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod(); if (token.desiredProducer == nullptr) { token.desiredProducer = tail; } } } std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; if (delta >= prodCount) { delta = delta % prodCount; } for (std::uint32_t i = 0; i != delta; ++i) { token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod(); if (token.desiredProducer == nullptr) { token.desiredProducer = tail; } } token.lastKnownGlobalOffset = globalOffset; token.currentProducer = token.desiredProducer; token.itemsConsumedFromCurrent = 0; return true; } /////////////////////////// // Free list /////////////////////////// template <typename N> struct FreeListNode { FreeListNode() : freeListRefs(0), freeListNext(nullptr) {} std::atomic<std::uint32_t> freeListRefs; std::atomic<N*> freeListNext; }; // A simple CAS-based lock-free free list. Not the fastest thing in the world // under heavy contention, but simple and correct (assuming nodes are never // freed until after the free list is destroyed), and fairly speedy under low // contention. template <typename N> // N must inherit FreeListNode or have the same fields // (and initialization of them) struct FreeList { FreeList() : freeListHead(nullptr) {} FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; inline void add(N* node) { #ifdef MCDBGQ_NOLOCKFREE_FREELIST debug::DebugLock lock(mutex); #endif // We know that the should-be-on-freelist bit is 0 at this point, so it's // safe to set it using a fetch_add if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { // Oh look! We were the last ones referencing this node, and we know // we want to add it to the free list, so let's do it! add_knowing_refcount_is_zero(node); } } inline N* try_get() { #ifdef MCDBGQ_NOLOCKFREE_FREELIST debug::DebugLock lock(mutex); #endif auto head = freeListHead.load(std::memory_order_acquire); while (head != nullptr) { auto prevHead = head; auto refs = head->freeListRefs.load(std::memory_order_relaxed); if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong( refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { head = freeListHead.load(std::memory_order_acquire); continue; } // Good, reference count has been incremented (it wasn't at zero), which // means we can read the next and not worry about it changing between // now and the time we do the CAS auto next = head->freeListNext.load(std::memory_order_relaxed); if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { // Yay, got the node. This means it was on the list, which means // shouldBeOnFreeList must be false no matter the refcount (because // nobody else knows it's been taken off yet, it can't have been put // back on). assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); // Decrease refcount twice, once for our ref, and once for the list's // ref head->freeListRefs.fetch_sub(2, std::memory_order_release); return head; } // OK, the head must have changed on us, but we still need to decrease // the refcount we increased. Note that we don't need to release any // memory effects, but we do need to ensure that the reference count // decrement happens-after the CAS on the head. refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); if (refs == SHOULD_BE_ON_FREELIST + 1) { add_knowing_refcount_is_zero(prevHead); } } return nullptr; } // Useful for traversing the list when there's no contention (e.g. to // destroy remaining nodes) N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } private: inline void add_knowing_refcount_is_zero(N* node) { // Since the refcount is zero, and nobody can increase it once it's zero // (except us, and we run only one copy of this method per node at a time, // i.e. the single thread case), then we know we can safely change the // next pointer of the node; however, once the refcount is back above // zero, then other threads could increase it (happens under heavy // contention, when the refcount goes to zero in between a load and a // refcount increment of a node in try_get, then back up to something // non-zero, then the refcount increment is done by the other thread) -- // so, if the CAS to add the node to the actual list fails, decrease the // refcount and leave the add operation to the next thread who puts the // refcount back at zero (which could be us, hence the loop). auto head = freeListHead.load(std::memory_order_relaxed); while (true) { node->freeListNext.store(head, std::memory_order_relaxed); node->freeListRefs.store(1, std::memory_order_release); if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { // Hmm, the add failed, but we can only try again when the refcount // goes back to zero if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { continue; } } return; } } private: // Implemented like a stack, but where node order doesn't matter (nodes are // inserted out of order under contention) std::atomic<N*> freeListHead; static const std::uint32_t REFS_MASK = 0x7FFFFFFF; static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; #ifdef MCDBGQ_NOLOCKFREE_FREELIST debug::DebugMutex mutex; #endif }; /////////////////////////// // Block /////////////////////////// enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; struct Block { Block() : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true) { #ifdef MCDBGQ_TRACKMEM owner = nullptr; #endif } template <InnerQueueContext context> inline bool is_empty() const { MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Check flags for (size_t i = 0; i < QUEUE_BLOCK_SIZE; ++i) { if (!emptyFlags[i].load(std::memory_order_relaxed)) { return false; } } // Aha, empty; make sure we have all other memory effects that happened // before the empty flags were set std::atomic_thread_fence(std::memory_order_acquire); return true; } else { // Check counter if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == QUEUE_BLOCK_SIZE) { std::atomic_thread_fence(std::memory_order_acquire); return true; } assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= QUEUE_BLOCK_SIZE); return false; } } // Returns true if the block is now empty (does not apply in explicit // context) template <InnerQueueContext context> inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) { MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Set flag assert(!emptyFlags[QUEUE_BLOCK_SIZE - 1 - static_cast<size_t>( i & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1))] .load(std::memory_order_relaxed)); emptyFlags[QUEUE_BLOCK_SIZE - 1 - static_cast<size_t>( i & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1))] .store(true, std::memory_order_release); return false; } else { // Increment counter auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); assert(prevVal < QUEUE_BLOCK_SIZE); return prevVal == QUEUE_BLOCK_SIZE - 1; } } // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping // and count > 0). Returns true if the block is now empty (does not apply in // explicit context). template <InnerQueueContext context> inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) { MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Set flags std::atomic_thread_fence(std::memory_order_release); i = QUEUE_BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) - count + 1; for (size_t j = 0; j != count; ++j) { assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); emptyFlags[i + j].store(true, std::memory_order_relaxed); } return false; } else { // Increment counter auto prevVal = elementsCompletelyDequeued.fetch_add( count, std::memory_order_release); assert(prevVal + count <= QUEUE_BLOCK_SIZE); return prevVal + count == QUEUE_BLOCK_SIZE; } } template <InnerQueueContext context> inline void set_all_empty() { MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Set all flags for (size_t i = 0; i != QUEUE_BLOCK_SIZE; ++i) { emptyFlags[i].store(true, std::memory_order_relaxed); } } else { // Reset counter elementsCompletelyDequeued.store(QUEUE_BLOCK_SIZE, std::memory_order_relaxed); } } template <InnerQueueContext context> inline void reset_empty() { MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Reset flags for (size_t i = 0; i != QUEUE_BLOCK_SIZE; ++i) { emptyFlags[i].store(false, std::memory_order_relaxed); } } else { // Reset counter elementsCompletelyDequeued.store(0, std::memory_order_relaxed); } } inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)); } inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)); } private: static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater " "than their size at this time"); MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * QUEUE_BLOCK_SIZE], T) elements; public: Block* next; std::atomic<size_t> elementsCompletelyDequeued; std::atomic<bool> emptyFlags[QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? QUEUE_BLOCK_SIZE : 1]; public: std::atomic<std::uint32_t> freeListRefs; std::atomic<Block*> freeListNext; bool dynamicallyAllocated; // Perhaps a better name for this would be // 'isNotPartOfInitialBlockPool' #ifdef MCDBGQ_TRACKMEM void* owner; #endif }; static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the " "type they are wrapping"); #ifdef MCDBGQ_TRACKMEM public: struct MemStats; private: #endif /////////////////////////// // Producer base /////////////////////////// struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase { ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : tailIndex(0), headIndex(0), dequeueOptimisticCount(0), dequeueOvercommit(0), tailBlock(nullptr), isExplicit(isExplicit_), parent(parent_) {} virtual ~ProducerBase() {} template <typename U> inline bool dequeue(U& element) { if (isExplicit) { return static_cast<ExplicitProducer*>(this)->dequeue(element); } else { return static_cast<ImplicitProducer*>(this)->dequeue(element); } } template <typename It> inline size_t dequeue_bulk(It& itemFirst, size_t max) { if (isExplicit) { return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max); } else { return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max); } } inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); } inline size_t size_approx() const { auto tail = tailIndex.load(std::memory_order_relaxed); auto head = headIndex.load(std::memory_order_relaxed); return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0; } inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } protected: std::atomic<index_t> tailIndex; // Where to enqueue to next std::atomic<index_t> headIndex; // Where to dequeue from next std::atomic<index_t> dequeueOptimisticCount; std::atomic<index_t> dequeueOvercommit; Block* tailBlock; public: bool isExplicit; ConcurrentQueue* parent; protected: #ifdef MCDBGQ_TRACKMEM friend struct MemStats; #endif }; /////////////////////////// // Explicit queue /////////////////////////// struct ExplicitProducer : public ProducerBase { explicit ExplicitProducer(ConcurrentQueue* parent_) : ProducerBase(parent_, true), blockIndex(nullptr), pr_blockIndexSlotsUsed(0), pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), pr_blockIndexFront(0), pr_blockIndexEntries(nullptr), pr_blockIndexRaw(nullptr) { size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; if (poolBasedIndexSize > pr_blockIndexSize) { pr_blockIndexSize = poolBasedIndexSize; } new_block_index(0); // This creates an index with double the number of // current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE } ~ExplicitProducer() { // Destruct any elements not yet dequeued. // Since we're in the destructor, we can assume all elements // are either completely dequeued or completely not (no halfways). if (this->tailBlock != nullptr) { // Note this means there must be a block index too // First find the block that's partially dequeued, if any Block* halfDequeuedBlock = nullptr; if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0) { // The head's not on a block boundary, meaning a block somewhere is // partially dequeued (or the head block is the tail block and was // fully dequeued, but the head/tail are still not on a boundary) size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); while (details::circular_less_than<index_t>( pr_blockIndexEntries[i].base + QUEUE_BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { i = (i + 1) & (pr_blockIndexSize - 1); } assert(details::circular_less_than<index_t>( pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); halfDequeuedBlock = pr_blockIndexEntries[i].block; } // Start at the head block (note the first line in the loop gives us the // head from the tail on the first iteration) auto block = this->tailBlock; do { block = block->next; if (block->ConcurrentQueue::Block::template is_empty< explicit_context>()) { continue; } size_t i = 0; // Offset into block if (block == halfDequeuedBlock) { i = static_cast<size_t>( this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)); } // Walk through all the items in the block; if this is the tail block, // we need to stop when we reach the tail index auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 ? QUEUE_BLOCK_SIZE : static_cast<size_t>( this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)); while (i != QUEUE_BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { (*block)[i++]->~T(); } } while (block != this->tailBlock); } // Destroy all blocks that we own if (this->tailBlock != nullptr) { auto block = this->tailBlock; do { auto nextBlock = block->next; this->parent->add_block_to_free_list(block); block = nextBlock; } while (block != this->tailBlock); } // Destroy the block indices auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw); while (header != nullptr) { auto prev = static_cast<BlockIndexHeader*>(header->prev); header->~BlockIndexHeader(); (Traits::free)(header); header = prev; } } template <AllocationMode allocMode, typename U> inline bool enqueue(U&& element) { index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); index_t newTailIndex = 1 + currentTailIndex; if ((currentTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) { // We reached the end of a block, start a new one auto startBlock = this->tailBlock; auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty< explicit_context>()) { // We can re-use the block ahead of us, it's empty! this->tailBlock = this->tailBlock->next; this->tailBlock->ConcurrentQueue::Block::template reset_empty< explicit_context>(); // We'll put the block on the block index (guaranteed to be room since // we're conceptually removing the last block from it first -- except // instead of removing then adding, we can just overwrite). Note that // there must be a valid block index here, since even if allocation // failed in the ctor, it would have been re-attempted when adding the // first block to the queue; since there is such a block, a block // index must have been successfully allocated. } else { // Whatever head value we see here is >= the last value we saw here // (relatively), and <= its current value. Since we have the most // recent tail, the head must be // <= to it. auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than<index_t>(currentTailIndex, head)); if (!details::circular_less_than<index_t>( head, currentTailIndex + QUEUE_BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE < currentTailIndex - head))) { // We can't enqueue in another block because there's not enough // leeway -- the tail could surpass the head by the time the block // fills up! (Or we'll exceed the size limit, if the second part of // the condition was true.) return false; } // We're going to need a new block; check that the block index has // room if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { // Hmm, the circular block index is already full -- we'll need // to allocate a new index. Note pr_blockIndexRaw can only be // nullptr if the initial allocation failed in the constructor. MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; } else if (!new_block_index(pr_blockIndexSlotsUsed)) { return false; } } // Insert a new block in the circular linked list auto newBlock = this->parent ->ConcurrentQueue::template requisition_block<allocMode>(); if (newBlock == nullptr) { return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock->ConcurrentQueue::Block::template reset_empty< explicit_context>(); if (this->tailBlock == nullptr) { newBlock->next = newBlock; } else { newBlock->next = this->tailBlock->next; this->tailBlock->next = newBlock; } this->tailBlock = newBlock; ++pr_blockIndexSlotsUsed; } MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( T, U, new (static_cast<T*>(nullptr)) T(std::forward(element)))) { // The constructor may throw. We want the element not to appear in the // queue in that case (without corrupting the queue): MOODYCAMEL_TRY { new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); } MOODYCAMEL_CATCH(...) { // Revert change to the current block, but leave the new block // available for next time pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; MOODYCAMEL_RETHROW; } } else { (void)startBlock; (void)originalBlockIndexSlotsUsed; } // Add block to block index auto& entry = blockIndex.load(std::memory_order_relaxed) ->entries[pr_blockIndexFront]; entry.base = currentTailIndex; entry.block = this->tailBlock; blockIndex.load(std::memory_order_relaxed) ->front.store(pr_blockIndexFront, std::memory_order_release); pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( T, U, new (static_cast<T*>(nullptr)) T(std::forward(element)))) { this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } } // Enqueue new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } template <typename U> bool dequeue(U& element) { auto tail = this->tailIndex.load(std::memory_order_relaxed); auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); if (details::circular_less_than<index_t>( this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { // Might be something to dequeue, let's give it a try // Note that this if is purely for performance purposes in the common // case when the queue is empty and the values are eventually consistent // -- we may enter here spuriously. // Note that whatever the values of overcommit and tail are, they are // not going to change (unless we change them) and must be the same // value at this point (inside the if) as when the if condition was // evaluated. // We insert an acquire fence here to synchronize-with the release upon // incrementing dequeueOvercommit below. This ensures that whatever the // value we got loaded into overcommit, the load of dequeueOptisticCount // in the fetch_add below will result in a value at least as recent as // that (and therefore at least as large). Note that I believe a // compiler (signal) fence here would be sufficient due to the nature of // fetch_add (all read-modify-write operations are guaranteed to work on // the latest value in the modification order), but unfortunately that // can't be shown to be correct using only the C++11 standard. See // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case std::atomic_thread_fence(std::memory_order_acquire); // Increment optimistic counter, then check if it went over the boundary auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( 1, std::memory_order_relaxed); // Note that since dequeueOvercommit must be <= dequeueOptimisticCount // (because dequeueOvercommit is only ever incremented after // dequeueOptimisticCount -- this is enforced in the `else` block // below), and since we now have a version of dequeueOptimisticCount // that is at least as recent as overcommit (due to the release upon // incrementing dequeueOvercommit and the acquire above that // synchronizes with it), overcommit <= myDequeueCount. However, we // can't assert this since both dequeueOptimisticCount and // dequeueOvercommit may (independently) overflow; in such a case, // though, the logic still holds since the difference between the two is // maintained. // Note that we reload tail here in case it changed; it will be the same // value as before or greater, since this load is sequenced after // (happens after) the earlier load above. This is supported by // read-read coherency (as defined in the standard), explained here: // http://en.cppreference.com/w/cpp/atomic/memory_order tail = this->tailIndex.load(std::memory_order_acquire); if ((details::likely)(details::circular_less_than<index_t>( myDequeueCount - overcommit, tail))) { // Guaranteed to be at least one element to dequeue! // Get the index. Note that since there's guaranteed to be at least // one element, this will never exceed tail. We need to do an // acquire-release fence here since it's possible that whatever // condition got us to this point was for an earlier enqueued element // (that we already see the memory effects for), but that by the time // we increment somebody else has incremented it, and we need to see // the memory effects for *that* element, which is in such a case is // necessarily visible on the thread that incremented it in the first // place with the more current condition (they must have acquired a // tail that is at least as recent). auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); // Determine which block the element is in auto localBlockIndex = blockIndex.load(std::memory_order_acquire); auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); // We need to be careful here about subtracting and dividing because // of index wrap-around. When an index wraps, we need to preserve the // sign of the offset when dividing it by the block size (in order to // get a correct signed block count offset in all cases): auto headBase = localBlockIndex->entries[localBlockIndexHead].base; auto blockBaseIndex = index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1); auto offset = static_cast<size_t>( static_cast<typename std::make_signed<index_t>::type>( blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>( QUEUE_BLOCK_SIZE)); auto block = localBlockIndex ->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)] .block; // Dequeue auto& el = *((*block)[index]); if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { // Make sure the element is still fully dequeued and destroyed even // if the assignment throws struct Guard { Block* block; index_t index; ~Guard() { (*block)[index]->~T(); block->ConcurrentQueue::Block::template set_empty< explicit_context>(index); } } guard = {block, index}; element = std::move(el); // NOLINT } else { element = std::move(el); // NOLINT el.~T(); // NOLINT block->ConcurrentQueue::Block::template set_empty<explicit_context>( index); } return true; } else { // Wasn't anything to dequeue after all; make the effective dequeue // count eventually consistent this->dequeueOvercommit.fetch_add( 1, std::memory_order_release); // Release so that the fetch_add // on dequeueOptimisticCount is // guaranteed to happen before // this write } } return false; } template <AllocationMode allocMode, typename It> bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) { // First, we need to make sure we have enough room to enqueue all of the // elements; this means pre-allocating blocks and putting them in the // block index (but only if all the allocations succeeded). index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); auto startBlock = this->tailBlock; auto originalBlockIndexFront = pr_blockIndexFront; auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; Block* firstAllocatedBlock = nullptr; // Figure out how many blocks we'll need to allocate, and do so size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)); index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1); if (blockBaseDiff > 0) { // Allocate as many blocks as possible from ahead while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty< explicit_context>()) { blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE); currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE); this->tailBlock = this->tailBlock->next; firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; auto& entry = blockIndex.load(std::memory_order_relaxed) ->entries[pr_blockIndexFront]; entry.base = currentTailIndex; entry.block = this->tailBlock; pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); } // Now allocate as many blocks as necessary from the block pool while (blockBaseDiff > 0) { blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE); currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE); auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than<index_t>(currentTailIndex, head)); bool full = !details::circular_less_than<index_t>( head, currentTailIndex + QUEUE_BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE < currentTailIndex - head)); if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { // Failed to allocate, undo changes (but keep injected blocks) pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; return false; } else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { // Failed to allocate, undo changes (but keep injected blocks) pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; return false; } // pr_blockIndexFront is updated inside new_block_index, so we need // to update our fallback value too (since we keep the new index // even if we later fail) originalBlockIndexFront = originalBlockIndexSlotsUsed; } // Insert a new block in the circular linked list auto newBlock = this->parent ->ConcurrentQueue::template requisition_block<allocMode>(); if (newBlock == nullptr) { pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock->ConcurrentQueue::Block::template set_all_empty< explicit_context>(); if (this->tailBlock == nullptr) { newBlock->next = newBlock; } else { newBlock->next = this->tailBlock->next; this->tailBlock->next = newBlock; } this->tailBlock = newBlock; firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; ++pr_blockIndexSlotsUsed; auto& entry = blockIndex.load(std::memory_order_relaxed) ->entries[pr_blockIndexFront]; entry.base = currentTailIndex; entry.block = this->tailBlock; pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); } // Excellent, all allocations succeeded. Reset each block's emptiness // before we fill them up, and publish the new block index front auto block = firstAllocatedBlock; while (true) { block->ConcurrentQueue::Block::template reset_empty< explicit_context>(); if (block == this->tailBlock) { break; } block = block->next; } MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) { blockIndex.load(std::memory_order_relaxed) ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); } } // Enqueue, one block at a time index_t newTailIndex = startTailIndex + static_cast<index_t>(count); currentTailIndex = startTailIndex; auto endBlock = this->tailBlock; this->tailBlock = startBlock; assert((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { this->tailBlock = firstAllocatedBlock; } while (true) { index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) { stopIndex = newTailIndex; } MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) { while (currentTailIndex != stopIndex) { new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); } } else { MOODYCAMEL_TRY { while (currentTailIndex != stopIndex) { // Must use copy constructor even if move constructor is available // because we may have to revert if there's an exception. // Sorry about the horrible templated next line, but it was the // only way to disable moving *at compile time*, which is // important because a type may only define a (noexcept) move // constructor, and so calls to the cctor will not compile, even // if they are in an if branch that will never be executed new ((*this->tailBlock)[currentTailIndex]) T( details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR( T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept( itemFirst)))>::eval(*itemFirst)); ++currentTailIndex; ++itemFirst; } } MOODYCAMEL_CATCH(...) { // Oh dear, an exception's been thrown -- destroy the elements that // were enqueued so far and revert the entire bulk operation (we'll // keep any allocated blocks in our linked list for later, though). auto constructedStopIndex = currentTailIndex; auto lastBlockEnqueued = this->tailBlock; pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; if (!details::is_trivially_destructible<T>::value) { auto block = startBlock; if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) { block = firstAllocatedBlock; } currentTailIndex = startTailIndex; while (true) { stopIndex = (currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) { stopIndex = constructedStopIndex; } while (currentTailIndex != stopIndex) { (*block)[currentTailIndex++]->~T(); } if (block == lastBlockEnqueued) { break; } block = block->next; } } MOODYCAMEL_RETHROW; } } if (this->tailBlock == endBlock) { assert(currentTailIndex == newTailIndex); break; } this->tailBlock = this->tailBlock->next; } MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) { if (firstAllocatedBlock != nullptr) blockIndex.load(std::memory_order_relaxed) ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); } this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } template <typename It> size_t dequeue_bulk(It& itemFirst, size_t max) { auto tail = this->tailIndex.load(std::memory_order_relaxed); auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); auto desiredCount = static_cast<size_t>( tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); if (details::circular_less_than<size_t>(0, desiredCount)) { desiredCount = desiredCount < max ? desiredCount : max; std::atomic_thread_fence(std::memory_order_acquire); auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( desiredCount, std::memory_order_relaxed); tail = this->tailIndex.load(std::memory_order_acquire); auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit)); if (details::circular_less_than<size_t>(0, actualCount)) { actualCount = desiredCount < actualCount ? desiredCount : actualCount; if (actualCount < desiredCount) { this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); } // Get the first index. Note that since there's guaranteed to be at // least actualCount elements, this will never exceed tail. auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); // Determine which block the first element is in auto localBlockIndex = blockIndex.load(std::memory_order_acquire); auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); auto headBase = localBlockIndex->entries[localBlockIndexHead].base; auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1); auto offset = static_cast<size_t>( static_cast<typename std::make_signed<index_t>::type>( firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>( QUEUE_BLOCK_SIZE)); auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); // Iterate the blocks and dequeue auto index = firstIndex; do { auto firstIndexInBlock = index; index_t endIndex = (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); endIndex = details::circular_less_than<index_t>( firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex; auto block = localBlockIndex->entries[indexIndex].block; if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst++ = std::move(el); el.~T(); ++index; } } else { MOODYCAMEL_TRY { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst = std::move(el); ++itemFirst; el.~T(); ++index; } } MOODYCAMEL_CATCH(...) { // It's too late to revert the dequeue, but we can make sure // that all the dequeued objects are properly destroyed and the // block index (and empty count) are properly updated before we // propagate the exception do { block = localBlockIndex->entries[indexIndex].block; while (index != endIndex) { (*block)[index++]->~T(); } block->ConcurrentQueue::Block::template set_many_empty< explicit_context>( firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock)); indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); firstIndexInBlock = index; endIndex = (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); endIndex = details::circular_less_than<index_t>( firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex; } while (index != firstIndex + actualCount); MOODYCAMEL_RETHROW; } } block->ConcurrentQueue::Block::template set_many_empty< explicit_context>( firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock)); indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); } while (index != firstIndex + actualCount); return actualCount; } else { // Wasn't anything to dequeue after all; make the effective dequeue // count eventually consistent this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); } } return 0; } private: struct BlockIndexEntry { index_t base; Block* block; }; struct BlockIndexHeader { size_t size; std::atomic<size_t> front; // Current slot (not next, like pr_blockIndexFront) BlockIndexEntry* entries; void* prev; }; bool new_block_index(size_t numberOfFilledSlotsToExpose) { auto prevBlockSizeMask = pr_blockIndexSize - 1; // Create the new block pr_blockIndexSize <<= 1; auto newRawPtr = static_cast<char*>((Traits::malloc)( sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); if (newRawPtr == nullptr) { pr_blockIndexSize >>= 1; // Reset to allow graceful retry return false; } auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>( details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader))); // Copy in all the old indices, if any size_t j = 0; if (pr_blockIndexSlotsUsed != 0) { auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; do { newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; i = (i + 1) & prevBlockSizeMask; } while (i != pr_blockIndexFront); } // Update everything auto header = new (newRawPtr) BlockIndexHeader; header->size = pr_blockIndexSize; header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); header->entries = newBlockIndexEntries; header->prev = pr_blockIndexRaw; // we link the new block to the old one // so we can free it later pr_blockIndexFront = j; pr_blockIndexEntries = newBlockIndexEntries; pr_blockIndexRaw = newRawPtr; blockIndex.store(header, std::memory_order_release); return true; } private: std::atomic<BlockIndexHeader*> blockIndex; // To be used by producer only -- consumer must use the ones in referenced // by blockIndex size_t pr_blockIndexSlotsUsed; size_t pr_blockIndexSize; size_t pr_blockIndexFront; // Next slot (not current) BlockIndexEntry* pr_blockIndexEntries; void* pr_blockIndexRaw; #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG public: ExplicitProducer* nextExplicitProducer; private: #endif #ifdef MCDBGQ_TRACKMEM friend struct MemStats; #endif }; ////////////////////////////////// // Implicit queue ////////////////////////////////// struct ImplicitProducer : public ProducerBase { ImplicitProducer(ConcurrentQueue* parent_) : ProducerBase(parent_, false), nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), blockIndex(nullptr) { new_block_index(); } ~ImplicitProducer() { // Note that since we're in the destructor we can assume that all // enqueue/dequeue operations completed already; this means that all // undequeued elements are placed contiguously across contiguous blocks, // and that only the first and last remaining blocks can be only partially // empty (all other remaining blocks must be completely full). #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // Unregister ourselves for thread termination notification if (!this->inactive.load(std::memory_order_relaxed)) { details::ThreadExitNotifier::unsubscribe(&threadExitListener); } #endif // Destroy all remaining elements! auto tail = this->tailIndex.load(std::memory_order_relaxed); auto index = this->headIndex.load(std::memory_order_relaxed); Block* block = nullptr; assert(index == tail || details::circular_less_than(index, tail)); bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block // will not be freed while (index != tail) { if ((index & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 || block == nullptr) { if (block != nullptr) { // Free the old block this->parent->add_block_to_free_list(block); } block = get_block_index_entry_for_index(index)->value.load( std::memory_order_relaxed); } ((*block)[index])->~T(); ++index; } // Even if the queue is empty, there's still one block that's not on the // free list (unless the head index reached the end of it, in which case // the tail will be poised to create a new block). if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0)) { this->parent->add_block_to_free_list(this->tailBlock); } // Destroy block index auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); if (localBlockIndex != nullptr) { for (size_t i = 0; i != localBlockIndex->capacity; ++i) { localBlockIndex->index[i]->~BlockIndexEntry(); } do { auto prev = localBlockIndex->prev; localBlockIndex->~BlockIndexHeader(); (Traits::free)(localBlockIndex); localBlockIndex = prev; } while (localBlockIndex != nullptr); } } template <AllocationMode allocMode, typename U> inline bool enqueue(U&& element) { index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); index_t newTailIndex = 1 + currentTailIndex; if ((currentTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) { // We reached the end of a block, start a new one auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than<index_t>(currentTailIndex, head)); if (!details::circular_less_than<index_t>( head, currentTailIndex + QUEUE_BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE < currentTailIndex - head))) { return false; } #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif // Find out where we'll be inserting this block in the block index BlockIndexEntry* idxEntry; if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) { return false; } // Get ahold of a new block auto newBlock = this->parent ->ConcurrentQueue::template requisition_block<allocMode>(); if (newBlock == nullptr) { rewind_block_index_tail(); idxEntry->value.store(nullptr, std::memory_order_relaxed); return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock ->ConcurrentQueue::Block::template reset_empty<implicit_context>(); MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( T, U, new (static_cast<T*>(nullptr)) T(std::forward(element)))) { // May throw, try to insert now before we publish the fact that we // have this new block MOODYCAMEL_TRY { new ((*newBlock)[currentTailIndex]) T(std::forward(element)); } MOODYCAMEL_CATCH(...) { rewind_block_index_tail(); idxEntry->value.store(nullptr, std::memory_order_relaxed); this->parent->add_block_to_free_list(newBlock); MOODYCAMEL_RETHROW; } } // Insert the new block into the index idxEntry->value.store(newBlock, std::memory_order_relaxed); this->tailBlock = newBlock; MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( T, U, new (static_cast<T*>(nullptr)) T(std::forward(element)))) { this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } } // Enqueue new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } template <typename U> bool dequeue(U& element) { // See ExplicitProducer::dequeue for rationale and explanation index_t tail = this->tailIndex.load(std::memory_order_relaxed); index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); if (details::circular_less_than<index_t>( this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { std::atomic_thread_fence(std::memory_order_acquire); index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add( 1, std::memory_order_relaxed); tail = this->tailIndex.load(std::memory_order_acquire); if ((details::likely)(details::circular_less_than<index_t>( myDequeueCount - overcommit, tail))) { index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); // Determine which block the element is in auto entry = get_block_index_entry_for_index(index); // Dequeue auto block = entry->value.load(std::memory_order_relaxed); auto& el = *((*block)[index]); if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX // Note: Acquiring the mutex with every dequeue instead of only when // a block is released is very sub-optimal, but it is, after all, // purely debug code. debug::DebugLock lock(producer->mutex); #endif struct Guard { Block* block; index_t index; BlockIndexEntry* entry; ConcurrentQueue* parent; ~Guard() { (*block)[index]->~T(); if (block->ConcurrentQueue::Block::template set_empty< implicit_context>(index)) { entry->value.store(nullptr, std::memory_order_relaxed); parent->add_block_to_free_list(block); } } } guard = {block, index, entry, this->parent}; element = std::move(el); // NOLINT } else { element = std::move(el); // NOLINT el.~T(); // NOLINT if (block->ConcurrentQueue::Block::template set_empty< implicit_context>(index)) { { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif // Add the block back into the global free pool (and remove from // block index) entry->value.store(nullptr, std::memory_order_relaxed); } this->parent->add_block_to_free_list( block); // releases the above store } } return true; } else { this->dequeueOvercommit.fetch_add(1, std::memory_order_release); } } return false; } #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 4706) // assignment within conditional expression #endif template <AllocationMode allocMode, typename It> bool enqueue_bulk(It itemFirst, size_t count) { // First, we need to make sure we have enough room to enqueue all of the // elements; this means pre-allocating blocks and putting them in the // block index (but only if all the allocations succeeded). // Note that the tailBlock we start off with may not be owned by us any // more; this happens if it was filled up exactly to the top (setting // tailIndex to the first index of the next block which is not yet // allocated), then dequeued completely (putting it on the free list) // before we enqueue again. index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); auto startBlock = this->tailBlock; Block* firstAllocatedBlock = nullptr; auto endBlock = this->tailBlock; // Figure out how many blocks we'll need to allocate, and do so size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)); index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1); if (blockBaseDiff > 0) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif do { blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE); currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE); // Find out where we'll be inserting this block in the block index BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't // always tell Block* newBlock; bool indexInserted = false; auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than<index_t>(currentTailIndex, head)); bool full = !details::circular_less_than<index_t>( head, currentTailIndex + QUEUE_BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE < currentTailIndex - head)); if (full || !(indexInserted = insert_block_index_entry<allocMode>( idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block< allocMode>()) == nullptr) { // Index allocation or block allocation failed; revert any other // allocations and index insertions done so far for this operation if (indexInserted) { rewind_block_index_tail(); idxEntry->value.store(nullptr, std::memory_order_relaxed); } currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1); for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE); idxEntry = get_block_index_entry_for_index(currentTailIndex); idxEntry->value.store(nullptr, std::memory_order_relaxed); rewind_block_index_tail(); } this->parent->add_blocks_to_free_list(firstAllocatedBlock); this->tailBlock = startBlock; return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock->ConcurrentQueue::Block::template reset_empty< implicit_context>(); newBlock->next = nullptr; // Insert the new block into the index idxEntry->value.store(newBlock, std::memory_order_relaxed); // Store the chain of blocks so that we can undo if later allocations // fail, and so that we can find the blocks when we do the actual // enqueueing if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { assert(this->tailBlock != nullptr); this->tailBlock->next = newBlock; } this->tailBlock = newBlock; endBlock = newBlock; firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; } while (blockBaseDiff > 0); } // Enqueue, one block at a time index_t newTailIndex = startTailIndex + static_cast<index_t>(count); currentTailIndex = startTailIndex; this->tailBlock = startBlock; assert((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { this->tailBlock = firstAllocatedBlock; } while (true) { index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) { stopIndex = newTailIndex; } MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) { while (currentTailIndex != stopIndex) { new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); } } else { MOODYCAMEL_TRY { while (currentTailIndex != stopIndex) { new ((*this->tailBlock)[currentTailIndex]) T( details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR( T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept( itemFirst)))>::eval(*itemFirst)); ++currentTailIndex; ++itemFirst; } } MOODYCAMEL_CATCH(...) { auto constructedStopIndex = currentTailIndex; auto lastBlockEnqueued = this->tailBlock; if (!details::is_trivially_destructible<T>::value) { auto block = startBlock; if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) { block = firstAllocatedBlock; } currentTailIndex = startTailIndex; while (true) { stopIndex = (currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) { stopIndex = constructedStopIndex; } while (currentTailIndex != stopIndex) { (*block)[currentTailIndex++]->~T(); } if (block == lastBlockEnqueued) { break; } block = block->next; } } currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1); for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE); auto idxEntry = get_block_index_entry_for_index(currentTailIndex); idxEntry->value.store(nullptr, std::memory_order_relaxed); rewind_block_index_tail(); } this->parent->add_blocks_to_free_list(firstAllocatedBlock); this->tailBlock = startBlock; MOODYCAMEL_RETHROW; } } if (this->tailBlock == endBlock) { assert(currentTailIndex == newTailIndex); break; } this->tailBlock = this->tailBlock->next; } this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } #ifdef _MSC_VER #pragma warning(pop) #endif template <typename It> size_t dequeue_bulk(It& itemFirst, size_t max) { auto tail = this->tailIndex.load(std::memory_order_relaxed); auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); auto desiredCount = static_cast<size_t>( tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); if (details::circular_less_than<size_t>(0, desiredCount)) { desiredCount = desiredCount < max ? desiredCount : max; std::atomic_thread_fence(std::memory_order_acquire); auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( desiredCount, std::memory_order_relaxed); tail = this->tailIndex.load(std::memory_order_acquire); auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit)); if (details::circular_less_than<size_t>(0, actualCount)) { actualCount = desiredCount < actualCount ? desiredCount : actualCount; if (actualCount < desiredCount) { this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); } // Get the first index. Note that since there's guaranteed to be at // least actualCount elements, this will never exceed tail. auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); // Iterate the blocks and dequeue auto index = firstIndex; BlockIndexHeader* localBlockIndex; auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); do { auto blockStartIndex = index; index_t endIndex = (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); endIndex = details::circular_less_than<index_t>( firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex; auto entry = localBlockIndex->index[indexIndex]; auto block = entry->value.load(std::memory_order_relaxed); if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst++ = std::move(el); el.~T(); ++index; } } else { MOODYCAMEL_TRY { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst = std::move(el); ++itemFirst; el.~T(); ++index; } } MOODYCAMEL_CATCH(...) { do { entry = localBlockIndex->index[indexIndex]; block = entry->value.load(std::memory_order_relaxed); while (index != endIndex) { (*block)[index++]->~T(); } if (block->ConcurrentQueue::Block::template set_many_empty< implicit_context>( blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif entry->value.store(nullptr, std::memory_order_relaxed); this->parent->add_block_to_free_list(block); } indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); blockStartIndex = index; endIndex = (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) + static_cast<index_t>(QUEUE_BLOCK_SIZE); endIndex = details::circular_less_than<index_t>( firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex; } while (index != firstIndex + actualCount); MOODYCAMEL_RETHROW; } } if (block->ConcurrentQueue::Block::template set_many_empty< implicit_context>( blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) { { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif // Note that the set_many_empty above did a release, meaning // that anybody who acquires the block we're about to free can // use it safely since our writes (and reads!) will have // happened-before then. entry->value.store(nullptr, std::memory_order_relaxed); } this->parent->add_block_to_free_list( block); // releases the above store } indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); } while (index != firstIndex + actualCount); return actualCount; } else { this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); } } return 0; } private: // The block size must be > 1, so any number with the low bit set is an // invalid block base index static const index_t INVALID_BLOCK_BASE = 1; struct BlockIndexEntry { std::atomic<index_t> key; std::atomic<Block*> value; }; struct BlockIndexHeader { size_t capacity; std::atomic<size_t> tail; BlockIndexEntry* entries; BlockIndexEntry** index; BlockIndexHeader* prev; }; template <AllocationMode allocMode> inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) { auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer // thread, relaxed is OK if (localBlockIndex == nullptr) { return false; // this can happen if new_block_index failed in the // constructor } size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); idxEntry = localBlockIndex->index[newTail]; if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || idxEntry->value.load(std::memory_order_relaxed) == nullptr) { idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); localBlockIndex->tail.store(newTail, std::memory_order_release); return true; } // No room in the old block index, try to allocate another one! MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; } else if (!new_block_index()) { return false; } else { localBlockIndex = blockIndex.load(std::memory_order_relaxed); newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); idxEntry = localBlockIndex->index[newTail]; assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); localBlockIndex->tail.store(newTail, std::memory_order_release); return true; } } inline void rewind_block_index_tail() { auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); localBlockIndex->tail.store( (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); } inline BlockIndexEntry* get_block_index_entry_for_index( index_t index) const { BlockIndexHeader* localBlockIndex; auto idx = get_block_index_index_for_index(index, localBlockIndex); return localBlockIndex->index[idx]; } inline size_t get_block_index_index_for_index( index_t index, BlockIndexHeader*& localBlockIndex) const { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif index &= ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1); localBlockIndex = blockIndex.load(std::memory_order_acquire); auto tail = localBlockIndex->tail.load(std::memory_order_acquire); auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); assert(tailBase != INVALID_BLOCK_BASE); // Note: Must use division instead of shift because the index may wrap // around, causing a negative offset, whose negativity we want to preserve auto offset = static_cast<size_t>( static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>( QUEUE_BLOCK_SIZE)); size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load( std::memory_order_relaxed) != nullptr); return idx; } bool new_block_index() { auto prev = blockIndex.load(std::memory_order_relaxed); size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; auto raw = static_cast<char*>((Traits::malloc)( sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount + std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); if (raw == nullptr) { return false; } auto header = new (raw) BlockIndexHeader; auto entries = reinterpret_cast<BlockIndexEntry*>( details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader))); auto index = reinterpret_cast<BlockIndexEntry**>( details::align_for<BlockIndexEntry*>( reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount)); if (prev != nullptr) { auto prevTail = prev->tail.load(std::memory_order_relaxed); auto prevPos = prevTail; size_t i = 0; do { prevPos = (prevPos + 1) & (prev->capacity - 1); index[i++] = prev->index[prevPos]; } while (prevPos != prevTail); assert(i == prevCapacity); } for (size_t i = 0; i != entryCount; ++i) { new (entries + i) BlockIndexEntry; entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); index[prevCapacity + i] = entries + i; } header->prev = prev; header->entries = entries; header->index = index; header->capacity = nextBlockIndexCapacity; header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); blockIndex.store(header, std::memory_order_release); nextBlockIndexCapacity <<= 1; return true; } private: size_t nextBlockIndexCapacity; std::atomic<BlockIndexHeader*> blockIndex; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED public: details::ThreadExitListener threadExitListener; private: #endif #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG public: ImplicitProducer* nextImplicitProducer; private: #endif #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX mutable debug::DebugMutex mutex; #endif #ifdef MCDBGQ_TRACKMEM friend struct MemStats; #endif }; ////////////////////////////////// // Block pool manipulation ////////////////////////////////// void populate_initial_block_list(size_t blockCount) { initialBlockPoolSize = blockCount; if (initialBlockPoolSize == 0) { initialBlockPool = nullptr; return; } initialBlockPool = create_array<Block>(blockCount); if (initialBlockPool == nullptr) { initialBlockPoolSize = 0; } for (size_t i = 0; i < initialBlockPoolSize; ++i) { initialBlockPool[i].dynamicallyAllocated = false; } } inline Block* try_get_block_from_initial_pool() { if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { return nullptr; } auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; } inline void add_block_to_free_list(Block* block) { #ifdef MCDBGQ_TRACKMEM block->owner = nullptr; #endif if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { destroy(block); } else { freeList.add(block); } } inline void add_blocks_to_free_list(Block* block) { while (block != nullptr) { auto next = block->next; add_block_to_free_list(block); block = next; } } inline Block* try_get_block_from_free_list() { return freeList.try_get(); } // Gets a free block from one of the memory pools, or allocates a new one (if // applicable) template <AllocationMode canAlloc> Block* requisition_block() { auto block = try_get_block_from_initial_pool(); if (block != nullptr) { return block; } block = try_get_block_from_free_list(); if (block != nullptr) { return block; } MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) { return create<Block>(); } else { return nullptr; } } #ifdef MCDBGQ_TRACKMEM public: struct MemStats { size_t allocatedBlocks; size_t usedBlocks; size_t freeBlocks; size_t ownedBlocksExplicit; size_t ownedBlocksImplicit; size_t implicitProducers; size_t explicitProducers; size_t elementsEnqueued; size_t blockClassBytes; size_t queueClassBytes; size_t implicitBlockIndexBytes; size_t explicitBlockIndexBytes; friend class ConcurrentQueue; private: static MemStats getFor(ConcurrentQueue* q) { MemStats stats = {0}; stats.elementsEnqueued = q->size_approx(); auto block = q->freeList.head_unsafe(); while (block != nullptr) { ++stats.allocatedBlocks; ++stats.freeBlocks; block = block->freeListNext.load(std::memory_order_relaxed); } for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr; stats.implicitProducers += implicit ? 1 : 0; stats.explicitProducers += implicit ? 0 : 1; if (implicit) { auto prod = static_cast<ImplicitProducer*>(ptr); stats.queueClassBytes += sizeof(ImplicitProducer); auto head = prod->headIndex.load(std::memory_order_relaxed); auto tail = prod->tailIndex.load(std::memory_order_relaxed); auto hash = prod->blockIndex.load(std::memory_order_relaxed); if (hash != nullptr) { for (size_t i = 0; i != hash->capacity; ++i) { if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { ++stats.allocatedBlocks; ++stats.ownedBlocksImplicit; } } stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); for (; hash != nullptr; hash = hash->prev) { stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); } } for (; details::circular_less_than<index_t>(head, tail); head += QUEUE_BLOCK_SIZE) { // auto block = prod->get_block_index_entry_for_index(head); ++stats.usedBlocks; } } else { auto prod = static_cast<ExplicitProducer*>(ptr); stats.queueClassBytes += sizeof(ExplicitProducer); auto tailBlock = prod->tailBlock; bool wasNonEmpty = false; if (tailBlock != nullptr) { auto block = tailBlock; do { ++stats.allocatedBlocks; if (!block->ConcurrentQueue::Block::template is_empty< explicit_context>() || wasNonEmpty) { ++stats.usedBlocks; wasNonEmpty = wasNonEmpty || block != tailBlock; } ++stats.ownedBlocksExplicit; block = block->next; } while (block != tailBlock); } auto index = prod->blockIndex.load(std::memory_order_relaxed); while (index != nullptr) { stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); index = static_cast<typename ExplicitProducer::BlockIndexHeader*>( index->prev); } } } auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); stats.allocatedBlocks += freeOnInitialPool; stats.freeBlocks += freeOnInitialPool; stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; stats.queueClassBytes += sizeof(ConcurrentQueue); return stats; } }; // For debugging only. Not thread-safe. MemStats getMemStats() { return MemStats::getFor(this); } private: friend struct MemStats; #endif ////////////////////////////////// // Producer list manipulation ////////////////////////////////// ProducerBase* recycle_or_create_producer(bool isExplicit) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugLock lock(implicitProdMutex); #endif // Try to re-use one first for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { bool expected = true; if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { // We caught one! It's been marked as activated, the caller can have // it return ptr; } } } return add_producer( isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this)); } ProducerBase* add_producer(ProducerBase* producer) { // Handle failed memory allocation if (producer == nullptr) { return nullptr; } producerCount.fetch_add(1, std::memory_order_relaxed); // Add it to the lock-free list auto prevTail = producerListTail.load(std::memory_order_relaxed); do { producer->next = prevTail; } while (!producerListTail.compare_exchange_weak( prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG if (producer->isExplicit) { auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); do { static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit; } while (!explicitProducers.compare_exchange_weak( prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed)); } else { auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); do { static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit; } while (!implicitProducers.compare_exchange_weak( prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed)); } #endif return producer; } void reown_producers() { // After another instance is moved-into/swapped-with this one, all the // producers we stole still think their parents are the other queue. // So fix them up! for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { ptr->parent = this; } } ////////////////////////////////// // Implicit producer hash ////////////////////////////////// struct ImplicitProducerKVP { std::atomic<details::thread_id_t> key; ImplicitProducer* value; // No need for atomicity since it's only read by // the thread that sets it in the first place ImplicitProducerKVP() : value(nullptr) {} ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT { key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); value = other.value; } inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT { swap(other); return *this; } inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT { if (this != &other) { details::swap_relaxed(key, other.key); std::swap(value, other.value); } } }; template <typename XT, typename XTraits> friend void moodycamel::swap( typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; struct ImplicitProducerHash { size_t capacity; ImplicitProducerKVP* entries; ImplicitProducerHash* prev; }; inline void populate_initial_implicit_producer_hash() { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { return; } else { implicitProducerHashCount.store(0, std::memory_order_relaxed); auto hash = &initialImplicitProducerHash; hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; hash->entries = &initialImplicitProducerHashEntries[0]; for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { initialImplicitProducerHashEntries[i].key.store( details::invalid_thread_id, std::memory_order_relaxed); } hash->prev = nullptr; implicitProducerHash.store(hash, std::memory_order_relaxed); } } void swap_implicit_producer_hashes(ConcurrentQueue& other) { MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { return; } else { // Swap (assumes our implicit producer hash is initialized) initialImplicitProducerHashEntries.swap( other.initialImplicitProducerHashEntries); initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); } else { ImplicitProducerHash* hash; for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { continue; } hash->prev = &initialImplicitProducerHash; } if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); } else { ImplicitProducerHash* hash; for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { continue; } hash->prev = &other.initialImplicitProducerHash; } } } // Only fails (returns nullptr) if memory allocation fails ImplicitProducer* get_or_add_implicit_producer() { // Note that since the data is essentially thread-local (key is thread ID), // there's a reduced need for fences (memory ordering is already consistent // for any individual thread), except for the current table itself. // Start by looking for the thread ID in the current and all previous hash // tables. If it's not found, it must not be in there yet, since this same // thread would have added it previously to one of the tables that we // traversed. // Code and algorithm adapted from // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugLock lock(implicitProdMutex); #endif auto id = details::thread_id(); auto hashedId = details::hash_thread_id(id); auto mainHash = implicitProducerHash.load(std::memory_order_acquire); assert( mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { // Look for the id in this hash auto index = hashedId; while (true) { // Not an infinite loop because at least one slot is free // in the hash table index &= hash->capacity - 1u; auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); if (probedKey == id) { // Found it! If we had to search several hashes deep, though, we // should lazily add it to the current main hash table to avoid the // extended search next time. Note there's guaranteed to be room in // the current hash table since every subsequent table implicitly // reserves space for all previous tables (there's only one // implicitProducerHashCount). auto value = hash->entries[index].value; if (hash != mainHash) { index = hashedId; while (true) { index &= mainHash->capacity - 1u; auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED auto reusable = details::invalid_thread_id2; if (mainHash->entries[index].key.compare_exchange_strong( empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) || mainHash->entries[index].key.compare_exchange_strong( reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { #else if (mainHash->entries[index].key.compare_exchange_strong( empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { #endif mainHash->entries[index].value = value; break; } ++index; } } return value; } if (probedKey == details::invalid_thread_id) { break; // Not in this hash table } ++index; } } // Insert! auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); while (true) { // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set( std::memory_order_acquire)) { // We've acquired the resize lock, try to allocate a bigger hash table. // Note the acquire fence synchronizes with the release fence at the end // of this block, and hence when we reload implicitProducerHash it must // be the most recent version (it only gets changed within this locked // block). mainHash = implicitProducerHash.load(std::memory_order_acquire); if (newCount >= (mainHash->capacity >> 1)) { size_t newCapacity = mainHash->capacity << 1; while (newCount >= (newCapacity >> 1)) { newCapacity <<= 1; } auto raw = static_cast<char*>( (Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); if (raw == nullptr) { // Allocation failed implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); implicitProducerHashResizeInProgress.clear( std::memory_order_relaxed); return nullptr; } auto newHash = new (raw) ImplicitProducerHash; newHash->capacity = static_cast<size_t>(newCapacity); newHash->entries = reinterpret_cast<ImplicitProducerKVP*>( details::align_for<ImplicitProducerKVP>( raw + sizeof(ImplicitProducerHash))); for (size_t i = 0; i != newCapacity; ++i) { new (newHash->entries + i) ImplicitProducerKVP; newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); } newHash->prev = mainHash; implicitProducerHash.store(newHash, std::memory_order_release); implicitProducerHashResizeInProgress.clear(std::memory_order_release); mainHash = newHash; } else { implicitProducerHashResizeInProgress.clear(std::memory_order_release); } } // If it's < three-quarters full, add to the old one anyway so that we // don't have to wait for the next table to finish being allocated by // another thread (and if we just finished allocating above, the condition // will always be true) if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false)); if (producer == nullptr) { implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); return nullptr; } #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; producer->threadExitListener.userData = producer; details::ThreadExitNotifier::subscribe(&producer->threadExitListener); #endif auto index = hashedId; while (true) { index &= mainHash->capacity - 1u; auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED auto reusable = details::invalid_thread_id2; if (mainHash->entries[index].key.compare_exchange_strong( reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { implicitProducerHashCount.fetch_sub( 1, std::memory_order_relaxed); // already counted as a used slot mainHash->entries[index].value = producer; break; } #endif if (mainHash->entries[index].key.compare_exchange_strong( empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { mainHash->entries[index].value = producer; break; } ++index; } return producer; } // Hmm, the old hash is quite full and somebody else is busy allocating a // new one. We need to wait for the allocating thread to finish (if it // succeeds, we add, if not, we try to allocate ourselves). mainHash = implicitProducerHash.load(std::memory_order_acquire); } } #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED void implicit_producer_thread_exited(ImplicitProducer* producer) { // Remove from hash #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugLock lock(implicitProdMutex); #endif auto hash = implicitProducerHash.load(std::memory_order_acquire); assert(hash != nullptr); // The thread exit listener is only registered if // we were added to a hash in the first place auto id = details::thread_id(); auto hashedId = details::hash_thread_id(id); details::thread_id_t probedKey; // We need to traverse all the hashes just in case other threads aren't on // the current one yet and are trying to add an entry thinking there's a // free slot (because they reused a producer) for (; hash != nullptr; hash = hash->prev) { auto index = hashedId; do { index &= hash->capacity - 1u; probedKey = id; if (hash->entries[index].key.compare_exchange_strong( probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) { break; } ++index; } while ( probedKey != details::invalid_thread_id); // Can happen if the hash has changed // but we weren't put back in it yet, or // if we weren't added to this hash in // the first place } // Mark the queue as being recyclable producer->inactive.store(true, std::memory_order_release); } static void implicit_producer_thread_exited_callback(void* userData) { auto producer = static_cast<ImplicitProducer*>(userData); auto queue = producer->parent; queue->implicit_producer_thread_exited(producer); } #endif ////////////////////////////////// // Utility functions ////////////////////////////////// template <typename TAlign> static inline void* aligned_malloc(size_t size) { MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value) return (Traits::malloc)(size); else { size_t alignment = std::alignment_of<TAlign>::value; void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); if (!raw) return nullptr; char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*)); *(reinterpret_cast<void**>(ptr) - 1) = raw; return ptr; } } template <typename TAlign> static inline void aligned_free(void* ptr) { MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value) return (Traits::free)(ptr); else(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr); } template <typename U> static inline U* create_array(size_t count) { assert(count > 0); U* p = static_cast<U*>(aligned_malloc(sizeof(U) * count)); if (p == nullptr) return nullptr; for (size_t i = 0; i != count; ++i) new (p + i) U(); return p; } template <typename U> static inline void destroy_array(U* p, size_t count) { if (p != nullptr) { assert(count > 0); for (size_t i = count; i != 0;) (p + --i)->~U(); } aligned_free(p); } template <typename U> static inline U* create() { void* p = aligned_malloc(sizeof(U)); return p != nullptr ? new (p) U : nullptr; } template <typename U, typename A1> static inline U* create(A1&& a1) { void* p = aligned_malloc(sizeof(U)); return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr; } template <typename U> static inline void destroy(U* p) { if (p != nullptr) p->~U(); aligned_free(p); } private: std::atomic<ProducerBase*> producerListTail; std::atomic<std::uint32_t> producerCount; std::atomic<size_t> initialBlockPoolIndex; Block* initialBlockPool; size_t initialBlockPoolSize; #ifndef MCDBGQ_USEDEBUGFREELIST FreeList<Block> freeList; #else debug::DebugFreeList<Block> freeList; #endif std::atomic<ImplicitProducerHash*> implicitProducerHash; std::atomic<size_t> implicitProducerHashCount; // Number of slots logically used ImplicitProducerHash initialImplicitProducerHash; std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries; std::atomic_flag implicitProducerHashResizeInProgress; std::atomic<std::uint32_t> nextExplicitConsumerId; std::atomic<std::uint32_t> globalExplicitConsumerOffset; #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugMutex implicitProdMutex; #endif #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG std::atomic<ExplicitProducer*> explicitProducers; std::atomic<ImplicitProducer*> implicitProducers; #endif }; template <typename T, typename Traits> ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue) : producer(queue.recycle_or_create_producer(true)) { if (producer != nullptr) { producer->token = this; } } template <typename T, typename Traits> ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue) : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue) ->recycle_or_create_producer(true)) { if (producer != nullptr) { producer->token = this; } } template <typename T, typename Traits> ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue) : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); lastKnownGlobalOffset = static_cast<std::uint32_t>(-1); } template <typename T, typename Traits> ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue) : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue) ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); lastKnownGlobalOffset = static_cast<std::uint32_t>(-1); } template <typename T, typename Traits> inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } template <typename T, typename Traits> inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } } // namespace ylt::detail::moodycamel #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) #pragma warning(pop) #endif #if defined(__GNUC__) && !defined(__INTEL_COMPILER) #pragma GCC diagnostic pop #endif

include/ylt/util/concurrentqueue.h (3,258 lines of code) (raw):