From ce3ecec7719df58333b5e61a5aef692392ad90c2 Mon Sep 17 00:00:00 2001 From: sharpener6 <1sc2l4qi@duck.com> Date: Wed, 24 Sep 2025 19:30:01 -0400 Subject: [PATCH 1/3] Rework all github workflows (#230) - Moved workflows to actions so they can be reused - Removed the dependency on boost - Fix formatting issue - Fix the formatting and import sort orders --- .github/actions/compile-library/action.yml | 2 +- examples/task_capabilities.py | 10 +- scaler/io/ymq/bytes.h | 11 +- .../io/ymq/examples/automated_echo_client.cpp | 2 +- scaler/io/ymq/message_connection_tcp.cpp | 1 - scaler/io/ymq/pymod_ymq/async.h | 3 +- scaler/io/ymq/pymod_ymq/exception.h | 2 +- scaler/io/ymq/pymod_ymq/io_context.h | 4 +- scaler/io/ymq/pymod_ymq/io_socket.h | 10 +- scaler/io/ymq/pymod_ymq/ymq.h | 2 +- scaler/io/ymq/simple_interface.cpp | 28 +- scaler/io/ymq/simple_interface.h | 8 +- scaler/io/ymq/ymq.pyi | 14 +- scaler/io/ymq/ymq_test.py | 20 - .../object_storage/object_storage_server.cpp | 14 +- scaler/object_storage/object_storage_server.h | 3 +- scripts/build.sh | 2 +- tests/CMakeLists.txt | 5 +- tests/cc_ymq/CMakeLists.txt | 1 + tests/cc_ymq/common.h | 496 +++++++++++++++++ tests/cc_ymq/py_mitm/__init__.py | 0 tests/cc_ymq/py_mitm/main.py | 152 ++++++ tests/cc_ymq/py_mitm/passthrough.py | 23 + tests/cc_ymq/py_mitm/randomly_drop_packets.py | 28 + tests/cc_ymq/py_mitm/send_rst_to_client.py | 48 ++ tests/cc_ymq/py_mitm/types.py | 54 ++ tests/cc_ymq/test_cc_ymq.cpp | 508 ++++++++++++++++++ .../test_object_storage_server.cpp | 28 +- tests/pymod_ymq/__init__.py | 0 tests/pymod_ymq/test_pymod_ymq.py | 150 ++++++ tests/pymod_ymq/test_types.py | 90 ++++ 31 files changed, 1626 insertions(+), 93 deletions(-) delete mode 100644 scaler/io/ymq/ymq_test.py create mode 100644 tests/cc_ymq/CMakeLists.txt create mode 100644 tests/cc_ymq/common.h create mode 100644 tests/cc_ymq/py_mitm/__init__.py create mode 100644 tests/cc_ymq/py_mitm/main.py create mode 100644 tests/cc_ymq/py_mitm/passthrough.py create mode 100644 tests/cc_ymq/py_mitm/randomly_drop_packets.py create mode 100644 tests/cc_ymq/py_mitm/send_rst_to_client.py create mode 100644 tests/cc_ymq/py_mitm/types.py create mode 100644 tests/cc_ymq/test_cc_ymq.cpp create mode 100644 tests/pymod_ymq/__init__.py create mode 100644 tests/pymod_ymq/test_pymod_ymq.py create mode 100644 tests/pymod_ymq/test_types.py diff --git a/.github/actions/compile-library/action.yml b/.github/actions/compile-library/action.yml index 1036b1524..6b630bfb6 100644 --- a/.github/actions/compile-library/action.yml +++ b/.github/actions/compile-library/action.yml @@ -13,7 +13,7 @@ runs: if: inputs.os == 'Linux' shell: bash run: | - CXX=$(which g++-14) ./scripts/build.sh + CXX=$(which g++-14) GTEST_FILTER="-*Mitm*:*TestIncompleteIdentity*" ./scripts/build.sh - name: Build and test C++ Components (Windows) if: inputs.os == 'Windows' diff --git a/examples/task_capabilities.py b/examples/task_capabilities.py index 5a38f313a..e75468863 100644 --- a/examples/task_capabilities.py +++ b/examples/task_capabilities.py @@ -56,18 +56,12 @@ def main(): # Submit a task that requires GPU capabilities, this will be redirected to the GPU worker. gpu_future = client.submit_verbose( - gpu_task, - args=(16.0,), - kwargs={}, - capabilities={"gpu": 1} # Requires a GPU capability + gpu_task, args=(16.0,), kwargs={}, capabilities={"gpu": 1} # Requires a GPU capability ) # Submit a task that does not require GPU capabilities, this will be routed to any available worker. cpu_future = client.submit_verbose( - cpu_task, - args=(16.0,), - kwargs={}, - capabilities={} # No GPU capability required + cpu_task, args=(16.0,), kwargs={}, capabilities={} # No GPU capability required ) # Waits for the tasks for finish diff --git a/scaler/io/ymq/bytes.h b/scaler/io/ymq/bytes.h index 9211bb24e..c3dacb46f 100644 --- a/scaler/io/ymq/bytes.h +++ b/scaler/io/ymq/bytes.h @@ -10,6 +10,7 @@ #include // C++ +#include #include // First-party @@ -33,10 +34,9 @@ class Bytes { public: Bytes(char* data, size_t len): _data(datadup((uint8_t*)data, len)), _len(len) {} - Bytes(): _data {}, _len {} {} + Bytes(const std::string& s): _data(datadup((uint8_t*)s.data(), s.length())), _len(s.length()) {} - // For debug and convenience only - explicit Bytes(const std::string& str): Bytes((char*)str.c_str(), str.size()) {} + Bytes(): _data {}, _len {} {} Bytes(const Bytes& other) noexcept { @@ -92,11 +92,10 @@ class Bytes { [[nodiscard]] constexpr bool is_null() const noexcept { return !this->_data; } - // debugging utility - std::string as_string() const + std::optional as_string() const { if (is_null()) - return "[EMPTY]"; + return std::nullopt; return std::string((char*)_data, _len); } diff --git a/scaler/io/ymq/examples/automated_echo_client.cpp b/scaler/io/ymq/examples/automated_echo_client.cpp index 5a4ca3510..328b3e219 100644 --- a/scaler/io/ymq/examples/automated_echo_client.cpp +++ b/scaler/io/ymq/examples/automated_echo_client.cpp @@ -70,7 +70,7 @@ int main() auto future = x.get_future(); Message msg = future.get().first; if (msg.payload.as_string() != longStr) { - printf("Checksum failed, %s\n", msg.payload.as_string().c_str()); + printf("Checksum failed, %s\n", msg.payload.as_string()->c_str()); exit(1); } } diff --git a/scaler/io/ymq/message_connection_tcp.cpp b/scaler/io/ymq/message_connection_tcp.cpp index 569fe214b..8212238f1 100644 --- a/scaler/io/ymq/message_connection_tcp.cpp +++ b/scaler/io/ymq/message_connection_tcp.cpp @@ -269,7 +269,6 @@ void MessageConnectionTCP::updateReadOperation() _pendingRecvMessageCallbacks->pop(); recvMessageCallback({Message(std::move(address), std::move(payload)), {}}); - } else { assert(_pendingRecvMessageCallbacks->size()); break; diff --git a/scaler/io/ymq/pymod_ymq/async.h b/scaler/io/ymq/pymod_ymq/async.h index 8a2e229a5..ba80d6bb3 100644 --- a/scaler/io/ymq/pymod_ymq/async.h +++ b/scaler/io/ymq/pymod_ymq/async.h @@ -12,7 +12,7 @@ #include "scaler/io/ymq/pymod_ymq/ymq.h" // wraps an async callback that accepts a Python asyncio future -static PyObject* async_wrapper(PyObject* self, const std::function& callback) +static PyObject* async_wrapper(PyObject* self, const std::function&& callback) { auto state = YMQStateFromSelf(self); if (!state) @@ -25,7 +25,6 @@ static PyObject* async_wrapper(PyObject* self, const std::function // First-party -#include "scaler/io/ymq/pymod_ymq/ymq.h" #include "scaler/io/ymq/pymod_ymq/utils.h" +#include "scaler/io/ymq/pymod_ymq/ymq.h" // the order of the members in the exception args tuple const Py_ssize_t YMQException_errorCodeIndex = 0; diff --git a/scaler/io/ymq/pymod_ymq/io_context.h b/scaler/io/ymq/pymod_ymq/io_context.h index 2bc9be556..e922d5388 100644 --- a/scaler/io/ymq/pymod_ymq/io_context.h +++ b/scaler/io/ymq/pymod_ymq/io_context.h @@ -77,8 +77,8 @@ static PyObject* PyIOContext_createIOSocket_( using Identity = Configuration::IOSocketIdentity; // note: references borrowed from args, so no need to manage their lifetime - PyObject* pyIdentity {}; - PyObject* pySocketType {}; + PyObject* pyIdentity = nullptr; + PyObject* pySocketType = nullptr; if (nargs == 1) { pyIdentity = args[0]; } else if (nargs == 2) { diff --git a/scaler/io/ymq/pymod_ymq/io_socket.h b/scaler/io/ymq/pymod_ymq/io_socket.h index 152c785a8..898afb832 100644 --- a/scaler/io/ymq/pymod_ymq/io_socket.h +++ b/scaler/io/ymq/pymod_ymq/io_socket.h @@ -156,10 +156,6 @@ static PyObject* PyIOSocket_recv(PyIOSocket* self, PyObject* args) if (!pyMessage) return YMQ_GetRaisedException(); - // TODO: why is leaking necessary? - address.forget(); - payload.forget(); - return (PyObject*)pyMessage.take(); }); } catch (...) { @@ -221,10 +217,6 @@ static PyObject* PyIOSocket_recv_sync(PyIOSocket* self, PyObject* args) if (!pyMessage) return nullptr; - // TODO: why is leaking necessary? - address.forget(); - payload.forget(); - return (PyObject*)pyMessage.take(); } @@ -378,7 +370,7 @@ static PyObject* PyIOSocket_socket_type_getter(PyIOSocket* self, void* closure) if (!state) return nullptr; - const IOSocketType socketType = self->socket->socketType(); + const IOSocketType socketType = self->socket->socketType(); OwnedPyObject socketTypeIntObj = PyLong_FromLong((long)socketType); if (!socketTypeIntObj) diff --git a/scaler/io/ymq/pymod_ymq/ymq.h b/scaler/io/ymq/pymod_ymq/ymq.h index 879b019d8..b788d05dd 100644 --- a/scaler/io/ymq/pymod_ymq/ymq.h +++ b/scaler/io/ymq/pymod_ymq/ymq.h @@ -304,7 +304,7 @@ static int YMQ_createErrorCodeEnum(PyObject* pyModule, YMQState* state) // docs and examples are unfortunately scarce for this // for now this will work just fine OwnedPyObject item {}; - while (item = PyIter_Next(*iter)) { + while ((item = PyIter_Next(*iter))) { OwnedPyObject fn = PyCMethod_New(&YMQErrorCode_explanation_def, *item, pyModule, nullptr); if (!fn) return -1; diff --git a/scaler/io/ymq/simple_interface.cpp b/scaler/io/ymq/simple_interface.cpp index 6acca250b..0a88f8ac0 100644 --- a/scaler/io/ymq/simple_interface.cpp +++ b/scaler/io/ymq/simple_interface.cpp @@ -1,6 +1,8 @@ #include "scaler/io/ymq/simple_interface.h" +#include + namespace scaler { namespace ymq { @@ -35,35 +37,41 @@ void syncConnectSocket(std::shared_ptr socket, std::string address) connect_future.wait(); } -std::pair syncRecvMessage(std::shared_ptr socket) +std::expected syncRecvMessage(std::shared_ptr socket) { auto fut = futureRecvMessage(std::move(socket)); return fut.get(); } -std::expected syncSendMessage(std::shared_ptr socket, Message message) +std::optional syncSendMessage(std::shared_ptr socket, Message message) { auto fut = futureSendMessage(std::move(socket), std::move(message)); return fut.get(); } -std::future> futureRecvMessage(std::shared_ptr socket) +std::future> futureRecvMessage(std::shared_ptr socket) { - auto recv_promise_ptr = std::make_unique>>(); + auto recv_promise_ptr = std::make_unique>>(); auto recv_future = recv_promise_ptr->get_future(); - socket->recvMessage([recv_promise = std::move(recv_promise_ptr)](std::pair msg) { - recv_promise->set_value(std::move(msg)); + socket->recvMessage([recv_promise = std::move(recv_promise_ptr)](std::pair result) { + if (result.second._errorCode == Error::ErrorCode::Uninit) + recv_promise->set_value(std::move(result.first)); + else + recv_promise->set_value(std::unexpected {std::move(result.second)}); }); return recv_future; } -std::future> futureSendMessage(std::shared_ptr socket, Message message) +std::future> futureSendMessage(std::shared_ptr socket, Message message) { - auto send_promise_ptr = std::make_unique>>(); + auto send_promise_ptr = std::make_unique>>(); auto send_future = send_promise_ptr->get_future(); socket->sendMessage( - std::move(message), [send_promise = std::move(send_promise_ptr)](std::expected msg) { - send_promise->set_value(std::move(msg)); + std::move(message), [send_promise = std::move(send_promise_ptr)](std::expected result) { + if (result) + send_promise->set_value(std::nullopt); + else + send_promise->set_value(std::move(result.error())); }); return send_future; } diff --git a/scaler/io/ymq/simple_interface.h b/scaler/io/ymq/simple_interface.h index 2f2d7a03d..9b9fa1f39 100644 --- a/scaler/io/ymq/simple_interface.h +++ b/scaler/io/ymq/simple_interface.h @@ -14,11 +14,11 @@ std::shared_ptr syncCreateSocket(IOContext& context, IOSocketType type void syncBindSocket(std::shared_ptr socket, std::string address); void syncConnectSocket(std::shared_ptr socket, std::string address); -std::pair syncRecvMessage(std::shared_ptr socket); -std::expected syncSendMessage(std::shared_ptr socket, Message message); +std::expected syncRecvMessage(std::shared_ptr socket); +std::optional syncSendMessage(std::shared_ptr socket, Message message); -std::future> futureRecvMessage(std::shared_ptr socket); -std::future> futureSendMessage(std::shared_ptr socket, Message message); +std::future> futureRecvMessage(std::shared_ptr socket); +std::future> futureSendMessage(std::shared_ptr socket, Message message); } // namespace ymq } // namespace scaler diff --git a/scaler/io/ymq/ymq.pyi b/scaler/io/ymq/ymq.pyi index 7444d6fad..d436fd136 100644 --- a/scaler/io/ymq/ymq.pyi +++ b/scaler/io/ymq/ymq.pyi @@ -1,6 +1,5 @@ # NOTE: NOT IMPLEMENTATION, TYPE INFORMATION ONLY # This file contains type stubs for the Ymq Python C Extension module -import abc import sys from collections.abc import Awaitable from enum import IntEnum @@ -11,12 +10,17 @@ if sys.version_info >= (3, 12): else: Buffer = object -class Bytes(Buffer, metaclass=abc.ABCMeta): - data: bytes +class Bytes(Buffer): + data: bytes | None len: int - def __init__(self, data: SupportsBytes | bytes) -> None: ... + def __init__(self, data: Buffer | None = None) -> None: ... def __repr__(self) -> str: ... + def __len__(self) -> int: ... + + # this type signature is not 100% accurate because it's implemented in C + # but this satisfies the type check and is good enough + def __buffer__(self, flags: int, /) -> memoryview: ... class Message: address: Bytes | None @@ -99,7 +103,7 @@ class YMQException(Exception): code: ErrorCode message: str - def __init__(self, code: ErrorCode, message: str) -> None: ... + def __init__(self, /, code: ErrorCode, message: str) -> None: ... def __repr__(self) -> str: ... def __str__(self) -> str: ... diff --git a/scaler/io/ymq/ymq_test.py b/scaler/io/ymq/ymq_test.py deleted file mode 100644 index 9201983c7..000000000 --- a/scaler/io/ymq/ymq_test.py +++ /dev/null @@ -1,20 +0,0 @@ -import asyncio - -import ymq - - -async def main(): - ctx = ymq.IOContext() - socket = await ctx.createIOSocket("ident", ymq.IOSocketType.Binder) - print(ctx, ";", socket) - - assert socket.identity == "ident" - assert socket.socket_type == ymq.IOSocketType.Binder - - exc = ymq.YMQException(ymq.ErrorCode.InvalidAddressFormat, "the address has an invalid format") - assert exc.code == ymq.ErrorCode.InvalidAddressFormat - assert exc.message == "the address has an invalid format" - assert exc.code.explanation() - - -asyncio.run(main()) diff --git a/scaler/object_storage/object_storage_server.cpp b/scaler/object_storage/object_storage_server.cpp index 22a4a9488..cd5de483e 100644 --- a/scaler/object_storage/object_storage_server.cpp +++ b/scaler/object_storage/object_storage_server.cpp @@ -129,14 +129,15 @@ void ObjectStorageServer::processRequests() std::ranges::for_each(_pendingSendMessageFuts, [](auto& fut) { if (fut.wait_for(0s) == std::future_status::ready) { - auto res = fut.get(); - assert(res); + auto error = fut.get(); + assert(!error); } }); - auto [message, error] = ymq::syncRecvMessage(_ioSocket); + auto maybeMessage = ymq::syncRecvMessage(_ioSocket); - if (error._errorCode != ymq::Error::ErrorCode::Uninit) { + if (!maybeMessage) { + auto error = maybeMessage.error(); if (error._errorCode == ymq::Error::ErrorCode::IOSocketStopRequested) { auto n = std::ranges::count_if(_pendingSendMessageFuts, [](auto& x) { return x.valid() && x.wait_for(0s) == std::future_status::timeout; @@ -163,8 +164,9 @@ void ObjectStorageServer::processRequests() } } - const auto identity = lastMessageIdentity = message.address.as_string(); - const auto headerOrPayload = std::move(message.payload); + const auto identity = *maybeMessage->address.as_string(); + lastMessageIdentity = identity; + const auto headerOrPayload = std::move(maybeMessage->payload); auto it = identityToFullRequest.find(identity); if (it == identityToFullRequest.end()) { diff --git a/scaler/object_storage/object_storage_server.h b/scaler/object_storage/object_storage_server.h index 20ed8e738..00849950e 100644 --- a/scaler/object_storage/object_storage_server.h +++ b/scaler/object_storage/object_storage_server.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "scaler/io/ymq/configuration.h" @@ -22,7 +23,7 @@ namespace object_storage { class ObjectStorageServer { public: using Identity = ymq::Configuration::IOSocketIdentity; - using SendMessageFuture = std::future>; + using SendMessageFuture = std::future>; ObjectStorageServer(); diff --git a/scripts/build.sh b/scripts/build.sh index 7256ea367..5fe3482d7 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -27,4 +27,4 @@ cmake --build --preset $BUILD_PRESET cmake --install $BUILD_DIR # Tests -ctest --preset $BUILD_PRESET +ctest --preset $BUILD_PRESET -VV diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ae6a3b116..a1f238355 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,6 +11,8 @@ set(BUILD_GMOCK OFF CACHE BOOL "" FORCE) set(BUILD_GTEST ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) +find_package(Python3 COMPONENTS Development REQUIRED) + # This function compiles, links, and adds a C++ test executable using Google Test. # It is shared by all test subdirectories. function(add_test_executable test_name source_file) @@ -26,16 +28,17 @@ function(add_test_executable test_name source_file) CapnProto::capnp CapnProto::kj GTest::gtest_main + Python3::Python ) add_test(NAME ${test_name} COMMAND ${test_name}) endfunction() - if(LINUX OR APPLE) # This directory fetches Google Test, so it must be included first. add_subdirectory(object_storage) # Add the new directory for io tests. add_subdirectory(io/ymq) + add_subdirectory(cc_ymq) endif() diff --git a/tests/cc_ymq/CMakeLists.txt b/tests/cc_ymq/CMakeLists.txt new file mode 100644 index 000000000..9f6abe371 --- /dev/null +++ b/tests/cc_ymq/CMakeLists.txt @@ -0,0 +1 @@ +add_test_executable(test_cc_ymq test_cc_ymq.cpp) diff --git a/tests/cc_ymq/common.h b/tests/cc_ymq/common.h new file mode 100644 index 000000000..42dfab9fe --- /dev/null +++ b/tests/cc_ymq/common.h @@ -0,0 +1,496 @@ +#pragma once + +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RETURN_FAILURE_IF_FALSE(condition) \ + if (!(condition)) { \ + return TestResult::Failure; \ + } + +using namespace std::chrono_literals; + +enum class TestResult : char { Success = 1, Failure = 2 }; + +inline const char* check_localhost(const char* host) +{ + return std::strcmp(host, "localhost") == 0 ? "127.0.0.1" : host; +} + +inline std::string format_address(std::string host, uint16_t port) +{ + return std::format("tcp://{}:{}", check_localhost(host.c_str()), port); +} + +class OwnedFd { +public: + int fd; + + OwnedFd(int fd): fd(fd) {} + + // move-only + OwnedFd(const OwnedFd&) = delete; + OwnedFd& operator=(const OwnedFd&) = delete; + OwnedFd(OwnedFd&& other) noexcept: fd(other.fd) { other.fd = 0; } + OwnedFd& operator=(OwnedFd&& other) noexcept + { + if (this != &other) { + this->fd = other.fd; + other.fd = 0; + } + return *this; + } + + ~OwnedFd() + { + if (fd > 0 && close(fd) < 0) + std::println(std::cerr, "failed to close fd!"); + } + + size_t write(const void* data, size_t len) + { + auto n = ::write(this->fd, data, len); + if (n < 0) + throw std::system_error(errno, std::generic_category(), "failed to write to socket"); + + return n; + } + + void write_all(const char* data, size_t len) + { + for (size_t cursor = 0; cursor < len;) + cursor += this->write(data + cursor, len - cursor); + } + + void write_all(std::vector data) { this->write_all(data.data(), data.size()); } + + size_t read(void* buffer, size_t len) + { + auto n = ::read(this->fd, buffer, len); + if (n < 0) + throw std::system_error(errno, std::generic_category(), "failed to read from socket"); + return n; + } + + void read_exact(char* buffer, size_t len) + { + for (size_t cursor = 0; cursor < len;) + cursor += this->read(buffer + cursor, len - cursor); + } + + operator int() { return fd; } +}; + +class Socket: public OwnedFd { +public: + Socket(int fd): OwnedFd(fd) {} + + void connect(const char* host, uint16_t port, bool nowait = false) + { + sockaddr_in addr { + .sin_family = AF_INET, + .sin_port = htons(port), + .sin_addr = {.s_addr = inet_addr(check_localhost(host))}, + .sin_zero = {0}}; + + connect: + if (::connect(this->fd, (sockaddr*)&addr, sizeof(addr)) < 0) { + if (errno == ECONNREFUSED && !nowait) { + std::this_thread::sleep_for(300ms); + goto connect; + } + + throw std::system_error(errno, std::generic_category(), "failed to connect"); + } + } + + void bind(const char* host, int port) + { + sockaddr_in addr { + .sin_family = AF_INET, + .sin_port = htons(port), + .sin_addr = {.s_addr = inet_addr(check_localhost(host))}, + .sin_zero = {0}}; + + auto status = ::bind(this->fd, (sockaddr*)&addr, sizeof(addr)); + if (status < 0) + throw std::system_error(errno, std::generic_category(), "failed to bind"); + } + + void listen(int n = 32) + { + auto status = ::listen(this->fd, n); + if (status < 0) + throw std::system_error(errno, std::generic_category(), "failed to listen on socket"); + } + + std::pair accept(int flags = 0) + { + sockaddr_in peer_addr {}; + socklen_t len = sizeof(peer_addr); + auto fd = ::accept4(this->fd, (sockaddr*)&peer_addr, &len, flags); + if (fd < 0) + throw std::system_error(errno, std::generic_category(), "failed to accept socket"); + + return std::make_pair(Socket(fd), peer_addr); + } + + void write_message(std::string message) + { + uint64_t header = message.length(); + this->write_all((char*)&header, 8); + this->write_all(message.data(), message.length()); + } + + std::string read_message() + { + uint64_t header = 0; + this->read_exact((char*)&header, 8); + std::vector buffer(header); + this->read_exact(buffer.data(), header); + return std::string(buffer.data(), header); + } +}; + +class TcpSocket: public Socket { +public: + TcpSocket(): Socket(0) + { + this->fd = ::socket(AF_INET, SOCK_STREAM, 0); + if (this->fd < 0) + throw std::system_error(errno, std::generic_category(), "failed to create socket"); + + int on = 1; + if (setsockopt(this->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&on, sizeof(on)) < 0) + throw std::system_error(errno, std::generic_category(), "failed to set nodelay"); + + if (setsockopt(this->fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) + throw std::system_error(errno, std::generic_category(), "failed to set reuseaddr"); + } +}; + +inline void fork_wrapper(std::function fn, int timeout_secs, OwnedFd pipe_wr) +{ + TestResult result = TestResult::Failure; + try { + result = fn(); + } catch (const std::exception& e) { + std::println(stderr, "Exception: {}", e.what()); + result = TestResult::Failure; + } + + pipe_wr.write_all((char*)&result, sizeof(TestResult)); +} + +// this function along with `wait_for_python_ready_sigwait()` +// work together to wait on a signal from the python process +// indicating that the tuntap interface has been created, and that the mitm is ready +inline void wait_for_python_ready_sigblock() +{ + sigset_t set {}; + int sig = 0; + + if (sigemptyset(&set) < 0) + throw std::system_error(errno, std::generic_category(), "failed to create empty signal set"); + + if (sigaddset(&set, SIGUSR1) < 0) + throw std::system_error(errno, std::generic_category(), "failed to add sigusr1 to the signal set"); + + if (sigprocmask(SIG_BLOCK, &set, nullptr) < 0) + throw std::system_error(errno, std::generic_category(), "failed to mask sigusr1"); + + std::println("blocked signal..."); +} + +inline void wait_for_python_ready_sigwait(int timeout_secs) +{ + sigset_t set {}; + siginfo_t sig {}; + + if (sigemptyset(&set) < 0) + throw std::system_error(errno, std::generic_category(), "failed to create empty signal set"); + + if (sigaddset(&set, SIGUSR1) < 0) + throw std::system_error(errno, std::generic_category(), "failed to add sigusr1 to the signal set"); + + std::println("waiting for python to be ready..."); + timespec ts {.tv_sec = timeout_secs, .tv_nsec = 0}; + if (sigtimedwait(&set, &sig, &ts) < 0) + throw std::system_error(errno, std::generic_category(), "failed to wait on sigusr1"); + + sigprocmask(SIG_UNBLOCK, &set, nullptr); + std::println("signal received; python is ready"); +} + +// run a test +// forks and runs each of the provided closures +// if `wait_for_python` is true, wait for SIGUSR1 after forking and executing the first closure +inline TestResult test( + int timeout_secs, std::vector> closures, bool wait_for_python = false) +{ + std::vector> pipes {}; + std::vector pids {}; + for (size_t i = 0; i < closures.size(); i++) { + int pipe[2] = {0}; + if (pipe2(pipe, O_NONBLOCK) < 0) { + std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { + close(pipe.first); + close(pipe.second); + }); + + throw std::system_error(errno, std::generic_category(), "failed to create pipe: "); + } + pipes.push_back(std::make_pair(pipe[0], pipe[1])); + } + + for (size_t i = 0; i < closures.size(); i++) { + if (wait_for_python && i == 0) + wait_for_python_ready_sigblock(); + + auto pid = fork(); + if (pid < 0) { + std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { + close(pipe.first); + close(pipe.second); + }); + + std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); }); + + throw std::system_error(errno, std::generic_category(), "failed to fork"); + } + + if (pid == 0) { + // close all pipes except our write half + for (size_t j = 0; j < pipes.size(); j++) { + if (i == j) + close(pipes[i].first); + else { + close(pipes[j].first); + close(pipes[j].second); + } + } + + fork_wrapper(closures[i], timeout_secs, pipes[i].second); + std::exit(EXIT_SUCCESS); + } + + pids.push_back(pid); + + if (wait_for_python && i == 0) + wait_for_python_ready_sigwait(3); + } + + // close all write halves of the pipes + for (auto pipe: pipes) + close(pipe.second); + + std::vector pfds {}; + + OwnedFd timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (timerfd < 0) { + std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); }); + std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); }); + + throw std::system_error(errno, std::generic_category(), "failed to create timerfd"); + } + + pfds.push_back({.fd = timerfd.fd, .events = POLL_IN, .revents = 0}); + for (auto pipe: pipes) + pfds.push_back({ + .fd = pipe.first, + .events = POLL_IN, + .revents = 0, + }); + + itimerspec spec { + .it_interval = + { + .tv_sec = 0, + .tv_nsec = 0, + }, + .it_value = { + .tv_sec = timeout_secs, + .tv_nsec = 0, + }}; + + if (timerfd_settime(timerfd, 0, &spec, nullptr) < 0) { + std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); }); + std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); }); + + throw std::system_error(errno, std::generic_category(), "failed to set timerfd"); + } + + std::vector> results(pids.size(), std::nullopt); + + for (;;) { + auto n = poll(pfds.data(), pfds.size(), -1); + if (n < 0) { + std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); }); + std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); }); + + throw std::system_error(errno, std::generic_category(), "failed to poll: "); + } + + for (auto& pfd: std::vector(pfds)) { + if (pfd.revents == 0) + continue; + + // timed out + if (pfd.fd == timerfd) { + std::println("Timed out!"); + + std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); }); + std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); }); + + return TestResult::Failure; + } + + TestResult result = TestResult::Failure; + char buffer = 0; + if (read(pfd.fd, &buffer, sizeof(TestResult)) <= 0) + result = TestResult::Failure; + else + result = (TestResult)buffer; + + auto elem = std::find_if(pipes.begin(), pipes.end(), [fd = pfd.fd](auto pipe) { return pipe.first == fd; }); + auto idx = elem - pipes.begin(); + results[idx] = result; + + std::println("Process[{}] completed with {}", idx, result == TestResult::Success ? "Success" : "Failure"); + + // this subprocess is done, remove its pipe from the poll fds + pfds.erase(std::remove_if(pfds.begin(), pfds.end(), [&](auto p) { return p.fd == pfd.fd; }), pfds.end()); + + auto done = std::all_of(results.begin(), results.end(), [](auto result) { return result.has_value(); }); + if (done) + goto end; // justification for goto: breaks out of two levels of loop + } + } + +end: + + std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); }); + + int status = 0; + std::for_each(pids.begin(), pids.end(), [&status](const auto& pid) { + if (waitpid(pid, &status, 0) < 0) + std::println(stderr, "failed to wait on a subprocess"); + }); + + if (std::ranges::any_of(results, [](auto x) { return x == TestResult::Failure; })) + return TestResult::Failure; + + return TestResult::Success; +} + +inline TestResult run_python(const char* path, std::vector argv = {}) +{ + // insert the pid at the start of the argv, this is important for signalling readiness + pid_t pid = getppid(); + auto pid_ws = std::to_wstring(pid); + argv.insert(argv.begin(), pid_ws.c_str()); + + PyStatus status; + PyConfig config; + PyConfig_InitPythonConfig(&config); + + status = PyConfig_SetBytesString(&config, &config.program_name, "mitm"); + if (PyStatus_Exception(status)) + goto exception; + + status = Py_InitializeFromConfig(&config); + if (PyStatus_Exception(status)) + goto exception; + PyConfig_Clear(&config); + + argv.insert(argv.begin(), L"mitm"); + PySys_SetArgv(argv.size(), (wchar_t**)argv.data()); + + { + auto file = fopen(path, "r"); + if (!file) + throw std::system_error(errno, std::generic_category(), "failed to open python file"); + + PyRun_SimpleFile(file, path); + fclose(file); + } + + if (Py_FinalizeEx() < 0) { + std::println("finalization failure"); + return TestResult::Failure; + } + + return TestResult::Success; + +exception: + PyConfig_Clear(&config); + Py_ExitStatusException(status); + + return TestResult::Failure; +} + +inline TestResult run_mitm( + std::string testcase, + std::string mitm_ip, + uint16_t mitm_port, + std::string remote_ip, + uint16_t remote_port, + std::vector extra_args = {}) +{ + // we build the args for the user to make calling the function more convenient + std::vector args { + testcase, mitm_ip, std::to_string(mitm_port), remote_ip, std::to_string(remote_port)}; + + for (auto arg: extra_args) + args.push_back(arg); + + // we need to convert to wide strings to pass to Python + std::vector wide_args_owned {}; + + // the strings are ascii so we can just make them into wstrings + for (const auto& str: args) + wide_args_owned.emplace_back(str.begin(), str.end()); + + std::vector wide_args {}; + for (const auto& wstr: wide_args_owned) + wide_args.push_back(wstr.c_str()); + + return run_python("tests/cc_ymq/py_mitm/main.py", wide_args); +} diff --git a/tests/cc_ymq/py_mitm/__init__.py b/tests/cc_ymq/py_mitm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/cc_ymq/py_mitm/main.py b/tests/cc_ymq/py_mitm/main.py new file mode 100644 index 000000000..edaeba569 --- /dev/null +++ b/tests/cc_ymq/py_mitm/main.py @@ -0,0 +1,152 @@ +# flake8: noqa: E402 + +""" +This script provides a framework for running MITM test cases +""" + +import argparse +import os +import sys +import importlib +import signal +import subprocess +from tests.cc_ymq.py_mitm.types import MITMProtocol, TCPConnection +from scapy.all import IP, TCP, TunTapInterface # type: ignore + + +def echo_call(cmd: list[str]): + print(f"+ {' '.join(cmd)}") + subprocess.check_call(cmd) + + +def create_tuntap_interface(iface_name: str, mitm_ip: str, remote_ip: str) -> TunTapInterface: + """ + Creates a TUNTAP interface and sets brings it up and adds ips using the `ip` program + + Args: + iface_name: The name of the TUNTAP interface, usually like `tun0`, `tun1`, etc. + mitm_ip: The desired ip address of the mitm. This is the ip that clients can use to connect to the mitm + remote_ip: The ip that routes to/from the tuntap interface. + packets sent to `mitm_ip` will appear to come from `remote_ip`,\ + and conversely the tuntap interface can connect/send packets + to `remote_ip`, making it a suitable ip for binding a server + + Returns: + The TUNTAP interface + """ + iface = TunTapInterface(iface_name, mode="tun") + + try: + echo_call(["sudo", "ip", "link", "set", iface_name, "up"]) + echo_call(["sudo", "ip", "addr", "add", remote_ip, "peer", mitm_ip, "dev", iface_name]) + print(f"[+] Interface {iface_name} up with IP {mitm_ip}") + except subprocess.CalledProcessError: + print("[!] Could not bring up interface. Run as root or set manually.") + raise + + return iface + + +def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: int, mitm: MITMProtocol): + """ + This function serves as a framework for man in the middle implementations + A client connects to the MITM, then the MITM connects to a remote server + The MITM sits inbetween the client and the server, manipulating the packets sent depending on the test case + This function: + 1. creates a TUNTAP interface and prepares it for MITM + 2. handles connecting clients and handling connection closes + 3. delegates additional logic to a pluggable callable, `mitm` + 4. returns when both connections have terminated (via ) + + Args: + pid: this is the pid of the test process, used for signaling readiness \ + we send SIGUSR1 to this process when the mitm is ready + mitm_ip: The desired ip address of the mitm server + mitm_port: The desired port of the mitm server. \ + This is the port used to connect to the server, but the client is free to connect on any port + remote_ip: The desired remote ip for the TUNTAP interface. This is the only ip address \ + reachable by the interface and is thus the src ip for clients, and the ip that the remote server \ + must be bound to + server_port: The port that the remote server is bound to + mitm: The core logic for a MITM test case. This callable may maintain its own state and is responsible \ + for sending packets over the TUNTAP interface (if it doesn't, nothing will happen) + """ + + tuntap = create_tuntap_interface("tun0", mitm_ip, remote_ip) + + # signal the caller that the tuntap interface has been created + if pid > 0: + os.kill(pid, signal.SIGUSR1) + + # these track information about our connections + # we already know what to expect for the server connection, we are the connector + client_conn = None + server_conn = TCPConnection(mitm_ip, mitm_port, remote_ip, server_port) + + # tracks the state of each connection + client_sent_fin_ack = False + client_closed = False + server_sent_fin_ack = False + server_closed = False + + while True: + pkt = tuntap.recv() + if not pkt.haslayer(IP) or not pkt.haslayer(TCP): + continue + ip = pkt[IP] + tcp = pkt[TCP] + + # for a received packet, the destination ip and port are our local ip and port + # and the source ip and port will be the remote ip and port + sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport) + + if sender == client_conn: + print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}") + elif sender == server_conn: + print(f"<- [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}") + + if tcp.flags == "S": # SYN from client + print("-> [S]") + print(f"[*] New connection from {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}") + client_conn = sender + + if tcp.flags == "SA": # SYN-ACK from server + if sender == server_conn: + print(f"[*] Connection to server established: {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}") + + if tcp.flags == "FA": # FIN-ACK + if sender == client_conn: + client_sent_fin_ack = True + if sender == server_conn: + server_sent_fin_ack = True + + if tcp.flags == "A": # ACK + if sender == client_conn and server_sent_fin_ack: + server_closed = True + if sender == server_conn and client_sent_fin_ack: + client_closed = True + + mitm.proxy(tuntap, pkt, sender, client_conn, server_conn) + + if client_closed and server_closed: + print("[*] Both connections closed") + return + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Man in the middle test framework") + parser.add_argument("pid", type=int, help="The pid of the test process, used for signaling") + parser.add_argument("testcase", type=str, help="The MITM test case module name") + parser.add_argument("mitm_ip", type=str, help="The desired ip address of the mitm server") + parser.add_argument("mitm_port", type=int, help="The desired port of the mitm server") + parser.add_argument("remote_ip", type=str, help="The desired remote ip for the TUNTAP interface") + parser.add_argument("server_port", type=int, help="The port that the remote server is bound to") + + args, unknown = parser.parse_known_args() + + # add the script's directory to path + sys.path.append(os.path.dirname(os.path.realpath(__file__))) + + # load the module dynamically + module = importlib.import_module(args.testcase) + main(args.pid, args.mitm_ip, args.mitm_port, args.remote_ip, args.server_port, module.MITM(*unknown)) diff --git a/tests/cc_ymq/py_mitm/passthrough.py b/tests/cc_ymq/py_mitm/passthrough.py new file mode 100644 index 000000000..20d8a9069 --- /dev/null +++ b/tests/cc_ymq/py_mitm/passthrough.py @@ -0,0 +1,23 @@ +""" +This MITM acts as a transparent passthrough, it simply forwards packets as they are, +minus necessary header changes to retransmit +This MITM should have no effect on the client and server, +and they should behave as if the MITM is not present +""" + +from tests.cc_ymq.py_mitm.types import MITMProtocol, TunTapInterface, IP, TCPConnection + + +class MITM(MITMProtocol): + def proxy( + self, + tuntap: TunTapInterface, + pkt: IP, + sender: TCPConnection, + client_conn: TCPConnection | None, + server_conn: TCPConnection, + ) -> None: + if sender == client_conn: + tuntap.send(server_conn.rewrite(pkt)) + elif sender == server_conn: + tuntap.send(client_conn.rewrite(pkt)) diff --git a/tests/cc_ymq/py_mitm/randomly_drop_packets.py b/tests/cc_ymq/py_mitm/randomly_drop_packets.py new file mode 100644 index 000000000..a197ac3c8 --- /dev/null +++ b/tests/cc_ymq/py_mitm/randomly_drop_packets.py @@ -0,0 +1,28 @@ +""" +This MITM drops a % of packets +""" + +import random +from tests.cc_ymq.py_mitm.types import MITMProtocol, TunTapInterface, IP, TCPConnection + + +class MITM(MITMProtocol): + def __init__(self, drop_pcent: str): + self.drop_pcent = float(drop_pcent) + + def proxy( + self, + tuntap: TunTapInterface, + pkt: IP, + sender: TCPConnection, + client_conn: TCPConnection | None, + server_conn: TCPConnection, + ) -> None: + if random.random() < self.drop_pcent: + print("[!] Dropping packet") + return + + if sender == client_conn: + tuntap.send(server_conn.rewrite(pkt)) + elif sender == server_conn: + tuntap.send(client_conn.rewrite(pkt)) diff --git a/tests/cc_ymq/py_mitm/send_rst_to_client.py b/tests/cc_ymq/py_mitm/send_rst_to_client.py new file mode 100644 index 000000000..fc70355e5 --- /dev/null +++ b/tests/cc_ymq/py_mitm/send_rst_to_client.py @@ -0,0 +1,48 @@ +""" +This MITM inserts an unexpected TCP RST +""" + +from tests.cc_ymq.py_mitm.types import IP, TCP, MITMProtocol, TCPConnection, TunTapInterface + + +class MITM(MITMProtocol): + def __init__(self): + # count the number of psh-acks sent by the client + self.client_pshack_counter = 0 + + def proxy( + self, + tuntap: TunTapInterface, + pkt: IP, + sender: TCPConnection, + client_conn: TCPConnection | None, + server_conn: TCPConnection, + ) -> None: + if sender == client_conn or client_conn is None: + if pkt[TCP].flags == "PA": + self.client_pshack_counter += 1 + + # on the second psh-ack, send a rst instead + if self.client_pshack_counter == 2: + rst_pkt = IP(src=client_conn.local_ip, dst=client_conn.remote_ip) / TCP( + sport=client_conn.local_port, dport=client_conn.remote_port, flags="R", seq=pkt[TCP].ack + ) + print(f"<- [{rst_pkt[TCP].flags}] (simulated)") + tuntap.send(rst_pkt) + return + + tuntap.send(server_conn.rewrite(pkt)) + elif sender == server_conn: + tuntap.send(client_conn.rewrite(pkt)) + + +# client -> mitm -> server +# server -> mitm -> client + +# client: 127.0.0.1:8080 +# mitm: 127.0.0.1:8081 +# server: 127.0.0.1:8081 + + +# client -> mitm == src = client.ip, sport = client.port ;; dst = mitm.ip, dport = mitm.port +# mitm -> server == src = mitm.ip, sport = mitm.port ;; dst = server.ip, dport = server.port diff --git a/tests/cc_ymq/py_mitm/types.py b/tests/cc_ymq/py_mitm/types.py new file mode 100644 index 000000000..4a22ee01a --- /dev/null +++ b/tests/cc_ymq/py_mitm/types.py @@ -0,0 +1,54 @@ +""" +This is the common code for implementing man in the middle in Python +""" + +import dataclasses +from typing import Protocol +from scapy.all import TunTapInterface, IP, TCP # type: ignore + + +@dataclasses.dataclass +class TCPConnection: + """ + Represents a TCP connection over the TUNTAP interface + local_ip and local_port are the mitm's ip and port, and + remote_ip and remote_port are the port for the remote peer + """ + + local_ip: str + local_port: int + remote_ip: str + remote_port: int + + def rewrite(self, pkt: IP, ack: int | None = None, data=None): + """ + Rewrite a TCP/IP packet as a packet originating + from (local_ip, local_port) and going to (remote_ip, remote_port) + This function is useful for taking a packet received from one connection, and redirecting it to another + + Args: + pkt: A scapy TCP/IP packet to rewrite + ack: An optional ack number to use instead of the one found in `pkt` + data: An optional payload to use instead of the one found int `pkt` + + Returns: + The rewritten packet, suitable for sending over TUNTAP + """ + tcp = pkt[TCP] + + return ( + IP(src=self.local_ip, dst=self.remote_ip) + / TCP(sport=self.local_port, dport=self.remote_port, flags=tcp.flags, seq=tcp.seq, ack=ack or tcp.ack) + / bytes(data or tcp.payload) + ) + + +class MITMProtocol(Protocol): + def proxy( + self, + tuntap: TunTapInterface, + pkt: IP, + sender: TCPConnection, + client_conn: TCPConnection | None, + server_conn: TCPConnection, + ) -> None: ... diff --git a/tests/cc_ymq/test_cc_ymq.cpp b/tests/cc_ymq/test_cc_ymq.cpp new file mode 100644 index 000000000..1e7872b5c --- /dev/null +++ b/tests/cc_ymq/test_cc_ymq.cpp @@ -0,0 +1,508 @@ +// this file contains the tests for the C++ interface of YMQ +// each test case is comprised of at least one client and one server, and possibly a middleman +// the clients and servers used in these tests are defined in the first part of this file +// +// the men in the middle (mitm) are implemented using Python and are found in py_mitm/ +// in that directory, `main.py` is the entrypoint and framework for all the mitm, +// and the individual mitm implementations are found in their respective files +// +// the test cases are at the bottom of this file, after the clients and servers +// the documentation for each case is found on the TEST() definition + +#include +#include + +#include +#include +#include +#include +#include + +#include "common.h" +#include "scaler/io/ymq/bytes.h" +#include "scaler/io/ymq/io_context.h" +#include "scaler/io/ymq/simple_interface.h" +#include "tests/cc_ymq/common.h" + +using namespace scaler::ymq; +using namespace std::chrono_literals; + +// ━━━━━━━━━━━━━━━━━━━ +// clients and servers +// ━━━━━━━━━━━━━━━━━━━ + +TestResult basic_server_ymq(std::string host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Binder, "server"); + syncBindSocket(socket, format_address(host, port)); + auto result = syncRecvMessage(socket); + + RETURN_FAILURE_IF_FALSE(result.has_value()); + RETURN_FAILURE_IF_FALSE(result->payload.as_string() == "yi er san si wu liu"); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +TestResult basic_client_ymq(std::string host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Connector, "client"); + syncConnectSocket(socket, format_address(host, port)); + auto result = syncSendMessage(socket, {.address = Bytes("server"), .payload = Bytes("yi er san si wu liu")}); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +TestResult basic_server_raw(std::string host, uint16_t port) +{ + TcpSocket socket; + + socket.bind(host.c_str(), port); + socket.listen(); + auto [client, _] = socket.accept(); + client.write_message("server"); + auto client_identity = client.read_message(); + RETURN_FAILURE_IF_FALSE(client_identity == "client"); + auto msg = client.read_message(); + RETURN_FAILURE_IF_FALSE(msg == "yi er san si wu liu"); + + return TestResult::Success; +} + +TestResult basic_client_raw(int delay, std::string host, uint16_t port) +{ + TcpSocket socket; + + socket.connect(host.c_str(), port); + socket.write_message("client"); + auto server_identity = socket.read_message(); + RETURN_FAILURE_IF_FALSE(server_identity == "server"); + socket.write_message("yi er san si wu liu"); + + if (delay) + std::this_thread::sleep_for(std::chrono::seconds(delay)); + + return TestResult::Success; +} + +TestResult server_receives_big_message(std::string host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Binder, "server"); + syncBindSocket(socket, format_address(host, port)); + auto result = syncRecvMessage(socket); + + RETURN_FAILURE_IF_FALSE(result.has_value()); + RETURN_FAILURE_IF_FALSE(result->payload.len() == 500'000'000); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +TestResult client_sends_big_message(int delay, std::string host, uint16_t port) +{ + TcpSocket socket; + + socket.connect(host.c_str(), port); + socket.write_message("client"); + auto remote_identity = socket.read_message(); + RETURN_FAILURE_IF_FALSE(remote_identity == "server"); + std::string msg(500'000'000, '.'); + socket.write_message(msg); + + if (delay) + std::this_thread::sleep_for(std::chrono::seconds(delay)); + + return TestResult::Success; +} + +TestResult reconnect_server_main(std::string host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Binder, "server"); + syncBindSocket(socket, format_address(host, port)); + auto result = syncRecvMessage(socket); + + RETURN_FAILURE_IF_FALSE(result.has_value()); + RETURN_FAILURE_IF_FALSE(result->payload.as_string() == "hello!!"); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +TestResult reconnect_client_main(std::string host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Connector, "client"); + syncConnectSocket(socket, format_address(host, port)); + auto result = syncSendMessage(socket, {.address = Bytes("server"), .payload = Bytes("hello!!")}); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +TestResult client_simulated_slow_network(const char* host, uint16_t port) +{ + TcpSocket socket; + + socket.connect(host, port); + socket.write_message("client"); + auto remote_identity = socket.read_message(); + RETURN_FAILURE_IF_FALSE(remote_identity == "server"); + + std::string message = "yi er san si wu liu"; + uint64_t header = message.length(); + + socket.write_all((char*)&header, 4); + std::this_thread::sleep_for(5s); + socket.write_all((char*)&header + 4, 4); + std::this_thread::sleep_for(3s); + socket.write_all(message.data(), header / 2); + std::this_thread::sleep_for(5s); + socket.write_all(message.data() + header / 2, header - header / 2); + std::this_thread::sleep_for(3s); + + return TestResult::Success; +} + +TestResult client_sends_incomplete_identity(const char* host, uint16_t port) +{ + // open a socket, write an incomplete identity and exit + { + TcpSocket socket; + + socket.connect(host, port); + + auto server_identity = socket.read_message(); + RETURN_FAILURE_IF_FALSE(server_identity == "server"); + + // write incomplete identity and exit + std::string identity = "client"; + uint64_t header = identity.length(); + socket.write_all((char*)&header, 8); + socket.write_all(identity.data(), identity.length() - 2); + std::this_thread::sleep_for(3s); + } + + // connect again and try to send a message + { + TcpSocket socket; + socket.connect(host, port); + auto server_identity = socket.read_message(); + RETURN_FAILURE_IF_FALSE(server_identity == "server"); + socket.write_message("client"); + socket.write_message("yi er san si wu liu"); + std::this_thread::sleep_for(3s); + } + + return TestResult::Success; +} + +TestResult server_receives_huge_header(const char* host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Binder, "server"); + syncBindSocket(socket, format_address(host, port)); + auto result = syncRecvMessage(socket); + + RETURN_FAILURE_IF_FALSE(result.has_value()); + RETURN_FAILURE_IF_FALSE(result->payload.as_string() == "yi er san si wu liu"); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +TestResult client_sends_huge_header(const char* host, uint16_t port) +{ + TcpSocket socket; + + socket.connect(host, port); + socket.write_message("client"); + auto server_identity = socket.read_message(); + RETURN_FAILURE_IF_FALSE(server_identity == "server"); + + // write the huge header + uint64_t header = std::numeric_limits::max(); + socket.write_all((char*)&header, 8); + + // TODO: this sleep shouldn't be necessary + std::this_thread::sleep_for(3s); + + return TestResult::Success; +} + +TestResult server_receives_empty_messages(const char* host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Binder, "server"); + syncBindSocket(socket, format_address(host, port)); + + auto result = syncRecvMessage(socket); + RETURN_FAILURE_IF_FALSE(result.has_value()); + RETURN_FAILURE_IF_FALSE(result->payload.as_string() == ""); + + auto result2 = syncRecvMessage(socket); + RETURN_FAILURE_IF_FALSE(result2.has_value()); + RETURN_FAILURE_IF_FALSE(result2->payload.as_string() == ""); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +TestResult client_sends_empty_messages(std::string host, uint16_t port) +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Connector, "client"); + syncConnectSocket(socket, format_address(host, port)); + + auto error = syncSendMessage(socket, Message {.address = Bytes(), .payload = Bytes()}); + RETURN_FAILURE_IF_FALSE(!error); + + auto error2 = syncSendMessage(socket, Message {.address = Bytes(), .payload = Bytes("")}); + RETURN_FAILURE_IF_FALSE(!error2); + + context.removeIOSocket(socket); + + return TestResult::Success; +} + +// ━━━━━━━━━━━━━ +// test cases +// ━━━━━━━━━━━━━ + +// this is a 'basic' test which sends a single message from a client to a server +// in this variant, both the client and server are implemented using YMQ +// +// this case includes a _delay_ +// this is a thread sleep that happens after the client sends the message, to delay the close() of the socket +// at the moment, if this delay is missing, YMQ will not shut down correctly +TEST(CcYmqTestSuite, TestBasicYMQClientYMQServer) +{ + auto host = "localhost"; + auto port = 2889; + + // this is the test harness, it accepts a timeout, a list of functions to run, + // and an optional third argument used to coordinate the execution of python (for mitm) + auto result = + test(10, {[=] { return basic_client_ymq(host, port); }, [=] { return basic_server_ymq(host, port); }}); + + // test() aggregates the results across all of the provided functions + EXPECT_EQ(result, TestResult::Success); +} + +// same as above, except YMQs protocol is directly implemented on top of a TCP socket +TEST(CcYmqTestSuite, TestBasicRawClientYMQServer) +{ + auto host = "localhost"; + auto port = 2890; + + // this is the test harness, it accepts a timeout, a list of functions to run, + // and an optional third argument used to coordinate the execution of python (for mitm) + auto result = + test(10, {[=] { return basic_client_raw(5, host, port); }, [=] { return basic_server_ymq(host, port); }}); + + // test() aggregates the results across all of the provided functions + EXPECT_EQ(result, TestResult::Success); +} + +TEST(CcYmqTestSuite, TestBasicRawClientRawServer) +{ + auto host = "localhost"; + auto port = 2891; + + // this is the test harness, it accepts a timeout, a list of functions to run, + // and an optional third argument used to coordinate the execution of python (for mitm) + auto result = + test(10, {[=] { return basic_client_raw(0, host, port); }, [=] { return basic_server_raw(host, port); }}); + + // test() aggregates the results across all of the provided functions + EXPECT_EQ(result, TestResult::Success); +} + +// TODO: this should pass +// this is the same as above, except that it has no delay before calling close() on the socket +// this test hangs +TEST(CcYmqTestSuite, DISABLED_TestBasicRawClientRawServerNoDelay) +{ + auto host = "localhost"; + auto port = 2892; + + auto result = + test(10, {[=] { return basic_client_raw(0, host, port); }, [=] { return basic_server_ymq(host, port); }}); + EXPECT_EQ(result, TestResult::Success); +} + +TEST(CcYmqTestSuite, TestBasicDelayYMQClientRawServer) +{ + auto host = "localhost"; + auto port = 2893; + + // this is the test harness, it accepts a timeout, a list of functions to run, + // and an optional third argument used to coordinate the execution of python (for mitm) + auto result = + test(10, {[=] { return basic_client_ymq(host, port); }, [=] { return basic_server_raw(host, port); }}); + + // test() aggregates the results across all of the provided functions + EXPECT_EQ(result, TestResult::Success); +} + +// in this test case, the client sends a large message to the server +// YMQ should be able to handle this without issue +TEST(CcYmqTestSuite, TestClientSendBigMessageToServer) +{ + auto host = "localhost"; + auto port = 2894; + + auto result = test( + 10, + {[=] { return client_sends_big_message(5, host, port); }, + [=] { return server_receives_big_message(host, port); }}); + EXPECT_EQ(result, TestResult::Success); +} + +// this is the no-op/passthrough man in the middle test +// for this test case we use YMQ on both the client side and the server side +// the client connects to the mitm, and the mitm connects to the server +// when the mitm receives packets from the client, it forwards it to the server without changing it +// and similarly when it receives packets from the server, it forwards them to the client +// +// the mitm is implemented in Python. we pass the name of the test case, which corresponds to the Python filename, +// and a list of arguments, which are: mitm ip, mitm port, remote ip, remote port +// this defines the address of the mitm, and the addresses that can connect to it +// for more, see the python mitm files +TEST(CcYmqTestSuite, TestMitmPassthrough) +{ + auto mitm_ip = "192.0.2.4"; + auto mitm_port = 2323; + auto remote_ip = "192.0.2.3"; + auto remote_port = 23571; + + // the Python program must be the first and only the first function passed to test() + // we must also pass `true` as the third argument to ensure that Python is fully started + // before beginning the test + auto result = test( + 20, + {[=] { return run_mitm("passthrough", mitm_ip, mitm_port, remote_ip, remote_port); }, + [=] { return basic_client_ymq(mitm_ip, mitm_port); }, + [=] { return basic_server_ymq(remote_ip, remote_port); }}, + true); + EXPECT_EQ(result, TestResult::Success); +} + +// this test uses the mitm to test the reconnect logic of YMQ by sending RST packets +// this test is disabled until fixes arrive in the core +TEST(CcYmqTestSuite, DISABLED_TestMitmReconnect) +{ + auto mitm_ip = "192.0.2.4"; + auto mitm_port = 2525; + auto remote_ip = "192.0.2.3"; + auto remote_port = 23575; + + auto result = test( + 10, + {[=] { return run_mitm("send_rst_to_client", mitm_ip, mitm_port, remote_ip, remote_port); }, + [=] { return reconnect_client_main(mitm_ip, mitm_port); }, + [=] { return reconnect_server_main(remote_ip, remote_port); }}, + true); + EXPECT_EQ(result, TestResult::Success); +} + +// TODO: Make this more reliable, and re-enable it +// in this test, the mitm drops a random % of packets arriving from the client and server +TEST(CcYmqTestSuite, DISABLED_TestMitmRandomlyDropPackets) +{ + auto mitm_ip = "192.0.2.4"; + auto mitm_port = 2828; + auto remote_ip = "192.0.2.3"; + auto remote_port = 23591; + + auto result = test( + 60, + {[=] { return run_mitm("randomly_drop_packets", mitm_ip, mitm_port, remote_ip, remote_port, {"0.3"}); }, + [=] { return basic_client_ymq(mitm_ip, mitm_port); }, + [=] { return basic_server_ymq(remote_ip, remote_port); }}, + true); + EXPECT_EQ(result, TestResult::Success); +} + +// in this test the client is sending a message to the server +// but we simulate a slow network connection by sending the message in segmented chunks +TEST(CcYmqTestSuite, TestSlowNetwork) +{ + auto host = "localhost"; + auto port = 2895; + + auto result = test( + 20, {[=] { return client_simulated_slow_network(host, port); }, [=] { return basic_server_ymq(host, port); }}); + EXPECT_EQ(result, TestResult::Success); +} + +// TODO: figure out why this test fails in ci sometimes, and re-enable +// +// in this test, a client connects to the YMQ server but only partially sends its identity and then disconnects +// then a new client connection is established, and this one sends a complete identity and message +// YMQ should be able to recover from a poorly-behaved client like this +TEST(CcYmqTestSuite, DISABLED_TestClientSendIncompleteIdentity) +{ + auto host = "localhost"; + auto port = 2896; + + auto result = test( + 20, + {[=] { return client_sends_incomplete_identity(host, port); }, [=] { return basic_server_ymq(host, port); }}); + EXPECT_EQ(result, TestResult::Success); +} + +// TODO: this should pass +// in this test, the client sends an unrealistically-large header +// it is important that YMQ checks the header size before allocating memory +// both for resilence against attacks and to guard against errors +// +// at the moment YMQ does not perform this check and throws std::bad_alloc +// this test can be re-enabled after this is fixed +TEST(CcYmqTestSuite, DISABLED_TestClientSendHugeHeader) +{ + auto host = "localhost"; + auto port = 2897; + + auto result = test( + 20, + {[=] { return client_sends_huge_header(host, port); }, + [=] { return server_receives_huge_header(host, port); }}); + EXPECT_EQ(result, TestResult::Success); +} + +// in this test, the client sends empty messages to the server +// there are in effect two kinds of empty messages: Bytes() and Bytes("") +// in the former case, the bytes contains a nullptr +// in the latter case, the bytes contains a zero-length allocation +// it's important that the behaviour of YMQ is known for both of these cases +TEST(CcYmqTestSuite, TestClientSendEmptyMessage) +{ + auto host = "localhost"; + auto port = 2898; + + auto result = test( + 20, + {[=] { return client_sends_empty_messages(host, port); }, + [=] { return server_receives_empty_messages(host, port); }}); + EXPECT_EQ(result, TestResult::Success); +} diff --git a/tests/object_storage/test_object_storage_server.cpp b/tests/object_storage/test_object_storage_server.cpp index e448e99dc..626a1ce26 100644 --- a/tests/object_storage/test_object_storage_server.cpp +++ b/tests/object_storage/test_object_storage_server.cpp @@ -1,6 +1,5 @@ #include -#include #include #include #include @@ -8,7 +7,6 @@ #include "scaler/io/ymq/io_context.h" #include "scaler/io/ymq/io_socket.h" -#include "scaler/io/ymq/logging.h" #include "scaler/io/ymq/simple_interface.h" #include "scaler/object_storage/object_storage_server.h" @@ -39,8 +37,8 @@ class ObjectStorageClient { void writeYMQMessage(Message message) { - auto res = syncSendMessage(ioSocket, std::move(message)); - ASSERT_TRUE(res.has_value()); + auto error = syncSendMessage(ioSocket, std::move(message)); + ASSERT_TRUE(!error); } auto readYMQMessage() { return syncRecvMessage(ioSocket); } @@ -63,17 +61,17 @@ class ObjectStorageClient { void readResponse(ObjectResponseHeader& header, std::optional& payload) { std::array buf {}; - auto [message, error] = syncRecvMessage(ioSocket); - ASSERT_EQ(error._errorCode, Error::ErrorCode::Uninit); + auto result = syncRecvMessage(ioSocket); + ASSERT_TRUE(result.has_value()); - memcpy(buf.begin(), message.payload.data(), CAPNP_HEADER_SIZE); - ASSERT_EQ(message.payload.size(), CAPNP_HEADER_SIZE); + memcpy(buf.begin(), result->payload.data(), CAPNP_HEADER_SIZE); + ASSERT_EQ(result->payload.size(), CAPNP_HEADER_SIZE); header = ObjectResponseHeader::fromBuffer(buf); if (header.payloadLength > 0) { - auto [message2, error2] = syncRecvMessage(ioSocket); - ASSERT_EQ(error2._errorCode, Error::ErrorCode::Uninit); - payload.emplace(message2.payload); + auto result2 = syncRecvMessage(ioSocket); + ASSERT_TRUE(result2.has_value()); + payload.emplace(result2->payload); } else { payload.reset(); } @@ -530,6 +528,9 @@ TEST_F(ObjectStorageServerTest, TestClientDisconnect) } } +// TODO: why does this not pass? +// the message connection tcp is removed from the remote socket's list +// but the object is never destructued, and so the connection is never closed TEST_F(ObjectStorageServerTest, TestMalformedHeader) { ObjectResponseHeader responseHeader; @@ -547,8 +548,9 @@ TEST_F(ObjectStorageServerTest, TestMalformedHeader) client->writeYMQMessage(std::move(message)); // Server should disconnect before or while we are reading the response - auto [msg, err] = client->readYMQMessage(); - EXPECT_EQ(err._errorCode, Error::ErrorCode::ConnectorSocketClosedByRemoteEnd); + auto result = client->readYMQMessage(); + EXPECT_TRUE(!result); + EXPECT_EQ(result.error()._errorCode, Error::ErrorCode::ConnectorSocketClosedByRemoteEnd); } // Server must still answers to requests from other clients diff --git a/tests/pymod_ymq/__init__.py b/tests/pymod_ymq/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/pymod_ymq/test_pymod_ymq.py b/tests/pymod_ymq/test_pymod_ymq.py new file mode 100644 index 000000000..ee96914e6 --- /dev/null +++ b/tests/pymod_ymq/test_pymod_ymq.py @@ -0,0 +1,150 @@ +import multiprocessing.connection +import unittest +from scaler.io.ymq import ymq +import asyncio +import multiprocessing + + +class TestPymodYMQ(unittest.IsolatedAsyncioTestCase): + async def test_basic(self): + ctx = ymq.IOContext() + binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder) + self.assertEqual(binder.identity, "binder") + self.assertEqual(binder.socket_type, ymq.IOSocketType.Binder) + + connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector) + self.assertEqual(connector.identity, "connector") + self.assertEqual(connector.socket_type, ymq.IOSocketType.Connector) + + await binder.bind("tcp://127.0.0.1:35791") + await connector.connect("tcp://127.0.0.1:35791") + + await connector.send(ymq.Message(address=None, payload=b"payload")) + msg = await binder.recv() + + assert msg.address is not None + self.assertEqual(msg.address.data, b"connector") + self.assertEqual(msg.payload.data, b"payload") + + @unittest.skip("this test currently hangs, see comment in the code") + async def test_no_address(self): + # this test requires special care because it hangs and doesn't shut down the worker threads properly + # we use a subprocess to shield us from any effects + pipe_parent, pipe_child = multiprocessing.Pipe(duplex=False) + + def test(pipe: multiprocessing.connection.Connection) -> None: + async def main(): + ctx = ymq.IOContext() + binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder) + connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector) + + await binder.bind("tcp://127.0.0.1:35791") + await connector.connect("tcp://127.0.0.1:35791") + + try: + # TODO: change to `asyncio.timeout()` in python >3.10 + await asyncio.wait_for(binder.send(ymq.Message(address=None, payload=b"payload")), 30) + + # TODO: solve the hang and write the rest of the test + pipe.send(True) + except asyncio.TimeoutError: + pipe.send(False) + + asyncio.run(main()) + + p = multiprocessing.Process(target=test, args=(pipe_child,)) + p.start() + result = pipe_parent.recv() + p.join(5) + if p.exitcode is None: + p.kill() + + if not result: + self.fail() + + async def test_routing(self): + ctx = ymq.IOContext() + binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder) + connector1 = await ctx.createIOSocket("connector1", ymq.IOSocketType.Connector) + connector2 = await ctx.createIOSocket("connector2", ymq.IOSocketType.Connector) + + await binder.bind("tcp://127.0.0.1:35791") + await connector1.connect("tcp://127.0.0.1:35791") + await connector2.connect("tcp://127.0.0.1:35791") + + await binder.send(ymq.Message(b"connector2", b"2")) + await binder.send(ymq.Message(b"connector1", b"1")) + + msg1 = await connector1.recv() + self.assertEqual(msg1.payload.data, b"1") + + msg2 = await connector2.recv() + self.assertEqual(msg2.payload.data, b"2") + + async def test_pingpong(self): + ctx = ymq.IOContext() + binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder) + connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector) + + await binder.bind("tcp://127.0.0.1:35791") + await connector.connect("tcp://127.0.0.1:35791") + + async def binder_routine(binder: ymq.IOSocket, limit: int) -> bool: + i = 0 + while i < limit: + await binder.send(ymq.Message(address=b"connector", payload=f"{i}".encode())) + msg = await binder.recv() + assert msg.payload.data is not None + + recv_i = int(msg.payload.data.decode()) + if recv_i - i > 1: + return False + i = recv_i + 1 + return True + + async def connector_routine(connector: ymq.IOSocket, limit: int) -> bool: + i = 0 + while True: + msg = await connector.recv() + assert msg.payload.data is not None + recv_i = int(msg.payload.data.decode()) + if recv_i - i > 1: + return False + i = recv_i + 1 + await connector.send(ymq.Message(address=None, payload=f"{i}".encode())) + + # when the connector sends `limit - 1`, we're done + if i >= limit - 1: + break + return True + + binder_success, connector_success = await asyncio.gather( + binder_routine(binder, 100), connector_routine(connector, 100) + ) + + if not binder_success: + self.fail("binder failed") + + if not connector_success: + self.fail("connector failed") + + async def test_big_message(self): + ctx = ymq.IOContext() + binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder) + self.assertEqual(binder.identity, "binder") + self.assertEqual(binder.socket_type, ymq.IOSocketType.Binder) + + connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector) + self.assertEqual(connector.identity, "connector") + self.assertEqual(connector.socket_type, ymq.IOSocketType.Connector) + + await binder.bind("tcp://127.0.0.1:35791") + await connector.connect("tcp://127.0.0.1:35791") + + for _ in range(10): + await connector.send(ymq.Message(address=None, payload=b"." * 500_000_000)) + msg = await binder.recv() + + assert msg.address is not None + self.assertEqual(msg.address.data, b"connector") + self.assertEqual(msg.payload.data, b"." * 500_000_000) diff --git a/tests/pymod_ymq/test_types.py b/tests/pymod_ymq/test_types.py new file mode 100644 index 000000000..e461856e0 --- /dev/null +++ b/tests/pymod_ymq/test_types.py @@ -0,0 +1,90 @@ +import unittest +from enum import IntEnum +from scaler.io.ymq import ymq +import array + + +class TestTypes(unittest.TestCase): + def test_exception(self): + # type checkers misidentify this as "unnecessary" due to the type hints file + self.assertTrue(issubclass(ymq.YMQException, Exception)) # type: ignore + + exc = ymq.YMQException(ymq.ErrorCode.CoreBug, "oh no") + self.assertEqual(exc.args, (ymq.ErrorCode.CoreBug, "oh no")) + self.assertEqual(exc.code, ymq.ErrorCode.CoreBug) + self.assertEqual(exc.message, "oh no") + + def test_interrupted_exception(self): + self.assertTrue(issubclass(ymq.YMQInterruptedException, Exception)) # type: ignore + + exc = ymq.YMQInterruptedException() + self.assertEqual(exc.args, tuple()) + + def test_error_code(self): + self.assertTrue(issubclass(ymq.ErrorCode, IntEnum)) # type: ignore + self.assertEqual( + ymq.ErrorCode.ConfigurationError.explanation(), + "An error generated by system call that's likely due to mis-configuration", + ) + + def test_bytes(self): + b = ymq.Bytes(b"data") + self.assertEqual(b.len, len(b)) + self.assertEqual(b.len, 4) + self.assertEqual(b.data, b"data") + + # would raise an exception if ymq.Bytes didn't support the buffer interface + m = memoryview(b) + self.assertTrue(m.obj is b) + self.assertEqual(m.tobytes(), b"data") + + b = ymq.Bytes() + self.assertEqual(b.len, 0) + self.assertTrue(b.data is None) + + b = ymq.Bytes(b"") + self.assertEqual(b.len, 0) + self.assertEqual(b.data, b"") + + b = ymq.Bytes(array.array("B", [115, 99, 97, 108, 101, 114])) + assert b.len == 6 + assert b.data == b"scaler" + + def test_message(self): + m = ymq.Message(b"address", b"payload") + assert m.address is not None + self.assertEqual(m.address.data, b"address") + self.assertEqual(m.payload.data, b"payload") + + m = ymq.Message(address=None, payload=ymq.Bytes(b"scaler")) + self.assertTrue(m.address is None) + self.assertEqual(m.payload.data, b"scaler") + + m = ymq.Message(b"address", payload=b"payload") + assert m.address is not None + self.assertEqual(m.address.data, b"address") + self.assertEqual(m.payload.data, b"payload") + + def test_io_context(self): + ctx = ymq.IOContext() + self.assertEqual(ctx.num_threads, 1) + + ctx = ymq.IOContext(2) + self.assertEqual(ctx.num_threads, 2) + + ctx = ymq.IOContext(num_threads=3) + self.assertEqual(ctx.num_threads, 3) + + def test_io_socket(self): + # check that we can't create io socket instances directly + self.assertRaises(TypeError, lambda: ymq.IOSocket()) + + def test_io_socket_type(self): + self.assertTrue(issubclass(ymq.IOSocketType, IntEnum)) # type: ignore + + def test_bad_socket_type(self): + ctx = ymq.IOContext() + + # TODO: should the core reject this? + socket = ctx.createIOSocket_sync("identity", ymq.IOSocketType.Uninit) + self.assertEqual(socket.socket_type, ymq.IOSocketType.Uninit) From 88613f90f24a510eaee96cc05acc311b70b35815 Mon Sep 17 00:00:00 2001 From: sharpener6 <1sc2l4qi@duck.com> Date: Wed, 24 Sep 2025 19:30:01 -0400 Subject: [PATCH 2/3] Rework all github workflows (#230) - Moved workflows to actions so they can be reused - Removed the dependency on boost - Fix formatting issue - Fix the formatting and import sort orders YMQ Python Module Fixes Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> remove panic fn Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> reorganize classes to put private members after public Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> apply suggestion Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> add comment Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> take && reference Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> rename variable Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> rename variable Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add YMQ Tests Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> review reference counting, improve error handling, increase consistency Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> clean up includes Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> update todos Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> update todos Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Improve error integration for futures Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Improve signal handling in sync methods Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Remove temporary ymq test files Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add basic test case Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Set socket TCP_NODELAY Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Make util fns inline Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Major refactor of ymq tests, add slow network test Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add new tests Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Use subprocesses instead of threads for better reliability Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Relocate C++ tests and use GTest Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add Python tests, update interface type hints as necessary Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> mitm Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> add tcp serialization and deserialization YMQ Python Module Fixes Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> remove panic fn Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> reorganize classes to put private members after public Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> apply suggestion Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> add comment Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> take && reference Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> rename variable Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> rename variable Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add YMQ Tests Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> review reference counting, improve error handling, increase consistency Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> clean up includes Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> update todos Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> update todos Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Improve error integration for futures Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Improve signal handling in sync methods Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Remove temporary ymq test files Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add basic test case Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Set socket TCP_NODELAY Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Make util fns inline Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Major refactor of ymq tests, add slow network test Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add new tests Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Use subprocesses instead of threads for better reliability Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Relocate C++ tests and use GTest Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add Python tests, update interface type hints as necessary Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> mitm Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> add tcp serialization and deserialization YMQ Python Module Fixes Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> remove panic fn Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> reorganize classes to put private members after public Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> apply suggestion Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> add comment Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> take && reference Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> rename variable Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> rename variable Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add YMQ Tests Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> review reference counting, improve error handling, increase consistency Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> clean up includes Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> update todos Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> update todos Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Improve error integration for futures Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Improve signal handling in sync methods Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Remove temporary ymq test files Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add basic test case Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Set socket TCP_NODELAY Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Make util fns inline Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Major refactor of ymq tests, add slow network test Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add new tests Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Use subprocesses instead of threads for better reliability Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Relocate C++ tests and use GTest Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> Add Python tests, update interface type hints as necessary Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> mitm Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> add tcp serialization and deserialization --- scaler/io/ymq/message_connection_tcp.cpp | 2 +- scaler/io/ymq/tests/incomplete_identity.h | 53 + scaler/io/ymq/third_party/concurrentqueue.h | 7130 ++++++++++--------- scripts/lint.sh | 15 + tests/cc_ymq/common.h | 5 +- tests/cc_ymq/py_mitm/core.py | 54 + tests/cc_ymq/py_mitm/drop.py | 28 + tests/cc_ymq/py_mitm/rst.py | 48 + tests/cc_ymq/py_mitm/runner.py | 157 + tests/pymod_ymq/config.py | 13 + 10 files changed, 4105 insertions(+), 3400 deletions(-) create mode 100644 scaler/io/ymq/tests/incomplete_identity.h create mode 100755 scripts/lint.sh create mode 100644 tests/cc_ymq/py_mitm/core.py create mode 100644 tests/cc_ymq/py_mitm/drop.py create mode 100644 tests/cc_ymq/py_mitm/rst.py create mode 100644 tests/cc_ymq/py_mitm/runner.py create mode 100644 tests/pymod_ymq/config.py diff --git a/scaler/io/ymq/message_connection_tcp.cpp b/scaler/io/ymq/message_connection_tcp.cpp index 8212238f1..df2338292 100644 --- a/scaler/io/ymq/message_connection_tcp.cpp +++ b/scaler/io/ymq/message_connection_tcp.cpp @@ -267,7 +267,7 @@ void MessageConnectionTCP::updateReadOperation() _receivedReadOperations.pop(); auto recvMessageCallback = std::move(_pendingRecvMessageCallbacks->front()); _pendingRecvMessageCallbacks->pop(); - + recvMessageCallback({Message(std::move(address), std::move(payload)), {}}); } else { assert(_pendingRecvMessageCallbacks->size()); diff --git a/scaler/io/ymq/tests/incomplete_identity.h b/scaler/io/ymq/tests/incomplete_identity.h new file mode 100644 index 000000000..e8ea4f1ed --- /dev/null +++ b/scaler/io/ymq/tests/incomplete_identity.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +#include "scaler/io/ymq/examples/common.h" +#include "scaler/io/ymq/io_context.h" +#include "tests/cc_ymq/common.h" + +void incomplete_identity_server_main() +{ + IOContext context(1); + + auto socket = syncCreateSocket(context, IOSocketType::Binder, "server"); + syncBindSocket(socket, "tcp://127.0.0.1:25715"); + auto result = syncRecvMessage(socket); + + assert(result.has_value()); + assert(result->payload.as_string() == "yi er san si wu liu"); + + context.removeIOSocket(socket); +} + +void incomplete_identity_client_main() +{ + // open a socket, write an incomplete identity and exit + { + TcpSocket socket; + + socket.connect("127.0.0.1", 25715); + + auto remote_identity = socket.read_message(); + assert(remote_identity == "server"); + + // write incomplete identity and exit + std::string identity = "client"; + uint64_t header = identity.length(); + socket.write_all((char*)&header, 8); + socket.write_all(identity.data(), identity.length() - 2); + std::this_thread::sleep_for(3s); + } + + // connect again and try to send a message + { + TcpSocket socket; + socket.connect("127.0.0.1", 25715); + auto remote_identity = socket.read_message(); + assert(remote_identity == "server"); + socket.write_message("client"); + socket.write_message("yi er san si wu liu"); + std::this_thread::sleep_for(3s); + } +} diff --git a/scaler/io/ymq/third_party/concurrentqueue.h b/scaler/io/ymq/third_party/concurrentqueue.h index 2fc775400..d5498b116 100644 --- a/scaler/io/ymq/third_party/concurrentqueue.h +++ b/scaler/io/ymq/third_party/concurrentqueue.h @@ -47,7 +47,7 @@ // VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher // does not support `if constexpr`, so we have no choice but to simply disable the warning #pragma warning(push) -#pragma warning(disable: 4127) // conditional expression is constant +#pragma warning(disable : 4127) // conditional expression is constant #endif #if defined(__APPLE__) @@ -64,81 +64,114 @@ #undef malloc #undef free #else -#include // Requires C++11. Sorry VS2010. +#include // Requires C++11. Sorry VS2010. #include #endif -#include // for max_align_t +#include +#include +#include // for CHAR_BIT +#include // for max_align_t #include #include +#include +#include // used for thread exit synchronization +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading #include -#include #include -#include -#include // for CHAR_BIT -#include -#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading -#include // used for thread exit synchronization // Platform-specific definitions of a numeric thread ID type and an invalid value -namespace moodycamel { namespace details { - template struct thread_id_converter { - typedef thread_id_t thread_id_numeric_size_t; - typedef thread_id_t thread_id_hash_t; - static thread_id_hash_t prehash(thread_id_t const& x) { return x; } - }; -} } +namespace moodycamel { +namespace details { +template +struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } +}; +} // namespace details +} // namespace moodycamel #if defined(MCDBGQ_USE_RELACY) -namespace moodycamel { namespace details { - typedef std::uint32_t thread_id_t; - static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; - static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; - static inline thread_id_t thread_id() { return rl::thread_index(); } -} } +namespace moodycamel { +namespace details { +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; +static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; +static inline thread_id_t thread_id() +{ + return rl::thread_index(); +} +} // namespace details +} // namespace moodycamel #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) // No sense pulling in windows.h in a header, we'll manually declare the function // we use and rely on backwards-compatibility for this not to break extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); -namespace moodycamel { namespace details { - static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); - typedef std::uint32_t thread_id_t; - static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx - static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. - static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } -} } -#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL) -namespace moodycamel { namespace details { - static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); - - typedef std::thread::id thread_id_t; - static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID - - // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's - // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't - // be. - static inline thread_id_t thread_id() { return std::this_thread::get_id(); } - - template struct thread_id_size { }; - template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; - template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; - - template<> struct thread_id_converter { - typedef thread_id_size::numeric_t thread_id_numeric_size_t; +namespace moodycamel { +namespace details { +static_assert( + sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = + 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx +static const thread_id_t invalid_thread_id2 = + 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread + // IDs are presently multiples of 4. +static inline thread_id_t thread_id() +{ + return static_cast(::GetCurrentThreadId()); +} +} // namespace details +} // namespace moodycamel +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || \ + defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL) +namespace moodycamel { +namespace details { +static_assert( + sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, + "std::thread::id is expected to be either 4 or 8 bytes"); + +typedef std::thread::id thread_id_t; +static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + +// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's +// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't +// be. +static inline thread_id_t thread_id() +{ + return std::this_thread::get_id(); +} + +template +struct thread_id_size {}; +template <> +struct thread_id_size<4> { + typedef std::uint32_t numeric_t; +}; +template <> +struct thread_id_size<8> { + typedef std::uint64_t numeric_t; +}; + +template <> +struct thread_id_converter { + typedef thread_id_size::numeric_t thread_id_numeric_size_t; #ifndef __APPLE__ - typedef std::size_t thread_id_hash_t; + typedef std::size_t thread_id_hash_t; #else - typedef thread_id_numeric_size_t thread_id_hash_t; + typedef thread_id_numeric_size_t thread_id_hash_t; #endif - static thread_id_hash_t prehash(thread_id_t const& x) - { + static thread_id_hash_t prehash(thread_id_t const& x) + { #ifndef __APPLE__ - return std::hash()(x); + return std::hash()(x); #else - return *reinterpret_cast(&x); + return *reinterpret_cast(&x); #endif - } - }; -} } + } +}; +} +} #else // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 // In order to get a numeric thread ID in a platform-independent way, we use a thread-local @@ -151,12 +184,19 @@ namespace moodycamel { namespace details { // Assume C++11 compliant compiler #define MOODYCAMEL_THREADLOCAL thread_local #endif -namespace moodycamel { namespace details { - typedef std::uintptr_t thread_id_t; - static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr - static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. - inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } -} } +namespace moodycamel { +namespace details { +typedef std::uintptr_t thread_id_t; +static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr +static const thread_id_t invalid_thread_id2 = + 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. +inline thread_id_t thread_id() +{ + static MOODYCAMEL_THREADLOCAL int x; + return reinterpret_cast(&x); +} +} +} #endif // Constexpr if @@ -172,18 +212,19 @@ namespace moodycamel { namespace details { // Exceptions #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED -#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ + (!defined(_MSC_VER) && !defined(__GNUC__)) #define MOODYCAMEL_EXCEPTIONS_ENABLED #endif #endif #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED -#define MOODYCAMEL_TRY try -#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) -#define MOODYCAMEL_RETHROW throw -#define MOODYCAMEL_THROW(expr) throw (expr) +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw(expr) #else -#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true) -#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false) +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false) #define MOODYCAMEL_RETHROW #define MOODYCAMEL_THROW(expr) #endif @@ -191,21 +232,35 @@ namespace moodycamel { namespace details { #ifndef MOODYCAMEL_NOEXCEPT #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) #define MOODYCAMEL_NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( // We have to assume *all* non-trivial constructors may throw on VS2012! #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && std::is_move_constructible::value ? \ + std::is_trivially_move_constructible::value : \ + std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && std::is_move_assignable::value ? \ + std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : \ + std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && std::is_move_constructible::value ? \ + std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : \ + std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && std::is_move_assignable::value ? \ + std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : \ + std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #else -#define MOODYCAMEL_NOEXCEPT noexcept -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) #endif #endif @@ -214,18 +269,24 @@ namespace moodycamel { namespace details { #ifdef MCDBGQ_USE_RELACY #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #else -// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 -// g++ <=4.7 doesn't support thread_local either. -// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work -#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__) +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: +// http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't support thread_local either. Finally, iOS/ARM doesn't +// have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ + (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && \ + (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ + (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && \ + !defined(__MVS__) // Assume `thread_local` is fully supported in all other C++11 compilers/platforms -#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; years ago several users report having problems with it on +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; years ago several users report having + // problems with it on #endif #endif #endif -// VS2012 doesn't support deleted functions. -// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is +// called. #ifndef MOODYCAMEL_DELETE_FUNCTION #if defined(_MSC_VER) && _MSC_VER < 1800 #define MOODYCAMEL_DELETE_FUNCTION @@ -234,54 +295,100 @@ namespace moodycamel { namespace details { #endif #endif -namespace moodycamel { namespace details { +namespace moodycamel { +namespace details { #ifndef MOODYCAMEL_ALIGNAS // VS2013 doesn't support alignas or alignof, and align() requires a constant literal #if defined(_MSC_VER) && _MSC_VER <= 1800 -#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) -#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) +#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) +#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned::value, T>::type - template struct Vs2013Aligned { }; // default, unsupported alignment - template struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; - template struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; - template struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; - template struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; - template struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; - template struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; - template struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; - template struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; - template struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; +template +struct Vs2013Aligned {}; // default, unsupported alignment +template +struct Vs2013Aligned<1, T> { + typedef __declspec(align(1)) T type; +}; +template +struct Vs2013Aligned<2, T> { + typedef __declspec(align(2)) T type; +}; +template +struct Vs2013Aligned<4, T> { + typedef __declspec(align(4)) T type; +}; +template +struct Vs2013Aligned<8, T> { + typedef __declspec(align(8)) T type; +}; +template +struct Vs2013Aligned<16, T> { + typedef __declspec(align(16)) T type; +}; +template +struct Vs2013Aligned<32, T> { + typedef __declspec(align(32)) T type; +}; +template +struct Vs2013Aligned<64, T> { + typedef __declspec(align(64)) T type; +}; +template +struct Vs2013Aligned<128, T> { + typedef __declspec(align(128)) T type; +}; +template +struct Vs2013Aligned<256, T> { + typedef __declspec(align(256)) T type; +}; #else - template struct identity { typedef T type; }; -#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) -#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) +template +struct identity { + typedef T type; +}; +#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) +#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity::type #endif #endif -} } - +} // namespace details +} // namespace moodycamel // TSAN can false report races in lock-free code. To enable TSAN to be used from projects that use this one, // we can apply per-function compile-time suppression. // See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer #define MOODYCAMEL_NO_TSAN #if defined(__has_feature) - #if __has_feature(thread_sanitizer) - #undef MOODYCAMEL_NO_TSAN - #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) - #endif // TSAN -#endif // TSAN +#if __has_feature(thread_sanitizer) +#undef MOODYCAMEL_NO_TSAN +#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) +#endif // TSAN +#endif // TSAN // Compiler-specific likely/unlikely hints -namespace moodycamel { namespace details { +namespace moodycamel { +namespace details { #if defined(__GNUC__) - static inline bool (likely)(bool x) { return __builtin_expect((x), true); } - static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +static inline bool(likely)(bool x) +{ + return __builtin_expect((x), true); +} +static inline bool(unlikely)(bool x) +{ + return __builtin_expect((x), false); +} #else - static inline bool (likely)(bool x) { return x; } - static inline bool (unlikely)(bool x) { return x; } +static inline bool(likely)(bool x) +{ + return x; +} +static inline bool(unlikely)(bool x) +{ + return x; +} #endif -} } +} // namespace details +} // namespace moodycamel #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG #include "internal/concurrentqueue_internal_debug.h" @@ -289,28 +396,28 @@ namespace moodycamel { namespace details { namespace moodycamel { namespace details { - template - struct const_numeric_max { - static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); - static const T value = std::numeric_limits::is_signed - ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) - : static_cast(-1); - }; +template +struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed ? + (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) : + static_cast(-1); +}; #if defined(__GLIBCXX__) - typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while #else - typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: #endif - // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting - // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. - typedef union { - std_max_align_t x; - long long y; - void* z; - } max_align_t; -} +// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting +// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. +typedef union { + std_max_align_t x; + long long y; + void* z; +} max_align_t; +} // namespace details // Default traits for the ConcurrentQueue. To change some of the // traits without re-implementing all of them, inherit from this @@ -318,99 +425,96 @@ namespace details { // since the traits are used as a template type parameter, the // shadowed declarations will be used where defined, and the defaults // otherwise. -struct ConcurrentQueueDefaultTraits -{ - // General-purpose size type. std::size_t is strongly recommended. - typedef std::size_t size_t; - - // The type used for the enqueue and dequeue indices. Must be at least as - // large as size_t. Should be significantly larger than the number of elements - // you expect to hold at once, especially if you have a high turnover rate; - // for example, on 32-bit x86, if you expect to have over a hundred million - // elements or pump several million elements through your queue in a very - // short space of time, using a 32-bit type *may* trigger a race condition. - // A 64-bit int type is recommended in that case, and in practice will - // prevent a race condition no matter the usage of the queue. Note that - // whether the queue is lock-free with a 64-int type depends on the whether - // std::atomic is lock-free, which is platform-specific. - typedef std::size_t index_t; - - // Internally, all elements are enqueued and dequeued from multi-element - // blocks; this is the smallest controllable unit. If you expect few elements - // but many producers, a smaller block size should be favoured. For few producers - // and/or many elements, a larger block size is preferred. A sane default - // is provided. Must be a power of 2. - static const size_t BLOCK_SIZE = 32; - - // For explicit producers (i.e. when using a producer token), the block is - // checked for being empty by iterating through a list of flags, one per element. - // For large block sizes, this is too inefficient, and switching to an atomic - // counter-based approach is faster. The switch is made for block sizes strictly - // larger than this threshold. - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; - - // How many full blocks can be expected for a single explicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; - - // How many full blocks can be expected for a single implicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; - - // The initial size of the hash table mapping thread IDs to implicit producers. - // Note that the hash is resized every time it becomes half full. - // Must be a power of two, and either 0 or at least 1. If 0, implicit production - // (using the enqueue methods without an explicit producer token) is disabled. - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; - - // Controls the number of items that an explicit consumer (i.e. one with a token) - // must consume before it causes all consumers to rotate and move on to the next - // internal queue. - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; - - // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. - // Enqueue operations that would cause this limit to be surpassed will fail. Note - // that this limit is enforced at the block level (for performance reasons), i.e. - // it's rounded up to the nearest block size. - static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; - - // The number of times to spin before sleeping when waiting on a semaphore. - // Recommended values are on the order of 1000-10000 unless the number of - // consumer threads exceeds the number of idle cores (in which case try 0-100). - // Only affects instances of the BlockingConcurrentQueue. - static const int MAX_SEMA_SPINS = 10000; - - // Whether to recycle dynamically-allocated blocks into an internal free list or - // not. If false, only pre-allocated blocks (controlled by the constructor - // arguments) will be recycled, and all others will be `free`d back to the heap. - // Note that blocks consumed by explicit producers are only freed on destruction - // of the queue (not following destruction of the token) regardless of this trait. - static const bool RECYCLE_ALLOCATED_BLOCKS = false; - - +struct ConcurrentQueueDefaultTraits { + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + // The number of times to spin before sleeping when waiting on a semaphore. + // Recommended values are on the order of 1000-10000 unless the number of + // consumer threads exceeds the number of idle cores (in which case try 0-100). + // Only affects instances of the BlockingConcurrentQueue. + static const int MAX_SEMA_SPINS = 10000; + + // Whether to recycle dynamically-allocated blocks into an internal free list or + // not. If false, only pre-allocated blocks (controlled by the constructor + // arguments) will be recycled, and all others will be `free`d back to the heap. + // Note that blocks consumed by explicit producers are only freed on destruction + // of the queue (not following destruction of the token) regardless of this trait. + static const bool RECYCLE_ALLOCATED_BLOCKS = false; + #ifndef MCDBGQ_USE_RELACY - // Memory allocation can be customized if needed. - // malloc should return nullptr on failure, and handle alignment like std::malloc. + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. #if defined(malloc) || defined(free) - // Gah, this is 2015, stop defining macros that break standard code already! - // Work around malloc/free being special macros: - static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } - static inline void WORKAROUND_free(void* ptr) { return free(ptr); } - static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } - static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); } #else - static inline void* malloc(size_t size) { return std::malloc(size); } - static inline void free(void* ptr) { return std::free(ptr); } + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } #endif #else - // Debug versions when running under the Relacy race detector (ignore - // these in user code) - static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } - static inline void free(void* ptr) { return rl::rl_free(ptr, $); } + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } #endif }; - // When producing or consuming many elements, the most efficient way is to: // 1) Use one of the bulk-operation methods of the queue with a token // 2) Failing that, use the bulk-operation methods without a token @@ -421,3322 +525,3554 @@ struct ConcurrentQueueDefaultTraits struct ProducerToken; struct ConsumerToken; -template class ConcurrentQueue; -template class BlockingConcurrentQueue; +template +class ConcurrentQueue; +template +class BlockingConcurrentQueue; class ConcurrentQueueTests; +namespace details { +struct ConcurrentQueueProducerTypelessBase { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase(): next(nullptr), inactive(false), token(nullptr) {} +}; + +template +struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } +}; +template <> +struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } +}; +template +struct hash_32_or_64: public _hash_32_or_64<(size > 4)> {}; + +static inline size_t hash_thread_id(thread_id_t id) +{ + static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( + thread_id_converter::prehash(id))); +} + +template +static inline bool circular_less_than(T a, T b) +{ + static_assert( + std::is_integral::value && !std::numeric_limits::is_signed, + "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << (static_cast(sizeof(T) * CHAR_BIT - 1))); + // Note: extra parens around rhs of operator<< is MSVC bug: + // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 + // silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when + // done here. +} + +template +static inline char* align_for(char* ptr) +{ + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; +} + +template +static inline T ceil_to_pow_2(T x) +{ + static_assert( + std::is_integral::value && !std::numeric_limits::is_signed, + "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; +} + +template +static inline void swap_relaxed(std::atomic& left, std::atomic& right) +{ + T temp = left.load(std::memory_order_relaxed); + left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed); + right.store(temp, std::memory_order_relaxed); +} + +template +static inline T const& nomove(T const& x) +{ + return x; +} + +template +struct nomove_if { + template + static inline T const& eval(T const& x) + { + return x; + } +}; + +template <> +struct nomove_if { + template + static inline auto eval(U&& x) -> decltype(std::forward(x)) + { + return std::forward(x); + } +}; -namespace details +template +static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) { - struct ConcurrentQueueProducerTypelessBase - { - ConcurrentQueueProducerTypelessBase* next; - std::atomic inactive; - ProducerToken* token; - - ConcurrentQueueProducerTypelessBase() - : next(nullptr), inactive(false), token(nullptr) - { - } - }; - - template struct _hash_32_or_64 { - static inline std::uint32_t hash(std::uint32_t h) - { - // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp - // Since the thread ID is already unique, all we really want to do is propagate that - // uniqueness evenly across all the bits, so that we can use a subset of the bits while - // reducing collisions significantly - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - return h ^ (h >> 16); - } - }; - template<> struct _hash_32_or_64<1> { - static inline std::uint64_t hash(std::uint64_t h) - { - h ^= h >> 33; - h *= 0xff51afd7ed558ccd; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53; - return h ^ (h >> 33); - } - }; - template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; - - static inline size_t hash_thread_id(thread_id_t id) - { - static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); - return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( - thread_id_converter::prehash(id))); - } - - template - static inline bool circular_less_than(T a, T b) - { - static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); - return static_cast(a - b) > static_cast(static_cast(1) << (static_cast(sizeof(T) * CHAR_BIT - 1))); - // Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 - // silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here. - } - - template - static inline char* align_for(char* ptr) - { - const std::size_t alignment = std::alignment_of::value; - return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; - } - - template - static inline T ceil_to_pow_2(T x) - { - static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); - - // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - for (std::size_t i = 1; i < sizeof(T); i <<= 1) { - x |= x >> (i << 3); - } - ++x; - return x; - } - - template - static inline void swap_relaxed(std::atomic& left, std::atomic& right) - { - T temp = left.load(std::memory_order_relaxed); - left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed); - right.store(temp, std::memory_order_relaxed); - } - - template - static inline T const& nomove(T const& x) - { - return x; - } - - template - struct nomove_if - { - template - static inline T const& eval(T const& x) - { - return x; - } - }; - - template<> - struct nomove_if - { - template - static inline auto eval(U&& x) - -> decltype(std::forward(x)) - { - return std::forward(x); - } - }; - - template - static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) - { - return *it; - } - + return *it; +} + #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) - template struct is_trivially_destructible : std::is_trivially_destructible { }; +template +struct is_trivially_destructible: std::is_trivially_destructible {}; #else - template struct is_trivially_destructible : std::has_trivial_destructor { }; +template +struct is_trivially_destructible: std::has_trivial_destructor {}; #endif - + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY - typedef RelacyThreadExitListener ThreadExitListener; - typedef RelacyThreadExitNotifier ThreadExitNotifier; +typedef RelacyThreadExitListener ThreadExitListener; +typedef RelacyThreadExitNotifier ThreadExitNotifier; #else - class ThreadExitNotifier; - - struct ThreadExitListener - { - typedef void (*callback_t)(void*); - callback_t callback; - void* userData; - - ThreadExitListener* next; // reserved for use by the ThreadExitNotifier - ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier - }; - - class ThreadExitNotifier - { - public: - static void subscribe(ThreadExitListener* listener) - { - auto& tlsInst = instance(); - std::lock_guard guard(mutex()); - listener->next = tlsInst.tail; - listener->chain = &tlsInst; - tlsInst.tail = listener; - } - - static void unsubscribe(ThreadExitListener* listener) - { - std::lock_guard guard(mutex()); - if (!listener->chain) { - return; // race with ~ThreadExitNotifier - } - auto& tlsInst = *listener->chain; - listener->chain = nullptr; - ThreadExitListener** prev = &tlsInst.tail; - for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { - if (ptr == listener) { - *prev = ptr->next; - break; - } - prev = &ptr->next; - } - } - - private: - ThreadExitNotifier() : tail(nullptr) { } - ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - - ~ThreadExitNotifier() - { - // This thread is about to exit, let everyone know! - assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); - std::lock_guard guard(mutex()); - for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { - ptr->chain = nullptr; - ptr->callback(ptr->userData); - } - } - - // Thread-local - static inline ThreadExitNotifier& instance() - { - static thread_local ThreadExitNotifier notifier; - return notifier; - } - - static inline std::mutex& mutex() - { - // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called - static std::mutex mutex; - return mutex; - } - - private: - ThreadExitListener* tail; - }; -#endif -#endif - - template struct static_is_lock_free_num { enum { value = 0 }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; - template struct static_is_lock_free : static_is_lock_free_num::type> { }; - template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; - template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; -} +class ThreadExitNotifier; +struct ThreadExitListener { + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; -struct ProducerToken -{ - template - explicit ProducerToken(ConcurrentQueue& queue); - - template - explicit ProducerToken(BlockingConcurrentQueue& queue); - - ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT - : producer(other.producer) - { - other.producer = nullptr; - if (producer != nullptr) { - producer->token = this; - } - } - - inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(producer, other.producer); - if (producer != nullptr) { - producer->token = this; - } - if (other.producer != nullptr) { - other.producer->token = &other; - } - } - - // A token is always valid unless: - // 1) Memory allocation failed during construction - // 2) It was moved via the move constructor - // (Note: assignment does a swap, leaving both potentially valid) - // 3) The associated queue was destroyed - // Note that if valid() returns true, that only indicates - // that the token is valid for use with a specific queue, - // but not which one; that's up to the user to track. - inline bool valid() const { return producer != nullptr; } - - ~ProducerToken() - { - if (producer != nullptr) { - producer->token = nullptr; - producer->inactive.store(true, std::memory_order_release); - } - } - - // Disable copying and assignment - ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; - -private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - -protected: - details::ConcurrentQueueProducerTypelessBase* producer; + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier + ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier }; +class ThreadExitNotifier { +public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + std::lock_guard guard(mutex()); + listener->next = tlsInst.tail; + listener->chain = &tlsInst; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + std::lock_guard guard(mutex()); + if (!listener->chain) { + return; // race with ~ThreadExitNotifier + } + auto& tlsInst = *listener->chain; + listener->chain = nullptr; + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } -struct ConsumerToken -{ - template - explicit ConsumerToken(ConcurrentQueue& q); - - template - explicit ConsumerToken(BlockingConcurrentQueue& q); - - ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT - : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) - { - } - - inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(initialOffset, other.initialOffset); - std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); - std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); - std::swap(currentProducer, other.currentProducer); - std::swap(desiredProducer, other.desiredProducer); - } - - // Disable copying and assignment - ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; +private: + ThreadExitNotifier(): tail(nullptr) {} + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert( + this == &instance() && + "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that " + "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + std::lock_guard guard(mutex()); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->chain = nullptr; + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + static inline std::mutex& mutex() + { + // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called + static std::mutex mutex; + return mutex; + } private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - -private: // but shared with ConcurrentQueue - std::uint32_t initialOffset; - std::uint32_t lastKnownGlobalOffset; - std::uint32_t itemsConsumedFromCurrent; - details::ConcurrentQueueProducerTypelessBase* currentProducer; - details::ConcurrentQueueProducerTypelessBase* desiredProducer; + ThreadExitListener* tail; }; +#endif +#endif -// Need to forward-declare this swap because it's in a namespace. -// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; +template +struct static_is_lock_free_num { + enum { value = 0 }; +}; +template <> +struct static_is_lock_free_num { + enum { value = ATOMIC_CHAR_LOCK_FREE }; +}; +template <> +struct static_is_lock_free_num { + enum { value = ATOMIC_SHORT_LOCK_FREE }; +}; +template <> +struct static_is_lock_free_num { + enum { value = ATOMIC_INT_LOCK_FREE }; +}; +template <> +struct static_is_lock_free_num { + enum { value = ATOMIC_LONG_LOCK_FREE }; +}; +template <> +struct static_is_lock_free_num { + enum { value = ATOMIC_LLONG_LOCK_FREE }; +}; +template +struct static_is_lock_free: static_is_lock_free_num::type> {}; +template <> +struct static_is_lock_free { + enum { value = ATOMIC_BOOL_LOCK_FREE }; +}; +template +struct static_is_lock_free { + enum { value = ATOMIC_POINTER_LOCK_FREE }; +}; +} // namespace details + +struct ProducerToken { + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT: producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; -template -class ConcurrentQueue -{ +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + +struct ConsumerToken { + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT: initialOffset(other.initialOffset), + lastKnownGlobalOffset(other.lastKnownGlobalOffset), + itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), + currentProducer(other.currentProducer), + desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See +// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap( + typename ConcurrentQueue::ImplicitProducerKVP& a, + typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; + +template +class ConcurrentQueue { public: - typedef ::moodycamel::ProducerToken producer_token_t; - typedef ::moodycamel::ConsumerToken consumer_token_t; - - typedef typename Traits::index_t index_t; - typedef typename Traits::size_t size_t; - - static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = + static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = + static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) -#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#pragma warning(disable : 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable : 4309) // static_cast: Truncation of constant value #endif - static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); + static const size_t MAX_SUBQUEUE_SIZE = + (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? + details::const_numeric_max::value : + ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); #ifdef _MSC_VER #pragma warning(pop) #endif - static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); - static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); - static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); - static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); - static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); - static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); - static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + static_assert( + !std::numeric_limits::is_signed && std::is_integral::value, + "Traits::size_t must be an unsigned integral type"); + static_assert( + !std::numeric_limits::is_signed && std::is_integral::value, + "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert( + (BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), + "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert( + (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && + !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), + "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert( + (EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert( + (IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert( + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || + !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert( + INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); public: - // Creates a queue with at least `capacity` element slots; note that the - // actual number of elements that can be inserted without additional memory - // allocation depends on the number of producers and the block size (e.g. if - // the block size is equal to `capacity`, only a single block will be allocated - // up-front, which means only a single producer will be able to enqueue elements - // without an extra allocation -- blocks aren't shared between producers). - // This method is not thread safe -- it is up to the user to ensure that the - // queue is fully constructed before it starts being used by other threads (this - // includes making the memory effects of construction visible, possibly with a - // memory barrier). - explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); - + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE) + : producerListTail(nullptr) + , producerCount(0) + , initialBlockPoolIndex(0) + , nextExplicitConsumerId(0) + , globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - // Track all the producers using a fully-resolved typed list for - // each kind; this makes it possible to debug them starting from - // the root queue object (otherwise wacky casts are needed that - // don't compile in the debugger's expression evaluator). - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Computes the correct amount of pre-allocated blocks for you based - // on the minimum number of elements you want available at any given - // time, and the maximum concurrent number of each type of producer. - ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); - populate_initial_block_list(blocks); - + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : producerListTail(nullptr) + , producerCount(0) + , initialBlockPoolIndex(0) + , nextExplicitConsumerId(0) + , globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Note: The queue should not be accessed concurrently while it's - // being deleted. It's up to the user to synchronize this. - // This method is not thread safe. - ~ConcurrentQueue() - { - // Destroy producers - auto ptr = producerListTail.load(std::memory_order_relaxed); - while (ptr != nullptr) { - auto next = ptr->next_prod(); - if (ptr->token != nullptr) { - ptr->token->producer = nullptr; - } - destroy(ptr); - ptr = next; - } - - // Destroy implicit producer hash tables - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { - auto hash = implicitProducerHash.load(std::memory_order_relaxed); - while (hash != nullptr) { - auto prev = hash->prev; - if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically - for (size_t i = 0; i != hash->capacity; ++i) { - hash->entries[i].~ImplicitProducerKVP(); - } - hash->~ImplicitProducerHash(); - (Traits::free)(hash); - } - hash = prev; - } - } - - // Destroy global free list - auto block = freeList.head_unsafe(); - while (block != nullptr) { - auto next = block->freeListNext.load(std::memory_order_relaxed); - if (block->dynamicallyAllocated) { - destroy(block); - } - block = next; - } - - // Destroy initial free list - destroy_array(initialBlockPool, initialBlockPoolSize); - } - - // Disable copying and copy assignment - ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - - // Moving is supported, but note that it is *not* a thread-safe operation. - // Nobody can use the queue while it's being moved, and the memory effects - // of that move must be propagated to other threads before they can use it. - // Note: When a queue is moved, its tokens are still valid but can only be - // used with the destination queue (i.e. semantically they are moved along - // with the queue itself). - ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT - : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), - producerCount(other.producerCount.load(std::memory_order_relaxed)), - initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), - initialBlockPool(other.initialBlockPool), - initialBlockPoolSize(other.initialBlockPoolSize), - freeList(std::move(other.freeList)), - nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), - globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) - { - // Move the other one into this, and leave the other one as an empty queue - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - swap_implicit_producer_hashes(other); - - other.producerListTail.store(nullptr, std::memory_order_relaxed); - other.producerCount.store(0, std::memory_order_relaxed); - other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); - other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); - + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) + { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - - other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); - other.initialBlockPoolSize = 0; - other.initialBlockPool = nullptr; - - reown_producers(); - } - - inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT - { - return swap_internal(other); - } - - // Swaps this queue's state with the other's. Not thread-safe. - // Swapping two queues does not invalidate their tokens, however - // the tokens that were created for one queue must be used with - // only the swapped queue (i.e. the tokens are tied to the - // queue's movable state, not the object itself). - inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT - { - swap_internal(other); - } - + explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT { return swap_internal(other); } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT { swap_internal(other); } + private: - ConcurrentQueue& swap_internal(ConcurrentQueue& other) - { - if (this == &other) { - return *this; - } - - details::swap_relaxed(producerListTail, other.producerListTail); - details::swap_relaxed(producerCount, other.producerCount); - details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); - std::swap(initialBlockPool, other.initialBlockPool); - std::swap(initialBlockPoolSize, other.initialBlockPoolSize); - freeList.swap(other.freeList); - details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); - details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); - - swap_implicit_producer_hashes(other); - - reown_producers(); - other.reown_producers(); - + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - details::swap_relaxed(explicitProducers, other.explicitProducers); - details::swap_relaxed(implicitProducers, other.implicitProducers); + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); #endif - - return *this; - } - -public: - // Enqueues a single item (by copying it). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T const& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T&& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T const& item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved instead of copied. - // Thread-safe. - template - bool enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails - // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - // Enqueues a single item (by copying it). - // Does not allocate memory. Fails if not enough room to enqueue (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0). - // Thread-safe. - inline bool try_enqueue(T const& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Thread-safe. - inline bool try_enqueue(T&& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T const& item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(U& item) - { - // Instead of simply trying each producer in turn (which could cause needless contention on the first - // producer), we score them heuristically. - size_t nonEmptyCount = 0; - ProducerBase* best = nullptr; - size_t bestSize = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { - auto size = ptr->size_approx(); - if (size > 0) { - if (size > bestSize) { - bestSize = size; - best = ptr; - } - ++nonEmptyCount; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (nonEmptyCount > 0) { - if ((details::likely)(best->dequeue(item))) { - return true; - } - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr != best && ptr->dequeue(item)) { - return true; - } - } - } - return false; - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // This differs from the try_dequeue(item) method in that this one does - // not attempt to reduce contention by interleaving the order that producer - // streams are dequeued from. So, using this method can reduce overall throughput - // under contention, but will give more predictable results in single-threaded - // consumer scenarios. This is mostly only useful for internal unit tests. - // Never allocates. Thread-safe. - template - bool try_dequeue_non_interleaved(U& item) - { - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->dequeue(item)) { - return true; - } - } - return false; - } - - // Attempts to dequeue from the queue using an explicit consumer token. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(consumer_token_t& token, U& item) - { - // The idea is roughly as follows: - // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less - // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place - // If there's no items where you're supposed to be, keep moving until you find a producer with some items - // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it - - if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return false; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (static_cast(token.currentProducer)->dequeue(item)) { - if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return true; - } - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - if (ptr->dequeue(item)) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = 1; - return true; - } - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return false; - } - - // Attempts to dequeue several elements from the queue. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(It itemFirst, size_t max) - { - size_t count = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - count += ptr->dequeue_bulk(itemFirst, max - count); - if (count == max) { - break; - } - } - return count; - } - - // Attempts to dequeue several elements from the queue using an explicit consumer token. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) - { - if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return 0; - } - } - - size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); - if (count == max) { - if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return max; - } - token.itemsConsumedFromCurrent += static_cast(count); - max -= count; - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - auto dequeued = ptr->dequeue_bulk(itemFirst, max); - count += dequeued; - if (dequeued != 0) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = static_cast(dequeued); - } - if (dequeued == max) { - break; - } - max -= dequeued; - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return count; - } - - - - // Attempts to dequeue from a specific producer's inner queue. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns false if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) - { - return static_cast(producer.producer)->dequeue(item); - } - - // Attempts to dequeue several elements from a specific producer's inner queue. - // Returns the number of items actually dequeued. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns 0 if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) - { - return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); - } - - - // Returns an estimate of the total number of elements currently in the queue. This - // estimate is only accurate if the queue has completely stabilized before it is called - // (i.e. all enqueue and dequeue operations have completed and their memory effects are - // visible on the calling thread, and no further operations start while this method is - // being called). - // Thread-safe. - size_t size_approx() const - { - size_t size = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - size += ptr->size_approx(); - } - return size; - } - - - // Returns true if the underlying atomic variables used by - // the queue are lock-free (they should be on most platforms). - // Thread-safe. - static constexpr bool is_lock_free() - { - return - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; - } + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) { return inner_enqueue(token, item); } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; + ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the + // highest efficiency consumer dictates the rotation speed of everyone else, more or less If you see that the + // global offset has changed, you must reset your consumption counter and move to your designated place If + // there's no items where you're supposed to be, keep moving until you find a producer with some items If the + // global offset has not changed but you've run out of items to consume, move over from your current position + // until you find an producer with something in it + + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); + } + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static constexpr bool is_lock_free() + { + return details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && + details::static_is_lock_free< + typename details::thread_id_converter::thread_id_numeric_size_t>::value == 2; + } private: - friend struct ProducerToken; - friend struct ConsumerToken; - struct ExplicitProducer; - friend struct ExplicitProducer; - struct ImplicitProducer; - friend struct ImplicitProducer; - friend class ConcurrentQueueTests; - - enum AllocationMode { CanAlloc, CannotAlloc }; - - - /////////////////////////////// - // Queue methods - /////////////////////////////// - - template - inline bool inner_enqueue(producer_token_t const& token, U&& element) - { - return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue(U&& element) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); - } - - template - inline bool inner_enqueue_bulk(It itemFirst, size_t count) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); - } - - inline bool update_current_producer_after_rotation(consumer_token_t& token) - { - // Ah, there's been a rotation, figure out where we should be! - auto tail = producerListTail.load(std::memory_order_acquire); - if (token.desiredProducer == nullptr && tail == nullptr) { - return false; - } - auto prodCount = producerCount.load(std::memory_order_relaxed); - auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); - if ((details::unlikely)(token.desiredProducer == nullptr)) { - // Aha, first time we're dequeueing anything. - // Figure out our local position - // Note: offset is from start, not end, but we're traversing from end -- subtract from count first - std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); - token.desiredProducer = tail; - for (std::uint32_t i = 0; i != offset; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - } - - std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; - if (delta >= prodCount) { - delta = delta % prodCount; - } - for (std::uint32_t i = 0; i != delta; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - - token.lastKnownGlobalOffset = globalOffset; - token.currentProducer = token.desiredProducer; - token.itemsConsumedFromCurrent = 0; - return true; - } - - - /////////////////////////// - // Free list - /////////////////////////// - - template - struct FreeListNode - { - FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } - - std::atomic freeListRefs; - std::atomic freeListNext; - }; - - // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but - // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly - // speedy under low contention. - template // N must inherit FreeListNode or have the same fields (and initialization of them) - struct FreeList - { - FreeList() : freeListHead(nullptr) { } - FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } - void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } - - FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - - inline void add(N* node) - { + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? + false : + producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? + false : + producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode { + FreeListNode(): freeListRefs(0), freeListNext(nullptr) {} + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList { + FreeList(): freeListHead(nullptr) {} + FreeList(FreeList&& other): freeListHead(other.freeListHead.load(std::memory_order_relaxed)) + { + other.freeListHead.store(nullptr, std::memory_order_relaxed); + } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to - // set it using a fetch_add - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { - // Oh look! We were the last ones referencing this node, and we know - // we want to add it to the free list, so let's do it! - add_knowing_refcount_is_zero(node); - } - } - - inline N* try_get() - { + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - auto head = freeListHead.load(std::memory_order_acquire); - while (head != nullptr) { - auto prevHead = head; - auto refs = head->freeListRefs.load(std::memory_order_relaxed); - if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) { - head = freeListHead.load(std::memory_order_acquire); - continue; - } - - // Good, reference count has been incremented (it wasn't at zero), which means we can read the - // next and not worry about it changing between now and the time we do the CAS - auto next = head->freeListNext.load(std::memory_order_relaxed); - if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { - // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no - // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). - assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); - - // Decrease refcount twice, once for our ref, and once for the list's ref - head->freeListRefs.fetch_sub(2, std::memory_order_release); - return head; - } - - // OK, the head must have changed on us, but we still need to decrease the refcount we increased. - // Note that we don't need to release any memory effects, but we do need to ensure that the reference - // count decrement happens-after the CAS on the head. - refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); - if (refs == SHOULD_BE_ON_FREELIST + 1) { - add_knowing_refcount_is_zero(prevHead); - } - } - - return nullptr; - } - - // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) - N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } - - private: - inline void add_knowing_refcount_is_zero(N* node) - { - // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run - // only one copy of this method per node at a time, i.e. the single thread case), then we know - // we can safely change the next pointer of the node; however, once the refcount is back above - // zero, then other threads could increase it (happens under heavy contention, when the refcount - // goes to zero in between a load and a refcount increment of a node in try_get, then back up to - // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS - // to add the node to the actual list fails, decrease the refcount and leave the add operation to - // the next thread who puts the refcount back at zero (which could be us, hence the loop). - auto head = freeListHead.load(std::memory_order_relaxed); - while (true) { - node->freeListNext.store(head, std::memory_order_relaxed); - node->freeListRefs.store(1, std::memory_order_release); - if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { - // Hmm, the add failed, but we can only try again when the refcount goes back to zero - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) { - continue; - } - } - return; - } - } - - private: - // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) - std::atomic freeListHead; - - static const std::uint32_t REFS_MASK = 0x7FFFFFFF; - static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; - + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || + !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong( + head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put + // back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong( + head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under + // contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugMutex mutex; -#endif - }; - - - /////////////////////////// - // Block - /////////////////////////// - - enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; - - struct Block - { - Block() - : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true) - { + debug::DebugMutex mutex; +#endif + }; + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block { + Block() + : next(nullptr) + , elementsCompletelyDequeued(0) + , freeListRefs(0) + , freeListNext(nullptr) + , dynamicallyAllocated(true) + { #ifdef MCDBGQ_TRACKMEM - owner = nullptr; -#endif - } - - template - inline bool is_empty() const - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Check flags - for (size_t i = 0; i < BLOCK_SIZE; ++i) { - if (!emptyFlags[i].load(std::memory_order_relaxed)) { - return false; - } - } - - // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - else { - // Check counter - if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); - return false; - } - } - - // Returns true if the block is now empty (does not apply in explicit context) - template - inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flag - assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); - emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); - return false; - } - else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel); - assert(prevVal < BLOCK_SIZE); - return prevVal == BLOCK_SIZE - 1; - } - } - - // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). - // Returns true if the block is now empty (does not apply in explicit context). - template - inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flags - std::atomic_thread_fence(std::memory_order_release); - i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; - for (size_t j = 0; j != count; ++j) { - assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); - emptyFlags[i + j].store(true, std::memory_order_relaxed); - } - return false; - } - else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel); - assert(prevVal + count <= BLOCK_SIZE); - return prevVal + count == BLOCK_SIZE; - } - } - - template - inline void set_all_empty() - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set all flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(true, std::memory_order_relaxed); - } - } - else { - // Reset counter - elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); - } - } - - template - inline void reset_empty() - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Reset flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(false, std::memory_order_relaxed); - } - } - else { - // Reset counter - elementsCompletelyDequeued.store(0, std::memory_order_relaxed); - } - } - - inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } - inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } - - private: - static_assert(std::alignment_of::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time"); - MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; - public: - Block* next; - std::atomic elementsCompletelyDequeued; - std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; - public: - std::atomic freeListRefs; - std::atomic freeListNext; - bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' - + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else + { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load( + std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store( + true, std::memory_order_release); + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & static_cast(BLOCK_SIZE - 1)); + } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & static_cast(BLOCK_SIZE - 1)); + } + + private: + static_assert( + std::alignment_of::value <= sizeof(T), + "The queue does not support types with an alignment greater than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + + public: + std::atomic freeListRefs; + std::atomic freeListNext; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + #ifdef MCDBGQ_TRACKMEM - void* owner; + void* owner; #endif - }; - static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); - + }; + static_assert( + std::alignment_of::value >= std::alignment_of::value, + "Internal error: Blocks must be at least as aligned as the type they are wrapping"); #ifdef MCDBGQ_TRACKMEM public: - struct MemStats; + struct MemStats; + private: #endif - - /////////////////////////// - // Producer base - /////////////////////////// - - struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase - { - ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : - tailIndex(0), - headIndex(0), - dequeueOptimisticCount(0), - dequeueOvercommit(0), - tailBlock(nullptr), - isExplicit(isExplicit_), - parent(parent_) - { - } - - virtual ~ProducerBase() { } - - template - inline bool dequeue(U& element) - { - if (isExplicit) { - return static_cast(this)->dequeue(element); - } - else { - return static_cast(this)->dequeue(element); - } - } - - template - inline size_t dequeue_bulk(It& itemFirst, size_t max) - { - if (isExplicit) { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } - else { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } - } - - inline ProducerBase* next_prod() const { return static_cast(next); } - - inline size_t size_approx() const - { - auto tail = tailIndex.load(std::memory_order_relaxed); - auto head = headIndex.load(std::memory_order_relaxed); - return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; - } - - inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } - protected: - std::atomic tailIndex; // Where to enqueue to next - std::atomic headIndex; // Where to dequeue from next - - std::atomic dequeueOptimisticCount; - std::atomic dequeueOvercommit; - - Block* tailBlock; - - public: - bool isExplicit; - ConcurrentQueue* parent; - - protected: + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase: public details::ConcurrentQueueProducerTypelessBase { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) + : tailIndex(0) + , headIndex(0) + , dequeueOptimisticCount(0) + , dequeueOvercommit(0) + , tailBlock(nullptr) + , isExplicit(isExplicit_) + , parent(parent_) + { + } + + virtual ~ProducerBase() {} + + template + inline bool dequeue(U& element) + { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } else { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - /////////////////////////// - // Explicit queue - /////////////////////////// - - struct ExplicitProducer : public ProducerBase - { - explicit ExplicitProducer(ConcurrentQueue* parent_) : - ProducerBase(parent_, true), - blockIndex(nullptr), - pr_blockIndexSlotsUsed(0), - pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), - pr_blockIndexFront(0), - pr_blockIndexEntries(nullptr), - pr_blockIndexRaw(nullptr) - { - size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; - if (poolBasedIndexSize > pr_blockIndexSize) { - pr_blockIndexSize = poolBasedIndexSize; - } - - new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE - } - - ~ExplicitProducer() - { - // Destruct any elements not yet dequeued. - // Since we're in the destructor, we can assume all elements - // are either completely dequeued or completely not (no halfways). - if (this->tailBlock != nullptr) { // Note this means there must be a block index too - // First find the block that's partially dequeued, if any - Block* halfDequeuedBlock = nullptr; - if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { - // The head's not on a block boundary, meaning a block somewhere is partially dequeued - // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) - size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); - while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { - i = (i + 1) & (pr_blockIndexSize - 1); - } - assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); - halfDequeuedBlock = pr_blockIndexEntries[i].block; - } - - // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) - auto block = this->tailBlock; - do { - block = block->next; - if (block->ConcurrentQueue::Block::template is_empty()) { - continue; - } - - size_t i = 0; // Offset into block - if (block == halfDequeuedBlock) { - i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - } - - // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index - auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { - (*block)[i++]->~T(); - } - } while (block != this->tailBlock); - } - - // Destroy all blocks that we own - if (this->tailBlock != nullptr) { - auto block = this->tailBlock; - do { - auto nextBlock = block->next; - this->parent->add_block_to_free_list(block); - block = nextBlock; - } while (block != this->tailBlock); - } - - // Destroy the block indices - auto header = static_cast(pr_blockIndexRaw); - while (header != nullptr) { - auto prev = static_cast(header->prev); - header->~BlockIndexHeader(); - (Traits::free)(header); - header = prev; - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto startBlock = this->tailBlock; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - // We can re-use the block ahead of us, it's empty! - this->tailBlock = this->tailBlock->next; - this->tailBlock->ConcurrentQueue::Block::template reset_empty(); - - // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the - // last block from it first -- except instead of removing then adding, we can just overwrite). - // Note that there must be a valid block index here, since even if allocation failed in the ctor, - // it would have been re-attempted when adding the first block to the queue; since there is such - // a block, a block index must have been successfully allocated. - } - else { - // Whatever head value we see here is >= the last value we saw here (relatively), - // and <= its current value. Since we have the most recent tail, the head must be - // <= to it. - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) - || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - // We can't enqueue in another block because there's not enough leeway -- the - // tail could surpass the head by the time the block fills up! (Or we'll exceed - // the size limit, if the second part of the condition was true.) - return false; - } - // We're going to need a new block; check that the block index has room - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { - // Hmm, the circular block index is already full -- we'll need - // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if - // the initial allocation failed in the constructor. - - MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { - return false; - } - else if (!new_block_index(pr_blockIndexSlotsUsed)) { - return false; - } - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - return false; - } + friend struct MemStats; +#endif + }; + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer: public ProducerBase { + explicit ExplicitProducer(ConcurrentQueue* parent_) + : ProducerBase(parent_, true) + , blockIndex(nullptr) + , pr_blockIndexSlotsUsed(0) + , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1) + , pr_blockIndexFront(0) + , pr_blockIndexEntries(nullptr) + , pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. + // EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a + // boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than( + pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert( + details::circular_less_than( + pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first + // iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast( + this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach + // the tail index + auto lastValidIndex = + (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? + BLOCK_SIZE : + static_cast( + this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + this->parent->add_block_to_free_list(block); + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && + this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing + // the last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + return false; + } + else if (!new_block_index(pr_blockIndexSlotsUsed)) + { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } - else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - ++pr_blockIndexSlotsUsed; - } - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - // The constructor may throw. We want the element not to appear in the queue in - // that case (without corrupting the queue): - MOODYCAMEL_TRY { - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH (...) { - // Revert change to the current block, but leave the new block available - // for next time - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; - MOODYCAMEL_RETHROW; - } - } - else { - (void)startBlock; - (void)originalBlockIndexSlotsUsed; - } - - // Add block to block index - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - // Might be something to dequeue, let's give it a try - - // Note that this if is purely for performance purposes in the common case when the queue is - // empty and the values are eventually consistent -- we may enter here spuriously. - - // Note that whatever the values of overcommit and tail are, they are not going to change (unless we - // change them) and must be the same value at this point (inside the if) as when the if condition was - // evaluated. - - // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. - // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in - // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). - // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all - // read-modify-write operations are guaranteed to work on the latest value in the modification order), but - // unfortunately that can't be shown to be correct using only the C++11 standard. - // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case - std::atomic_thread_fence(std::memory_order_acquire); - - // Increment optimistic counter, then check if it went over the boundary - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - - // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever - // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now - // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon - // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. - // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) - // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. - - // Note that we reload tail here in case it changed; it will be the same value as before or greater, since - // this load is sequenced after (happens after) the earlier load above. This is supported by read-read - // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - // Guaranteed to be at least one element to dequeue! - - // Get the index. Note that since there's guaranteed to be at least one element, this - // will never exceed tail. We need to do an acquire-release fence here since it's possible - // that whatever condition got us to this point was for an earlier enqueued element (that - // we already see the memory effects for), but that by the time we increment somebody else - // has incremented it, and we need to see the memory effects for *that* element, which is - // in such a case is necessarily visible on the thread that incremented it in the first - // place with the more current condition (they must have acquired a tail that is at least - // as recent). - auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - - // Determine which block the element is in - - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - // We need to be careful here about subtracting and dividing because of index wrap-around. - // When an index wraps, we need to preserve the sign of the offset when dividing it by the - // block size (in order to get a correct signed block count offset in all cases): - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); - auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; - - // Dequeue - auto& el = *((*block)[index]); - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { - // Make sure the element is still fully dequeued and destroyed even if the assignment - // throws - struct Guard { - Block* block; - index_t index; - - ~Guard() - { - (*block)[index]->~T(); - block->ConcurrentQueue::Block::template set_empty(index); - } - } guard = { block, index }; - - element = std::move(el); // NOLINT - } - else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - block->ConcurrentQueue::Block::template set_empty(index); - } - - return true; - } - else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write - } - } - - return false; - } - - template - bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - auto originalBlockIndexFront = pr_blockIndexFront; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - - Block* firstAllocatedBlock = nullptr; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { - // Allocate as many blocks as possible from ahead - while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - this->tailBlock = this->tailBlock->next; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Now allocate as many blocks as necessary from the block pool - while (blockBaseDiff > 0) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { - MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - - // pr_blockIndexFront is updated inside new_block_index, so we need to - // update our fallback value too (since we keep the new index even if we - // later fail) - originalBlockIndexFront = originalBlockIndexSlotsUsed; - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) + { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY + { + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else + { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit + // below. This ensures that whatever the value we got loaded into overcommit, the load of + // dequeueOptisticCount in the fetch_add below will result in a value at least as recent as that (and + // therefore at least as large). Note that I believe a compiler (signal) fence here would be sufficient + // due to the nature of fetch_add (all read-modify-write operations are guaranteed to work on the latest + // value in the modification order), but unfortunately that can't be shown to be correct using only the + // C++11 standard. See + // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is + // only ever incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), + // and since we now have a version of dequeueOptimisticCount that is at least as recent as overcommit + // (due to the release upon incrementing dequeueOvercommit and the acquire above that synchronizes with + // it), overcommit <= myDequeueCount. However, we can't assert this since both dequeueOptimisticCount + // and dequeueOvercommit may (independently) overflow; in such a case, though, the logic still holds + // since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, + // since this load is sequenced after (happens after) the earlier load above. This is supported by + // read-read coherency (as defined in the standard), explained here: + // http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>(blockBaseIndex - headBase) / + static_cast::type>(BLOCK_SIZE)); + auto block = + localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty(index); + } + } guard = {block, index}; + + element = std::move(el); // NOLINT + } else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty(index); + } + + return true; + } else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add( + 1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is + // guaranteed to happen before this write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && + this->tailBlock->next != firstAllocatedBlock && + this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + else if (full || !new_block_index(originalBlockIndexSlotsUsed)) + { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template set_all_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } - else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - ++pr_blockIndexSlotsUsed; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and - // publish the new block index front - auto block = firstAllocatedBlock; - while (true) { - block->ConcurrentQueue::Block::template reset_empty(); - if (block == this->tailBlock) { - break; - } - block = block->next; - } - - MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - auto endBlock = this->tailBlock; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else { - MOODYCAMEL_TRY { - while (currentTailIndex != stopIndex) { - // Must use copy constructor even if move constructor is available - // because we may have to revert if there's an exception. - // Sorry about the horrible templated next line, but it was the only way - // to disable moving *at compile time*, which is important because a type - // may only define a (noexcept) move constructor, and so calls to the - // cctor will not compile, even if they are in an if branch that will never - // be executed - new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) { - // Oh dear, an exception's been thrown -- destroy the elements that - // were enqueued so far and revert the entire bulk operation (we'll keep - // any allocated blocks in our linked list for later, though). - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - if (firstAllocatedBlock != nullptr) - blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Determine which block the first element is in - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); - auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); - - // Iterate the blocks and dequeue - auto index = firstIndex; - do { - auto firstIndexInBlock = index; - index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - auto block = localBlockIndex->entries[indexIndex].block; - if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else { - MOODYCAMEL_TRY { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH (...) { - // It's too late to revert the dequeue, but we can make sure that all - // the dequeued objects are properly destroyed and the block index - // (and empty count) are properly updated before we propagate the exception - do { - block = localBlockIndex->entries[indexIndex].block; - while (index != endIndex) { - (*block)[index++]->~T(); - } - block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - - firstIndexInBlock = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - struct BlockIndexEntry - { - index_t base; - Block* block; - }; - - struct BlockIndexHeader - { - size_t size; - std::atomic front; // Current slot (not next, like pr_blockIndexFront) - BlockIndexEntry* entries; - void* prev; - }; - - - bool new_block_index(size_t numberOfFilledSlotsToExpose) - { - auto prevBlockSizeMask = pr_blockIndexSize - 1; - - // Create the new block - pr_blockIndexSize <<= 1; - auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); - if (newRawPtr == nullptr) { - pr_blockIndexSize >>= 1; // Reset to allow graceful retry - return false; - } - - auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); - - // Copy in all the old indices, if any - size_t j = 0; - if (pr_blockIndexSlotsUsed != 0) { - auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; - do { - newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; - i = (i + 1) & prevBlockSizeMask; - } while (i != pr_blockIndexFront); - } - - // Update everything - auto header = new (newRawPtr) BlockIndexHeader; - header->size = pr_blockIndexSize; - header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); - header->entries = newBlockIndexEntries; - header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later - - pr_blockIndexFront = j; - pr_blockIndexEntries = newBlockIndexEntries; - pr_blockIndexRaw = newRawPtr; - blockIndex.store(header, std::memory_order_release); - - return true; - } - - private: - std::atomic blockIndex; - - // To be used by producer only -- consumer must use the ones in referenced by blockIndex - size_t pr_blockIndexSlotsUsed; - size_t pr_blockIndexSize; - size_t pr_blockIndexFront; // Next slot (not current) - BlockIndexEntry* pr_blockIndexEntries; - void* pr_blockIndexRaw; - + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) + { + blockIndex.load(std::memory_order_relaxed) + ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert( + (startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || + count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if(nullptr)) + T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) + { + if (firstAllocatedBlock != nullptr) + blockIndex.load(std::memory_order_relaxed) + ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = + static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>(firstBlockBaseIndex - headBase) / + static_cast::type>(BLOCK_SIZE)); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + index_t endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than( + firstIndex + static_cast(actualCount), endIndex) ? + firstIndex + static_cast(actualCount) : + endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } else { + MOODYCAMEL_TRY + { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty( + firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than( + firstIndex + static_cast(actualCount), endIndex) ? + firstIndex + static_cast(actualCount) : + endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty( + firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry { + index_t base; + Block* block; + }; + + struct BlockIndexHeader { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast( + details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ExplicitProducer* nextExplicitProducer; - private: + public: + ExplicitProducer* nextExplicitProducer; + + private: #endif - + #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Implicit queue - ////////////////////////////////// - - struct ImplicitProducer : public ProducerBase - { - ImplicitProducer(ConcurrentQueue* parent_) : - ProducerBase(parent_, false), - nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), - blockIndex(nullptr) - { - new_block_index(); - } - - ~ImplicitProducer() - { - // Note that since we're in the destructor we can assume that all enqueue/dequeue operations - // completed already; this means that all undequeued elements are placed contiguously across - // contiguous blocks, and that only the first and last remaining blocks can be only partially - // empty (all other remaining blocks must be completely full). - + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer: public ProducerBase { + ImplicitProducer(ConcurrentQueue* parent_) + : ProducerBase(parent_, false), nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - // Unregister ourselves for thread termination notification - if (!this->inactive.load(std::memory_order_relaxed)) { - details::ThreadExitNotifier::unsubscribe(&threadExitListener); - } -#endif - - // Destroy all remaining elements! - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto index = this->headIndex.load(std::memory_order_relaxed); - Block* block = nullptr; - assert(index == tail || details::circular_less_than(index, tail)); - bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed - while (index != tail) { - if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { - if (block != nullptr) { - // Free the old block - this->parent->add_block_to_free_list(block); - } - - block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); - } - - ((*block)[index])->~T(); - ++index; - } - // Even if the queue is empty, there's still one block that's not on the free list - // (unless the head index reached the end of it, in which case the tail will be poised - // to create a new block). - if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { - this->parent->add_block_to_free_list(this->tailBlock); - } - - // Destroy block index - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - if (localBlockIndex != nullptr) { - for (size_t i = 0; i != localBlockIndex->capacity; ++i) { - localBlockIndex->index[i]->~BlockIndexEntry(); - } - do { - auto prev = localBlockIndex->prev; - localBlockIndex->~BlockIndexHeader(); - (Traits::free)(localBlockIndex); - localBlockIndex = prev; - } while (localBlockIndex != nullptr); - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - return false; - } + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = + index != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && + (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry; - if (!insert_block_index_entry(idxEntry, currentTailIndex)) { - return false; - } - - // Get ahold of a new block - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - return false; - } + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - // May throw, try to insert now before we publish the fact that we have this new block - MOODYCAMEL_TRY { - new ((*newBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH (...) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(newBlock); - MOODYCAMEL_RETHROW; - } - } - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - this->tailBlock = newBlock; - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - // See ExplicitProducer::dequeue for rationale and explanation - index_t tail = this->tailIndex.load(std::memory_order_relaxed); - index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - std::atomic_thread_fence(std::memory_order_acquire); - - index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - // Determine which block the element is in - auto entry = get_block_index_entry_for_index(index); - - // Dequeue - auto block = entry->value.load(std::memory_order_relaxed); - auto& el = *((*block)[index]); - - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) + { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY + { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - // Note: Acquiring the mutex with every dequeue instead of only when a block - // is released is very sub-optimal, but it is, after all, purely debug code. - debug::DebugLock lock(producer->mutex); -#endif - struct Guard { - Block* block; - index_t index; - BlockIndexEntry* entry; - ConcurrentQueue* parent; - - ~Guard() - { - (*block)[index]->~T(); - if (block->ConcurrentQueue::Block::template set_empty(index)) { - entry->value.store(nullptr, std::memory_order_relaxed); - parent->add_block_to_free_list(block); - } - } - } guard = { block, index, entry, this->parent }; - - element = std::move(el); // NOLINT - } - else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - - if (block->ConcurrentQueue::Block::template set_empty(index)) { - { + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = {block, index, entry, this->parent}; + + element = std::move(el); // NOLINT + } else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty(index)) { + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Add the block back into the global free pool (and remove from block index) - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - } - - return true; - } - else { - this->dequeueOvercommit.fetch_add(1, std::memory_order_release); - } - } - - return false; - } - + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + } + + return true; + } else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable: 4706) // assignment within conditional expression -#endif - template - bool enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - - // Note that the tailBlock we start off with may not be owned by us any more; - // this happens if it was filled up exactly to the top (setting tailIndex to - // the first index of the next block which is not yet allocated), then dequeued - // completely (putting it on the free list) before we enqueue again. - - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - Block* firstAllocatedBlock = nullptr; - auto endBlock = this->tailBlock; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { +#pragma warning(disable : 4706) // assignment within conditional expression +#endif + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - do { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell - Block* newBlock; - bool indexInserted = false; - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - - if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { - // Index allocation or block allocation failed; revert any other allocations - // and index insertions done so far for this operation - if (indexInserted) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - } - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - - return false; - } - + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry = + nullptr; // initialization here unnecessary but compiler can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + + if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || + (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == + nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - newBlock->next = nullptr; - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - // Store the chain of blocks so that we can undo if later allocations fail, - // and so that we can find the blocks when we do the actual enqueueing - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { - assert(this->tailBlock != nullptr); - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - endBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; - } while (blockBaseDiff > 0); - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else { - MOODYCAMEL_TRY { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) { - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - auto idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || + firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert( + (startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || + count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if(nullptr)) + T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } #ifdef _MSC_VER #pragma warning(pop) #endif - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Iterate the blocks and dequeue - auto index = firstIndex; - BlockIndexHeader* localBlockIndex; - auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); - do { - auto blockStartIndex = index; - index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - - auto entry = localBlockIndex->index[indexIndex]; - auto block = entry->value.load(std::memory_order_relaxed); - if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else { - MOODYCAMEL_TRY { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH (...) { - do { - entry = localBlockIndex->index[indexIndex]; - block = entry->value.load(std::memory_order_relaxed); - while (index != endIndex) { - (*block)[index++]->~T(); - } - - if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = + static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + index_t endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than( + firstIndex + static_cast(actualCount), endIndex) ? + firstIndex + static_cast(actualCount) : + endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } else { + MOODYCAMEL_TRY + { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty( + blockStartIndex, static_cast(endIndex - blockStartIndex))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - entry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(block); - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - - blockStartIndex = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { - { + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than( + firstIndex + static_cast(actualCount), endIndex) ? + firstIndex + static_cast(actualCount) : + endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty( + blockStartIndex, static_cast(endIndex - blockStartIndex))) { + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Note that the set_many_empty above did a release, meaning that anybody who acquires the block - // we're about to free can use it safely since our writes (and reads!) will have happened-before then. - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else { - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - // The block size must be > 1, so any number with the low bit set is an invalid block base index - static const index_t INVALID_BLOCK_BASE = 1; - - struct BlockIndexEntry - { - std::atomic key; - std::atomic value; - }; - - struct BlockIndexHeader - { - size_t capacity; - std::atomic tail; - BlockIndexEntry* entries; - BlockIndexEntry** index; - BlockIndexHeader* prev; - }; - - template - inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK - if (localBlockIndex == nullptr) { - return false; // this can happen if new_block_index failed in the constructor - } - size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || - idxEntry->value.load(std::memory_order_relaxed) == nullptr) { - - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - - // No room in the old block index, try to allocate another one! - MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { - return false; - } - else if (!new_block_index()) { - return false; - } - else { - localBlockIndex = blockIndex.load(std::memory_order_relaxed); - newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - } - - inline void rewind_block_index_tail() - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); - } - - inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const - { - BlockIndexHeader* localBlockIndex; - auto idx = get_block_index_index_for_index(index, localBlockIndex); - return localBlockIndex->index[idx]; - } - - inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const - { + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires + // the block we're about to free can use it safely since our writes (and reads!) will + // have happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } else { + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) + { + auto localBlockIndex = + blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + size_t newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + return false; + } + else if (!new_block_index()) + { + return false; + } + else + { + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store( + (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), + std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - index &= ~static_cast(BLOCK_SIZE - 1); - localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto tail = localBlockIndex->tail.load(std::memory_order_acquire); - auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); - assert(tailBase != INVALID_BLOCK_BASE); - // Note: Must use division instead of shift because the index may wrap around, causing a negative - // offset, whose negativity we want to preserve - auto offset = static_cast(static_cast::type>(index - tailBase) / static_cast::type>(BLOCK_SIZE)); - size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); - assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); - return idx; - } - - bool new_block_index() - { - auto prev = blockIndex.load(std::memory_order_relaxed); - size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; - auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; - auto raw = static_cast((Traits::malloc)( - sizeof(BlockIndexHeader) + - std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + - std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); - if (raw == nullptr) { - return false; - } - - auto header = new (raw) BlockIndexHeader; - auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); - auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); - if (prev != nullptr) { - auto prevTail = prev->tail.load(std::memory_order_relaxed); - auto prevPos = prevTail; - size_t i = 0; - do { - prevPos = (prevPos + 1) & (prev->capacity - 1); - index[i++] = prev->index[prevPos]; - } while (prevPos != prevTail); - assert(i == prevCapacity); - } - for (size_t i = 0; i != entryCount; ++i) { - new (entries + i) BlockIndexEntry; - entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); - index[prevCapacity + i] = entries + i; - } - header->prev = prev; - header->entries = entries; - header->index = index; - header->capacity = nextBlockIndexCapacity; - header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); - - blockIndex.store(header, std::memory_order_release); - - nextBlockIndexCapacity <<= 1; - - return true; - } - - private: - size_t nextBlockIndexCapacity; - std::atomic blockIndex; + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast( + static_cast::type>(index - tailBase) / + static_cast::type>(BLOCK_SIZE)); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert( + localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && + localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast( + (Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * entryCount + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = + reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast(details::align_for( + reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - public: - details::ThreadExitListener threadExitListener; - private: + public: + details::ThreadExitListener threadExitListener; + + private: #endif - + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ImplicitProducer* nextImplicitProducer; - private: + public: + ImplicitProducer* nextImplicitProducer; + + private: #endif #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - mutable debug::DebugMutex mutex; + mutable debug::DebugMutex mutex; #endif #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Block pool manipulation - ////////////////////////////////// - - void populate_initial_block_list(size_t blockCount) - { - initialBlockPoolSize = blockCount; - if (initialBlockPoolSize == 0) { - initialBlockPool = nullptr; - return; - } - - initialBlockPool = create_array(blockCount); - if (initialBlockPool == nullptr) { - initialBlockPoolSize = 0; - } - for (size_t i = 0; i < initialBlockPoolSize; ++i) { - initialBlockPool[i].dynamicallyAllocated = false; - } - } - - inline Block* try_get_block_from_initial_pool() - { - if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { - return nullptr; - } - - auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); - - return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; - } - - inline void add_block_to_free_list(Block* block) - { + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { #ifdef MCDBGQ_TRACKMEM - block->owner = nullptr; -#endif - if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { - destroy(block); - } - else { - freeList.add(block); - } - } - - inline void add_blocks_to_free_list(Block* block) - { - while (block != nullptr) { - auto next = block->next; - add_block_to_free_list(block); - block = next; - } - } - - inline Block* try_get_block_from_free_list() - { - return freeList.try_get(); - } - - // Gets a free block from one of the memory pools, or allocates a new one (if applicable) - template - Block* requisition_block() - { - auto block = try_get_block_from_initial_pool(); - if (block != nullptr) { - return block; - } - - block = try_get_block_from_free_list(); - if (block != nullptr) { - return block; - } - - MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) { - return create(); - } - else { - return nullptr; - } - } - + block->owner = nullptr; +#endif + if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { + destroy(block); + } else { + freeList.add(block); + } + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() { return freeList.try_get(); } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) + { + return create(); + } + else + { + return nullptr; + } + } #ifdef MCDBGQ_TRACKMEM - public: - struct MemStats { - size_t allocatedBlocks; - size_t usedBlocks; - size_t freeBlocks; - size_t ownedBlocksExplicit; - size_t ownedBlocksImplicit; - size_t implicitProducers; - size_t explicitProducers; - size_t elementsEnqueued; - size_t blockClassBytes; - size_t queueClassBytes; - size_t implicitBlockIndexBytes; - size_t explicitBlockIndexBytes; - - friend class ConcurrentQueue; - - private: - static MemStats getFor(ConcurrentQueue* q) - { - MemStats stats = { 0 }; - - stats.elementsEnqueued = q->size_approx(); - - auto block = q->freeList.head_unsafe(); - while (block != nullptr) { - ++stats.allocatedBlocks; - ++stats.freeBlocks; - block = block->freeListNext.load(std::memory_order_relaxed); - } - - for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - bool implicit = dynamic_cast(ptr) != nullptr; - stats.implicitProducers += implicit ? 1 : 0; - stats.explicitProducers += implicit ? 0 : 1; - - if (implicit) { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ImplicitProducer); - auto head = prod->headIndex.load(std::memory_order_relaxed); - auto tail = prod->tailIndex.load(std::memory_order_relaxed); - auto hash = prod->blockIndex.load(std::memory_order_relaxed); - if (hash != nullptr) { - for (size_t i = 0; i != hash->capacity; ++i) { - if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { - ++stats.allocatedBlocks; - ++stats.ownedBlocksImplicit; - } - } - stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); - for (; hash != nullptr; hash = hash->prev) { - stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); - } - } - for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { - //auto block = prod->get_block_index_entry_for_index(head); - ++stats.usedBlocks; - } - } - else { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ExplicitProducer); - auto tailBlock = prod->tailBlock; - bool wasNonEmpty = false; - if (tailBlock != nullptr) { - auto block = tailBlock; - do { - ++stats.allocatedBlocks; - if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { - ++stats.usedBlocks; - wasNonEmpty = wasNonEmpty || block != tailBlock; - } - ++stats.ownedBlocksExplicit; - block = block->next; - } while (block != tailBlock); - } - auto index = prod->blockIndex.load(std::memory_order_relaxed); - while (index != nullptr) { - stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); - index = static_cast(index->prev); - } - } - } - - auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); - stats.allocatedBlocks += freeOnInitialPool; - stats.freeBlocks += freeOnInitialPool; - - stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; - stats.queueClassBytes += sizeof(ConcurrentQueue); - - return stats; - } - }; - - // For debugging only. Not thread-safe. - MemStats getMemStats() - { - return MemStats::getFor(this); - } - private: - friend struct MemStats; -#endif - - - ////////////////////////////////// - // Producer list manipulation - ////////////////////////////////// - - ProducerBase* recycle_or_create_producer(bool isExplicit) - { +public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = {0}; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; + ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != + ImplicitProducer::INVALID_BLOCK_BASE && + hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += + sizeof(typename ImplicitProducer::BlockIndexHeader) + + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { + // auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += + sizeof(typename ExplicitProducer::BlockIndexHeader) + + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast(index->prev); + } + } + } + + auto freeOnInitialPool = + q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? + 0 : + q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() { return MemStats::getFor(this); } + +private: + friend struct MemStats; +#endif + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - // Try to re-use one first - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { - bool expected = true; - if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { - // We caught one! It's been marked as activated, the caller can have it - return ptr; - } - } - } - - return add_producer(isExplicit ? static_cast(create(this)) : create(this)); - } - - ProducerBase* add_producer(ProducerBase* producer) - { - // Handle failed memory allocation - if (producer == nullptr) { - return nullptr; - } - - producerCount.fetch_add(1, std::memory_order_relaxed); - - // Add it to the lock-free list - auto prevTail = producerListTail.load(std::memory_order_relaxed); - do { - producer->next = prevTail; - } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); - + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong( + expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + return ptr; + } + } + } + + return add_producer( + isExplicit ? static_cast(create(this)) : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak( + prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - if (producer->isExplicit) { - auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextExplicitProducer = prevTailExplicit; - } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); - } - else { - auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextImplicitProducer = prevTailImplicit; - } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); - } -#endif - - return producer; - } - - void reown_producers() - { - // After another instance is moved-into/swapped-with this one, all the - // producers we stole still think their parents are the other queue. - // So fix them up! - for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { - ptr->parent = this; - } - } - - - ////////////////////////////////// - // Implicit producer hash - ////////////////////////////////// - - struct ImplicitProducerKVP - { - std::atomic key; - ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place - - ImplicitProducerKVP() : value(nullptr) { } - - ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); - value = other.value; - } - - inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT - { - if (this != &other) { - details::swap_relaxed(key, other.key); - std::swap(value, other.value); - } - } - }; - - template - friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; - - struct ImplicitProducerHash - { - size_t capacity; - ImplicitProducerKVP* entries; - ImplicitProducerHash* prev; - }; - - inline void populate_initial_implicit_producer_hash() - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { - return; - } - else { - implicitProducerHashCount.store(0, std::memory_order_relaxed); - auto hash = &initialImplicitProducerHash; - hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; - hash->entries = &initialImplicitProducerHashEntries[0]; - for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { - initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - hash->prev = nullptr; - implicitProducerHash.store(hash, std::memory_order_relaxed); - } - } - - void swap_implicit_producer_hashes(ConcurrentQueue& other) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { - return; - } - else { - // Swap (assumes our implicit producer hash is initialized) - initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); - initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; - other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; - - details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); - - details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); - if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { - implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); - } - else { - ImplicitProducerHash* hash; - for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { - continue; - } - hash->prev = &initialImplicitProducerHash; - } - if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { - other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); - } - else { - ImplicitProducerHash* hash; - for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { - continue; - } - hash->prev = &other.initialImplicitProducerHash; - } - } - } - - // Only fails (returns nullptr) if memory allocation fails - ImplicitProducer* get_or_add_implicit_producer() - { - // Note that since the data is essentially thread-local (key is thread ID), - // there's a reduced need for fences (memory ordering is already consistent - // for any individual thread), except for the current table itself. - - // Start by looking for the thread ID in the current and all previous hash tables. - // If it's not found, it must not be in there yet, since this same thread would - // have added it previously to one of the tables that we traversed. - - // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table - + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak( + prevTailExplicit, + static_cast(producer), + std::memory_order_release, + std::memory_order_relaxed)); + } else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak( + prevTailImplicit, + static_cast(producer), + std::memory_order_release, + std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP { + std::atomic key; + ImplicitProducer* + value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP(): value(nullptr) {} + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap( + typename ConcurrentQueue::ImplicitProducerKVP&, + typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + { + return; + } + else + { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + { + return; + } + else + { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); + } else { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &other.initialImplicitProducerHash; + hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); + } else { + ImplicitProducerHash* hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &initialImplicitProducerHash; + hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - - auto mainHash = implicitProducerHash.load(std::memory_order_acquire); - assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) - for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { - // Look for the id in this hash - auto index = hashedId; - while (true) { // Not an infinite loop because at least one slot is free in the hash table - index &= hash->capacity - 1u; - - auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); - if (probedKey == id) { - // Found it! If we had to search several hashes deep, though, we should lazily add it - // to the current main hash table to avoid the extended search next time. - // Note there's guaranteed to be room in the current hash table since every subsequent - // table implicitly reserves space for all previous tables (there's only one - // implicitProducerHashCount). - auto value = hash->entries[index].value; - if (hash != mainHash) { - index = hashedId; - while (true) { - index &= mainHash->capacity - 1u; - auto empty = details::invalid_thread_id; + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1u; + + auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1u; + auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) || - mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { + auto reusable = details::invalid_thread_id2; + if (mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) || + mainHash->entries[index].key.compare_exchange_strong( + reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { #else - if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { -#endif - mainHash->entries[index].value = value; - break; - } - ++index; - } - } - - return value; - } - if (probedKey == details::invalid_thread_id) { - break; // Not in this hash table - } - ++index; - } - } - - // Insert! - auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); - while (true) { - // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) - if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { - // We've acquired the resize lock, try to allocate a bigger hash table. - // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when - // we reload implicitProducerHash it must be the most recent version (it only gets changed within this - // locked block). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - if (newCount >= (mainHash->capacity >> 1)) { - size_t newCapacity = mainHash->capacity << 1; - while (newCount >= (newCapacity >> 1)) { - newCapacity <<= 1; - } - auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); - if (raw == nullptr) { - // Allocation failed - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - return nullptr; - } - - auto newHash = new (raw) ImplicitProducerHash; - newHash->capacity = static_cast(newCapacity); - newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); - for (size_t i = 0; i != newCapacity; ++i) { - new (newHash->entries + i) ImplicitProducerKVP; - newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - newHash->prev = mainHash; - implicitProducerHash.store(newHash, std::memory_order_release); - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - mainHash = newHash; - } - else { - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - } - } - - // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table - // to finish being allocated by another thread (and if we just finished allocating above, the condition will - // always be true) - if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { - auto producer = static_cast(recycle_or_create_producer(false)); - if (producer == nullptr) { - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - return nullptr; - } - + if (mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && + !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + size_t newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + + std::alignment_of::value - 1 + + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = static_cast(newCapacity); + newHash->entries = reinterpret_cast( + details::align_for(raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + auto producer = static_cast(recycle_or_create_producer(false)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; - producer->threadExitListener.userData = producer; - details::ThreadExitNotifier::subscribe(&producer->threadExitListener); -#endif - - auto index = hashedId; - while (true) { - index &= mainHash->capacity - 1u; - auto empty = details::invalid_thread_id; + producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1u; + auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); // already counted as a used slot - mainHash->entries[index].value = producer; - break; - } -#endif - if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { - mainHash->entries[index].value = producer; - break; - } - ++index; - } - return producer; - } - - // Hmm, the old hash is quite full and somebody else is busy allocating a new one. - // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, - // we try to allocate ourselves). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - } - } - + auto reusable = details::invalid_thread_id2; + if (mainHash->entries[index].key.compare_exchange_strong( + reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); // already counted as a used slot + mainHash->entries[index].value = producer; + break; + } +#endif + if (mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - void implicit_producer_thread_exited(ImplicitProducer* producer) - { - // Remove from hash + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from hash #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - auto hash = implicitProducerHash.load(std::memory_order_acquire); - assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - details::thread_id_t probedKey; - - // We need to traverse all the hashes just in case other threads aren't on the current one yet and are - // trying to add an entry thinking there's a free slot (because they reused a producer) - for (; hash != nullptr; hash = hash->prev) { - auto index = hashedId; - do { - index &= hash->capacity - 1u; - probedKey = id; - if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) { - break; - } - ++index; - } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place - } - - // Mark the queue as being recyclable - producer->inactive.store(true, std::memory_order_release); - } - - static void implicit_producer_thread_exited_callback(void* userData) - { - auto producer = static_cast(userData); - auto queue = producer->parent; - queue->implicit_producer_thread_exited(producer); - } -#endif - - ////////////////////////////////// - // Utility functions - ////////////////////////////////// - - template - static inline void* aligned_malloc(size_t size) - { - MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) - return (Traits::malloc)(size); - else { - size_t alignment = std::alignment_of::value; - void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); - if (!raw) - return nullptr; - char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); - *(reinterpret_cast(ptr) - 1) = raw; - return ptr; - } - } - - template - static inline void aligned_free(void* ptr) - { - MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) - return (Traits::free)(ptr); - else - (Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); - } - - template - static inline U* create_array(size_t count) - { - assert(count > 0); - U* p = static_cast(aligned_malloc(sizeof(U) * count)); - if (p == nullptr) - return nullptr; - - for (size_t i = 0; i != count; ++i) - new (p + i) U(); - return p; - } - - template - static inline void destroy_array(U* p, size_t count) - { - if (p != nullptr) { - assert(count > 0); - for (size_t i = count; i != 0; ) - (p + --i)->~U(); - } - aligned_free(p); - } - - template - static inline U* create() - { - void* p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U : nullptr; - } - - template - static inline U* create(A1&& a1) - { - void* p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; - } - - template - static inline void destroy(U* p) - { - if (p != nullptr) - p->~U(); - aligned_free(p); - } + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert( + hash != + nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1u; + probedKey = id; + if (hash->entries[index].key.compare_exchange_strong( + probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) { + break; + } + ++index; + } while (probedKey != + details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it + // yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline void* aligned_malloc(size_t size) + { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= std::alignment_of::value) + return (Traits::malloc)(size); + else + { + size_t alignment = std::alignment_of::value; + void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); + if (!raw) + return nullptr; + char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + } + + template + static inline void aligned_free(void* ptr) + { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= std::alignment_of::value) + return (Traits::free)(ptr); + else(Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); + } + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + U* p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0;) + (p + --i)->~U(); + } + aligned_free(p); + } + + template + static inline U* create() + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) + p->~U(); + aligned_free(p); + } private: - std::atomic producerListTail; - std::atomic producerCount; - - std::atomic initialBlockPoolIndex; - Block* initialBlockPool; - size_t initialBlockPoolSize; - + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + #ifndef MCDBGQ_USEDEBUGFREELIST - FreeList freeList; + FreeList freeList; #else - debug::DebugFreeList freeList; -#endif - - std::atomic implicitProducerHash; - std::atomic implicitProducerHashCount; // Number of slots logically used - ImplicitProducerHash initialImplicitProducerHash; - std::array initialImplicitProducerHashEntries; - std::atomic_flag implicitProducerHashResizeInProgress; - - std::atomic nextExplicitConsumerId; - std::atomic globalExplicitConsumerOffset; - + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugMutex implicitProdMutex; + debug::DebugMutex implicitProdMutex; #endif - + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - std::atomic explicitProducers; - std::atomic implicitProducers; + std::atomic explicitProducers; + std::atomic implicitProducers; #endif }; - -template -ProducerToken::ProducerToken(ConcurrentQueue& queue) - : producer(queue.recycle_or_create_producer(true)) +template +ProducerToken::ProducerToken(ConcurrentQueue& queue): producer(queue.recycle_or_create_producer(true)) { - if (producer != nullptr) { - producer->token = this; - } + if (producer != nullptr) { + producer->token = this; + } } -template +template ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) - : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) + : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) { - if (producer != nullptr) { - producer->token = this; - } + if (producer != nullptr) { + producer->token = this; + } } -template +template ConsumerToken::ConsumerToken(ConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { - initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); } -template +template ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { - initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); + initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add( + 1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); } -template +template inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT +template +inline void swap( + typename ConcurrentQueue::ImplicitProducerKVP& a, + typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } -} +} // namespace moodycamel #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) #pragma warning(pop) diff --git a/scripts/lint.sh b/scripts/lint.sh new file mode 100755 index 000000000..273d70043 --- /dev/null +++ b/scripts/lint.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euxo pipefail + +black . +pflake8 . +mypy --install-types . + +# disable tracing -- too verbose +set +x +# find all .h, .hpp, and .cpp files excepting third party files +files=$(find ./scaler -path './scaler/io/ymq/third_party' -prune -o \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) -print0 | xargs -0) +set -x + +echo "running clang format on $(wc -w <<< $files) files" +clang-format -i -style file -- $files diff --git a/tests/cc_ymq/common.h b/tests/cc_ymq/common.h index 42dfab9fe..bafe5d907 100644 --- a/tests/cc_ymq/common.h +++ b/tests/cc_ymq/common.h @@ -30,10 +30,11 @@ #include #include #include -#include +#include +#include +#include #include #include -#include #include #include #include diff --git a/tests/cc_ymq/py_mitm/core.py b/tests/cc_ymq/py_mitm/core.py new file mode 100644 index 000000000..4a22ee01a --- /dev/null +++ b/tests/cc_ymq/py_mitm/core.py @@ -0,0 +1,54 @@ +""" +This is the common code for implementing man in the middle in Python +""" + +import dataclasses +from typing import Protocol +from scapy.all import TunTapInterface, IP, TCP # type: ignore + + +@dataclasses.dataclass +class TCPConnection: + """ + Represents a TCP connection over the TUNTAP interface + local_ip and local_port are the mitm's ip and port, and + remote_ip and remote_port are the port for the remote peer + """ + + local_ip: str + local_port: int + remote_ip: str + remote_port: int + + def rewrite(self, pkt: IP, ack: int | None = None, data=None): + """ + Rewrite a TCP/IP packet as a packet originating + from (local_ip, local_port) and going to (remote_ip, remote_port) + This function is useful for taking a packet received from one connection, and redirecting it to another + + Args: + pkt: A scapy TCP/IP packet to rewrite + ack: An optional ack number to use instead of the one found in `pkt` + data: An optional payload to use instead of the one found int `pkt` + + Returns: + The rewritten packet, suitable for sending over TUNTAP + """ + tcp = pkt[TCP] + + return ( + IP(src=self.local_ip, dst=self.remote_ip) + / TCP(sport=self.local_port, dport=self.remote_port, flags=tcp.flags, seq=tcp.seq, ack=ack or tcp.ack) + / bytes(data or tcp.payload) + ) + + +class MITMProtocol(Protocol): + def proxy( + self, + tuntap: TunTapInterface, + pkt: IP, + sender: TCPConnection, + client_conn: TCPConnection | None, + server_conn: TCPConnection, + ) -> None: ... diff --git a/tests/cc_ymq/py_mitm/drop.py b/tests/cc_ymq/py_mitm/drop.py new file mode 100644 index 000000000..520e5bf70 --- /dev/null +++ b/tests/cc_ymq/py_mitm/drop.py @@ -0,0 +1,28 @@ +""" +This MITM drops a % of packets +""" + +import random +from core import MITMProtocol, TunTapInterface, IP, TCPConnection + + +class MITM(MITMProtocol): + def __init__(self, drop_pcent: str): + self.drop_pcent = float(drop_pcent) + + def proxy( + self, + tuntap: TunTapInterface, + pkt: IP, + sender: TCPConnection, + client_conn: TCPConnection | None, + server_conn: TCPConnection, + ) -> None: + if random.random() < self.drop_pcent: + print("[!] Dropping packet") + return + + if sender == client_conn: + tuntap.send(server_conn.rewrite(pkt)) + elif sender == server_conn: + tuntap.send(client_conn.rewrite(pkt)) diff --git a/tests/cc_ymq/py_mitm/rst.py b/tests/cc_ymq/py_mitm/rst.py new file mode 100644 index 000000000..be301d5b0 --- /dev/null +++ b/tests/cc_ymq/py_mitm/rst.py @@ -0,0 +1,48 @@ +""" +This MITM inserts an unexpected TCP RST +""" + +from core import IP, TCP, MITMProtocol, TCPConnection, TunTapInterface + + +class MITM(MITMProtocol): + def __init__(self): + # count the number of psh-acks sent by the client + self.client_pshack_counter = 0 + + def proxy( + self, + tuntap: TunTapInterface, + pkt: IP, + sender: TCPConnection, + client_conn: TCPConnection | None, + server_conn: TCPConnection, + ) -> None: + if sender == client_conn or client_conn is None: + if pkt[TCP].flags == "PA": + self.client_pshack_counter += 1 + + # on the second psh-ack, send a rst instead + if self.client_pshack_counter == 2: + rst_pkt = IP(src=client_conn.local_ip, dst=client_conn.remote_ip) / TCP( + sport=client_conn.local_port, dport=client_conn.remote_port, flags="R", seq=pkt[TCP].ack + ) + print(f"<- [{rst_pkt[TCP].flags}] (simulated)") + tuntap.send(rst_pkt) + return + + tuntap.send(server_conn.rewrite(pkt)) + elif sender == server_conn: + tuntap.send(client_conn.rewrite(pkt)) + + +# client -> mitm -> server +# server -> mitm -> client + +# client: 127.0.0.1:8080 +# mitm: 127.0.0.1:8081 +# server: 127.0.0.1:8081 + + +# client -> mitm == src = client.ip, sport = client.port ;; dst = mitm.ip, dport = mitm.port +# mitm -> server == src = mitm.ip, sport = mitm.port ;; dst = server.ip, dport = server.port diff --git a/tests/cc_ymq/py_mitm/runner.py b/tests/cc_ymq/py_mitm/runner.py new file mode 100644 index 000000000..f56ccb9f9 --- /dev/null +++ b/tests/cc_ymq/py_mitm/runner.py @@ -0,0 +1,157 @@ +# flake8: noqa: E402 + +""" +This script provides a framework for running MITM test cases + +This script accepts 5 arguments in the following order: + 1. pid: the pid of the test process, used for signaling + 2. testcase: the MITM test case. \ + this loads `from .testcase import MITM` where `MITM` is a class implementing `MITMProtocol` + 3. mitm_ip: an ipv4 address for the mitm server + 4. mitm_port: the port used to connect to the remote server + 5. server_ip: the desired ip of the remote side of the TUNTAP interface + 6. server_port: the port of the remote server + 7. *args: Additional args, if any are passed to the constructor: `MITM(*args)` + +See the documentation on `main` for more +""" +import os +import sys + +# add the script's directory to path +sys.path.append(os.path.dirname(os.path.realpath(__file__))) + +import importlib +import signal +import subprocess + +from core import MITMProtocol, TCPConnection +from scapy.all import IP, TCP, TunTapInterface # type: ignore + + +def echo_call(cmd: list[str]): + print(f"+ {' '.join(cmd)}") + subprocess.check_call(cmd) + + +def create_tuntap_interface(iface_name: str, mitm_ip: str, remote_ip: str) -> TunTapInterface: + """ + Creates a TUNTAP interface and sets brings it up and adds ips using the `ip` program + + Args: + iface_name: The name of the TUNTAP interface, usually like `tun0`, `tun1`, etc. + mitm_ip: The desired ip address of the mitm. This is the ip that clients can use to connect to the mitm + remote_ip: The ip that routes to/from the tuntap interface. + packets sent to `mitm_ip` will appear to come from `remote_ip`,\ + and conversely the tuntap interface can connect/send packets + to `remote_ip`, making it a suitable ip for binding a server + + Returns: + The TUNTAP interface + """ + iface = TunTapInterface(iface_name, mode="tun") + + try: + echo_call(["sudo", "ip", "link", "set", iface_name, "up"]) + echo_call(["sudo", "ip", "addr", "add", remote_ip, "peer", mitm_ip, "dev", iface_name]) + print(f"[+] Interface {iface_name} up with IP {mitm_ip}") + except subprocess.CalledProcessError: + print("[!] Could not bring up interface. Run as root or set manually.") + raise + + return iface + + +def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: int, mitm: MITMProtocol): + """ + This function serves as a framework for man in the middle implementations + A client connects to the MITM, then the MITM connects to a remote server + The MITM sits inbetween the client and the server, manipulating the packets sent depending on the test case + This function: + 1. creates a TUNTAP interface and prepares it for MITM + 2. handles connecting clients and handling connection closes + 3. delegates additional logic to a pluggable callable, `mitm` + 4. returns when both connections have terminated (via ) + + Args: + pid: this is the pid of the test process, used for signaling readiness \ + we send SIGUSR1 to this process when the mitm is ready + mitm_ip: The desired ip address of the mitm server + mitm_port: The desired port of the mitm server. \ + This is the port used to connect to the server, but the client is free to connect on any port + remote_ip: The desired remote ip for the TUNTAP interface. This is the only ip address \ + reachable by the interface and is thus the src ip for clients, and the ip that the remote server \ + must be bound to + server_port: The port that the remote server is bound to + mitm: The core logic for a MITM test case. This callable may maintain its own state and is responsible \ + for sending packets over the TUNTAP interface (if it doesn't, nothing will happen) + """ + + tuntap = create_tuntap_interface("tun0", mitm_ip, remote_ip) + + # signal the caller that the tuntap interface has been created + if pid > 0: + os.kill(pid, signal.SIGUSR1) + + # these track information about our connections + # we already know what to expect for the server connection, we are the connector + client_conn = None + server_conn = TCPConnection(mitm_ip, mitm_port, remote_ip, server_port) + + # tracks the state of each connection + client_sent_fin_ack = False + client_closed = False + server_sent_fin_ack = False + server_closed = False + + while True: + pkt = tuntap.recv() + if not pkt.haslayer(IP) or not pkt.haslayer(TCP): + continue + ip = pkt[IP] + tcp = pkt[TCP] + + # for a received packet, the destination ip and port are our local ip and port + # and the source ip and port will be the remote ip and port + sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport) + + if sender == client_conn: + print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}") + elif sender == server_conn: + print(f"<- [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}") + + if tcp.flags == "S": # SYN from client + print("-> [S]") + print(f"[*] New connection from {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}") + client_conn = sender + + if tcp.flags == "SA": # SYN-ACK from server + if sender == server_conn: + print(f"[*] Connection to server established: {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}") + + if tcp.flags == "FA": # FIN-ACK + if sender == client_conn: + client_sent_fin_ack = True + if sender == server_conn: + server_sent_fin_ack = True + + if tcp.flags == "A": # ACK + if sender == client_conn and server_sent_fin_ack: + server_closed = True + if sender == server_conn and client_sent_fin_ack: + client_closed = True + + mitm.proxy(tuntap, pkt, sender, client_conn, server_conn) + + if client_closed and server_closed: + print("[*] Both connections closed") + return + + +if __name__ == "__main__": + # parse the ips, ports, and test case from the command line + pid, testcase, mitm_ip, mitm_port, remote_ip, server_port, *args = sys.argv[1:] + + # load the module dynamically + module = importlib.import_module(testcase) + main(int(pid), mitm_ip, int(mitm_port), remote_ip, int(server_port), module.MITM(*args)) diff --git a/tests/pymod_ymq/config.py b/tests/pymod_ymq/config.py new file mode 100644 index 000000000..e3b138516 --- /dev/null +++ b/tests/pymod_ymq/config.py @@ -0,0 +1,13 @@ +__all__ = ["ymq"] + +import sys +import os + +file_path = os.path.realpath(__file__) +joined_path = os.path.join(file_path, "..", "..", "..", "scaler", "io", "ymq") +normed_path = os.path.normpath(joined_path) + +sys.path.append(normed_path) +import ymq # noqa: E402 + +sys.path.pop() From c468afc637a26294eb3f638e56876b9b3af0166a Mon Sep 17 00:00:00 2001 From: magniloquency <197707854+magniloquency@users.noreply.github.com> Date: Mon, 15 Sep 2025 21:53:44 -0400 Subject: [PATCH 3/3] Add more tests Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com> --- pyproject.toml | 1 + scaler/io/ymq/message_connection_tcp.cpp | 2 +- scaler/io/ymq/tests/incomplete_identity.h | 53 - scaler/io/ymq/third_party/concurrentqueue.h | 7130 ++++++++--------- scripts/lint.sh | 15 - tests/cc_ymq/common.h | 5 +- tests/cc_ymq/py_mitm/core.py | 54 - tests/cc_ymq/py_mitm/drop.py | 28 - tests/cc_ymq/py_mitm/main.py | 5 +- tests/cc_ymq/py_mitm/passthrough.py | 5 +- tests/cc_ymq/py_mitm/randomly_drop_packets.py | 5 +- tests/cc_ymq/py_mitm/rst.py | 48 - tests/cc_ymq/py_mitm/runner.py | 157 - tests/cc_ymq/py_mitm/send_rst_to_client.py | 5 +- tests/cc_ymq/py_mitm/types.py | 2 +- 15 files changed, 3416 insertions(+), 4099 deletions(-) delete mode 100644 scaler/io/ymq/tests/incomplete_identity.h delete mode 100755 scripts/lint.sh delete mode 100644 tests/cc_ymq/py_mitm/core.py delete mode 100644 tests/cc_ymq/py_mitm/drop.py delete mode 100644 tests/cc_ymq/py_mitm/rst.py delete mode 100644 tests/cc_ymq/py_mitm/runner.py diff --git a/pyproject.toml b/pyproject.toml index 9fb667b61..a5aef39a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dev = [ "flake8>=7.3.0", "flake8-pyproject>=1.2.3", "mypy>=1.17.1", + "scapy==2.*", ] [tool.scikit-build.metadata.version] diff --git a/scaler/io/ymq/message_connection_tcp.cpp b/scaler/io/ymq/message_connection_tcp.cpp index df2338292..8212238f1 100644 --- a/scaler/io/ymq/message_connection_tcp.cpp +++ b/scaler/io/ymq/message_connection_tcp.cpp @@ -267,7 +267,7 @@ void MessageConnectionTCP::updateReadOperation() _receivedReadOperations.pop(); auto recvMessageCallback = std::move(_pendingRecvMessageCallbacks->front()); _pendingRecvMessageCallbacks->pop(); - + recvMessageCallback({Message(std::move(address), std::move(payload)), {}}); } else { assert(_pendingRecvMessageCallbacks->size()); diff --git a/scaler/io/ymq/tests/incomplete_identity.h b/scaler/io/ymq/tests/incomplete_identity.h deleted file mode 100644 index e8ea4f1ed..000000000 --- a/scaler/io/ymq/tests/incomplete_identity.h +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include -#include - -#include "scaler/io/ymq/examples/common.h" -#include "scaler/io/ymq/io_context.h" -#include "tests/cc_ymq/common.h" - -void incomplete_identity_server_main() -{ - IOContext context(1); - - auto socket = syncCreateSocket(context, IOSocketType::Binder, "server"); - syncBindSocket(socket, "tcp://127.0.0.1:25715"); - auto result = syncRecvMessage(socket); - - assert(result.has_value()); - assert(result->payload.as_string() == "yi er san si wu liu"); - - context.removeIOSocket(socket); -} - -void incomplete_identity_client_main() -{ - // open a socket, write an incomplete identity and exit - { - TcpSocket socket; - - socket.connect("127.0.0.1", 25715); - - auto remote_identity = socket.read_message(); - assert(remote_identity == "server"); - - // write incomplete identity and exit - std::string identity = "client"; - uint64_t header = identity.length(); - socket.write_all((char*)&header, 8); - socket.write_all(identity.data(), identity.length() - 2); - std::this_thread::sleep_for(3s); - } - - // connect again and try to send a message - { - TcpSocket socket; - socket.connect("127.0.0.1", 25715); - auto remote_identity = socket.read_message(); - assert(remote_identity == "server"); - socket.write_message("client"); - socket.write_message("yi er san si wu liu"); - std::this_thread::sleep_for(3s); - } -} diff --git a/scaler/io/ymq/third_party/concurrentqueue.h b/scaler/io/ymq/third_party/concurrentqueue.h index d5498b116..2fc775400 100644 --- a/scaler/io/ymq/third_party/concurrentqueue.h +++ b/scaler/io/ymq/third_party/concurrentqueue.h @@ -47,7 +47,7 @@ // VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher // does not support `if constexpr`, so we have no choice but to simply disable the warning #pragma warning(push) -#pragma warning(disable : 4127) // conditional expression is constant +#pragma warning(disable: 4127) // conditional expression is constant #endif #if defined(__APPLE__) @@ -64,114 +64,81 @@ #undef malloc #undef free #else -#include // Requires C++11. Sorry VS2010. +#include // Requires C++11. Sorry VS2010. #include #endif -#include -#include -#include // for CHAR_BIT -#include // for max_align_t +#include // for max_align_t #include #include -#include -#include // used for thread exit synchronization -#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading #include +#include #include +#include +#include // for CHAR_BIT +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading +#include // used for thread exit synchronization // Platform-specific definitions of a numeric thread ID type and an invalid value -namespace moodycamel { -namespace details { -template -struct thread_id_converter { - typedef thread_id_t thread_id_numeric_size_t; - typedef thread_id_t thread_id_hash_t; - static thread_id_hash_t prehash(thread_id_t const& x) { return x; } -}; -} // namespace details -} // namespace moodycamel +namespace moodycamel { namespace details { + template struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } + }; +} } #if defined(MCDBGQ_USE_RELACY) -namespace moodycamel { -namespace details { -typedef std::uint32_t thread_id_t; -static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; -static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; -static inline thread_id_t thread_id() -{ - return rl::thread_index(); -} -} // namespace details -} // namespace moodycamel +namespace moodycamel { namespace details { + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; + static inline thread_id_t thread_id() { return rl::thread_index(); } +} } #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) // No sense pulling in windows.h in a header, we'll manually declare the function // we use and rely on backwards-compatibility for this not to break extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); -namespace moodycamel { -namespace details { -static_assert( - sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); -typedef std::uint32_t thread_id_t; -static const thread_id_t invalid_thread_id = - 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx -static const thread_id_t invalid_thread_id2 = - 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread - // IDs are presently multiples of 4. -static inline thread_id_t thread_id() -{ - return static_cast(::GetCurrentThreadId()); -} -} // namespace details -} // namespace moodycamel -#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || \ - defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL) -namespace moodycamel { -namespace details { -static_assert( - sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, - "std::thread::id is expected to be either 4 or 8 bytes"); - -typedef std::thread::id thread_id_t; -static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID - -// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's -// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't -// be. -static inline thread_id_t thread_id() -{ - return std::this_thread::get_id(); -} - -template -struct thread_id_size {}; -template <> -struct thread_id_size<4> { - typedef std::uint32_t numeric_t; -}; -template <> -struct thread_id_size<8> { - typedef std::uint64_t numeric_t; -}; - -template <> -struct thread_id_converter { - typedef thread_id_size::numeric_t thread_id_numeric_size_t; +namespace moodycamel { namespace details { + static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. + static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } +} } +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL) +namespace moodycamel { namespace details { + static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); + + typedef std::thread::id thread_id_t; + static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + + // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's + // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't + // be. + static inline thread_id_t thread_id() { return std::this_thread::get_id(); } + + template struct thread_id_size { }; + template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; + template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; + + template<> struct thread_id_converter { + typedef thread_id_size::numeric_t thread_id_numeric_size_t; #ifndef __APPLE__ - typedef std::size_t thread_id_hash_t; + typedef std::size_t thread_id_hash_t; #else - typedef thread_id_numeric_size_t thread_id_hash_t; + typedef thread_id_numeric_size_t thread_id_hash_t; #endif - static thread_id_hash_t prehash(thread_id_t const& x) - { + static thread_id_hash_t prehash(thread_id_t const& x) + { #ifndef __APPLE__ - return std::hash()(x); + return std::hash()(x); #else - return *reinterpret_cast(&x); + return *reinterpret_cast(&x); #endif - } -}; -} -} + } + }; +} } #else // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 // In order to get a numeric thread ID in a platform-independent way, we use a thread-local @@ -184,19 +151,12 @@ struct thread_id_converter { // Assume C++11 compliant compiler #define MOODYCAMEL_THREADLOCAL thread_local #endif -namespace moodycamel { -namespace details { -typedef std::uintptr_t thread_id_t; -static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr -static const thread_id_t invalid_thread_id2 = - 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. -inline thread_id_t thread_id() -{ - static MOODYCAMEL_THREADLOCAL int x; - return reinterpret_cast(&x); -} -} -} +namespace moodycamel { namespace details { + typedef std::uintptr_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr + static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. + inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } +} } #endif // Constexpr if @@ -212,19 +172,18 @@ inline thread_id_t thread_id() // Exceptions #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED -#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ - (!defined(_MSC_VER) && !defined(__GNUC__)) +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) #define MOODYCAMEL_EXCEPTIONS_ENABLED #endif #endif #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED -#define MOODYCAMEL_TRY try -#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) -#define MOODYCAMEL_RETHROW throw -#define MOODYCAMEL_THROW(expr) throw(expr) +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw (expr) #else -#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true) -#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false) +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false) #define MOODYCAMEL_RETHROW #define MOODYCAMEL_THROW(expr) #endif @@ -232,35 +191,21 @@ inline thread_id_t thread_id() #ifndef MOODYCAMEL_NOEXCEPT #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) #define MOODYCAMEL_NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( // We have to assume *all* non-trivial constructors may throw on VS2012! #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ - (std::is_rvalue_reference::value && std::is_move_constructible::value ? \ - std::is_trivially_move_constructible::value : \ - std::is_trivially_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ - ((std::is_rvalue_reference::value && std::is_move_assignable::value ? \ - std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : \ - std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && \ - MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ - (std::is_rvalue_reference::value && std::is_move_constructible::value ? \ - std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : \ - std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ - ((std::is_rvalue_reference::value && std::is_move_assignable::value ? \ - std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : \ - std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && \ - MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #else -#define MOODYCAMEL_NOEXCEPT noexcept -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) #endif #endif @@ -269,24 +214,18 @@ inline thread_id_t thread_id() #ifdef MCDBGQ_USE_RELACY #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #else -// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: -// http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't support thread_local either. Finally, iOS/ARM doesn't -// have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work -#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ - (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && \ - (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ - (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && \ - !defined(__MVS__) +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 +// g++ <=4.7 doesn't support thread_local either. +// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__) // Assume `thread_local` is fully supported in all other C++11 compilers/platforms -#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; years ago several users report having - // problems with it on +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; years ago several users report having problems with it on #endif #endif #endif -// VS2012 doesn't support deleted functions. -// In this case, we declare the function normally but don't define it. A link error will be generated if the function is -// called. +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. #ifndef MOODYCAMEL_DELETE_FUNCTION #if defined(_MSC_VER) && _MSC_VER < 1800 #define MOODYCAMEL_DELETE_FUNCTION @@ -295,100 +234,54 @@ inline thread_id_t thread_id() #endif #endif -namespace moodycamel { -namespace details { +namespace moodycamel { namespace details { #ifndef MOODYCAMEL_ALIGNAS // VS2013 doesn't support alignas or alignof, and align() requires a constant literal #if defined(_MSC_VER) && _MSC_VER <= 1800 -#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) -#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) +#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) +#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned::value, T>::type -template -struct Vs2013Aligned {}; // default, unsupported alignment -template -struct Vs2013Aligned<1, T> { - typedef __declspec(align(1)) T type; -}; -template -struct Vs2013Aligned<2, T> { - typedef __declspec(align(2)) T type; -}; -template -struct Vs2013Aligned<4, T> { - typedef __declspec(align(4)) T type; -}; -template -struct Vs2013Aligned<8, T> { - typedef __declspec(align(8)) T type; -}; -template -struct Vs2013Aligned<16, T> { - typedef __declspec(align(16)) T type; -}; -template -struct Vs2013Aligned<32, T> { - typedef __declspec(align(32)) T type; -}; -template -struct Vs2013Aligned<64, T> { - typedef __declspec(align(64)) T type; -}; -template -struct Vs2013Aligned<128, T> { - typedef __declspec(align(128)) T type; -}; -template -struct Vs2013Aligned<256, T> { - typedef __declspec(align(256)) T type; -}; + template struct Vs2013Aligned { }; // default, unsupported alignment + template struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; + template struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; + template struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; + template struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; + template struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; + template struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; + template struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; + template struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; + template struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; #else -template -struct identity { - typedef T type; -}; -#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) -#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) + template struct identity { typedef T type; }; +#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) +#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity::type #endif #endif -} // namespace details -} // namespace moodycamel +} } + // TSAN can false report races in lock-free code. To enable TSAN to be used from projects that use this one, // we can apply per-function compile-time suppression. // See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer #define MOODYCAMEL_NO_TSAN #if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#undef MOODYCAMEL_NO_TSAN -#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) -#endif // TSAN -#endif // TSAN + #if __has_feature(thread_sanitizer) + #undef MOODYCAMEL_NO_TSAN + #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) + #endif // TSAN +#endif // TSAN // Compiler-specific likely/unlikely hints -namespace moodycamel { -namespace details { +namespace moodycamel { namespace details { #if defined(__GNUC__) -static inline bool(likely)(bool x) -{ - return __builtin_expect((x), true); -} -static inline bool(unlikely)(bool x) -{ - return __builtin_expect((x), false); -} + static inline bool (likely)(bool x) { return __builtin_expect((x), true); } + static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } #else -static inline bool(likely)(bool x) -{ - return x; -} -static inline bool(unlikely)(bool x) -{ - return x; -} + static inline bool (likely)(bool x) { return x; } + static inline bool (unlikely)(bool x) { return x; } #endif -} // namespace details -} // namespace moodycamel +} } #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG #include "internal/concurrentqueue_internal_debug.h" @@ -396,28 +289,28 @@ static inline bool(unlikely)(bool x) namespace moodycamel { namespace details { -template -struct const_numeric_max { - static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); - static const T value = std::numeric_limits::is_signed ? - (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) : - static_cast(-1); -}; + template + struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) + : static_cast(-1); + }; #if defined(__GLIBCXX__) -typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while #else -typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: #endif -// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting -// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. -typedef union { - std_max_align_t x; - long long y; - void* z; -} max_align_t; -} // namespace details + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} // Default traits for the ConcurrentQueue. To change some of the // traits without re-implementing all of them, inherit from this @@ -425,96 +318,99 @@ typedef union { // since the traits are used as a template type parameter, the // shadowed declarations will be used where defined, and the defaults // otherwise. -struct ConcurrentQueueDefaultTraits { - // General-purpose size type. std::size_t is strongly recommended. - typedef std::size_t size_t; - - // The type used for the enqueue and dequeue indices. Must be at least as - // large as size_t. Should be significantly larger than the number of elements - // you expect to hold at once, especially if you have a high turnover rate; - // for example, on 32-bit x86, if you expect to have over a hundred million - // elements or pump several million elements through your queue in a very - // short space of time, using a 32-bit type *may* trigger a race condition. - // A 64-bit int type is recommended in that case, and in practice will - // prevent a race condition no matter the usage of the queue. Note that - // whether the queue is lock-free with a 64-int type depends on the whether - // std::atomic is lock-free, which is platform-specific. - typedef std::size_t index_t; - - // Internally, all elements are enqueued and dequeued from multi-element - // blocks; this is the smallest controllable unit. If you expect few elements - // but many producers, a smaller block size should be favoured. For few producers - // and/or many elements, a larger block size is preferred. A sane default - // is provided. Must be a power of 2. - static const size_t BLOCK_SIZE = 32; - - // For explicit producers (i.e. when using a producer token), the block is - // checked for being empty by iterating through a list of flags, one per element. - // For large block sizes, this is too inefficient, and switching to an atomic - // counter-based approach is faster. The switch is made for block sizes strictly - // larger than this threshold. - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; - - // How many full blocks can be expected for a single explicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; - - // How many full blocks can be expected for a single implicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; - - // The initial size of the hash table mapping thread IDs to implicit producers. - // Note that the hash is resized every time it becomes half full. - // Must be a power of two, and either 0 or at least 1. If 0, implicit production - // (using the enqueue methods without an explicit producer token) is disabled. - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; - - // Controls the number of items that an explicit consumer (i.e. one with a token) - // must consume before it causes all consumers to rotate and move on to the next - // internal queue. - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; - - // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. - // Enqueue operations that would cause this limit to be surpassed will fail. Note - // that this limit is enforced at the block level (for performance reasons), i.e. - // it's rounded up to the nearest block size. - static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; - - // The number of times to spin before sleeping when waiting on a semaphore. - // Recommended values are on the order of 1000-10000 unless the number of - // consumer threads exceeds the number of idle cores (in which case try 0-100). - // Only affects instances of the BlockingConcurrentQueue. - static const int MAX_SEMA_SPINS = 10000; - - // Whether to recycle dynamically-allocated blocks into an internal free list or - // not. If false, only pre-allocated blocks (controlled by the constructor - // arguments) will be recycled, and all others will be `free`d back to the heap. - // Note that blocks consumed by explicit producers are only freed on destruction - // of the queue (not following destruction of the token) regardless of this trait. - static const bool RECYCLE_ALLOCATED_BLOCKS = false; - +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + // The number of times to spin before sleeping when waiting on a semaphore. + // Recommended values are on the order of 1000-10000 unless the number of + // consumer threads exceeds the number of idle cores (in which case try 0-100). + // Only affects instances of the BlockingConcurrentQueue. + static const int MAX_SEMA_SPINS = 10000; + + // Whether to recycle dynamically-allocated blocks into an internal free list or + // not. If false, only pre-allocated blocks (controlled by the constructor + // arguments) will be recycled, and all others will be `free`d back to the heap. + // Note that blocks consumed by explicit producers are only freed on destruction + // of the queue (not following destruction of the token) regardless of this trait. + static const bool RECYCLE_ALLOCATED_BLOCKS = false; + + #ifndef MCDBGQ_USE_RELACY - // Memory allocation can be customized if needed. - // malloc should return nullptr on failure, and handle alignment like std::malloc. + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. #if defined(malloc) || defined(free) - // Gah, this is 2015, stop defining macros that break standard code already! - // Work around malloc/free being special macros: - static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } - static inline void WORKAROUND_free(void* ptr) { return free(ptr); } - static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); } - static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); } + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } #else - static inline void* malloc(size_t size) { return std::malloc(size); } - static inline void free(void* ptr) { return std::free(ptr); } + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } #endif #else - // Debug versions when running under the Relacy race detector (ignore - // these in user code) - static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } - static inline void free(void* ptr) { return rl::rl_free(ptr, $); } + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } #endif }; + // When producing or consuming many elements, the most efficient way is to: // 1) Use one of the bulk-operation methods of the queue with a token // 2) Failing that, use the bulk-operation methods without a token @@ -525,3554 +421,3322 @@ struct ConcurrentQueueDefaultTraits { struct ProducerToken; struct ConsumerToken; -template -class ConcurrentQueue; -template -class BlockingConcurrentQueue; +template class ConcurrentQueue; +template class BlockingConcurrentQueue; class ConcurrentQueueTests; -namespace details { -struct ConcurrentQueueProducerTypelessBase { - ConcurrentQueueProducerTypelessBase* next; - std::atomic inactive; - ProducerToken* token; - - ConcurrentQueueProducerTypelessBase(): next(nullptr), inactive(false), token(nullptr) {} -}; - -template -struct _hash_32_or_64 { - static inline std::uint32_t hash(std::uint32_t h) - { - // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp - // Since the thread ID is already unique, all we really want to do is propagate that - // uniqueness evenly across all the bits, so that we can use a subset of the bits while - // reducing collisions significantly - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - return h ^ (h >> 16); - } -}; -template <> -struct _hash_32_or_64<1> { - static inline std::uint64_t hash(std::uint64_t h) - { - h ^= h >> 33; - h *= 0xff51afd7ed558ccd; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53; - return h ^ (h >> 33); - } -}; -template -struct hash_32_or_64: public _hash_32_or_64<(size > 4)> {}; - -static inline size_t hash_thread_id(thread_id_t id) -{ - static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); - return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( - thread_id_converter::prehash(id))); -} - -template -static inline bool circular_less_than(T a, T b) -{ - static_assert( - std::is_integral::value && !std::numeric_limits::is_signed, - "circular_less_than is intended to be used only with unsigned integer types"); - return static_cast(a - b) > static_cast(static_cast(1) << (static_cast(sizeof(T) * CHAR_BIT - 1))); - // Note: extra parens around rhs of operator<< is MSVC bug: - // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 - // silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when - // done here. -} - -template -static inline char* align_for(char* ptr) -{ - const std::size_t alignment = std::alignment_of::value; - return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; -} - -template -static inline T ceil_to_pow_2(T x) -{ - static_assert( - std::is_integral::value && !std::numeric_limits::is_signed, - "ceil_to_pow_2 is intended to be used only with unsigned integer types"); - - // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - for (std::size_t i = 1; i < sizeof(T); i <<= 1) { - x |= x >> (i << 3); - } - ++x; - return x; -} - -template -static inline void swap_relaxed(std::atomic& left, std::atomic& right) -{ - T temp = left.load(std::memory_order_relaxed); - left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed); - right.store(temp, std::memory_order_relaxed); -} - -template -static inline T const& nomove(T const& x) -{ - return x; -} - -template -struct nomove_if { - template - static inline T const& eval(T const& x) - { - return x; - } -}; - -template <> -struct nomove_if { - template - static inline auto eval(U&& x) -> decltype(std::forward(x)) - { - return std::forward(x); - } -}; -template -static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) +namespace details { - return *it; -} - + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } + }; + + template struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } + }; + template<> struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } + }; + template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; + + static inline size_t hash_thread_id(thread_id_t id) + { + static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( + thread_id_converter::prehash(id))); + } + + template + static inline bool circular_less_than(T a, T b) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << (static_cast(sizeof(T) * CHAR_BIT - 1))); + // Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 + // silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here. + } + + template + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } + + template + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static inline void swap_relaxed(std::atomic& left, std::atomic& right) + { + T temp = left.load(std::memory_order_relaxed); + left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed); + right.store(temp, std::memory_order_relaxed); + } + + template + static inline T const& nomove(T const& x) + { + return x; + } + + template + struct nomove_if + { + template + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if + { + template + static inline auto eval(U&& x) + -> decltype(std::forward(x)) + { + return std::forward(x); + } + }; + + template + static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) + { + return *it; + } + #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) -template -struct is_trivially_destructible: std::is_trivially_destructible {}; + template struct is_trivially_destructible : std::is_trivially_destructible { }; #else -template -struct is_trivially_destructible: std::has_trivial_destructor {}; + template struct is_trivially_destructible : std::has_trivial_destructor { }; #endif - + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY -typedef RelacyThreadExitListener ThreadExitListener; -typedef RelacyThreadExitNotifier ThreadExitNotifier; + typedef RelacyThreadExitListener ThreadExitListener; + typedef RelacyThreadExitNotifier ThreadExitNotifier; #else -class ThreadExitNotifier; - -struct ThreadExitListener { - typedef void (*callback_t)(void*); - callback_t callback; - void* userData; - - ThreadExitListener* next; // reserved for use by the ThreadExitNotifier - ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier -}; - -class ThreadExitNotifier { -public: - static void subscribe(ThreadExitListener* listener) - { - auto& tlsInst = instance(); - std::lock_guard guard(mutex()); - listener->next = tlsInst.tail; - listener->chain = &tlsInst; - tlsInst.tail = listener; - } - - static void unsubscribe(ThreadExitListener* listener) - { - std::lock_guard guard(mutex()); - if (!listener->chain) { - return; // race with ~ThreadExitNotifier - } - auto& tlsInst = *listener->chain; - listener->chain = nullptr; - ThreadExitListener** prev = &tlsInst.tail; - for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { - if (ptr == listener) { - *prev = ptr->next; - break; - } - prev = &ptr->next; - } - } - -private: - ThreadExitNotifier(): tail(nullptr) {} - ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - - ~ThreadExitNotifier() - { - // This thread is about to exit, let everyone know! - assert( - this == &instance() && - "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that " - "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); - std::lock_guard guard(mutex()); - for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { - ptr->chain = nullptr; - ptr->callback(ptr->userData); - } - } - - // Thread-local - static inline ThreadExitNotifier& instance() - { - static thread_local ThreadExitNotifier notifier; - return notifier; - } - - static inline std::mutex& mutex() - { - // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called - static std::mutex mutex; - return mutex; - } - -private: - ThreadExitListener* tail; -}; -#endif -#endif + class ThreadExitNotifier; + + struct ThreadExitListener + { + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; + + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier + ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier + }; + + class ThreadExitNotifier + { + public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + std::lock_guard guard(mutex()); + listener->next = tlsInst.tail; + listener->chain = &tlsInst; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + std::lock_guard guard(mutex()); + if (!listener->chain) { + return; // race with ~ThreadExitNotifier + } + auto& tlsInst = *listener->chain; + listener->chain = nullptr; + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + + private: + ThreadExitNotifier() : tail(nullptr) { } + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + std::lock_guard guard(mutex()); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->chain = nullptr; + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + static inline std::mutex& mutex() + { + // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called + static std::mutex mutex; + return mutex; + } + + private: + ThreadExitListener* tail; + }; +#endif +#endif + + template struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template struct static_is_lock_free : static_is_lock_free_num::type> { }; + template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} -template -struct static_is_lock_free_num { - enum { value = 0 }; -}; -template <> -struct static_is_lock_free_num { - enum { value = ATOMIC_CHAR_LOCK_FREE }; -}; -template <> -struct static_is_lock_free_num { - enum { value = ATOMIC_SHORT_LOCK_FREE }; -}; -template <> -struct static_is_lock_free_num { - enum { value = ATOMIC_INT_LOCK_FREE }; -}; -template <> -struct static_is_lock_free_num { - enum { value = ATOMIC_LONG_LOCK_FREE }; -}; -template <> -struct static_is_lock_free_num { - enum { value = ATOMIC_LLONG_LOCK_FREE }; -}; -template -struct static_is_lock_free: static_is_lock_free_num::type> {}; -template <> -struct static_is_lock_free { - enum { value = ATOMIC_BOOL_LOCK_FREE }; -}; -template -struct static_is_lock_free { - enum { value = ATOMIC_POINTER_LOCK_FREE }; -}; -} // namespace details - -struct ProducerToken { - template - explicit ProducerToken(ConcurrentQueue& queue); - - template - explicit ProducerToken(BlockingConcurrentQueue& queue); - - ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT: producer(other.producer) - { - other.producer = nullptr; - if (producer != nullptr) { - producer->token = this; - } - } - - inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(producer, other.producer); - if (producer != nullptr) { - producer->token = this; - } - if (other.producer != nullptr) { - other.producer->token = &other; - } - } - - // A token is always valid unless: - // 1) Memory allocation failed during construction - // 2) It was moved via the move constructor - // (Note: assignment does a swap, leaving both potentially valid) - // 3) The associated queue was destroyed - // Note that if valid() returns true, that only indicates - // that the token is valid for use with a specific queue, - // but not which one; that's up to the user to track. - inline bool valid() const { return producer != nullptr; } - - ~ProducerToken() - { - if (producer != nullptr) { - producer->token = nullptr; - producer->inactive.store(true, std::memory_order_release); - } - } - - // Disable copying and assignment - ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + private: - template - friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + protected: - details::ConcurrentQueueProducerTypelessBase* producer; + details::ConcurrentQueueProducerTypelessBase* producer; }; -struct ConsumerToken { - template - explicit ConsumerToken(ConcurrentQueue& q); - - template - explicit ConsumerToken(BlockingConcurrentQueue& q); - - ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT: initialOffset(other.initialOffset), - lastKnownGlobalOffset(other.lastKnownGlobalOffset), - itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), - currentProducer(other.currentProducer), - desiredProducer(other.desiredProducer) - { - } - - inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(initialOffset, other.initialOffset); - std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); - std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); - std::swap(currentProducer, other.currentProducer); - std::swap(desiredProducer, other.desiredProducer); - } - - // Disable copying and assignment - ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; private: - template - friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - -private: // but shared with ConcurrentQueue - std::uint32_t initialOffset; - std::uint32_t lastKnownGlobalOffset; - std::uint32_t itemsConsumedFromCurrent; - details::ConcurrentQueueProducerTypelessBase* currentProducer; - details::ConcurrentQueueProducerTypelessBase* desiredProducer; + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; }; // Need to forward-declare this swap because it's in a namespace. -// See -// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces -template -inline void swap( - typename ConcurrentQueue::ImplicitProducerKVP& a, - typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; - -template -class ConcurrentQueue { +// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; + + +template +class ConcurrentQueue +{ public: - typedef ::moodycamel::ProducerToken producer_token_t; - typedef ::moodycamel::ConsumerToken consumer_token_t; - - typedef typename Traits::index_t index_t; - typedef typename Traits::size_t size_t; - - static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = - static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = - static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = - static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable : 4307) // + integral constant overflow (that's what the ternary expression is for!) -#pragma warning(disable : 4309) // static_cast: Truncation of constant value +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value #endif - static const size_t MAX_SUBQUEUE_SIZE = - (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? - details::const_numeric_max::value : - ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); #ifdef _MSC_VER #pragma warning(pop) #endif - static_assert( - !std::numeric_limits::is_signed && std::is_integral::value, - "Traits::size_t must be an unsigned integral type"); - static_assert( - !std::numeric_limits::is_signed && std::is_integral::value, - "Traits::index_t must be an unsigned integral type"); - static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); - static_assert( - (BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), - "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); - static_assert( - (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && - !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), - "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); - static_assert( - (EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), - "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert( - (IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), - "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert( - (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || - !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), - "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); - static_assert( - INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, - "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); public: - // Creates a queue with at least `capacity` element slots; note that the - // actual number of elements that can be inserted without additional memory - // allocation depends on the number of producers and the block size (e.g. if - // the block size is equal to `capacity`, only a single block will be allocated - // up-front, which means only a single producer will be able to enqueue elements - // without an extra allocation -- blocks aren't shared between producers). - // This method is not thread safe -- it is up to the user to ensure that the - // queue is fully constructed before it starts being used by other threads (this - // includes making the memory effects of construction visible, possibly with a - // memory barrier). - explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE) - : producerListTail(nullptr) - , producerCount(0) - , initialBlockPoolIndex(0) - , nextExplicitConsumerId(0) - , globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); - + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - // Track all the producers using a fully-resolved typed list for - // each kind; this makes it possible to debug them starting from - // the root queue object (otherwise wacky casts are needed that - // don't compile in the debugger's expression evaluator). - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Computes the correct amount of pre-allocated blocks for you based - // on the minimum number of elements you want available at any given - // time, and the maximum concurrent number of each type of producer. - ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) - : producerListTail(nullptr) - , producerCount(0) - , initialBlockPoolIndex(0) - , nextExplicitConsumerId(0) - , globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + - 2 * (maxExplicitProducers + maxImplicitProducers); - populate_initial_block_list(blocks); - + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Note: The queue should not be accessed concurrently while it's - // being deleted. It's up to the user to synchronize this. - // This method is not thread safe. - ~ConcurrentQueue() - { - // Destroy producers - auto ptr = producerListTail.load(std::memory_order_relaxed); - while (ptr != nullptr) { - auto next = ptr->next_prod(); - if (ptr->token != nullptr) { - ptr->token->producer = nullptr; - } - destroy(ptr); - ptr = next; - } - - // Destroy implicit producer hash tables - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) - { - auto hash = implicitProducerHash.load(std::memory_order_relaxed); - while (hash != nullptr) { - auto prev = hash->prev; - if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically - for (size_t i = 0; i != hash->capacity; ++i) { - hash->entries[i].~ImplicitProducerKVP(); - } - hash->~ImplicitProducerHash(); - (Traits::free)(hash); - } - hash = prev; - } - } - - // Destroy global free list - auto block = freeList.head_unsafe(); - while (block != nullptr) { - auto next = block->freeListNext.load(std::memory_order_relaxed); - if (block->dynamicallyAllocated) { - destroy(block); - } - block = next; - } - - // Destroy initial free list - destroy_array(initialBlockPool, initialBlockPoolSize); - } - - // Disable copying and copy assignment - ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - - // Moving is supported, but note that it is *not* a thread-safe operation. - // Nobody can use the queue while it's being moved, and the memory effects - // of that move must be propagated to other threads before they can use it. - // Note: When a queue is moved, its tokens are still valid but can only be - // used with the destination queue (i.e. semantically they are moved along - // with the queue itself). - ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT - : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), - producerCount(other.producerCount.load(std::memory_order_relaxed)), - initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), - initialBlockPool(other.initialBlockPool), - initialBlockPoolSize(other.initialBlockPoolSize), - freeList(std::move(other.freeList)), - nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), - globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) - { - // Move the other one into this, and leave the other one as an empty queue - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - swap_implicit_producer_hashes(other); - - other.producerListTail.store(nullptr, std::memory_order_relaxed); - other.producerCount.store(0, std::memory_order_relaxed); - other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); - other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); - + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - - other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); - other.initialBlockPoolSize = 0; - other.initialBlockPool = nullptr; - - reown_producers(); - } - - inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT { return swap_internal(other); } - - // Swaps this queue's state with the other's. Not thread-safe. - // Swapping two queues does not invalidate their tokens, however - // the tokens that were created for one queue must be used with - // only the swapped queue (i.e. the tokens are tied to the - // queue's movable state, not the object itself). - inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT { swap_internal(other); } - + explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + private: - ConcurrentQueue& swap_internal(ConcurrentQueue& other) - { - if (this == &other) { - return *this; - } - - details::swap_relaxed(producerListTail, other.producerListTail); - details::swap_relaxed(producerCount, other.producerCount); - details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); - std::swap(initialBlockPool, other.initialBlockPool); - std::swap(initialBlockPoolSize, other.initialBlockPoolSize); - freeList.swap(other.freeList); - details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); - details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); - - swap_implicit_producer_hashes(other); - - reown_producers(); - other.reown_producers(); - + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - details::swap_relaxed(explicitProducers, other.explicitProducers); - details::swap_relaxed(implicitProducers, other.implicitProducers); + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); #endif - - return *this; - } - + + return *this; + } + public: - // Enqueues a single item (by copying it). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T const& item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T&& item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T const& item) { return inner_enqueue(token, item); } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved instead of copied. - // Thread-safe. - template - bool enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails - // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - // Enqueues a single item (by copying it). - // Does not allocate memory. Fails if not enough room to enqueue (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0). - // Thread-safe. - inline bool try_enqueue(T const& item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Thread-safe. - inline bool try_enqueue(T&& item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T const& item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(U& item) - { - // Instead of simply trying each producer in turn (which could cause needless contention on the first - // producer), we score them heuristically. - size_t nonEmptyCount = 0; - ProducerBase* best = nullptr; - size_t bestSize = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; - ptr = ptr->next_prod()) { - auto size = ptr->size_approx(); - if (size > 0) { - if (size > bestSize) { - bestSize = size; - best = ptr; - } - ++nonEmptyCount; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (nonEmptyCount > 0) { - if ((details::likely)(best->dequeue(item))) { - return true; - } - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr != best && ptr->dequeue(item)) { - return true; - } - } - } - return false; - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // This differs from the try_dequeue(item) method in that this one does - // not attempt to reduce contention by interleaving the order that producer - // streams are dequeued from. So, using this method can reduce overall throughput - // under contention, but will give more predictable results in single-threaded - // consumer scenarios. This is mostly only useful for internal unit tests. - // Never allocates. Thread-safe. - template - bool try_dequeue_non_interleaved(U& item) - { - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->dequeue(item)) { - return true; - } - } - return false; - } - - // Attempts to dequeue from the queue using an explicit consumer token. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(consumer_token_t& token, U& item) - { - // The idea is roughly as follows: - // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the - // highest efficiency consumer dictates the rotation speed of everyone else, more or less If you see that the - // global offset has changed, you must reset your consumption counter and move to your designated place If - // there's no items where you're supposed to be, keep moving until you find a producer with some items If the - // global offset has not changed but you've run out of items to consume, move over from your current position - // until you find an producer with something in it - - if (token.desiredProducer == nullptr || - token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return false; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (static_cast(token.currentProducer)->dequeue(item)) { - if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return true; - } - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - if (ptr->dequeue(item)) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = 1; - return true; - } - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return false; - } - - // Attempts to dequeue several elements from the queue. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(It itemFirst, size_t max) - { - size_t count = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - count += ptr->dequeue_bulk(itemFirst, max - count); - if (count == max) { - break; - } - } - return count; - } - - // Attempts to dequeue several elements from the queue using an explicit consumer token. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) - { - if (token.desiredProducer == nullptr || - token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return 0; - } - } - - size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); - if (count == max) { - if ((token.itemsConsumedFromCurrent += static_cast(max)) >= - EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return max; - } - token.itemsConsumedFromCurrent += static_cast(count); - max -= count; - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - auto dequeued = ptr->dequeue_bulk(itemFirst, max); - count += dequeued; - if (dequeued != 0) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = static_cast(dequeued); - } - if (dequeued == max) { - break; - } - max -= dequeued; - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return count; - } - - // Attempts to dequeue from a specific producer's inner queue. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns false if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) - { - return static_cast(producer.producer)->dequeue(item); - } - - // Attempts to dequeue several elements from a specific producer's inner queue. - // Returns the number of items actually dequeued. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns 0 if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) - { - return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); - } - - // Returns an estimate of the total number of elements currently in the queue. This - // estimate is only accurate if the queue has completely stabilized before it is called - // (i.e. all enqueue and dequeue operations have completed and their memory effects are - // visible on the calling thread, and no further operations start while this method is - // being called). - // Thread-safe. - size_t size_approx() const - { - size_t size = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - size += ptr->size_approx(); - } - return size; - } - - // Returns true if the underlying atomic variables used by - // the queue are lock-free (they should be on most platforms). - // Thread-safe. - static constexpr bool is_lock_free() - { - return details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && - details::static_is_lock_free< - typename details::thread_id_converter::thread_id_numeric_size_t>::value == 2; - } + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it + + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static constexpr bool is_lock_free() + { + return + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; + } + private: - friend struct ProducerToken; - friend struct ConsumerToken; - struct ExplicitProducer; - friend struct ExplicitProducer; - struct ImplicitProducer; - friend struct ImplicitProducer; - friend class ConcurrentQueueTests; - - enum AllocationMode { CanAlloc, CannotAlloc }; - - /////////////////////////////// - // Queue methods - /////////////////////////////// - - template - inline bool inner_enqueue(producer_token_t const& token, U&& element) - { - return static_cast(token.producer) - ->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue(U&& element) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? - false : - producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return static_cast(token.producer) - ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); - } - - template - inline bool inner_enqueue_bulk(It itemFirst, size_t count) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? - false : - producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); - } - - inline bool update_current_producer_after_rotation(consumer_token_t& token) - { - // Ah, there's been a rotation, figure out where we should be! - auto tail = producerListTail.load(std::memory_order_acquire); - if (token.desiredProducer == nullptr && tail == nullptr) { - return false; - } - auto prodCount = producerCount.load(std::memory_order_relaxed); - auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); - if ((details::unlikely)(token.desiredProducer == nullptr)) { - // Aha, first time we're dequeueing anything. - // Figure out our local position - // Note: offset is from start, not end, but we're traversing from end -- subtract from count first - std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); - token.desiredProducer = tail; - for (std::uint32_t i = 0; i != offset; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - } - - std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; - if (delta >= prodCount) { - delta = delta % prodCount; - } - for (std::uint32_t i = 0; i != delta; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - - token.lastKnownGlobalOffset = globalOffset; - token.currentProducer = token.desiredProducer; - token.itemsConsumedFromCurrent = 0; - return true; - } - - /////////////////////////// - // Free list - /////////////////////////// - - template - struct FreeListNode { - FreeListNode(): freeListRefs(0), freeListNext(nullptr) {} - - std::atomic freeListRefs; - std::atomic freeListNext; - }; - - // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but - // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly - // speedy under low contention. - template // N must inherit FreeListNode or have the same fields (and initialization of them) - struct FreeList { - FreeList(): freeListHead(nullptr) {} - FreeList(FreeList&& other): freeListHead(other.freeListHead.load(std::memory_order_relaxed)) - { - other.freeListHead.store(nullptr, std::memory_order_relaxed); - } - void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } - - FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - - inline void add(N* node) - { + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to - // set it using a fetch_add - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { - // Oh look! We were the last ones referencing this node, and we know - // we want to add it to the free list, so let's do it! - add_knowing_refcount_is_zero(node); - } - } - - inline N* try_get() - { + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - auto head = freeListHead.load(std::memory_order_acquire); - while (head != nullptr) { - auto prevHead = head; - auto refs = head->freeListRefs.load(std::memory_order_relaxed); - if ((refs & REFS_MASK) == 0 || - !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) { - head = freeListHead.load(std::memory_order_acquire); - continue; - } - - // Good, reference count has been incremented (it wasn't at zero), which means we can read the - // next and not worry about it changing between now and the time we do the CAS - auto next = head->freeListNext.load(std::memory_order_relaxed); - if (freeListHead.compare_exchange_strong( - head, next, std::memory_order_acquire, std::memory_order_relaxed)) { - // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no - // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put - // back on). - assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); - - // Decrease refcount twice, once for our ref, and once for the list's ref - head->freeListRefs.fetch_sub(2, std::memory_order_release); - return head; - } - - // OK, the head must have changed on us, but we still need to decrease the refcount we increased. - // Note that we don't need to release any memory effects, but we do need to ensure that the reference - // count decrement happens-after the CAS on the head. - refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); - if (refs == SHOULD_BE_ON_FREELIST + 1) { - add_knowing_refcount_is_zero(prevHead); - } - } - - return nullptr; - } - - // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) - N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } - - private: - inline void add_knowing_refcount_is_zero(N* node) - { - // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run - // only one copy of this method per node at a time, i.e. the single thread case), then we know - // we can safely change the next pointer of the node; however, once the refcount is back above - // zero, then other threads could increase it (happens under heavy contention, when the refcount - // goes to zero in between a load and a refcount increment of a node in try_get, then back up to - // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS - // to add the node to the actual list fails, decrease the refcount and leave the add operation to - // the next thread who puts the refcount back at zero (which could be us, hence the loop). - auto head = freeListHead.load(std::memory_order_relaxed); - while (true) { - node->freeListNext.store(head, std::memory_order_relaxed); - node->freeListRefs.store(1, std::memory_order_release); - if (!freeListHead.compare_exchange_strong( - head, node, std::memory_order_release, std::memory_order_relaxed)) { - // Hmm, the add failed, but we can only try again when the refcount goes back to zero - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) { - continue; - } - } - return; - } - } - - private: - // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under - // contention) - std::atomic freeListHead; - - static const std::uint32_t REFS_MASK = 0x7FFFFFFF; - static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; - + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugMutex mutex; -#endif - }; - - /////////////////////////// - // Block - /////////////////////////// - - enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; - - struct Block { - Block() - : next(nullptr) - , elementsCompletelyDequeued(0) - , freeListRefs(0) - , freeListNext(nullptr) - , dynamicallyAllocated(true) - { + debug::DebugMutex mutex; +#endif + }; + + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true) + { #ifdef MCDBGQ_TRACKMEM - owner = nullptr; -#endif - } - - template - inline bool is_empty() const - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Check flags - for (size_t i = 0; i < BLOCK_SIZE; ++i) { - if (!emptyFlags[i].load(std::memory_order_relaxed)) { - return false; - } - } - - // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - else - { - // Check counter - if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); - return false; - } - } - - // Returns true if the block is now empty (does not apply in explicit context) - template - inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Set flag - assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load( - std::memory_order_relaxed)); - emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store( - true, std::memory_order_release); - return false; - } - else - { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel); - assert(prevVal < BLOCK_SIZE); - return prevVal == BLOCK_SIZE - 1; - } - } - - // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). - // Returns true if the block is now empty (does not apply in explicit context). - template - inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Set flags - std::atomic_thread_fence(std::memory_order_release); - i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; - for (size_t j = 0; j != count; ++j) { - assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); - emptyFlags[i + j].store(true, std::memory_order_relaxed); - } - return false; - } - else - { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel); - assert(prevVal + count <= BLOCK_SIZE); - return prevVal + count == BLOCK_SIZE; - } - } - - template - inline void set_all_empty() - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Set all flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(true, std::memory_order_relaxed); - } - } - else - { - // Reset counter - elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); - } - } - - template - inline void reset_empty() - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Reset flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(false, std::memory_order_relaxed); - } - } - else - { - // Reset counter - elementsCompletelyDequeued.store(0, std::memory_order_relaxed); - } - } - - inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT - { - return static_cast(static_cast(elements)) + - static_cast(idx & static_cast(BLOCK_SIZE - 1)); - } - inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT - { - return static_cast(static_cast(elements)) + - static_cast(idx & static_cast(BLOCK_SIZE - 1)); - } - - private: - static_assert( - std::alignment_of::value <= sizeof(T), - "The queue does not support types with an alignment greater than their size at this time"); - MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; - - public: - Block* next; - std::atomic elementsCompletelyDequeued; - std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; - - public: - std::atomic freeListRefs; - std::atomic freeListNext; - bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' - + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + + private: + static_assert(std::alignment_of::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic freeListRefs; + std::atomic freeListNext; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + #ifdef MCDBGQ_TRACKMEM - void* owner; + void* owner; #endif - }; - static_assert( - std::alignment_of::value >= std::alignment_of::value, - "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + #ifdef MCDBGQ_TRACKMEM public: - struct MemStats; - + struct MemStats; private: #endif - - /////////////////////////// - // Producer base - /////////////////////////// - - struct ProducerBase: public details::ConcurrentQueueProducerTypelessBase { - ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) - : tailIndex(0) - , headIndex(0) - , dequeueOptimisticCount(0) - , dequeueOvercommit(0) - , tailBlock(nullptr) - , isExplicit(isExplicit_) - , parent(parent_) - { - } - - virtual ~ProducerBase() {} - - template - inline bool dequeue(U& element) - { - if (isExplicit) { - return static_cast(this)->dequeue(element); - } else { - return static_cast(this)->dequeue(element); - } - } - - template - inline size_t dequeue_bulk(It& itemFirst, size_t max) - { - if (isExplicit) { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } else { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } - } - - inline ProducerBase* next_prod() const { return static_cast(next); } - - inline size_t size_approx() const - { - auto tail = tailIndex.load(std::memory_order_relaxed); - auto head = headIndex.load(std::memory_order_relaxed); - return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; - } - - inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } - - protected: - std::atomic tailIndex; // Where to enqueue to next - std::atomic headIndex; // Where to dequeue from next - - std::atomic dequeueOptimisticCount; - std::atomic dequeueOvercommit; - - Block* tailBlock; - - public: - bool isExplicit; - ConcurrentQueue* parent; - - protected: + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() { } + + template + inline bool dequeue(U& element) + { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } + else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + else { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - /////////////////////////// - // Explicit queue - /////////////////////////// - - struct ExplicitProducer: public ProducerBase { - explicit ExplicitProducer(ConcurrentQueue* parent_) - : ProducerBase(parent_, true) - , blockIndex(nullptr) - , pr_blockIndexSlotsUsed(0) - , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1) - , pr_blockIndexFront(0) - , pr_blockIndexEntries(nullptr) - , pr_blockIndexRaw(nullptr) - { - size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; - if (poolBasedIndexSize > pr_blockIndexSize) { - pr_blockIndexSize = poolBasedIndexSize; - } - - new_block_index(0); // This creates an index with double the number of current entries, i.e. - // EXPLICIT_INITIAL_INDEX_SIZE - } - - ~ExplicitProducer() - { - // Destruct any elements not yet dequeued. - // Since we're in the destructor, we can assume all elements - // are either completely dequeued or completely not (no halfways). - if (this->tailBlock != nullptr) { // Note this means there must be a block index too - // First find the block that's partially dequeued, if any - Block* halfDequeuedBlock = nullptr; - if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { - // The head's not on a block boundary, meaning a block somewhere is partially dequeued - // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a - // boundary) - size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); - while (details::circular_less_than( - pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { - i = (i + 1) & (pr_blockIndexSize - 1); - } - assert( - details::circular_less_than( - pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); - halfDequeuedBlock = pr_blockIndexEntries[i].block; - } - - // Start at the head block (note the first line in the loop gives us the head from the tail on the first - // iteration) - auto block = this->tailBlock; - do { - block = block->next; - if (block->ConcurrentQueue::Block::template is_empty()) { - continue; - } - - size_t i = 0; // Offset into block - if (block == halfDequeuedBlock) { - i = static_cast( - this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - } - - // Walk through all the items in the block; if this is the tail block, we need to stop when we reach - // the tail index - auto lastValidIndex = - (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? - BLOCK_SIZE : - static_cast( - this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { - (*block)[i++]->~T(); - } - } while (block != this->tailBlock); - } - - // Destroy all blocks that we own - if (this->tailBlock != nullptr) { - auto block = this->tailBlock; - do { - auto nextBlock = block->next; - this->parent->add_block_to_free_list(block); - block = nextBlock; - } while (block != this->tailBlock); - } - - // Destroy the block indices - auto header = static_cast(pr_blockIndexRaw); - while (header != nullptr) { - auto prev = static_cast(header->prev); - header->~BlockIndexHeader(); - (Traits::free)(header); - header = prev; - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto startBlock = this->tailBlock; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - if (this->tailBlock != nullptr && - this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - // We can re-use the block ahead of us, it's empty! - this->tailBlock = this->tailBlock->next; - this->tailBlock->ConcurrentQueue::Block::template reset_empty(); - - // We'll put the block on the block index (guaranteed to be room since we're conceptually removing - // the last block from it first -- except instead of removing then adding, we can just overwrite). - // Note that there must be a valid block index here, since even if allocation failed in the ctor, - // it would have been re-attempted when adding the first block to the queue; since there is such - // a block, a block index must have been successfully allocated. - } else { - // Whatever head value we see here is >= the last value we saw here (relatively), - // and <= its current value. Since we have the most recent tail, the head must be - // <= to it. - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - // We can't enqueue in another block because there's not enough leeway -- the - // tail could surpass the head by the time the block fills up! (Or we'll exceed - // the size limit, if the second part of the condition was true.) - return false; - } - // We're going to need a new block; check that the block index has room - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { - // Hmm, the circular block index is already full -- we'll need - // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if - // the initial allocation failed in the constructor. - - MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) - { - return false; - } - else if (!new_block_index(pr_blockIndexSlotsUsed)) - { - return false; - } - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - return false; - } + friend struct MemStats; +#endif + }; + + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + this->parent->add_block_to_free_list(block); + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index(pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - ++pr_blockIndexSlotsUsed; - } - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) - { - // The constructor may throw. We want the element not to appear in the queue in - // that case (without corrupting the queue): - MOODYCAMEL_TRY - { - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH(...) - { - // Revert change to the current block, but leave the new block available - // for next time - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; - MOODYCAMEL_RETHROW; - } - } - else - { - (void)startBlock; - (void)originalBlockIndexSlotsUsed; - } - - // Add block to block index - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) - { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than( - this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - // Might be something to dequeue, let's give it a try - - // Note that this if is purely for performance purposes in the common case when the queue is - // empty and the values are eventually consistent -- we may enter here spuriously. - - // Note that whatever the values of overcommit and tail are, they are not going to change (unless we - // change them) and must be the same value at this point (inside the if) as when the if condition was - // evaluated. - - // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit - // below. This ensures that whatever the value we got loaded into overcommit, the load of - // dequeueOptisticCount in the fetch_add below will result in a value at least as recent as that (and - // therefore at least as large). Note that I believe a compiler (signal) fence here would be sufficient - // due to the nature of fetch_add (all read-modify-write operations are guaranteed to work on the latest - // value in the modification order), but unfortunately that can't be shown to be correct using only the - // C++11 standard. See - // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case - std::atomic_thread_fence(std::memory_order_acquire); - - // Increment optimistic counter, then check if it went over the boundary - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - - // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is - // only ever incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), - // and since we now have a version of dequeueOptimisticCount that is at least as recent as overcommit - // (due to the release upon incrementing dequeueOvercommit and the acquire above that synchronizes with - // it), overcommit <= myDequeueCount. However, we can't assert this since both dequeueOptimisticCount - // and dequeueOvercommit may (independently) overflow; in such a case, though, the logic still holds - // since the difference between the two is maintained. - - // Note that we reload tail here in case it changed; it will be the same value as before or greater, - // since this load is sequenced after (happens after) the earlier load above. This is supported by - // read-read coherency (as defined in the standard), explained here: - // http://en.cppreference.com/w/cpp/atomic/memory_order - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - // Guaranteed to be at least one element to dequeue! - - // Get the index. Note that since there's guaranteed to be at least one element, this - // will never exceed tail. We need to do an acquire-release fence here since it's possible - // that whatever condition got us to this point was for an earlier enqueued element (that - // we already see the memory effects for), but that by the time we increment somebody else - // has incremented it, and we need to see the memory effects for *that* element, which is - // in such a case is necessarily visible on the thread that incremented it in the first - // place with the more current condition (they must have acquired a tail that is at least - // as recent). - auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - // Determine which block the element is in - - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - // We need to be careful here about subtracting and dividing because of index wrap-around. - // When an index wraps, we need to preserve the sign of the offset when dividing it by the - // block size (in order to get a correct signed block count offset in all cases): - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast( - static_cast::type>(blockBaseIndex - headBase) / - static_cast::type>(BLOCK_SIZE)); - auto block = - localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; - - // Dequeue - auto& el = *((*block)[index]); - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { - // Make sure the element is still fully dequeued and destroyed even if the assignment - // throws - struct Guard { - Block* block; - index_t index; - - ~Guard() - { - (*block)[index]->~T(); - block->ConcurrentQueue::Block::template set_empty(index); - } - } guard = {block, index}; - - element = std::move(el); // NOLINT - } else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - block->ConcurrentQueue::Block::template set_empty(index); - } - - return true; - } else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add( - 1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is - // guaranteed to happen before this write - } - } - - return false; - } - - template - bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - auto originalBlockIndexFront = pr_blockIndexFront; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - - Block* firstAllocatedBlock = nullptr; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { - // Allocate as many blocks as possible from ahead - while (blockBaseDiff > 0 && this->tailBlock != nullptr && - this->tailBlock->next != firstAllocatedBlock && - this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - this->tailBlock = this->tailBlock->next; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Now allocate as many blocks as necessary from the block pool - while (blockBaseDiff > 0) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { - MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) - { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - else if (full || !new_block_index(originalBlockIndexSlotsUsed)) - { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - - // pr_blockIndexFront is updated inside new_block_index, so we need to - // update our fallback value too (since we keep the new index even if we - // later fail) - originalBlockIndexFront = originalBlockIndexSlotsUsed; - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY { + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only the C++11 standard. + // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever + // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. + // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) + // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load above. This is supported by read-read + // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); + auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty(index); + } + } guard = { block, index }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty(index); + } + + return true; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template set_all_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - ++pr_blockIndexSlotsUsed; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and - // publish the new block index front - auto block = firstAllocatedBlock; - while (true) { - block->ConcurrentQueue::Block::template reset_empty(); - if (block == this->tailBlock) { - break; - } - block = block->next; - } - - MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( - T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) - { - blockIndex.load(std::memory_order_relaxed) - ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - auto endBlock = this->tailBlock; - this->tailBlock = startBlock; - assert( - (startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || - count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - index_t stopIndex = - (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( - T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) - { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else - { - MOODYCAMEL_TRY - { - while (currentTailIndex != stopIndex) { - // Must use copy constructor even if move constructor is available - // because we may have to revert if there's an exception. - // Sorry about the horrible templated next line, but it was the only way - // to disable moving *at compile time*, which is important because a type - // may only define a (noexcept) move constructor, and so calls to the - // cctor will not compile, even if they are in an if branch that will never - // be executed - new ((*this->tailBlock)[currentTailIndex]) - T(details::nomove_if(nullptr)) - T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH(...) - { - // Oh dear, an exception's been thrown -- destroy the elements that - // were enqueued so far and revert the entire bulk operation (we'll keep - // any allocated blocks in our linked list for later, though). - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - - MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( - T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) - { - if (firstAllocatedBlock != nullptr) - blockIndex.load(std::memory_order_relaxed) - ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = - static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Determine which block the first element is in - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast( - static_cast::type>(firstBlockBaseIndex - headBase) / - static_cast::type>(BLOCK_SIZE)); - auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); - - // Iterate the blocks and dequeue - auto index = firstIndex; - do { - auto firstIndexInBlock = index; - index_t endIndex = - (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than( - firstIndex + static_cast(actualCount), endIndex) ? - firstIndex + static_cast(actualCount) : - endIndex; - auto block = localBlockIndex->entries[indexIndex].block; - if (MOODYCAMEL_NOEXCEPT_ASSIGN( - T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } else { - MOODYCAMEL_TRY - { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH(...) - { - // It's too late to revert the dequeue, but we can make sure that all - // the dequeued objects are properly destroyed and the block index - // (and empty count) are properly updated before we propagate the exception - do { - block = localBlockIndex->entries[indexIndex].block; - while (index != endIndex) { - (*block)[index++]->~T(); - } - block->ConcurrentQueue::Block::template set_many_empty( - firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - - firstIndexInBlock = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than( - firstIndex + static_cast(actualCount), endIndex) ? - firstIndex + static_cast(actualCount) : - endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - block->ConcurrentQueue::Block::template set_many_empty( - firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - struct BlockIndexEntry { - index_t base; - Block* block; - }; - - struct BlockIndexHeader { - size_t size; - std::atomic front; // Current slot (not next, like pr_blockIndexFront) - BlockIndexEntry* entries; - void* prev; - }; - - bool new_block_index(size_t numberOfFilledSlotsToExpose) - { - auto prevBlockSizeMask = pr_blockIndexSize - 1; - - // Create the new block - pr_blockIndexSize <<= 1; - auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + - std::alignment_of::value - 1 + - sizeof(BlockIndexEntry) * pr_blockIndexSize)); - if (newRawPtr == nullptr) { - pr_blockIndexSize >>= 1; // Reset to allow graceful retry - return false; - } - - auto newBlockIndexEntries = reinterpret_cast( - details::align_for(newRawPtr + sizeof(BlockIndexHeader))); - - // Copy in all the old indices, if any - size_t j = 0; - if (pr_blockIndexSlotsUsed != 0) { - auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; - do { - newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; - i = (i + 1) & prevBlockSizeMask; - } while (i != pr_blockIndexFront); - } - - // Update everything - auto header = new (newRawPtr) BlockIndexHeader; - header->size = pr_blockIndexSize; - header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); - header->entries = newBlockIndexEntries; - header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later - - pr_blockIndexFront = j; - pr_blockIndexEntries = newBlockIndexEntries; - pr_blockIndexRaw = newRawPtr; - blockIndex.store(header, std::memory_order_release); - - return true; - } - - private: - std::atomic blockIndex; - - // To be used by producer only -- consumer must use the ones in referenced by blockIndex - size_t pr_blockIndexSlotsUsed; - size_t pr_blockIndexSize; - size_t pr_blockIndexFront; // Next slot (not current) - BlockIndexEntry* pr_blockIndexEntries; - void* pr_blockIndexRaw; - + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + if (firstAllocatedBlock != nullptr) + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ExplicitProducer* nextExplicitProducer; - - private: + public: + ExplicitProducer* nextExplicitProducer; + private: #endif - + #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - ////////////////////////////////// - // Implicit queue - ////////////////////////////////// - - struct ImplicitProducer: public ProducerBase { - ImplicitProducer(ConcurrentQueue* parent_) - : ProducerBase(parent_, false), nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), blockIndex(nullptr) - { - new_block_index(); - } - - ~ImplicitProducer() - { - // Note that since we're in the destructor we can assume that all enqueue/dequeue operations - // completed already; this means that all undequeued elements are placed contiguously across - // contiguous blocks, and that only the first and last remaining blocks can be only partially - // empty (all other remaining blocks must be completely full). - + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - // Unregister ourselves for thread termination notification - if (!this->inactive.load(std::memory_order_relaxed)) { - details::ThreadExitNotifier::unsubscribe(&threadExitListener); - } -#endif - - // Destroy all remaining elements! - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto index = this->headIndex.load(std::memory_order_relaxed); - Block* block = nullptr; - assert(index == tail || details::circular_less_than(index, tail)); - bool forceFreeLastBlock = - index != tail; // If we enter the loop, then the last (tail) block will not be freed - while (index != tail) { - if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { - if (block != nullptr) { - // Free the old block - this->parent->add_block_to_free_list(block); - } - - block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); - } - - ((*block)[index])->~T(); - ++index; - } - // Even if the queue is empty, there's still one block that's not on the free list - // (unless the head index reached the end of it, in which case the tail will be poised - // to create a new block). - if (this->tailBlock != nullptr && - (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { - this->parent->add_block_to_free_list(this->tailBlock); - } - - // Destroy block index - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - if (localBlockIndex != nullptr) { - for (size_t i = 0; i != localBlockIndex->capacity; ++i) { - localBlockIndex->index[i]->~BlockIndexEntry(); - } - do { - auto prev = localBlockIndex->prev; - localBlockIndex->~BlockIndexHeader(); - (Traits::free)(localBlockIndex); - localBlockIndex = prev; - } while (localBlockIndex != nullptr); - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - return false; - } + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry; - if (!insert_block_index_entry(idxEntry, currentTailIndex)) { - return false; - } - - // Get ahold of a new block - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - return false; - } + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) - { - // May throw, try to insert now before we publish the fact that we have this new block - MOODYCAMEL_TRY - { - new ((*newBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH(...) - { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(newBlock); - MOODYCAMEL_RETHROW; - } - } - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - this->tailBlock = newBlock; - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) - { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - // See ExplicitProducer::dequeue for rationale and explanation - index_t tail = this->tailIndex.load(std::memory_order_relaxed); - index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than( - this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - std::atomic_thread_fence(std::memory_order_acquire); - - index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - // Determine which block the element is in - auto entry = get_block_index_entry_for_index(index); - - // Dequeue - auto block = entry->value.load(std::memory_order_relaxed); - auto& el = *((*block)[index]); - - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - // Note: Acquiring the mutex with every dequeue instead of only when a block - // is released is very sub-optimal, but it is, after all, purely debug code. - debug::DebugLock lock(producer->mutex); -#endif - struct Guard { - Block* block; - index_t index; - BlockIndexEntry* entry; - ConcurrentQueue* parent; - - ~Guard() - { - (*block)[index]->~T(); - if (block->ConcurrentQueue::Block::template set_empty(index)) { - entry->value.store(nullptr, std::memory_order_relaxed); - parent->add_block_to_free_list(block); - } - } - } guard = {block, index, entry, this->parent}; - - element = std::move(el); // NOLINT - } else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - - if (block->ConcurrentQueue::Block::template set_empty(index)) { - { + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = { block, index, entry, this->parent }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty(index)) { + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Add the block back into the global free pool (and remove from block index) - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - } - - return true; - } else { - this->dequeueOvercommit.fetch_add(1, std::memory_order_release); - } - } - - return false; - } - + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + } + + return true; + } + else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable : 4706) // assignment within conditional expression -#endif - template - bool enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - - // Note that the tailBlock we start off with may not be owned by us any more; - // this happens if it was filled up exactly to the top (setting tailIndex to - // the first index of the next block which is not yet allocated), then dequeued - // completely (putting it on the free list) before we enqueue again. - - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - Block* firstAllocatedBlock = nullptr; - auto endBlock = this->tailBlock; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { +#pragma warning(disable: 4706) // assignment within conditional expression +#endif + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - do { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry = - nullptr; // initialization here unnecessary but compiler can't always tell - Block* newBlock; - bool indexInserted = false; - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - - if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || - (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == - nullptr) { - // Index allocation or block allocation failed; revert any other allocations - // and index insertions done so far for this operation - if (indexInserted) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - } - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - - return false; - } - + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + + if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - newBlock->next = nullptr; - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - // Store the chain of blocks so that we can undo if later allocations fail, - // and so that we can find the blocks when we do the actual enqueueing - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || - firstAllocatedBlock != nullptr) { - assert(this->tailBlock != nullptr); - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - endBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; - } while (blockBaseDiff > 0); - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - this->tailBlock = startBlock; - assert( - (startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || - count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - index_t stopIndex = - (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( - T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) - { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else - { - MOODYCAMEL_TRY - { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex]) - T(details::nomove_if(nullptr)) - T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH(...) - { - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - auto idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } #ifdef _MSC_VER #pragma warning(pop) #endif - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = - static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Iterate the blocks and dequeue - auto index = firstIndex; - BlockIndexHeader* localBlockIndex; - auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); - do { - auto blockStartIndex = index; - index_t endIndex = - (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than( - firstIndex + static_cast(actualCount), endIndex) ? - firstIndex + static_cast(actualCount) : - endIndex; - - auto entry = localBlockIndex->index[indexIndex]; - auto block = entry->value.load(std::memory_order_relaxed); - if (MOODYCAMEL_NOEXCEPT_ASSIGN( - T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } else { - MOODYCAMEL_TRY - { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH(...) - { - do { - entry = localBlockIndex->index[indexIndex]; - block = entry->value.load(std::memory_order_relaxed); - while (index != endIndex) { - (*block)[index++]->~T(); - } - - if (block->ConcurrentQueue::Block::template set_many_empty( - blockStartIndex, static_cast(endIndex - blockStartIndex))) { + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - entry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(block); - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - - blockStartIndex = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than( - firstIndex + static_cast(actualCount), endIndex) ? - firstIndex + static_cast(actualCount) : - endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - if (block->ConcurrentQueue::Block::template set_many_empty( - blockStartIndex, static_cast(endIndex - blockStartIndex))) { - { + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Note that the set_many_empty above did a release, meaning that anybody who acquires - // the block we're about to free can use it safely since our writes (and reads!) will - // have happened-before then. - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } else { - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - // The block size must be > 1, so any number with the low bit set is an invalid block base index - static const index_t INVALID_BLOCK_BASE = 1; - - struct BlockIndexEntry { - std::atomic key; - std::atomic value; - }; - - struct BlockIndexHeader { - size_t capacity; - std::atomic tail; - BlockIndexEntry* entries; - BlockIndexEntry** index; - BlockIndexHeader* prev; - }; - - template - inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) - { - auto localBlockIndex = - blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK - if (localBlockIndex == nullptr) { - return false; // this can happen if new_block_index failed in the constructor - } - size_t newTail = - (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || - idxEntry->value.load(std::memory_order_relaxed) == nullptr) { - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - - // No room in the old block index, try to allocate another one! - MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) - { - return false; - } - else if (!new_block_index()) - { - return false; - } - else - { - localBlockIndex = blockIndex.load(std::memory_order_relaxed); - newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - } - - inline void rewind_block_index_tail() - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - localBlockIndex->tail.store( - (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), - std::memory_order_relaxed); - } - - inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const - { - BlockIndexHeader* localBlockIndex; - auto idx = get_block_index_index_for_index(index, localBlockIndex); - return localBlockIndex->index[idx]; - } - - inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const - { + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires the block + // we're about to free can use it safely since our writes (and reads!) will have happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index()) { + return false; + } + else { + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - index &= ~static_cast(BLOCK_SIZE - 1); - localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto tail = localBlockIndex->tail.load(std::memory_order_acquire); - auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); - assert(tailBase != INVALID_BLOCK_BASE); - // Note: Must use division instead of shift because the index may wrap around, causing a negative - // offset, whose negativity we want to preserve - auto offset = static_cast( - static_cast::type>(index - tailBase) / - static_cast::type>(BLOCK_SIZE)); - size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); - assert( - localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && - localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); - return idx; - } - - bool new_block_index() - { - auto prev = blockIndex.load(std::memory_order_relaxed); - size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; - auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; - auto raw = static_cast( - (Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + - sizeof(BlockIndexEntry) * entryCount + std::alignment_of::value - 1 + - sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); - if (raw == nullptr) { - return false; - } - - auto header = new (raw) BlockIndexHeader; - auto entries = - reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); - auto index = reinterpret_cast(details::align_for( - reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); - if (prev != nullptr) { - auto prevTail = prev->tail.load(std::memory_order_relaxed); - auto prevPos = prevTail; - size_t i = 0; - do { - prevPos = (prevPos + 1) & (prev->capacity - 1); - index[i++] = prev->index[prevPos]; - } while (prevPos != prevTail); - assert(i == prevCapacity); - } - for (size_t i = 0; i != entryCount; ++i) { - new (entries + i) BlockIndexEntry; - entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); - index[prevCapacity + i] = entries + i; - } - header->prev = prev; - header->entries = entries; - header->index = index; - header->capacity = nextBlockIndexCapacity; - header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); - - blockIndex.store(header, std::memory_order_release); - - nextBlockIndexCapacity <<= 1; - - return true; - } - - private: - size_t nextBlockIndexCapacity; - std::atomic blockIndex; + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast(static_cast::type>(index - tailBase) / static_cast::type>(BLOCK_SIZE)); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - public: - details::ThreadExitListener threadExitListener; - - private: + public: + details::ThreadExitListener threadExitListener; + private: #endif - + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ImplicitProducer* nextImplicitProducer; - - private: + public: + ImplicitProducer* nextImplicitProducer; + private: #endif #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - mutable debug::DebugMutex mutex; + mutable debug::DebugMutex mutex; #endif #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - ////////////////////////////////// - // Block pool manipulation - ////////////////////////////////// - - void populate_initial_block_list(size_t blockCount) - { - initialBlockPoolSize = blockCount; - if (initialBlockPoolSize == 0) { - initialBlockPool = nullptr; - return; - } - - initialBlockPool = create_array(blockCount); - if (initialBlockPool == nullptr) { - initialBlockPoolSize = 0; - } - for (size_t i = 0; i < initialBlockPoolSize; ++i) { - initialBlockPool[i].dynamicallyAllocated = false; - } - } - - inline Block* try_get_block_from_initial_pool() - { - if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { - return nullptr; - } - - auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); - - return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; - } - - inline void add_block_to_free_list(Block* block) - { + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { #ifdef MCDBGQ_TRACKMEM - block->owner = nullptr; -#endif - if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { - destroy(block); - } else { - freeList.add(block); - } - } - - inline void add_blocks_to_free_list(Block* block) - { - while (block != nullptr) { - auto next = block->next; - add_block_to_free_list(block); - block = next; - } - } - - inline Block* try_get_block_from_free_list() { return freeList.try_get(); } - - // Gets a free block from one of the memory pools, or allocates a new one (if applicable) - template - Block* requisition_block() - { - auto block = try_get_block_from_initial_pool(); - if (block != nullptr) { - return block; - } - - block = try_get_block_from_free_list(); - if (block != nullptr) { - return block; - } - - MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) - { - return create(); - } - else - { - return nullptr; - } - } + block->owner = nullptr; +#endif + if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { + destroy(block); + } + else { + freeList.add(block); + } + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) { + return create(); + } + else { + return nullptr; + } + } + #ifdef MCDBGQ_TRACKMEM -public: - struct MemStats { - size_t allocatedBlocks; - size_t usedBlocks; - size_t freeBlocks; - size_t ownedBlocksExplicit; - size_t ownedBlocksImplicit; - size_t implicitProducers; - size_t explicitProducers; - size_t elementsEnqueued; - size_t blockClassBytes; - size_t queueClassBytes; - size_t implicitBlockIndexBytes; - size_t explicitBlockIndexBytes; - - friend class ConcurrentQueue; - - private: - static MemStats getFor(ConcurrentQueue* q) - { - MemStats stats = {0}; - - stats.elementsEnqueued = q->size_approx(); - - auto block = q->freeList.head_unsafe(); - while (block != nullptr) { - ++stats.allocatedBlocks; - ++stats.freeBlocks; - block = block->freeListNext.load(std::memory_order_relaxed); - } - - for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; - ptr = ptr->next_prod()) { - bool implicit = dynamic_cast(ptr) != nullptr; - stats.implicitProducers += implicit ? 1 : 0; - stats.explicitProducers += implicit ? 0 : 1; - - if (implicit) { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ImplicitProducer); - auto head = prod->headIndex.load(std::memory_order_relaxed); - auto tail = prod->tailIndex.load(std::memory_order_relaxed); - auto hash = prod->blockIndex.load(std::memory_order_relaxed); - if (hash != nullptr) { - for (size_t i = 0; i != hash->capacity; ++i) { - if (hash->index[i]->key.load(std::memory_order_relaxed) != - ImplicitProducer::INVALID_BLOCK_BASE && - hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { - ++stats.allocatedBlocks; - ++stats.ownedBlocksImplicit; - } - } - stats.implicitBlockIndexBytes += - hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); - for (; hash != nullptr; hash = hash->prev) { - stats.implicitBlockIndexBytes += - sizeof(typename ImplicitProducer::BlockIndexHeader) + - hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); - } - } - for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { - // auto block = prod->get_block_index_entry_for_index(head); - ++stats.usedBlocks; - } - } else { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ExplicitProducer); - auto tailBlock = prod->tailBlock; - bool wasNonEmpty = false; - if (tailBlock != nullptr) { - auto block = tailBlock; - do { - ++stats.allocatedBlocks; - if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { - ++stats.usedBlocks; - wasNonEmpty = wasNonEmpty || block != tailBlock; - } - ++stats.ownedBlocksExplicit; - block = block->next; - } while (block != tailBlock); - } - auto index = prod->blockIndex.load(std::memory_order_relaxed); - while (index != nullptr) { - stats.explicitBlockIndexBytes += - sizeof(typename ExplicitProducer::BlockIndexHeader) + - index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); - index = static_cast(index->prev); - } - } - } - - auto freeOnInitialPool = - q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? - 0 : - q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); - stats.allocatedBlocks += freeOnInitialPool; - stats.freeBlocks += freeOnInitialPool; - - stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; - stats.queueClassBytes += sizeof(ConcurrentQueue); - - return stats; - } - }; - - // For debugging only. Not thread-safe. - MemStats getMemStats() { return MemStats::getFor(this); } - -private: - friend struct MemStats; -#endif - - ////////////////////////////////// - // Producer list manipulation - ////////////////////////////////// - - ProducerBase* recycle_or_create_producer(bool isExplicit) - { + public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = { 0 }; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { + //auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast(index->prev); + } + } + } + + auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + private: + friend struct MemStats; +#endif + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - // Try to re-use one first - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { - bool expected = true; - if (ptr->inactive.compare_exchange_strong( - expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { - // We caught one! It's been marked as activated, the caller can have it - return ptr; - } - } - } - - return add_producer( - isExplicit ? static_cast(create(this)) : create(this)); - } - - ProducerBase* add_producer(ProducerBase* producer) - { - // Handle failed memory allocation - if (producer == nullptr) { - return nullptr; - } - - producerCount.fetch_add(1, std::memory_order_relaxed); - - // Add it to the lock-free list - auto prevTail = producerListTail.load(std::memory_order_relaxed); - do { - producer->next = prevTail; - } while (!producerListTail.compare_exchange_weak( - prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); - + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + return ptr; + } + } + } + + return add_producer(isExplicit ? static_cast(create(this)) : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - if (producer->isExplicit) { - auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextExplicitProducer = prevTailExplicit; - } while (!explicitProducers.compare_exchange_weak( - prevTailExplicit, - static_cast(producer), - std::memory_order_release, - std::memory_order_relaxed)); - } else { - auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextImplicitProducer = prevTailImplicit; - } while (!implicitProducers.compare_exchange_weak( - prevTailImplicit, - static_cast(producer), - std::memory_order_release, - std::memory_order_relaxed)); - } -#endif - - return producer; - } - - void reown_producers() - { - // After another instance is moved-into/swapped-with this one, all the - // producers we stole still think their parents are the other queue. - // So fix them up! - for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { - ptr->parent = this; - } - } - - ////////////////////////////////// - // Implicit producer hash - ////////////////////////////////// - - struct ImplicitProducerKVP { - std::atomic key; - ImplicitProducer* - value; // No need for atomicity since it's only read by the thread that sets it in the first place - - ImplicitProducerKVP(): value(nullptr) {} - - ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); - value = other.value; - } - - inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT - { - if (this != &other) { - details::swap_relaxed(key, other.key); - std::swap(value, other.value); - } - } - }; - - template - friend void moodycamel::swap( - typename ConcurrentQueue::ImplicitProducerKVP&, - typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; - - struct ImplicitProducerHash { - size_t capacity; - ImplicitProducerKVP* entries; - ImplicitProducerHash* prev; - }; - - inline void populate_initial_implicit_producer_hash() - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - { - return; - } - else - { - implicitProducerHashCount.store(0, std::memory_order_relaxed); - auto hash = &initialImplicitProducerHash; - hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; - hash->entries = &initialImplicitProducerHashEntries[0]; - for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { - initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - hash->prev = nullptr; - implicitProducerHash.store(hash, std::memory_order_relaxed); - } - } - - void swap_implicit_producer_hashes(ConcurrentQueue& other) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - { - return; - } - else - { - // Swap (assumes our implicit producer hash is initialized) - initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); - initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; - other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; - - details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); - - details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); - if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { - implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); - } else { - ImplicitProducerHash* hash; - for (hash = implicitProducerHash.load(std::memory_order_relaxed); - hash->prev != &other.initialImplicitProducerHash; - hash = hash->prev) { - continue; - } - hash->prev = &initialImplicitProducerHash; - } - if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { - other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); - } else { - ImplicitProducerHash* hash; - for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); - hash->prev != &initialImplicitProducerHash; - hash = hash->prev) { - continue; - } - hash->prev = &other.initialImplicitProducerHash; - } - } - } - - // Only fails (returns nullptr) if memory allocation fails - ImplicitProducer* get_or_add_implicit_producer() - { - // Note that since the data is essentially thread-local (key is thread ID), - // there's a reduced need for fences (memory ordering is already consistent - // for any individual thread), except for the current table itself. - - // Start by looking for the thread ID in the current and all previous hash tables. - // If it's not found, it must not be in there yet, since this same thread would - // have added it previously to one of the tables that we traversed. - - // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table - + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } + else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) { } + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - - auto mainHash = implicitProducerHash.load(std::memory_order_acquire); - assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) - for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { - // Look for the id in this hash - auto index = hashedId; - while (true) { // Not an infinite loop because at least one slot is free in the hash table - index &= hash->capacity - 1u; - - auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); - if (probedKey == id) { - // Found it! If we had to search several hashes deep, though, we should lazily add it - // to the current main hash table to avoid the extended search next time. - // Note there's guaranteed to be room in the current hash table since every subsequent - // table implicitly reserves space for all previous tables (there's only one - // implicitProducerHashCount). - auto value = hash->entries[index].value; - if (hash != mainHash) { - index = hashedId; - while (true) { - index &= mainHash->capacity - 1u; - auto empty = details::invalid_thread_id; + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1u; + + auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1u; + auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if (mainHash->entries[index].key.compare_exchange_strong( - empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) || - mainHash->entries[index].key.compare_exchange_strong( - reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { + auto reusable = details::invalid_thread_id2; + if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) || + mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { #else - if (mainHash->entries[index].key.compare_exchange_strong( - empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { -#endif - mainHash->entries[index].value = value; - break; - } - ++index; - } - } - - return value; - } - if (probedKey == details::invalid_thread_id) { - break; // Not in this hash table - } - ++index; - } - } - - // Insert! - auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); - while (true) { - // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) - if (newCount >= (mainHash->capacity >> 1) && - !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { - // We've acquired the resize lock, try to allocate a bigger hash table. - // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when - // we reload implicitProducerHash it must be the most recent version (it only gets changed within this - // locked block). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - if (newCount >= (mainHash->capacity >> 1)) { - size_t newCapacity = mainHash->capacity << 1; - while (newCount >= (newCapacity >> 1)) { - newCapacity <<= 1; - } - auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + - std::alignment_of::value - 1 + - sizeof(ImplicitProducerKVP) * newCapacity)); - if (raw == nullptr) { - // Allocation failed - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - return nullptr; - } - - auto newHash = new (raw) ImplicitProducerHash; - newHash->capacity = static_cast(newCapacity); - newHash->entries = reinterpret_cast( - details::align_for(raw + sizeof(ImplicitProducerHash))); - for (size_t i = 0; i != newCapacity; ++i) { - new (newHash->entries + i) ImplicitProducerKVP; - newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - newHash->prev = mainHash; - implicitProducerHash.store(newHash, std::memory_order_release); - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - mainHash = newHash; - } else { - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - } - } - - // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table - // to finish being allocated by another thread (and if we just finished allocating above, the condition will - // always be true) - if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { - auto producer = static_cast(recycle_or_create_producer(false)); - if (producer == nullptr) { - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - return nullptr; - } - + if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + size_t newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = static_cast(newCapacity); + newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } + else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + auto producer = static_cast(recycle_or_create_producer(false)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; - producer->threadExitListener.userData = producer; - details::ThreadExitNotifier::subscribe(&producer->threadExitListener); -#endif - - auto index = hashedId; - while (true) { - index &= mainHash->capacity - 1u; - auto empty = details::invalid_thread_id; + producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1u; + auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if (mainHash->entries[index].key.compare_exchange_strong( - reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { - implicitProducerHashCount.fetch_sub( - 1, std::memory_order_relaxed); // already counted as a used slot - mainHash->entries[index].value = producer; - break; - } -#endif - if (mainHash->entries[index].key.compare_exchange_strong( - empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { - mainHash->entries[index].value = producer; - break; - } - ++index; - } - return producer; - } - - // Hmm, the old hash is quite full and somebody else is busy allocating a new one. - // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, - // we try to allocate ourselves). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - } - } - + auto reusable = details::invalid_thread_id2; + if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); // already counted as a used slot + mainHash->entries[index].value = producer; + break; + } +#endif + if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - void implicit_producer_thread_exited(ImplicitProducer* producer) - { - // Remove from hash + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from hash #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - auto hash = implicitProducerHash.load(std::memory_order_acquire); - assert( - hash != - nullptr); // The thread exit listener is only registered if we were added to a hash in the first place - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - details::thread_id_t probedKey; - - // We need to traverse all the hashes just in case other threads aren't on the current one yet and are - // trying to add an entry thinking there's a free slot (because they reused a producer) - for (; hash != nullptr; hash = hash->prev) { - auto index = hashedId; - do { - index &= hash->capacity - 1u; - probedKey = id; - if (hash->entries[index].key.compare_exchange_strong( - probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) { - break; - } - ++index; - } while (probedKey != - details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it - // yet, or if we weren't added to this hash in the first place - } - - // Mark the queue as being recyclable - producer->inactive.store(true, std::memory_order_release); - } - - static void implicit_producer_thread_exited_callback(void* userData) - { - auto producer = static_cast(userData); - auto queue = producer->parent; - queue->implicit_producer_thread_exited(producer); - } -#endif - - ////////////////////////////////// - // Utility functions - ////////////////////////////////// - - template - static inline void* aligned_malloc(size_t size) - { - MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= std::alignment_of::value) - return (Traits::malloc)(size); - else - { - size_t alignment = std::alignment_of::value; - void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); - if (!raw) - return nullptr; - char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); - *(reinterpret_cast(ptr) - 1) = raw; - return ptr; - } - } - - template - static inline void aligned_free(void* ptr) - { - MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= std::alignment_of::value) - return (Traits::free)(ptr); - else(Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); - } - - template - static inline U* create_array(size_t count) - { - assert(count > 0); - U* p = static_cast(aligned_malloc(sizeof(U) * count)); - if (p == nullptr) - return nullptr; - - for (size_t i = 0; i != count; ++i) - new (p + i) U(); - return p; - } - - template - static inline void destroy_array(U* p, size_t count) - { - if (p != nullptr) { - assert(count > 0); - for (size_t i = count; i != 0;) - (p + --i)->~U(); - } - aligned_free(p); - } - - template - static inline U* create() - { - void* p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U : nullptr; - } - - template - static inline U* create(A1&& a1) - { - void* p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; - } - - template - static inline void destroy(U* p) - { - if (p != nullptr) - p->~U(); - aligned_free(p); - } + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1u; + probedKey = id; + if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) { + break; + } + ++index; + } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline void* aligned_malloc(size_t size) + { + MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) + return (Traits::malloc)(size); + else { + size_t alignment = std::alignment_of::value; + void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); + if (!raw) + return nullptr; + char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + } + + template + static inline void aligned_free(void* ptr) + { + MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) + return (Traits::free)(ptr); + else + (Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); + } + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + U* p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0; ) + (p + --i)->~U(); + } + aligned_free(p); + } + + template + static inline U* create() + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) + p->~U(); + aligned_free(p); + } private: - std::atomic producerListTail; - std::atomic producerCount; - - std::atomic initialBlockPoolIndex; - Block* initialBlockPool; - size_t initialBlockPoolSize; - + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + #ifndef MCDBGQ_USEDEBUGFREELIST - FreeList freeList; + FreeList freeList; #else - debug::DebugFreeList freeList; -#endif - - std::atomic implicitProducerHash; - std::atomic implicitProducerHashCount; // Number of slots logically used - ImplicitProducerHash initialImplicitProducerHash; - std::array initialImplicitProducerHashEntries; - std::atomic_flag implicitProducerHashResizeInProgress; - - std::atomic nextExplicitConsumerId; - std::atomic globalExplicitConsumerOffset; - + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugMutex implicitProdMutex; + debug::DebugMutex implicitProdMutex; #endif - + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - std::atomic explicitProducers; - std::atomic implicitProducers; + std::atomic explicitProducers; + std::atomic implicitProducers; #endif }; -template -ProducerToken::ProducerToken(ConcurrentQueue& queue): producer(queue.recycle_or_create_producer(true)) + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer(true)) { - if (producer != nullptr) { - producer->token = this; - } + if (producer != nullptr) { + producer->token = this; + } } -template +template ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) - : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) + : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) { - if (producer != nullptr) { - producer->token = this; - } + if (producer != nullptr) { + producer->token = this; + } } -template +template ConsumerToken::ConsumerToken(ConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { - initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); } -template +template ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { - initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add( - 1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); + initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); } -template +template inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } -template -inline void swap( - typename ConcurrentQueue::ImplicitProducerKVP& a, - typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT { - a.swap(b); + a.swap(b); } -} // namespace moodycamel +} #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) #pragma warning(pop) diff --git a/scripts/lint.sh b/scripts/lint.sh deleted file mode 100755 index 273d70043..000000000 --- a/scripts/lint.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -euxo pipefail - -black . -pflake8 . -mypy --install-types . - -# disable tracing -- too verbose -set +x -# find all .h, .hpp, and .cpp files excepting third party files -files=$(find ./scaler -path './scaler/io/ymq/third_party' -prune -o \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) -print0 | xargs -0) -set -x - -echo "running clang format on $(wc -w <<< $files) files" -clang-format -i -style file -- $files diff --git a/tests/cc_ymq/common.h b/tests/cc_ymq/common.h index bafe5d907..b361f76df 100644 --- a/tests/cc_ymq/common.h +++ b/tests/cc_ymq/common.h @@ -1,5 +1,7 @@ #pragma once +#include +#include #define PY_SSIZE_T_CLEAN #include #include @@ -421,6 +423,7 @@ inline TestResult test( return TestResult::Success; } +// path is relative to the directory of this source file inline TestResult run_python(const char* path, std::vector argv = {}) { // insert the pid at the start of the argv, this is important for signalling readiness @@ -449,7 +452,7 @@ inline TestResult run_python(const char* path, std::vector argv if (!file) throw std::system_error(errno, std::generic_category(), "failed to open python file"); - PyRun_SimpleFile(file, path); + PyRun_SimpleFile(file, full_path.c_str()); fclose(file); } diff --git a/tests/cc_ymq/py_mitm/core.py b/tests/cc_ymq/py_mitm/core.py deleted file mode 100644 index 4a22ee01a..000000000 --- a/tests/cc_ymq/py_mitm/core.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -This is the common code for implementing man in the middle in Python -""" - -import dataclasses -from typing import Protocol -from scapy.all import TunTapInterface, IP, TCP # type: ignore - - -@dataclasses.dataclass -class TCPConnection: - """ - Represents a TCP connection over the TUNTAP interface - local_ip and local_port are the mitm's ip and port, and - remote_ip and remote_port are the port for the remote peer - """ - - local_ip: str - local_port: int - remote_ip: str - remote_port: int - - def rewrite(self, pkt: IP, ack: int | None = None, data=None): - """ - Rewrite a TCP/IP packet as a packet originating - from (local_ip, local_port) and going to (remote_ip, remote_port) - This function is useful for taking a packet received from one connection, and redirecting it to another - - Args: - pkt: A scapy TCP/IP packet to rewrite - ack: An optional ack number to use instead of the one found in `pkt` - data: An optional payload to use instead of the one found int `pkt` - - Returns: - The rewritten packet, suitable for sending over TUNTAP - """ - tcp = pkt[TCP] - - return ( - IP(src=self.local_ip, dst=self.remote_ip) - / TCP(sport=self.local_port, dport=self.remote_port, flags=tcp.flags, seq=tcp.seq, ack=ack or tcp.ack) - / bytes(data or tcp.payload) - ) - - -class MITMProtocol(Protocol): - def proxy( - self, - tuntap: TunTapInterface, - pkt: IP, - sender: TCPConnection, - client_conn: TCPConnection | None, - server_conn: TCPConnection, - ) -> None: ... diff --git a/tests/cc_ymq/py_mitm/drop.py b/tests/cc_ymq/py_mitm/drop.py deleted file mode 100644 index 520e5bf70..000000000 --- a/tests/cc_ymq/py_mitm/drop.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -This MITM drops a % of packets -""" - -import random -from core import MITMProtocol, TunTapInterface, IP, TCPConnection - - -class MITM(MITMProtocol): - def __init__(self, drop_pcent: str): - self.drop_pcent = float(drop_pcent) - - def proxy( - self, - tuntap: TunTapInterface, - pkt: IP, - sender: TCPConnection, - client_conn: TCPConnection | None, - server_conn: TCPConnection, - ) -> None: - if random.random() < self.drop_pcent: - print("[!] Dropping packet") - return - - if sender == client_conn: - tuntap.send(server_conn.rewrite(pkt)) - elif sender == server_conn: - tuntap.send(client_conn.rewrite(pkt)) diff --git a/tests/cc_ymq/py_mitm/main.py b/tests/cc_ymq/py_mitm/main.py index edaeba569..6a97faafd 100644 --- a/tests/cc_ymq/py_mitm/main.py +++ b/tests/cc_ymq/py_mitm/main.py @@ -100,6 +100,9 @@ def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: in # and the source ip and port will be the remote ip and port sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport) + if not mitm.proxy(tuntap, pkt, sender, client_conn, server_conn): + continue # the segment was not proxied, so we can't update our internal state + if sender == client_conn: print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}") elif sender == server_conn: @@ -126,8 +129,6 @@ def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: in if sender == server_conn and client_sent_fin_ack: client_closed = True - mitm.proxy(tuntap, pkt, sender, client_conn, server_conn) - if client_closed and server_closed: print("[*] Both connections closed") return diff --git a/tests/cc_ymq/py_mitm/passthrough.py b/tests/cc_ymq/py_mitm/passthrough.py index 20d8a9069..1dc825928 100644 --- a/tests/cc_ymq/py_mitm/passthrough.py +++ b/tests/cc_ymq/py_mitm/passthrough.py @@ -16,8 +16,9 @@ def proxy( sender: TCPConnection, client_conn: TCPConnection | None, server_conn: TCPConnection, - ) -> None: - if sender == client_conn: + ) -> bool: + if sender == client_conn or client_conn is None: tuntap.send(server_conn.rewrite(pkt)) elif sender == server_conn: tuntap.send(client_conn.rewrite(pkt)) + return True diff --git a/tests/cc_ymq/py_mitm/randomly_drop_packets.py b/tests/cc_ymq/py_mitm/randomly_drop_packets.py index a197ac3c8..d84e57ee6 100644 --- a/tests/cc_ymq/py_mitm/randomly_drop_packets.py +++ b/tests/cc_ymq/py_mitm/randomly_drop_packets.py @@ -17,12 +17,13 @@ def proxy( sender: TCPConnection, client_conn: TCPConnection | None, server_conn: TCPConnection, - ) -> None: + ) -> bool: if random.random() < self.drop_pcent: print("[!] Dropping packet") - return + return False if sender == client_conn: tuntap.send(server_conn.rewrite(pkt)) elif sender == server_conn: tuntap.send(client_conn.rewrite(pkt)) + return True diff --git a/tests/cc_ymq/py_mitm/rst.py b/tests/cc_ymq/py_mitm/rst.py deleted file mode 100644 index be301d5b0..000000000 --- a/tests/cc_ymq/py_mitm/rst.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -This MITM inserts an unexpected TCP RST -""" - -from core import IP, TCP, MITMProtocol, TCPConnection, TunTapInterface - - -class MITM(MITMProtocol): - def __init__(self): - # count the number of psh-acks sent by the client - self.client_pshack_counter = 0 - - def proxy( - self, - tuntap: TunTapInterface, - pkt: IP, - sender: TCPConnection, - client_conn: TCPConnection | None, - server_conn: TCPConnection, - ) -> None: - if sender == client_conn or client_conn is None: - if pkt[TCP].flags == "PA": - self.client_pshack_counter += 1 - - # on the second psh-ack, send a rst instead - if self.client_pshack_counter == 2: - rst_pkt = IP(src=client_conn.local_ip, dst=client_conn.remote_ip) / TCP( - sport=client_conn.local_port, dport=client_conn.remote_port, flags="R", seq=pkt[TCP].ack - ) - print(f"<- [{rst_pkt[TCP].flags}] (simulated)") - tuntap.send(rst_pkt) - return - - tuntap.send(server_conn.rewrite(pkt)) - elif sender == server_conn: - tuntap.send(client_conn.rewrite(pkt)) - - -# client -> mitm -> server -# server -> mitm -> client - -# client: 127.0.0.1:8080 -# mitm: 127.0.0.1:8081 -# server: 127.0.0.1:8081 - - -# client -> mitm == src = client.ip, sport = client.port ;; dst = mitm.ip, dport = mitm.port -# mitm -> server == src = mitm.ip, sport = mitm.port ;; dst = server.ip, dport = server.port diff --git a/tests/cc_ymq/py_mitm/runner.py b/tests/cc_ymq/py_mitm/runner.py deleted file mode 100644 index f56ccb9f9..000000000 --- a/tests/cc_ymq/py_mitm/runner.py +++ /dev/null @@ -1,157 +0,0 @@ -# flake8: noqa: E402 - -""" -This script provides a framework for running MITM test cases - -This script accepts 5 arguments in the following order: - 1. pid: the pid of the test process, used for signaling - 2. testcase: the MITM test case. \ - this loads `from .testcase import MITM` where `MITM` is a class implementing `MITMProtocol` - 3. mitm_ip: an ipv4 address for the mitm server - 4. mitm_port: the port used to connect to the remote server - 5. server_ip: the desired ip of the remote side of the TUNTAP interface - 6. server_port: the port of the remote server - 7. *args: Additional args, if any are passed to the constructor: `MITM(*args)` - -See the documentation on `main` for more -""" -import os -import sys - -# add the script's directory to path -sys.path.append(os.path.dirname(os.path.realpath(__file__))) - -import importlib -import signal -import subprocess - -from core import MITMProtocol, TCPConnection -from scapy.all import IP, TCP, TunTapInterface # type: ignore - - -def echo_call(cmd: list[str]): - print(f"+ {' '.join(cmd)}") - subprocess.check_call(cmd) - - -def create_tuntap_interface(iface_name: str, mitm_ip: str, remote_ip: str) -> TunTapInterface: - """ - Creates a TUNTAP interface and sets brings it up and adds ips using the `ip` program - - Args: - iface_name: The name of the TUNTAP interface, usually like `tun0`, `tun1`, etc. - mitm_ip: The desired ip address of the mitm. This is the ip that clients can use to connect to the mitm - remote_ip: The ip that routes to/from the tuntap interface. - packets sent to `mitm_ip` will appear to come from `remote_ip`,\ - and conversely the tuntap interface can connect/send packets - to `remote_ip`, making it a suitable ip for binding a server - - Returns: - The TUNTAP interface - """ - iface = TunTapInterface(iface_name, mode="tun") - - try: - echo_call(["sudo", "ip", "link", "set", iface_name, "up"]) - echo_call(["sudo", "ip", "addr", "add", remote_ip, "peer", mitm_ip, "dev", iface_name]) - print(f"[+] Interface {iface_name} up with IP {mitm_ip}") - except subprocess.CalledProcessError: - print("[!] Could not bring up interface. Run as root or set manually.") - raise - - return iface - - -def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: int, mitm: MITMProtocol): - """ - This function serves as a framework for man in the middle implementations - A client connects to the MITM, then the MITM connects to a remote server - The MITM sits inbetween the client and the server, manipulating the packets sent depending on the test case - This function: - 1. creates a TUNTAP interface and prepares it for MITM - 2. handles connecting clients and handling connection closes - 3. delegates additional logic to a pluggable callable, `mitm` - 4. returns when both connections have terminated (via ) - - Args: - pid: this is the pid of the test process, used for signaling readiness \ - we send SIGUSR1 to this process when the mitm is ready - mitm_ip: The desired ip address of the mitm server - mitm_port: The desired port of the mitm server. \ - This is the port used to connect to the server, but the client is free to connect on any port - remote_ip: The desired remote ip for the TUNTAP interface. This is the only ip address \ - reachable by the interface and is thus the src ip for clients, and the ip that the remote server \ - must be bound to - server_port: The port that the remote server is bound to - mitm: The core logic for a MITM test case. This callable may maintain its own state and is responsible \ - for sending packets over the TUNTAP interface (if it doesn't, nothing will happen) - """ - - tuntap = create_tuntap_interface("tun0", mitm_ip, remote_ip) - - # signal the caller that the tuntap interface has been created - if pid > 0: - os.kill(pid, signal.SIGUSR1) - - # these track information about our connections - # we already know what to expect for the server connection, we are the connector - client_conn = None - server_conn = TCPConnection(mitm_ip, mitm_port, remote_ip, server_port) - - # tracks the state of each connection - client_sent_fin_ack = False - client_closed = False - server_sent_fin_ack = False - server_closed = False - - while True: - pkt = tuntap.recv() - if not pkt.haslayer(IP) or not pkt.haslayer(TCP): - continue - ip = pkt[IP] - tcp = pkt[TCP] - - # for a received packet, the destination ip and port are our local ip and port - # and the source ip and port will be the remote ip and port - sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport) - - if sender == client_conn: - print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}") - elif sender == server_conn: - print(f"<- [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}") - - if tcp.flags == "S": # SYN from client - print("-> [S]") - print(f"[*] New connection from {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}") - client_conn = sender - - if tcp.flags == "SA": # SYN-ACK from server - if sender == server_conn: - print(f"[*] Connection to server established: {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}") - - if tcp.flags == "FA": # FIN-ACK - if sender == client_conn: - client_sent_fin_ack = True - if sender == server_conn: - server_sent_fin_ack = True - - if tcp.flags == "A": # ACK - if sender == client_conn and server_sent_fin_ack: - server_closed = True - if sender == server_conn and client_sent_fin_ack: - client_closed = True - - mitm.proxy(tuntap, pkt, sender, client_conn, server_conn) - - if client_closed and server_closed: - print("[*] Both connections closed") - return - - -if __name__ == "__main__": - # parse the ips, ports, and test case from the command line - pid, testcase, mitm_ip, mitm_port, remote_ip, server_port, *args = sys.argv[1:] - - # load the module dynamically - module = importlib.import_module(testcase) - main(int(pid), mitm_ip, int(mitm_port), remote_ip, int(server_port), module.MITM(*args)) diff --git a/tests/cc_ymq/py_mitm/send_rst_to_client.py b/tests/cc_ymq/py_mitm/send_rst_to_client.py index fc70355e5..2d3922105 100644 --- a/tests/cc_ymq/py_mitm/send_rst_to_client.py +++ b/tests/cc_ymq/py_mitm/send_rst_to_client.py @@ -17,7 +17,7 @@ def proxy( sender: TCPConnection, client_conn: TCPConnection | None, server_conn: TCPConnection, - ) -> None: + ) -> bool: if sender == client_conn or client_conn is None: if pkt[TCP].flags == "PA": self.client_pshack_counter += 1 @@ -29,11 +29,12 @@ def proxy( ) print(f"<- [{rst_pkt[TCP].flags}] (simulated)") tuntap.send(rst_pkt) - return + return True tuntap.send(server_conn.rewrite(pkt)) elif sender == server_conn: tuntap.send(client_conn.rewrite(pkt)) + return True # client -> mitm -> server diff --git a/tests/cc_ymq/py_mitm/types.py b/tests/cc_ymq/py_mitm/types.py index 4a22ee01a..ae20fd053 100644 --- a/tests/cc_ymq/py_mitm/types.py +++ b/tests/cc_ymq/py_mitm/types.py @@ -51,4 +51,4 @@ def proxy( sender: TCPConnection, client_conn: TCPConnection | None, server_conn: TCPConnection, - ) -> None: ... + ) -> bool: ...