From ce3ecec7719df58333b5e61a5aef692392ad90c2 Mon Sep 17 00:00:00 2001
From: sharpener6 <1sc2l4qi@duck.com>
Date: Wed, 24 Sep 2025 19:30:01 -0400
Subject: [PATCH 1/3] Rework all github workflows (#230)

- Moved workflows to actions so they can be reused
- Removed the dependency on boost
- Fix formatting issue
- Fix the formatting and import sort orders
---
 .github/actions/compile-library/action.yml    |   2 +-
 examples/task_capabilities.py                 |  10 +-
 scaler/io/ymq/bytes.h                         |  11 +-
 .../io/ymq/examples/automated_echo_client.cpp |   2 +-
 scaler/io/ymq/message_connection_tcp.cpp      |   1 -
 scaler/io/ymq/pymod_ymq/async.h               |   3 +-
 scaler/io/ymq/pymod_ymq/exception.h           |   2 +-
 scaler/io/ymq/pymod_ymq/io_context.h          |   4 +-
 scaler/io/ymq/pymod_ymq/io_socket.h           |  10 +-
 scaler/io/ymq/pymod_ymq/ymq.h                 |   2 +-
 scaler/io/ymq/simple_interface.cpp            |  28 +-
 scaler/io/ymq/simple_interface.h              |   8 +-
 scaler/io/ymq/ymq.pyi                         |  14 +-
 scaler/io/ymq/ymq_test.py                     |  20 -
 .../object_storage/object_storage_server.cpp  |  14 +-
 scaler/object_storage/object_storage_server.h |   3 +-
 scripts/build.sh                              |   2 +-
 tests/CMakeLists.txt                          |   5 +-
 tests/cc_ymq/CMakeLists.txt                   |   1 +
 tests/cc_ymq/common.h                         | 496 +++++++++++++++++
 tests/cc_ymq/py_mitm/__init__.py              |   0
 tests/cc_ymq/py_mitm/main.py                  | 152 ++++++
 tests/cc_ymq/py_mitm/passthrough.py           |  23 +
 tests/cc_ymq/py_mitm/randomly_drop_packets.py |  28 +
 tests/cc_ymq/py_mitm/send_rst_to_client.py    |  48 ++
 tests/cc_ymq/py_mitm/types.py                 |  54 ++
 tests/cc_ymq/test_cc_ymq.cpp                  | 508 ++++++++++++++++++
 .../test_object_storage_server.cpp            |  28 +-
 tests/pymod_ymq/__init__.py                   |   0
 tests/pymod_ymq/test_pymod_ymq.py             | 150 ++++++
 tests/pymod_ymq/test_types.py                 |  90 ++++
 31 files changed, 1626 insertions(+), 93 deletions(-)
 delete mode 100644 scaler/io/ymq/ymq_test.py
 create mode 100644 tests/cc_ymq/CMakeLists.txt
 create mode 100644 tests/cc_ymq/common.h
 create mode 100644 tests/cc_ymq/py_mitm/__init__.py
 create mode 100644 tests/cc_ymq/py_mitm/main.py
 create mode 100644 tests/cc_ymq/py_mitm/passthrough.py
 create mode 100644 tests/cc_ymq/py_mitm/randomly_drop_packets.py
 create mode 100644 tests/cc_ymq/py_mitm/send_rst_to_client.py
 create mode 100644 tests/cc_ymq/py_mitm/types.py
 create mode 100644 tests/cc_ymq/test_cc_ymq.cpp
 create mode 100644 tests/pymod_ymq/__init__.py
 create mode 100644 tests/pymod_ymq/test_pymod_ymq.py
 create mode 100644 tests/pymod_ymq/test_types.py

diff --git a/.github/actions/compile-library/action.yml b/.github/actions/compile-library/action.yml
index 1036b1524..6b630bfb6 100644
--- a/.github/actions/compile-library/action.yml
+++ b/.github/actions/compile-library/action.yml
@@ -13,7 +13,7 @@ runs:
       if: inputs.os == 'Linux'
       shell: bash
       run: |
-        CXX=$(which g++-14) ./scripts/build.sh
+        CXX=$(which g++-14) GTEST_FILTER="-*Mitm*:*TestIncompleteIdentity*" ./scripts/build.sh
 
     - name: Build and test C++ Components (Windows)
       if: inputs.os == 'Windows'
diff --git a/examples/task_capabilities.py b/examples/task_capabilities.py
index 5a38f313a..e75468863 100644
--- a/examples/task_capabilities.py
+++ b/examples/task_capabilities.py
@@ -56,18 +56,12 @@ def main():
 
         # Submit a task that requires GPU capabilities, this will be redirected to the GPU worker.
         gpu_future = client.submit_verbose(
-            gpu_task,
-            args=(16.0,),
-            kwargs={},
-            capabilities={"gpu": 1}  # Requires a GPU capability
+            gpu_task, args=(16.0,), kwargs={}, capabilities={"gpu": 1}  # Requires a GPU capability
         )
 
         # Submit a task that does not require GPU capabilities, this will be routed to any available worker.
         cpu_future = client.submit_verbose(
-            cpu_task,
-            args=(16.0,),
-            kwargs={},
-            capabilities={}  # No GPU capability required
+            cpu_task, args=(16.0,), kwargs={}, capabilities={}  # No GPU capability required
         )
 
         # Waits for the tasks for finish
diff --git a/scaler/io/ymq/bytes.h b/scaler/io/ymq/bytes.h
index 9211bb24e..c3dacb46f 100644
--- a/scaler/io/ymq/bytes.h
+++ b/scaler/io/ymq/bytes.h
@@ -10,6 +10,7 @@
 #include <cstring>
 
 // C++
+#include <optional>
 #include <string>
 
 // First-party
@@ -33,10 +34,9 @@ class Bytes {
 public:
     Bytes(char* data, size_t len): _data(datadup((uint8_t*)data, len)), _len(len) {}
 
-    Bytes(): _data {}, _len {} {}
+    Bytes(const std::string& s): _data(datadup((uint8_t*)s.data(), s.length())), _len(s.length()) {}
 
-    // For debug and convenience only
-    explicit Bytes(const std::string& str): Bytes((char*)str.c_str(), str.size()) {}
+    Bytes(): _data {}, _len {} {}
 
     Bytes(const Bytes& other) noexcept
     {
@@ -92,11 +92,10 @@ class Bytes {
 
     [[nodiscard]] constexpr bool is_null() const noexcept { return !this->_data; }
 
-    // debugging utility
-    std::string as_string() const
+    std::optional<std::string> as_string() const
     {
         if (is_null())
-            return "[EMPTY]";
+            return std::nullopt;
 
         return std::string((char*)_data, _len);
     }
diff --git a/scaler/io/ymq/examples/automated_echo_client.cpp b/scaler/io/ymq/examples/automated_echo_client.cpp
index 5a4ca3510..328b3e219 100644
--- a/scaler/io/ymq/examples/automated_echo_client.cpp
+++ b/scaler/io/ymq/examples/automated_echo_client.cpp
@@ -70,7 +70,7 @@ int main()
         auto future = x.get_future();
         Message msg = future.get().first;
         if (msg.payload.as_string() != longStr) {
-            printf("Checksum failed, %s\n", msg.payload.as_string().c_str());
+            printf("Checksum failed, %s\n", msg.payload.as_string()->c_str());
             exit(1);
         }
     }
diff --git a/scaler/io/ymq/message_connection_tcp.cpp b/scaler/io/ymq/message_connection_tcp.cpp
index 569fe214b..8212238f1 100644
--- a/scaler/io/ymq/message_connection_tcp.cpp
+++ b/scaler/io/ymq/message_connection_tcp.cpp
@@ -269,7 +269,6 @@ void MessageConnectionTCP::updateReadOperation()
             _pendingRecvMessageCallbacks->pop();
 
             recvMessageCallback({Message(std::move(address), std::move(payload)), {}});
-
         } else {
             assert(_pendingRecvMessageCallbacks->size());
             break;
diff --git a/scaler/io/ymq/pymod_ymq/async.h b/scaler/io/ymq/pymod_ymq/async.h
index 8a2e229a5..ba80d6bb3 100644
--- a/scaler/io/ymq/pymod_ymq/async.h
+++ b/scaler/io/ymq/pymod_ymq/async.h
@@ -12,7 +12,7 @@
 #include "scaler/io/ymq/pymod_ymq/ymq.h"
 
 // wraps an async callback that accepts a Python asyncio future
-static PyObject* async_wrapper(PyObject* self, const std::function<void(YMQState* state, PyObject* future)>& callback)
+static PyObject* async_wrapper(PyObject* self, const std::function<void(YMQState* state, PyObject* future)>&& callback)
 {
     auto state = YMQStateFromSelf(self);
     if (!state)
@@ -25,7 +25,6 @@ static PyObject* async_wrapper(PyObject* self, const std::function<void(YMQState
     }
 
     OwnedPyObject future = PyObject_CallMethod(*loop, "create_future", nullptr);
-
     if (!future) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to create future");
         return nullptr;
diff --git a/scaler/io/ymq/pymod_ymq/exception.h b/scaler/io/ymq/pymod_ymq/exception.h
index 422c2f97b..d44ef581d 100644
--- a/scaler/io/ymq/pymod_ymq/exception.h
+++ b/scaler/io/ymq/pymod_ymq/exception.h
@@ -9,8 +9,8 @@
 #include <functional>
 
 // First-party
-#include "scaler/io/ymq/pymod_ymq/ymq.h"
 #include "scaler/io/ymq/pymod_ymq/utils.h"
+#include "scaler/io/ymq/pymod_ymq/ymq.h"
 
 // the order of the members in the exception args tuple
 const Py_ssize_t YMQException_errorCodeIndex = 0;
diff --git a/scaler/io/ymq/pymod_ymq/io_context.h b/scaler/io/ymq/pymod_ymq/io_context.h
index 2bc9be556..e922d5388 100644
--- a/scaler/io/ymq/pymod_ymq/io_context.h
+++ b/scaler/io/ymq/pymod_ymq/io_context.h
@@ -77,8 +77,8 @@ static PyObject* PyIOContext_createIOSocket_(
     using Identity = Configuration::IOSocketIdentity;
 
     // note: references borrowed from args, so no need to manage their lifetime
-    PyObject* pyIdentity {};
-    PyObject* pySocketType {};
+    PyObject* pyIdentity   = nullptr;
+    PyObject* pySocketType = nullptr;
     if (nargs == 1) {
         pyIdentity = args[0];
     } else if (nargs == 2) {
diff --git a/scaler/io/ymq/pymod_ymq/io_socket.h b/scaler/io/ymq/pymod_ymq/io_socket.h
index 152c785a8..898afb832 100644
--- a/scaler/io/ymq/pymod_ymq/io_socket.h
+++ b/scaler/io/ymq/pymod_ymq/io_socket.h
@@ -156,10 +156,6 @@ static PyObject* PyIOSocket_recv(PyIOSocket* self, PyObject* args)
                     if (!pyMessage)
                         return YMQ_GetRaisedException();
 
-                    // TODO: why is leaking necessary?
-                    address.forget();
-                    payload.forget();
-
                     return (PyObject*)pyMessage.take();
                 });
             } catch (...) {
@@ -221,10 +217,6 @@ static PyObject* PyIOSocket_recv_sync(PyIOSocket* self, PyObject* args)
     if (!pyMessage)
         return nullptr;
 
-    // TODO: why is leaking necessary?
-    address.forget();
-    payload.forget();
-
     return (PyObject*)pyMessage.take();
 }
 
@@ -378,7 +370,7 @@ static PyObject* PyIOSocket_socket_type_getter(PyIOSocket* self, void* closure)
     if (!state)
         return nullptr;
 
-    const IOSocketType socketType        = self->socket->socketType();
+    const IOSocketType socketType  = self->socket->socketType();
     OwnedPyObject socketTypeIntObj = PyLong_FromLong((long)socketType);
 
     if (!socketTypeIntObj)
diff --git a/scaler/io/ymq/pymod_ymq/ymq.h b/scaler/io/ymq/pymod_ymq/ymq.h
index 879b019d8..b788d05dd 100644
--- a/scaler/io/ymq/pymod_ymq/ymq.h
+++ b/scaler/io/ymq/pymod_ymq/ymq.h
@@ -304,7 +304,7 @@ static int YMQ_createErrorCodeEnum(PyObject* pyModule, YMQState* state)
     // docs and examples are unfortunately scarce for this
     // for now this will work just fine
     OwnedPyObject item {};
-    while (item = PyIter_Next(*iter)) {
+    while ((item = PyIter_Next(*iter))) {
         OwnedPyObject fn = PyCMethod_New(&YMQErrorCode_explanation_def, *item, pyModule, nullptr);
         if (!fn)
             return -1;
diff --git a/scaler/io/ymq/simple_interface.cpp b/scaler/io/ymq/simple_interface.cpp
index 6acca250b..0a88f8ac0 100644
--- a/scaler/io/ymq/simple_interface.cpp
+++ b/scaler/io/ymq/simple_interface.cpp
@@ -1,6 +1,8 @@
 
 #include "scaler/io/ymq/simple_interface.h"
 
+#include <optional>
+
 namespace scaler {
 namespace ymq {
 
@@ -35,35 +37,41 @@ void syncConnectSocket(std::shared_ptr<IOSocket> socket, std::string address)
     connect_future.wait();
 }
 
-std::pair<Message, Error> syncRecvMessage(std::shared_ptr<IOSocket> socket)
+std::expected<Message, Error> syncRecvMessage(std::shared_ptr<IOSocket> socket)
 {
     auto fut = futureRecvMessage(std::move(socket));
     return fut.get();
 }
 
-std::expected<void, Error> syncSendMessage(std::shared_ptr<IOSocket> socket, Message message)
+std::optional<Error> syncSendMessage(std::shared_ptr<IOSocket> socket, Message message)
 {
     auto fut = futureSendMessage(std::move(socket), std::move(message));
     return fut.get();
 }
 
-std::future<std::pair<Message, Error>> futureRecvMessage(std::shared_ptr<IOSocket> socket)
+std::future<std::expected<Message, Error>> futureRecvMessage(std::shared_ptr<IOSocket> socket)
 {
-    auto recv_promise_ptr = std::make_unique<std::promise<std::pair<Message, Error>>>();
+    auto recv_promise_ptr = std::make_unique<std::promise<std::expected<Message, Error>>>();
     auto recv_future      = recv_promise_ptr->get_future();
-    socket->recvMessage([recv_promise = std::move(recv_promise_ptr)](std::pair<Message, Error> msg) {
-        recv_promise->set_value(std::move(msg));
+    socket->recvMessage([recv_promise = std::move(recv_promise_ptr)](std::pair<Message, Error> result) {
+        if (result.second._errorCode == Error::ErrorCode::Uninit)
+            recv_promise->set_value(std::move(result.first));
+        else
+            recv_promise->set_value(std::unexpected {std::move(result.second)});
     });
     return recv_future;
 }
 
-std::future<std::expected<void, Error>> futureSendMessage(std::shared_ptr<IOSocket> socket, Message message)
+std::future<std::optional<Error>> futureSendMessage(std::shared_ptr<IOSocket> socket, Message message)
 {
-    auto send_promise_ptr = std::make_unique<std::promise<std::expected<void, Error>>>();
+    auto send_promise_ptr = std::make_unique<std::promise<std::optional<Error>>>();
     auto send_future      = send_promise_ptr->get_future();
     socket->sendMessage(
-        std::move(message), [send_promise = std::move(send_promise_ptr)](std::expected<void, Error> msg) {
-            send_promise->set_value(std::move(msg));
+        std::move(message), [send_promise = std::move(send_promise_ptr)](std::expected<void, Error> result) {
+            if (result)
+                send_promise->set_value(std::nullopt);
+            else
+                send_promise->set_value(std::move(result.error()));
         });
     return send_future;
 }
diff --git a/scaler/io/ymq/simple_interface.h b/scaler/io/ymq/simple_interface.h
index 2f2d7a03d..9b9fa1f39 100644
--- a/scaler/io/ymq/simple_interface.h
+++ b/scaler/io/ymq/simple_interface.h
@@ -14,11 +14,11 @@ std::shared_ptr<IOSocket> syncCreateSocket(IOContext& context, IOSocketType type
 void syncBindSocket(std::shared_ptr<IOSocket> socket, std::string address);
 void syncConnectSocket(std::shared_ptr<IOSocket> socket, std::string address);
 
-std::pair<Message, Error> syncRecvMessage(std::shared_ptr<IOSocket> socket);
-std::expected<void, Error> syncSendMessage(std::shared_ptr<IOSocket> socket, Message message);
+std::expected<Message, Error> syncRecvMessage(std::shared_ptr<IOSocket> socket);
+std::optional<Error> syncSendMessage(std::shared_ptr<IOSocket> socket, Message message);
 
-std::future<std::pair<Message, Error>> futureRecvMessage(std::shared_ptr<IOSocket> socket);
-std::future<std::expected<void, Error>> futureSendMessage(std::shared_ptr<IOSocket> socket, Message message);
+std::future<std::expected<Message, Error>> futureRecvMessage(std::shared_ptr<IOSocket> socket);
+std::future<std::optional<Error>> futureSendMessage(std::shared_ptr<IOSocket> socket, Message message);
 
 }  // namespace ymq
 }  // namespace scaler
diff --git a/scaler/io/ymq/ymq.pyi b/scaler/io/ymq/ymq.pyi
index 7444d6fad..d436fd136 100644
--- a/scaler/io/ymq/ymq.pyi
+++ b/scaler/io/ymq/ymq.pyi
@@ -1,6 +1,5 @@
 # NOTE: NOT IMPLEMENTATION, TYPE INFORMATION ONLY
 # This file contains type stubs for the Ymq Python C Extension module
-import abc
 import sys
 from collections.abc import Awaitable
 from enum import IntEnum
@@ -11,12 +10,17 @@ if sys.version_info >= (3, 12):
 else:
     Buffer = object
 
-class Bytes(Buffer, metaclass=abc.ABCMeta):
-    data: bytes
+class Bytes(Buffer):
+    data: bytes | None
     len: int
 
-    def __init__(self, data: SupportsBytes | bytes) -> None: ...
+    def __init__(self, data: Buffer | None = None) -> None: ...
     def __repr__(self) -> str: ...
+    def __len__(self) -> int: ...
+
+    # this type signature is not 100% accurate because it's implemented in C
+    # but this satisfies the type check and is good enough
+    def __buffer__(self, flags: int, /) -> memoryview: ...
 
 class Message:
     address: Bytes | None
@@ -99,7 +103,7 @@ class YMQException(Exception):
     code: ErrorCode
     message: str
 
-    def __init__(self, code: ErrorCode, message: str) -> None: ...
+    def __init__(self, /, code: ErrorCode, message: str) -> None: ...
     def __repr__(self) -> str: ...
     def __str__(self) -> str: ...
 
diff --git a/scaler/io/ymq/ymq_test.py b/scaler/io/ymq/ymq_test.py
deleted file mode 100644
index 9201983c7..000000000
--- a/scaler/io/ymq/ymq_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import asyncio
-
-import ymq
-
-
-async def main():
-    ctx = ymq.IOContext()
-    socket = await ctx.createIOSocket("ident", ymq.IOSocketType.Binder)
-    print(ctx, ";", socket)
-
-    assert socket.identity == "ident"
-    assert socket.socket_type == ymq.IOSocketType.Binder
-
-    exc = ymq.YMQException(ymq.ErrorCode.InvalidAddressFormat, "the address has an invalid format")
-    assert exc.code == ymq.ErrorCode.InvalidAddressFormat
-    assert exc.message == "the address has an invalid format"
-    assert exc.code.explanation()
-
-
-asyncio.run(main())
diff --git a/scaler/object_storage/object_storage_server.cpp b/scaler/object_storage/object_storage_server.cpp
index 22a4a9488..cd5de483e 100644
--- a/scaler/object_storage/object_storage_server.cpp
+++ b/scaler/object_storage/object_storage_server.cpp
@@ -129,14 +129,15 @@ void ObjectStorageServer::processRequests()
 
             std::ranges::for_each(_pendingSendMessageFuts, [](auto& fut) {
                 if (fut.wait_for(0s) == std::future_status::ready) {
-                    auto res = fut.get();
-                    assert(res);
+                    auto error = fut.get();
+                    assert(!error);
                 }
             });
 
-            auto [message, error] = ymq::syncRecvMessage(_ioSocket);
+            auto maybeMessage = ymq::syncRecvMessage(_ioSocket);
 
-            if (error._errorCode != ymq::Error::ErrorCode::Uninit) {
+            if (!maybeMessage) {
+                auto error = maybeMessage.error();
                 if (error._errorCode == ymq::Error::ErrorCode::IOSocketStopRequested) {
                     auto n = std::ranges::count_if(_pendingSendMessageFuts, [](auto& x) {
                         return x.valid() && x.wait_for(0s) == std::future_status::timeout;
@@ -163,8 +164,9 @@ void ObjectStorageServer::processRequests()
                 }
             }
 
-            const auto identity = lastMessageIdentity = message.address.as_string();
-            const auto headerOrPayload                = std::move(message.payload);
+            const auto identity        = *maybeMessage->address.as_string();
+            lastMessageIdentity        = identity;
+            const auto headerOrPayload = std::move(maybeMessage->payload);
 
             auto it = identityToFullRequest.find(identity);
             if (it == identityToFullRequest.end()) {
diff --git a/scaler/object_storage/object_storage_server.h b/scaler/object_storage/object_storage_server.h
index 20ed8e738..00849950e 100644
--- a/scaler/object_storage/object_storage_server.h
+++ b/scaler/object_storage/object_storage_server.h
@@ -3,6 +3,7 @@
 #include <expected>
 #include <iostream>
 #include <memory>
+#include <optional>
 #include <span>
 
 #include "scaler/io/ymq/configuration.h"
@@ -22,7 +23,7 @@ namespace object_storage {
 class ObjectStorageServer {
 public:
     using Identity          = ymq::Configuration::IOSocketIdentity;
-    using SendMessageFuture = std::future<std::expected<void, ymq::Error>>;
+    using SendMessageFuture = std::future<std::optional<ymq::Error>>;
 
     ObjectStorageServer();
 
diff --git a/scripts/build.sh b/scripts/build.sh
index 7256ea367..5fe3482d7 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -27,4 +27,4 @@ cmake --build --preset $BUILD_PRESET
 cmake --install $BUILD_DIR
 
 # Tests
-ctest --preset $BUILD_PRESET
+ctest --preset $BUILD_PRESET -VV
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ae6a3b116..a1f238355 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,6 +11,8 @@ set(BUILD_GMOCK OFF CACHE BOOL "" FORCE)
 set(BUILD_GTEST ON CACHE BOOL "" FORCE)
 FetchContent_MakeAvailable(googletest)
 
+find_package(Python3 COMPONENTS Development REQUIRED)
+
 # This function compiles, links, and adds a C++ test executable using Google Test.
 # It is shared by all test subdirectories.
 function(add_test_executable test_name source_file)
@@ -26,16 +28,17 @@ function(add_test_executable test_name source_file)
         CapnProto::capnp
         CapnProto::kj
         GTest::gtest_main
+        Python3::Python
     )
 
     add_test(NAME ${test_name} COMMAND ${test_name})
 endfunction()
 
-
 if(LINUX OR APPLE)
   # This directory fetches Google Test, so it must be included first.
   add_subdirectory(object_storage)
 
   # Add the new directory for io tests.
   add_subdirectory(io/ymq)
+  add_subdirectory(cc_ymq)
 endif()
diff --git a/tests/cc_ymq/CMakeLists.txt b/tests/cc_ymq/CMakeLists.txt
new file mode 100644
index 000000000..9f6abe371
--- /dev/null
+++ b/tests/cc_ymq/CMakeLists.txt
@@ -0,0 +1 @@
+add_test_executable(test_cc_ymq test_cc_ymq.cpp)
diff --git a/tests/cc_ymq/common.h b/tests/cc_ymq/common.h
new file mode 100644
index 000000000..42dfab9fe
--- /dev/null
+++ b/tests/cc_ymq/common.h
@@ -0,0 +1,496 @@
+#pragma once
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <ifaddrs.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <signal.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/timerfd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <exception>
+#include <format>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <optional>
+#include <print>
+#include <stdexcept>
+#include <string>
+#include <system_error>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#define RETURN_FAILURE_IF_FALSE(condition) \
+    if (!(condition)) {                    \
+        return TestResult::Failure;        \
+    }
+
+using namespace std::chrono_literals;
+
+enum class TestResult : char { Success = 1, Failure = 2 };
+
+inline const char* check_localhost(const char* host)
+{
+    return std::strcmp(host, "localhost") == 0 ? "127.0.0.1" : host;
+}
+
+inline std::string format_address(std::string host, uint16_t port)
+{
+    return std::format("tcp://{}:{}", check_localhost(host.c_str()), port);
+}
+
+class OwnedFd {
+public:
+    int fd;
+
+    OwnedFd(int fd): fd(fd) {}
+
+    // move-only
+    OwnedFd(const OwnedFd&)            = delete;
+    OwnedFd& operator=(const OwnedFd&) = delete;
+    OwnedFd(OwnedFd&& other) noexcept: fd(other.fd) { other.fd = 0; }
+    OwnedFd& operator=(OwnedFd&& other) noexcept
+    {
+        if (this != &other) {
+            this->fd = other.fd;
+            other.fd = 0;
+        }
+        return *this;
+    }
+
+    ~OwnedFd()
+    {
+        if (fd > 0 && close(fd) < 0)
+            std::println(std::cerr, "failed to close fd!");
+    }
+
+    size_t write(const void* data, size_t len)
+    {
+        auto n = ::write(this->fd, data, len);
+        if (n < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to write to socket");
+
+        return n;
+    }
+
+    void write_all(const char* data, size_t len)
+    {
+        for (size_t cursor = 0; cursor < len;)
+            cursor += this->write(data + cursor, len - cursor);
+    }
+
+    void write_all(std::vector<char> data) { this->write_all(data.data(), data.size()); }
+
+    size_t read(void* buffer, size_t len)
+    {
+        auto n = ::read(this->fd, buffer, len);
+        if (n < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to read from socket");
+        return n;
+    }
+
+    void read_exact(char* buffer, size_t len)
+    {
+        for (size_t cursor = 0; cursor < len;)
+            cursor += this->read(buffer + cursor, len - cursor);
+    }
+
+    operator int() { return fd; }
+};
+
+class Socket: public OwnedFd {
+public:
+    Socket(int fd): OwnedFd(fd) {}
+
+    void connect(const char* host, uint16_t port, bool nowait = false)
+    {
+        sockaddr_in addr {
+            .sin_family = AF_INET,
+            .sin_port   = htons(port),
+            .sin_addr   = {.s_addr = inet_addr(check_localhost(host))},
+            .sin_zero   = {0}};
+
+    connect:
+        if (::connect(this->fd, (sockaddr*)&addr, sizeof(addr)) < 0) {
+            if (errno == ECONNREFUSED && !nowait) {
+                std::this_thread::sleep_for(300ms);
+                goto connect;
+            }
+
+            throw std::system_error(errno, std::generic_category(), "failed to connect");
+        }
+    }
+
+    void bind(const char* host, int port)
+    {
+        sockaddr_in addr {
+            .sin_family = AF_INET,
+            .sin_port   = htons(port),
+            .sin_addr   = {.s_addr = inet_addr(check_localhost(host))},
+            .sin_zero   = {0}};
+
+        auto status = ::bind(this->fd, (sockaddr*)&addr, sizeof(addr));
+        if (status < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to bind");
+    }
+
+    void listen(int n = 32)
+    {
+        auto status = ::listen(this->fd, n);
+        if (status < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to listen on socket");
+    }
+
+    std::pair<Socket, sockaddr_in> accept(int flags = 0)
+    {
+        sockaddr_in peer_addr {};
+        socklen_t len = sizeof(peer_addr);
+        auto fd       = ::accept4(this->fd, (sockaddr*)&peer_addr, &len, flags);
+        if (fd < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to accept socket");
+
+        return std::make_pair(Socket(fd), peer_addr);
+    }
+
+    void write_message(std::string message)
+    {
+        uint64_t header = message.length();
+        this->write_all((char*)&header, 8);
+        this->write_all(message.data(), message.length());
+    }
+
+    std::string read_message()
+    {
+        uint64_t header = 0;
+        this->read_exact((char*)&header, 8);
+        std::vector<char> buffer(header);
+        this->read_exact(buffer.data(), header);
+        return std::string(buffer.data(), header);
+    }
+};
+
+class TcpSocket: public Socket {
+public:
+    TcpSocket(): Socket(0)
+    {
+        this->fd = ::socket(AF_INET, SOCK_STREAM, 0);
+        if (this->fd < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to create socket");
+
+        int on = 1;
+        if (setsockopt(this->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&on, sizeof(on)) < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to set nodelay");
+
+        if (setsockopt(this->fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0)
+            throw std::system_error(errno, std::generic_category(), "failed to set reuseaddr");
+    }
+};
+
+inline void fork_wrapper(std::function<TestResult()> fn, int timeout_secs, OwnedFd pipe_wr)
+{
+    TestResult result = TestResult::Failure;
+    try {
+        result = fn();
+    } catch (const std::exception& e) {
+        std::println(stderr, "Exception: {}", e.what());
+        result = TestResult::Failure;
+    }
+
+    pipe_wr.write_all((char*)&result, sizeof(TestResult));
+}
+
+// this function along with `wait_for_python_ready_sigwait()`
+// work together to wait on a signal from the python process
+// indicating that the tuntap interface has been created, and that the mitm is ready
+inline void wait_for_python_ready_sigblock()
+{
+    sigset_t set {};
+    int sig = 0;
+
+    if (sigemptyset(&set) < 0)
+        throw std::system_error(errno, std::generic_category(), "failed to create empty signal set");
+
+    if (sigaddset(&set, SIGUSR1) < 0)
+        throw std::system_error(errno, std::generic_category(), "failed to add sigusr1 to the signal set");
+
+    if (sigprocmask(SIG_BLOCK, &set, nullptr) < 0)
+        throw std::system_error(errno, std::generic_category(), "failed to mask sigusr1");
+
+    std::println("blocked signal...");
+}
+
+inline void wait_for_python_ready_sigwait(int timeout_secs)
+{
+    sigset_t set {};
+    siginfo_t sig {};
+
+    if (sigemptyset(&set) < 0)
+        throw std::system_error(errno, std::generic_category(), "failed to create empty signal set");
+
+    if (sigaddset(&set, SIGUSR1) < 0)
+        throw std::system_error(errno, std::generic_category(), "failed to add sigusr1 to the signal set");
+
+    std::println("waiting for python to be ready...");
+    timespec ts {.tv_sec = timeout_secs, .tv_nsec = 0};
+    if (sigtimedwait(&set, &sig, &ts) < 0)
+        throw std::system_error(errno, std::generic_category(), "failed to wait on sigusr1");
+
+    sigprocmask(SIG_UNBLOCK, &set, nullptr);
+    std::println("signal received; python is ready");
+}
+
+// run a test
+// forks and runs each of the provided closures
+// if `wait_for_python` is true, wait for SIGUSR1 after forking and executing the first closure
+inline TestResult test(
+    int timeout_secs, std::vector<std::function<TestResult()>> closures, bool wait_for_python = false)
+{
+    std::vector<std::pair<int, int>> pipes {};
+    std::vector<int> pids {};
+    for (size_t i = 0; i < closures.size(); i++) {
+        int pipe[2] = {0};
+        if (pipe2(pipe, O_NONBLOCK) < 0) {
+            std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) {
+                close(pipe.first);
+                close(pipe.second);
+            });
+
+            throw std::system_error(errno, std::generic_category(), "failed to create pipe: ");
+        }
+        pipes.push_back(std::make_pair(pipe[0], pipe[1]));
+    }
+
+    for (size_t i = 0; i < closures.size(); i++) {
+        if (wait_for_python && i == 0)
+            wait_for_python_ready_sigblock();
+
+        auto pid = fork();
+        if (pid < 0) {
+            std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) {
+                close(pipe.first);
+                close(pipe.second);
+            });
+
+            std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); });
+
+            throw std::system_error(errno, std::generic_category(), "failed to fork");
+        }
+
+        if (pid == 0) {
+            // close all pipes except our write half
+            for (size_t j = 0; j < pipes.size(); j++) {
+                if (i == j)
+                    close(pipes[i].first);
+                else {
+                    close(pipes[j].first);
+                    close(pipes[j].second);
+                }
+            }
+
+            fork_wrapper(closures[i], timeout_secs, pipes[i].second);
+            std::exit(EXIT_SUCCESS);
+        }
+
+        pids.push_back(pid);
+
+        if (wait_for_python && i == 0)
+            wait_for_python_ready_sigwait(3);
+    }
+
+    // close all write halves of the pipes
+    for (auto pipe: pipes)
+        close(pipe.second);
+
+    std::vector<pollfd> pfds {};
+
+    OwnedFd timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+    if (timerfd < 0) {
+        std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); });
+        std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); });
+
+        throw std::system_error(errno, std::generic_category(), "failed to create timerfd");
+    }
+
+    pfds.push_back({.fd = timerfd.fd, .events = POLL_IN, .revents = 0});
+    for (auto pipe: pipes)
+        pfds.push_back({
+            .fd      = pipe.first,
+            .events  = POLL_IN,
+            .revents = 0,
+        });
+
+    itimerspec spec {
+        .it_interval =
+            {
+                .tv_sec  = 0,
+                .tv_nsec = 0,
+            },
+        .it_value = {
+            .tv_sec  = timeout_secs,
+            .tv_nsec = 0,
+        }};
+
+    if (timerfd_settime(timerfd, 0, &spec, nullptr) < 0) {
+        std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); });
+        std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); });
+
+        throw std::system_error(errno, std::generic_category(), "failed to set timerfd");
+    }
+
+    std::vector<std::optional<TestResult>> results(pids.size(), std::nullopt);
+
+    for (;;) {
+        auto n = poll(pfds.data(), pfds.size(), -1);
+        if (n < 0) {
+            std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); });
+            std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); });
+
+            throw std::system_error(errno, std::generic_category(), "failed to poll: ");
+        }
+
+        for (auto& pfd: std::vector(pfds)) {
+            if (pfd.revents == 0)
+                continue;
+
+            // timed out
+            if (pfd.fd == timerfd) {
+                std::println("Timed out!");
+
+                std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); });
+                std::for_each(pids.begin(), pids.end(), [](const auto& pid) { kill(pid, SIGKILL); });
+
+                return TestResult::Failure;
+            }
+
+            TestResult result = TestResult::Failure;
+            char buffer       = 0;
+            if (read(pfd.fd, &buffer, sizeof(TestResult)) <= 0)
+                result = TestResult::Failure;
+            else
+                result = (TestResult)buffer;
+
+            auto elem = std::find_if(pipes.begin(), pipes.end(), [fd = pfd.fd](auto pipe) { return pipe.first == fd; });
+            auto idx  = elem - pipes.begin();
+            results[idx] = result;
+
+            std::println("Process[{}] completed with {}", idx, result == TestResult::Success ? "Success" : "Failure");
+
+            // this subprocess is done, remove its pipe from the poll fds
+            pfds.erase(std::remove_if(pfds.begin(), pfds.end(), [&](auto p) { return p.fd == pfd.fd; }), pfds.end());
+
+            auto done = std::all_of(results.begin(), results.end(), [](auto result) { return result.has_value(); });
+            if (done)
+                goto end;  // justification for goto: breaks out of two levels of loop
+        }
+    }
+
+end:
+
+    std::for_each(pipes.begin(), pipes.end(), [](const auto& pipe) { close(pipe.first); });
+
+    int status = 0;
+    std::for_each(pids.begin(), pids.end(), [&status](const auto& pid) {
+        if (waitpid(pid, &status, 0) < 0)
+            std::println(stderr, "failed to wait on a subprocess");
+    });
+
+    if (std::ranges::any_of(results, [](auto x) { return x == TestResult::Failure; }))
+        return TestResult::Failure;
+
+    return TestResult::Success;
+}
+
+inline TestResult run_python(const char* path, std::vector<const wchar_t*> argv = {})
+{
+    // insert the pid at the start of the argv, this is important for signalling readiness
+    pid_t pid   = getppid();
+    auto pid_ws = std::to_wstring(pid);
+    argv.insert(argv.begin(), pid_ws.c_str());
+
+    PyStatus status;
+    PyConfig config;
+    PyConfig_InitPythonConfig(&config);
+
+    status = PyConfig_SetBytesString(&config, &config.program_name, "mitm");
+    if (PyStatus_Exception(status))
+        goto exception;
+
+    status = Py_InitializeFromConfig(&config);
+    if (PyStatus_Exception(status))
+        goto exception;
+    PyConfig_Clear(&config);
+
+    argv.insert(argv.begin(), L"mitm");
+    PySys_SetArgv(argv.size(), (wchar_t**)argv.data());
+
+    {
+        auto file = fopen(path, "r");
+        if (!file)
+            throw std::system_error(errno, std::generic_category(), "failed to open python file");
+
+        PyRun_SimpleFile(file, path);
+        fclose(file);
+    }
+
+    if (Py_FinalizeEx() < 0) {
+        std::println("finalization failure");
+        return TestResult::Failure;
+    }
+
+    return TestResult::Success;
+
+exception:
+    PyConfig_Clear(&config);
+    Py_ExitStatusException(status);
+
+    return TestResult::Failure;
+}
+
+inline TestResult run_mitm(
+    std::string testcase,
+    std::string mitm_ip,
+    uint16_t mitm_port,
+    std::string remote_ip,
+    uint16_t remote_port,
+    std::vector<std::string> extra_args = {})
+{
+    // we build the args for the user to make calling the function more convenient
+    std::vector<std::string> args {
+        testcase, mitm_ip, std::to_string(mitm_port), remote_ip, std::to_string(remote_port)};
+
+    for (auto arg: extra_args)
+        args.push_back(arg);
+
+    // we need to convert to wide strings to pass to Python
+    std::vector<std::wstring> wide_args_owned {};
+
+    // the strings are ascii so we can just make them into wstrings
+    for (const auto& str: args)
+        wide_args_owned.emplace_back(str.begin(), str.end());
+
+    std::vector<const wchar_t*> wide_args {};
+    for (const auto& wstr: wide_args_owned)
+        wide_args.push_back(wstr.c_str());
+
+    return run_python("tests/cc_ymq/py_mitm/main.py", wide_args);
+}
diff --git a/tests/cc_ymq/py_mitm/__init__.py b/tests/cc_ymq/py_mitm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/cc_ymq/py_mitm/main.py b/tests/cc_ymq/py_mitm/main.py
new file mode 100644
index 000000000..edaeba569
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/main.py
@@ -0,0 +1,152 @@
+# flake8: noqa: E402
+
+"""
+This script provides a framework for running MITM test cases
+"""
+
+import argparse
+import os
+import sys
+import importlib
+import signal
+import subprocess
+from tests.cc_ymq.py_mitm.types import MITMProtocol, TCPConnection
+from scapy.all import IP, TCP, TunTapInterface  # type: ignore
+
+
+def echo_call(cmd: list[str]):
+    print(f"+ {' '.join(cmd)}")
+    subprocess.check_call(cmd)
+
+
+def create_tuntap_interface(iface_name: str, mitm_ip: str, remote_ip: str) -> TunTapInterface:
+    """
+    Creates a TUNTAP interface and sets brings it up and adds ips using the `ip` program
+
+    Args:
+        iface_name: The name of the TUNTAP interface, usually like `tun0`, `tun1`, etc.
+        mitm_ip: The desired ip address of the mitm. This is the ip that clients can use to connect to the mitm
+        remote_ip: The ip that routes to/from the tuntap interface.
+        packets sent to `mitm_ip` will appear to come from `remote_ip`,\
+        and conversely the tuntap interface can connect/send packets
+        to `remote_ip`, making it a suitable ip for binding a server
+
+    Returns:
+        The TUNTAP interface
+    """
+    iface = TunTapInterface(iface_name, mode="tun")
+
+    try:
+        echo_call(["sudo", "ip", "link", "set", iface_name, "up"])
+        echo_call(["sudo", "ip", "addr", "add", remote_ip, "peer", mitm_ip, "dev", iface_name])
+        print(f"[+] Interface {iface_name} up with IP {mitm_ip}")
+    except subprocess.CalledProcessError:
+        print("[!] Could not bring up interface. Run as root or set manually.")
+        raise
+
+    return iface
+
+
+def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: int, mitm: MITMProtocol):
+    """
+    This function serves as a framework for man in the middle implementations
+    A client connects to the MITM, then the MITM connects to a remote server
+    The MITM sits inbetween the client and the server, manipulating the packets sent depending on the test case
+    This function:
+        1. creates a TUNTAP interface and prepares it for MITM
+        2. handles connecting clients and handling connection closes
+        3. delegates additional logic to a pluggable callable, `mitm`
+        4. returns when both connections have terminated (via )
+
+    Args:
+        pid: this is the pid of the test process, used for signaling readiness \
+        we send SIGUSR1 to this process when the mitm is ready
+        mitm_ip: The desired ip address of the mitm server
+        mitm_port: The desired port of the mitm server. \
+        This is the port used to connect to the server, but the client is free to connect on any port
+        remote_ip: The desired remote ip for the TUNTAP interface. This is the only ip address \
+        reachable by the interface and is thus the src ip for clients, and the ip that the remote server \
+        must be bound to
+        server_port: The port that the remote server is bound to
+        mitm: The core logic for a MITM test case. This callable may maintain its own state and is responsible \
+        for sending packets over the TUNTAP interface (if it doesn't, nothing will happen)
+    """
+
+    tuntap = create_tuntap_interface("tun0", mitm_ip, remote_ip)
+
+    # signal the caller that the tuntap interface has been created
+    if pid > 0:
+        os.kill(pid, signal.SIGUSR1)
+
+    # these track information about our connections
+    # we already know what to expect for the server connection, we are the connector
+    client_conn = None
+    server_conn = TCPConnection(mitm_ip, mitm_port, remote_ip, server_port)
+
+    # tracks the state of each connection
+    client_sent_fin_ack = False
+    client_closed = False
+    server_sent_fin_ack = False
+    server_closed = False
+
+    while True:
+        pkt = tuntap.recv()
+        if not pkt.haslayer(IP) or not pkt.haslayer(TCP):
+            continue
+        ip = pkt[IP]
+        tcp = pkt[TCP]
+
+        # for a received packet, the destination ip and port are our local ip and port
+        # and the source ip and port will be the remote ip and port
+        sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport)
+
+        if sender == client_conn:
+            print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}")
+        elif sender == server_conn:
+            print(f"<- [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}")
+
+        if tcp.flags == "S":  # SYN from client
+            print("-> [S]")
+            print(f"[*] New connection from {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}")
+            client_conn = sender
+
+        if tcp.flags == "SA":  # SYN-ACK from server
+            if sender == server_conn:
+                print(f"[*] Connection to server established: {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}")
+
+        if tcp.flags == "FA":  # FIN-ACK
+            if sender == client_conn:
+                client_sent_fin_ack = True
+            if sender == server_conn:
+                server_sent_fin_ack = True
+
+        if tcp.flags == "A":  # ACK
+            if sender == client_conn and server_sent_fin_ack:
+                server_closed = True
+            if sender == server_conn and client_sent_fin_ack:
+                client_closed = True
+
+        mitm.proxy(tuntap, pkt, sender, client_conn, server_conn)
+
+        if client_closed and server_closed:
+            print("[*] Both connections closed")
+            return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Man in the middle test framework")
+    parser.add_argument("pid", type=int, help="The pid of the test process, used for signaling")
+    parser.add_argument("testcase", type=str, help="The MITM test case module name")
+    parser.add_argument("mitm_ip", type=str, help="The desired ip address of the mitm server")
+    parser.add_argument("mitm_port", type=int, help="The desired port of the mitm server")
+    parser.add_argument("remote_ip", type=str, help="The desired remote ip for the TUNTAP interface")
+    parser.add_argument("server_port", type=int, help="The port that the remote server is bound to")
+
+    args, unknown = parser.parse_known_args()
+
+    # add the script's directory to path
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+
+    # load the module dynamically
+    module = importlib.import_module(args.testcase)
+    main(args.pid, args.mitm_ip, args.mitm_port, args.remote_ip, args.server_port, module.MITM(*unknown))
diff --git a/tests/cc_ymq/py_mitm/passthrough.py b/tests/cc_ymq/py_mitm/passthrough.py
new file mode 100644
index 000000000..20d8a9069
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/passthrough.py
@@ -0,0 +1,23 @@
+"""
+This MITM acts as a transparent passthrough, it simply forwards packets as they are,
+minus necessary header changes to retransmit
+This MITM should have no effect on the client and server,
+and they should behave as if the MITM is not present
+"""
+
+from tests.cc_ymq.py_mitm.types import MITMProtocol, TunTapInterface, IP, TCPConnection
+
+
+class MITM(MITMProtocol):
+    def proxy(
+        self,
+        tuntap: TunTapInterface,
+        pkt: IP,
+        sender: TCPConnection,
+        client_conn: TCPConnection | None,
+        server_conn: TCPConnection,
+    ) -> None:
+        if sender == client_conn:
+            tuntap.send(server_conn.rewrite(pkt))
+        elif sender == server_conn:
+            tuntap.send(client_conn.rewrite(pkt))
diff --git a/tests/cc_ymq/py_mitm/randomly_drop_packets.py b/tests/cc_ymq/py_mitm/randomly_drop_packets.py
new file mode 100644
index 000000000..a197ac3c8
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/randomly_drop_packets.py
@@ -0,0 +1,28 @@
+"""
+This MITM drops a % of packets
+"""
+
+import random
+from tests.cc_ymq.py_mitm.types import MITMProtocol, TunTapInterface, IP, TCPConnection
+
+
+class MITM(MITMProtocol):
+    def __init__(self, drop_pcent: str):
+        self.drop_pcent = float(drop_pcent)
+
+    def proxy(
+        self,
+        tuntap: TunTapInterface,
+        pkt: IP,
+        sender: TCPConnection,
+        client_conn: TCPConnection | None,
+        server_conn: TCPConnection,
+    ) -> None:
+        if random.random() < self.drop_pcent:
+            print("[!] Dropping packet")
+            return
+
+        if sender == client_conn:
+            tuntap.send(server_conn.rewrite(pkt))
+        elif sender == server_conn:
+            tuntap.send(client_conn.rewrite(pkt))
diff --git a/tests/cc_ymq/py_mitm/send_rst_to_client.py b/tests/cc_ymq/py_mitm/send_rst_to_client.py
new file mode 100644
index 000000000..fc70355e5
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/send_rst_to_client.py
@@ -0,0 +1,48 @@
+"""
+This MITM inserts an unexpected TCP RST
+"""
+
+from tests.cc_ymq.py_mitm.types import IP, TCP, MITMProtocol, TCPConnection, TunTapInterface
+
+
+class MITM(MITMProtocol):
+    def __init__(self):
+        # count the number of psh-acks sent by the client
+        self.client_pshack_counter = 0
+
+    def proxy(
+        self,
+        tuntap: TunTapInterface,
+        pkt: IP,
+        sender: TCPConnection,
+        client_conn: TCPConnection | None,
+        server_conn: TCPConnection,
+    ) -> None:
+        if sender == client_conn or client_conn is None:
+            if pkt[TCP].flags == "PA":
+                self.client_pshack_counter += 1
+
+                # on the second psh-ack, send a rst instead
+                if self.client_pshack_counter == 2:
+                    rst_pkt = IP(src=client_conn.local_ip, dst=client_conn.remote_ip) / TCP(
+                        sport=client_conn.local_port, dport=client_conn.remote_port, flags="R", seq=pkt[TCP].ack
+                    )
+                    print(f"<- [{rst_pkt[TCP].flags}] (simulated)")
+                    tuntap.send(rst_pkt)
+                    return
+
+            tuntap.send(server_conn.rewrite(pkt))
+        elif sender == server_conn:
+            tuntap.send(client_conn.rewrite(pkt))
+
+
+# client -> mitm -> server
+# server -> mitm -> client
+
+# client: 127.0.0.1:8080
+# mitm: 127.0.0.1:8081
+# server: 127.0.0.1:8081
+
+
+# client -> mitm == src = client.ip, sport = client.port ;; dst = mitm.ip, dport = mitm.port
+# mitm -> server == src = mitm.ip, sport = mitm.port ;; dst = server.ip, dport = server.port
diff --git a/tests/cc_ymq/py_mitm/types.py b/tests/cc_ymq/py_mitm/types.py
new file mode 100644
index 000000000..4a22ee01a
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/types.py
@@ -0,0 +1,54 @@
+"""
+This is the common code for implementing man in the middle in Python
+"""
+
+import dataclasses
+from typing import Protocol
+from scapy.all import TunTapInterface, IP, TCP  # type: ignore
+
+
+@dataclasses.dataclass
+class TCPConnection:
+    """
+    Represents a TCP connection over the TUNTAP interface
+    local_ip and local_port are the mitm's ip and port, and
+    remote_ip and remote_port are the port for the remote peer
+    """
+
+    local_ip: str
+    local_port: int
+    remote_ip: str
+    remote_port: int
+
+    def rewrite(self, pkt: IP, ack: int | None = None, data=None):
+        """
+        Rewrite a TCP/IP packet as a packet originating
+        from (local_ip, local_port) and going to (remote_ip, remote_port)
+        This function is useful for taking a packet received from one connection, and redirecting it to another
+
+        Args:
+            pkt: A scapy TCP/IP packet to rewrite
+            ack: An optional ack number to use instead of the one found in `pkt`
+            data: An optional payload to use instead of the one found int `pkt`
+
+        Returns:
+            The rewritten packet, suitable for sending over TUNTAP
+        """
+        tcp = pkt[TCP]
+
+        return (
+            IP(src=self.local_ip, dst=self.remote_ip)
+            / TCP(sport=self.local_port, dport=self.remote_port, flags=tcp.flags, seq=tcp.seq, ack=ack or tcp.ack)
+            / bytes(data or tcp.payload)
+        )
+
+
+class MITMProtocol(Protocol):
+    def proxy(
+        self,
+        tuntap: TunTapInterface,
+        pkt: IP,
+        sender: TCPConnection,
+        client_conn: TCPConnection | None,
+        server_conn: TCPConnection,
+    ) -> None: ...
diff --git a/tests/cc_ymq/test_cc_ymq.cpp b/tests/cc_ymq/test_cc_ymq.cpp
new file mode 100644
index 000000000..1e7872b5c
--- /dev/null
+++ b/tests/cc_ymq/test_cc_ymq.cpp
@@ -0,0 +1,508 @@
+// this file contains the tests for the C++ interface of YMQ
+// each test case is comprised of at least one client and one server, and possibly a middleman
+// the clients and servers used in these tests are defined in the first part of this file
+//
+// the men in the middle (mitm) are implemented using Python and are found in py_mitm/
+// in that directory, `main.py` is the entrypoint and framework for all the mitm,
+// and the individual mitm implementations are found in their respective files
+//
+// the test cases are at the bottom of this file, after the clients and servers
+// the documentation for each case is found on the TEST() definition
+
+#include <gtest/gtest.h>
+#include <netinet/ip.h>
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <thread>
+
+#include "common.h"
+#include "scaler/io/ymq/bytes.h"
+#include "scaler/io/ymq/io_context.h"
+#include "scaler/io/ymq/simple_interface.h"
+#include "tests/cc_ymq/common.h"
+
+using namespace scaler::ymq;
+using namespace std::chrono_literals;
+
+// ━━━━━━━━━━━━━━━━━━━
+//  clients and servers
+// ━━━━━━━━━━━━━━━━━━━
+
+TestResult basic_server_ymq(std::string host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Binder, "server");
+    syncBindSocket(socket, format_address(host, port));
+    auto result = syncRecvMessage(socket);
+
+    RETURN_FAILURE_IF_FALSE(result.has_value());
+    RETURN_FAILURE_IF_FALSE(result->payload.as_string() == "yi er san si wu liu");
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+TestResult basic_client_ymq(std::string host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Connector, "client");
+    syncConnectSocket(socket, format_address(host, port));
+    auto result = syncSendMessage(socket, {.address = Bytes("server"), .payload = Bytes("yi er san si wu liu")});
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+TestResult basic_server_raw(std::string host, uint16_t port)
+{
+    TcpSocket socket;
+
+    socket.bind(host.c_str(), port);
+    socket.listen();
+    auto [client, _] = socket.accept();
+    client.write_message("server");
+    auto client_identity = client.read_message();
+    RETURN_FAILURE_IF_FALSE(client_identity == "client");
+    auto msg = client.read_message();
+    RETURN_FAILURE_IF_FALSE(msg == "yi er san si wu liu");
+
+    return TestResult::Success;
+}
+
+TestResult basic_client_raw(int delay, std::string host, uint16_t port)
+{
+    TcpSocket socket;
+
+    socket.connect(host.c_str(), port);
+    socket.write_message("client");
+    auto server_identity = socket.read_message();
+    RETURN_FAILURE_IF_FALSE(server_identity == "server");
+    socket.write_message("yi er san si wu liu");
+
+    if (delay)
+        std::this_thread::sleep_for(std::chrono::seconds(delay));
+
+    return TestResult::Success;
+}
+
+TestResult server_receives_big_message(std::string host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Binder, "server");
+    syncBindSocket(socket, format_address(host, port));
+    auto result = syncRecvMessage(socket);
+
+    RETURN_FAILURE_IF_FALSE(result.has_value());
+    RETURN_FAILURE_IF_FALSE(result->payload.len() == 500'000'000);
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+TestResult client_sends_big_message(int delay, std::string host, uint16_t port)
+{
+    TcpSocket socket;
+
+    socket.connect(host.c_str(), port);
+    socket.write_message("client");
+    auto remote_identity = socket.read_message();
+    RETURN_FAILURE_IF_FALSE(remote_identity == "server");
+    std::string msg(500'000'000, '.');
+    socket.write_message(msg);
+
+    if (delay)
+        std::this_thread::sleep_for(std::chrono::seconds(delay));
+
+    return TestResult::Success;
+}
+
+TestResult reconnect_server_main(std::string host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Binder, "server");
+    syncBindSocket(socket, format_address(host, port));
+    auto result = syncRecvMessage(socket);
+
+    RETURN_FAILURE_IF_FALSE(result.has_value());
+    RETURN_FAILURE_IF_FALSE(result->payload.as_string() == "hello!!");
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+TestResult reconnect_client_main(std::string host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Connector, "client");
+    syncConnectSocket(socket, format_address(host, port));
+    auto result = syncSendMessage(socket, {.address = Bytes("server"), .payload = Bytes("hello!!")});
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+TestResult client_simulated_slow_network(const char* host, uint16_t port)
+{
+    TcpSocket socket;
+
+    socket.connect(host, port);
+    socket.write_message("client");
+    auto remote_identity = socket.read_message();
+    RETURN_FAILURE_IF_FALSE(remote_identity == "server");
+
+    std::string message = "yi er san si wu liu";
+    uint64_t header     = message.length();
+
+    socket.write_all((char*)&header, 4);
+    std::this_thread::sleep_for(5s);
+    socket.write_all((char*)&header + 4, 4);
+    std::this_thread::sleep_for(3s);
+    socket.write_all(message.data(), header / 2);
+    std::this_thread::sleep_for(5s);
+    socket.write_all(message.data() + header / 2, header - header / 2);
+    std::this_thread::sleep_for(3s);
+
+    return TestResult::Success;
+}
+
+TestResult client_sends_incomplete_identity(const char* host, uint16_t port)
+{
+    // open a socket, write an incomplete identity and exit
+    {
+        TcpSocket socket;
+
+        socket.connect(host, port);
+
+        auto server_identity = socket.read_message();
+        RETURN_FAILURE_IF_FALSE(server_identity == "server");
+
+        // write incomplete identity and exit
+        std::string identity = "client";
+        uint64_t header      = identity.length();
+        socket.write_all((char*)&header, 8);
+        socket.write_all(identity.data(), identity.length() - 2);
+        std::this_thread::sleep_for(3s);
+    }
+
+    // connect again and try to send a message
+    {
+        TcpSocket socket;
+        socket.connect(host, port);
+        auto server_identity = socket.read_message();
+        RETURN_FAILURE_IF_FALSE(server_identity == "server");
+        socket.write_message("client");
+        socket.write_message("yi er san si wu liu");
+        std::this_thread::sleep_for(3s);
+    }
+
+    return TestResult::Success;
+}
+
+TestResult server_receives_huge_header(const char* host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Binder, "server");
+    syncBindSocket(socket, format_address(host, port));
+    auto result = syncRecvMessage(socket);
+
+    RETURN_FAILURE_IF_FALSE(result.has_value());
+    RETURN_FAILURE_IF_FALSE(result->payload.as_string() == "yi er san si wu liu");
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+TestResult client_sends_huge_header(const char* host, uint16_t port)
+{
+    TcpSocket socket;
+
+    socket.connect(host, port);
+    socket.write_message("client");
+    auto server_identity = socket.read_message();
+    RETURN_FAILURE_IF_FALSE(server_identity == "server");
+
+    // write the huge header
+    uint64_t header = std::numeric_limits<uint64_t>::max();
+    socket.write_all((char*)&header, 8);
+
+    // TODO: this sleep shouldn't be necessary
+    std::this_thread::sleep_for(3s);
+
+    return TestResult::Success;
+}
+
+TestResult server_receives_empty_messages(const char* host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Binder, "server");
+    syncBindSocket(socket, format_address(host, port));
+
+    auto result = syncRecvMessage(socket);
+    RETURN_FAILURE_IF_FALSE(result.has_value());
+    RETURN_FAILURE_IF_FALSE(result->payload.as_string() == "");
+
+    auto result2 = syncRecvMessage(socket);
+    RETURN_FAILURE_IF_FALSE(result2.has_value());
+    RETURN_FAILURE_IF_FALSE(result2->payload.as_string() == "");
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+TestResult client_sends_empty_messages(std::string host, uint16_t port)
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Connector, "client");
+    syncConnectSocket(socket, format_address(host, port));
+
+    auto error = syncSendMessage(socket, Message {.address = Bytes(), .payload = Bytes()});
+    RETURN_FAILURE_IF_FALSE(!error);
+
+    auto error2 = syncSendMessage(socket, Message {.address = Bytes(), .payload = Bytes("")});
+    RETURN_FAILURE_IF_FALSE(!error2);
+
+    context.removeIOSocket(socket);
+
+    return TestResult::Success;
+}
+
+// ━━━━━━━━━━━━━
+//   test cases
+// ━━━━━━━━━━━━━
+
+// this is a 'basic' test which sends a single message from a client to a server
+// in this variant, both the client and server are implemented using YMQ
+//
+// this case includes a _delay_
+// this is a thread sleep that happens after the client sends the message, to delay the close() of the socket
+// at the moment, if this delay is missing, YMQ will not shut down correctly
+TEST(CcYmqTestSuite, TestBasicYMQClientYMQServer)
+{
+    auto host = "localhost";
+    auto port = 2889;
+
+    // this is the test harness, it accepts a timeout, a list of functions to run,
+    // and an optional third argument used to coordinate the execution of python (for mitm)
+    auto result =
+        test(10, {[=] { return basic_client_ymq(host, port); }, [=] { return basic_server_ymq(host, port); }});
+
+    // test() aggregates the results across all of the provided functions
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// same as above, except YMQs protocol is directly implemented on top of a TCP socket
+TEST(CcYmqTestSuite, TestBasicRawClientYMQServer)
+{
+    auto host = "localhost";
+    auto port = 2890;
+
+    // this is the test harness, it accepts a timeout, a list of functions to run,
+    // and an optional third argument used to coordinate the execution of python (for mitm)
+    auto result =
+        test(10, {[=] { return basic_client_raw(5, host, port); }, [=] { return basic_server_ymq(host, port); }});
+
+    // test() aggregates the results across all of the provided functions
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+TEST(CcYmqTestSuite, TestBasicRawClientRawServer)
+{
+    auto host = "localhost";
+    auto port = 2891;
+
+    // this is the test harness, it accepts a timeout, a list of functions to run,
+    // and an optional third argument used to coordinate the execution of python (for mitm)
+    auto result =
+        test(10, {[=] { return basic_client_raw(0, host, port); }, [=] { return basic_server_raw(host, port); }});
+
+    // test() aggregates the results across all of the provided functions
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// TODO: this should pass
+// this is the same as above, except that it has no delay before calling close() on the socket
+// this test hangs
+TEST(CcYmqTestSuite, DISABLED_TestBasicRawClientRawServerNoDelay)
+{
+    auto host = "localhost";
+    auto port = 2892;
+
+    auto result =
+        test(10, {[=] { return basic_client_raw(0, host, port); }, [=] { return basic_server_ymq(host, port); }});
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+TEST(CcYmqTestSuite, TestBasicDelayYMQClientRawServer)
+{
+    auto host = "localhost";
+    auto port = 2893;
+
+    // this is the test harness, it accepts a timeout, a list of functions to run,
+    // and an optional third argument used to coordinate the execution of python (for mitm)
+    auto result =
+        test(10, {[=] { return basic_client_ymq(host, port); }, [=] { return basic_server_raw(host, port); }});
+
+    // test() aggregates the results across all of the provided functions
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// in this test case, the client sends a large message to the server
+// YMQ should be able to handle this without issue
+TEST(CcYmqTestSuite, TestClientSendBigMessageToServer)
+{
+    auto host = "localhost";
+    auto port = 2894;
+
+    auto result = test(
+        10,
+        {[=] { return client_sends_big_message(5, host, port); },
+         [=] { return server_receives_big_message(host, port); }});
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// this is the no-op/passthrough man in the middle test
+// for this test case we use YMQ on both the client side and the server side
+// the client connects to the mitm, and the mitm connects to the server
+// when the mitm receives packets from the client, it forwards it to the server without changing it
+// and similarly when it receives packets from the server, it forwards them to the client
+//
+// the mitm is implemented in Python. we pass the name of the test case, which corresponds to the Python filename,
+// and a list of arguments, which are: mitm ip, mitm port, remote ip, remote port
+// this defines the address of the mitm, and the addresses that can connect to it
+// for more, see the python mitm files
+TEST(CcYmqTestSuite, TestMitmPassthrough)
+{
+    auto mitm_ip     = "192.0.2.4";
+    auto mitm_port   = 2323;
+    auto remote_ip   = "192.0.2.3";
+    auto remote_port = 23571;
+
+    // the Python program must be the first and only the first function passed to test()
+    // we must also pass `true` as the third argument to ensure that Python is fully started
+    // before beginning the test
+    auto result = test(
+        20,
+        {[=] { return run_mitm("passthrough", mitm_ip, mitm_port, remote_ip, remote_port); },
+         [=] { return basic_client_ymq(mitm_ip, mitm_port); },
+         [=] { return basic_server_ymq(remote_ip, remote_port); }},
+        true);
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// this test uses the mitm to test the reconnect logic of YMQ by sending RST packets
+// this test is disabled until fixes arrive in the core
+TEST(CcYmqTestSuite, DISABLED_TestMitmReconnect)
+{
+    auto mitm_ip     = "192.0.2.4";
+    auto mitm_port   = 2525;
+    auto remote_ip   = "192.0.2.3";
+    auto remote_port = 23575;
+
+    auto result = test(
+        10,
+        {[=] { return run_mitm("send_rst_to_client", mitm_ip, mitm_port, remote_ip, remote_port); },
+         [=] { return reconnect_client_main(mitm_ip, mitm_port); },
+         [=] { return reconnect_server_main(remote_ip, remote_port); }},
+        true);
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// TODO: Make this more reliable, and re-enable it
+// in this test, the mitm drops a random % of packets arriving from the client and server
+TEST(CcYmqTestSuite, DISABLED_TestMitmRandomlyDropPackets)
+{
+    auto mitm_ip     = "192.0.2.4";
+    auto mitm_port   = 2828;
+    auto remote_ip   = "192.0.2.3";
+    auto remote_port = 23591;
+
+    auto result = test(
+        60,
+        {[=] { return run_mitm("randomly_drop_packets", mitm_ip, mitm_port, remote_ip, remote_port, {"0.3"}); },
+         [=] { return basic_client_ymq(mitm_ip, mitm_port); },
+         [=] { return basic_server_ymq(remote_ip, remote_port); }},
+        true);
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// in this test the client is sending a message to the server
+// but we simulate a slow network connection by sending the message in segmented chunks
+TEST(CcYmqTestSuite, TestSlowNetwork)
+{
+    auto host = "localhost";
+    auto port = 2895;
+
+    auto result = test(
+        20, {[=] { return client_simulated_slow_network(host, port); }, [=] { return basic_server_ymq(host, port); }});
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// TODO: figure out why this test fails in ci sometimes, and re-enable
+//
+// in this test, a client connects to the YMQ server but only partially sends its identity and then disconnects
+// then a new client connection is established, and this one sends a complete identity and message
+// YMQ should be able to recover from a poorly-behaved client like this
+TEST(CcYmqTestSuite, DISABLED_TestClientSendIncompleteIdentity)
+{
+    auto host = "localhost";
+    auto port = 2896;
+
+    auto result = test(
+        20,
+        {[=] { return client_sends_incomplete_identity(host, port); }, [=] { return basic_server_ymq(host, port); }});
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// TODO: this should pass
+// in this test, the client sends an unrealistically-large header
+// it is important that YMQ checks the header size before allocating memory
+// both for resilence against attacks and to guard against errors
+//
+// at the moment YMQ does not perform this check and throws std::bad_alloc
+// this test can be re-enabled after this is fixed
+TEST(CcYmqTestSuite, DISABLED_TestClientSendHugeHeader)
+{
+    auto host = "localhost";
+    auto port = 2897;
+
+    auto result = test(
+        20,
+        {[=] { return client_sends_huge_header(host, port); },
+         [=] { return server_receives_huge_header(host, port); }});
+    EXPECT_EQ(result, TestResult::Success);
+}
+
+// in this test, the client sends empty messages to the server
+// there are in effect two kinds of empty messages: Bytes() and Bytes("")
+// in the former case, the bytes contains a nullptr
+// in the latter case, the bytes contains a zero-length allocation
+// it's important that the behaviour of YMQ is known for both of these cases
+TEST(CcYmqTestSuite, TestClientSendEmptyMessage)
+{
+    auto host = "localhost";
+    auto port = 2898;
+
+    auto result = test(
+        20,
+        {[=] { return client_sends_empty_messages(host, port); },
+         [=] { return server_receives_empty_messages(host, port); }});
+    EXPECT_EQ(result, TestResult::Success);
+}
diff --git a/tests/object_storage/test_object_storage_server.cpp b/tests/object_storage/test_object_storage_server.cpp
index e448e99dc..626a1ce26 100644
--- a/tests/object_storage/test_object_storage_server.cpp
+++ b/tests/object_storage/test_object_storage_server.cpp
@@ -1,6 +1,5 @@
 #include <gtest/gtest.h>
 
-#include <algorithm>
 #include <cstring>
 #include <filesystem>
 #include <string>
@@ -8,7 +7,6 @@
 
 #include "scaler/io/ymq/io_context.h"
 #include "scaler/io/ymq/io_socket.h"
-#include "scaler/io/ymq/logging.h"
 #include "scaler/io/ymq/simple_interface.h"
 #include "scaler/object_storage/object_storage_server.h"
 
@@ -39,8 +37,8 @@ class ObjectStorageClient {
 
     void writeYMQMessage(Message message)
     {
-        auto res = syncSendMessage(ioSocket, std::move(message));
-        ASSERT_TRUE(res.has_value());
+        auto error = syncSendMessage(ioSocket, std::move(message));
+        ASSERT_TRUE(!error);
     }
 
     auto readYMQMessage() { return syncRecvMessage(ioSocket); }
@@ -63,17 +61,17 @@ class ObjectStorageClient {
     void readResponse(ObjectResponseHeader& header, std::optional<ObjectPayload>& payload)
     {
         std::array<uint64_t, CAPNP_HEADER_SIZE / CAPNP_WORD_SIZE> buf {};
-        auto [message, error] = syncRecvMessage(ioSocket);
-        ASSERT_EQ(error._errorCode, Error::ErrorCode::Uninit);
+        auto result = syncRecvMessage(ioSocket);
+        ASSERT_TRUE(result.has_value());
 
-        memcpy(buf.begin(), message.payload.data(), CAPNP_HEADER_SIZE);
-        ASSERT_EQ(message.payload.size(), CAPNP_HEADER_SIZE);
+        memcpy(buf.begin(), result->payload.data(), CAPNP_HEADER_SIZE);
+        ASSERT_EQ(result->payload.size(), CAPNP_HEADER_SIZE);
         header = ObjectResponseHeader::fromBuffer(buf);
 
         if (header.payloadLength > 0) {
-            auto [message2, error2] = syncRecvMessage(ioSocket);
-            ASSERT_EQ(error2._errorCode, Error::ErrorCode::Uninit);
-            payload.emplace(message2.payload);
+            auto result2 = syncRecvMessage(ioSocket);
+            ASSERT_TRUE(result2.has_value());
+            payload.emplace(result2->payload);
         } else {
             payload.reset();
         }
@@ -530,6 +528,9 @@ TEST_F(ObjectStorageServerTest, TestClientDisconnect)
     }
 }
 
+// TODO: why does this not pass?
+// the message connection tcp is removed from the remote socket's list
+// but the object is never destructued, and so the connection is never closed
 TEST_F(ObjectStorageServerTest, TestMalformedHeader)
 {
     ObjectResponseHeader responseHeader;
@@ -547,8 +548,9 @@ TEST_F(ObjectStorageServerTest, TestMalformedHeader)
         client->writeYMQMessage(std::move(message));
 
         // Server should disconnect before or while we are reading the response
-        auto [msg, err] = client->readYMQMessage();
-        EXPECT_EQ(err._errorCode, Error::ErrorCode::ConnectorSocketClosedByRemoteEnd);
+        auto result = client->readYMQMessage();
+        EXPECT_TRUE(!result);
+        EXPECT_EQ(result.error()._errorCode, Error::ErrorCode::ConnectorSocketClosedByRemoteEnd);
     }
 
     // Server must still answers to requests from other clients
diff --git a/tests/pymod_ymq/__init__.py b/tests/pymod_ymq/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/pymod_ymq/test_pymod_ymq.py b/tests/pymod_ymq/test_pymod_ymq.py
new file mode 100644
index 000000000..ee96914e6
--- /dev/null
+++ b/tests/pymod_ymq/test_pymod_ymq.py
@@ -0,0 +1,150 @@
+import multiprocessing.connection
+import unittest
+from scaler.io.ymq import ymq
+import asyncio
+import multiprocessing
+
+
+class TestPymodYMQ(unittest.IsolatedAsyncioTestCase):
+    async def test_basic(self):
+        ctx = ymq.IOContext()
+        binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder)
+        self.assertEqual(binder.identity, "binder")
+        self.assertEqual(binder.socket_type, ymq.IOSocketType.Binder)
+
+        connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector)
+        self.assertEqual(connector.identity, "connector")
+        self.assertEqual(connector.socket_type, ymq.IOSocketType.Connector)
+
+        await binder.bind("tcp://127.0.0.1:35791")
+        await connector.connect("tcp://127.0.0.1:35791")
+
+        await connector.send(ymq.Message(address=None, payload=b"payload"))
+        msg = await binder.recv()
+
+        assert msg.address is not None
+        self.assertEqual(msg.address.data, b"connector")
+        self.assertEqual(msg.payload.data, b"payload")
+
+    @unittest.skip("this test currently hangs, see comment in the code")
+    async def test_no_address(self):
+        # this test requires special care because it hangs and doesn't shut down the worker threads properly
+        # we use a subprocess to shield us from any effects
+        pipe_parent, pipe_child = multiprocessing.Pipe(duplex=False)
+
+        def test(pipe: multiprocessing.connection.Connection) -> None:
+            async def main():
+                ctx = ymq.IOContext()
+                binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder)
+                connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector)
+
+                await binder.bind("tcp://127.0.0.1:35791")
+                await connector.connect("tcp://127.0.0.1:35791")
+
+                try:
+                    # TODO: change to `asyncio.timeout()` in python >3.10
+                    await asyncio.wait_for(binder.send(ymq.Message(address=None, payload=b"payload")), 30)
+
+                    # TODO: solve the hang and write the rest of the test
+                    pipe.send(True)
+                except asyncio.TimeoutError:
+                    pipe.send(False)
+
+            asyncio.run(main())
+
+        p = multiprocessing.Process(target=test, args=(pipe_child,))
+        p.start()
+        result = pipe_parent.recv()
+        p.join(5)
+        if p.exitcode is None:
+            p.kill()
+
+        if not result:
+            self.fail()
+
+    async def test_routing(self):
+        ctx = ymq.IOContext()
+        binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder)
+        connector1 = await ctx.createIOSocket("connector1", ymq.IOSocketType.Connector)
+        connector2 = await ctx.createIOSocket("connector2", ymq.IOSocketType.Connector)
+
+        await binder.bind("tcp://127.0.0.1:35791")
+        await connector1.connect("tcp://127.0.0.1:35791")
+        await connector2.connect("tcp://127.0.0.1:35791")
+
+        await binder.send(ymq.Message(b"connector2", b"2"))
+        await binder.send(ymq.Message(b"connector1", b"1"))
+
+        msg1 = await connector1.recv()
+        self.assertEqual(msg1.payload.data, b"1")
+
+        msg2 = await connector2.recv()
+        self.assertEqual(msg2.payload.data, b"2")
+
+    async def test_pingpong(self):
+        ctx = ymq.IOContext()
+        binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder)
+        connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector)
+
+        await binder.bind("tcp://127.0.0.1:35791")
+        await connector.connect("tcp://127.0.0.1:35791")
+
+        async def binder_routine(binder: ymq.IOSocket, limit: int) -> bool:
+            i = 0
+            while i < limit:
+                await binder.send(ymq.Message(address=b"connector", payload=f"{i}".encode()))
+                msg = await binder.recv()
+                assert msg.payload.data is not None
+
+                recv_i = int(msg.payload.data.decode())
+                if recv_i - i > 1:
+                    return False
+                i = recv_i + 1
+            return True
+
+        async def connector_routine(connector: ymq.IOSocket, limit: int) -> bool:
+            i = 0
+            while True:
+                msg = await connector.recv()
+                assert msg.payload.data is not None
+                recv_i = int(msg.payload.data.decode())
+                if recv_i - i > 1:
+                    return False
+                i = recv_i + 1
+                await connector.send(ymq.Message(address=None, payload=f"{i}".encode()))
+
+                # when the connector sends `limit - 1`, we're done
+                if i >= limit - 1:
+                    break
+            return True
+
+        binder_success, connector_success = await asyncio.gather(
+            binder_routine(binder, 100), connector_routine(connector, 100)
+        )
+
+        if not binder_success:
+            self.fail("binder failed")
+
+        if not connector_success:
+            self.fail("connector failed")
+
+    async def test_big_message(self):
+        ctx = ymq.IOContext()
+        binder = await ctx.createIOSocket("binder", ymq.IOSocketType.Binder)
+        self.assertEqual(binder.identity, "binder")
+        self.assertEqual(binder.socket_type, ymq.IOSocketType.Binder)
+
+        connector = await ctx.createIOSocket("connector", ymq.IOSocketType.Connector)
+        self.assertEqual(connector.identity, "connector")
+        self.assertEqual(connector.socket_type, ymq.IOSocketType.Connector)
+
+        await binder.bind("tcp://127.0.0.1:35791")
+        await connector.connect("tcp://127.0.0.1:35791")
+
+        for _ in range(10):
+            await connector.send(ymq.Message(address=None, payload=b"." * 500_000_000))
+            msg = await binder.recv()
+
+            assert msg.address is not None
+            self.assertEqual(msg.address.data, b"connector")
+            self.assertEqual(msg.payload.data, b"." * 500_000_000)
diff --git a/tests/pymod_ymq/test_types.py b/tests/pymod_ymq/test_types.py
new file mode 100644
index 000000000..e461856e0
--- /dev/null
+++ b/tests/pymod_ymq/test_types.py
@@ -0,0 +1,90 @@
+import unittest
+from enum import IntEnum
+from scaler.io.ymq import ymq
+import array
+
+
+class TestTypes(unittest.TestCase):
+    def test_exception(self):
+        # type checkers misidentify this as "unnecessary" due to the type hints file
+        self.assertTrue(issubclass(ymq.YMQException, Exception))  # type: ignore
+
+        exc = ymq.YMQException(ymq.ErrorCode.CoreBug, "oh no")
+        self.assertEqual(exc.args, (ymq.ErrorCode.CoreBug, "oh no"))
+        self.assertEqual(exc.code, ymq.ErrorCode.CoreBug)
+        self.assertEqual(exc.message, "oh no")
+
+    def test_interrupted_exception(self):
+        self.assertTrue(issubclass(ymq.YMQInterruptedException, Exception))  # type: ignore
+
+        exc = ymq.YMQInterruptedException()
+        self.assertEqual(exc.args, tuple())
+
+    def test_error_code(self):
+        self.assertTrue(issubclass(ymq.ErrorCode, IntEnum))  # type: ignore
+        self.assertEqual(
+            ymq.ErrorCode.ConfigurationError.explanation(),
+            "An error generated by system call that's likely due to mis-configuration",
+        )
+
+    def test_bytes(self):
+        b = ymq.Bytes(b"data")
+        self.assertEqual(b.len, len(b))
+        self.assertEqual(b.len, 4)
+        self.assertEqual(b.data, b"data")
+
+        # would raise an exception if ymq.Bytes didn't support the buffer interface
+        m = memoryview(b)
+        self.assertTrue(m.obj is b)
+        self.assertEqual(m.tobytes(), b"data")
+
+        b = ymq.Bytes()
+        self.assertEqual(b.len, 0)
+        self.assertTrue(b.data is None)
+
+        b = ymq.Bytes(b"")
+        self.assertEqual(b.len, 0)
+        self.assertEqual(b.data, b"")
+
+        b = ymq.Bytes(array.array("B", [115, 99, 97, 108, 101, 114]))
+        assert b.len == 6
+        assert b.data == b"scaler"
+
+    def test_message(self):
+        m = ymq.Message(b"address", b"payload")
+        assert m.address is not None
+        self.assertEqual(m.address.data, b"address")
+        self.assertEqual(m.payload.data, b"payload")
+
+        m = ymq.Message(address=None, payload=ymq.Bytes(b"scaler"))
+        self.assertTrue(m.address is None)
+        self.assertEqual(m.payload.data, b"scaler")
+
+        m = ymq.Message(b"address", payload=b"payload")
+        assert m.address is not None
+        self.assertEqual(m.address.data, b"address")
+        self.assertEqual(m.payload.data, b"payload")
+
+    def test_io_context(self):
+        ctx = ymq.IOContext()
+        self.assertEqual(ctx.num_threads, 1)
+
+        ctx = ymq.IOContext(2)
+        self.assertEqual(ctx.num_threads, 2)
+
+        ctx = ymq.IOContext(num_threads=3)
+        self.assertEqual(ctx.num_threads, 3)
+
+    def test_io_socket(self):
+        # check that we can't create io socket instances directly
+        self.assertRaises(TypeError, lambda: ymq.IOSocket())
+
+    def test_io_socket_type(self):
+        self.assertTrue(issubclass(ymq.IOSocketType, IntEnum))  # type: ignore
+
+    def test_bad_socket_type(self):
+        ctx = ymq.IOContext()
+
+        # TODO: should the core reject this?
+        socket = ctx.createIOSocket_sync("identity", ymq.IOSocketType.Uninit)
+        self.assertEqual(socket.socket_type, ymq.IOSocketType.Uninit)

From 88613f90f24a510eaee96cc05acc311b70b35815 Mon Sep 17 00:00:00 2001
From: sharpener6 <1sc2l4qi@duck.com>
Date: Wed, 24 Sep 2025 19:30:01 -0400
Subject: [PATCH 2/3] Rework all github workflows (#230)

- Moved workflows to actions so they can be reused
- Removed the dependency on boost
- Fix formatting issue
- Fix the formatting and import sort orders

YMQ Python Module Fixes

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

remove panic fn

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

reorganize classes to put private members after public

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

apply suggestion

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

add comment

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

take && reference

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

rename variable

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

rename variable

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add YMQ Tests

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

review reference counting, improve error handling, increase consistency

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

clean up includes

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

update todos

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

update todos

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Improve error integration for futures

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Improve signal handling in sync methods

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Remove temporary ymq test files

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add basic test case

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Set socket TCP_NODELAY

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Make util fns inline

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Major refactor of ymq tests, add slow network test

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add new tests

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Use subprocesses instead of threads for better reliability

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Relocate C++ tests and use GTest

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add Python tests, update interface type hints as necessary

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

mitm

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

add tcp serialization and deserialization

YMQ Python Module Fixes

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

remove panic fn

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

reorganize classes to put private members after public

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

apply suggestion

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

add comment

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

take && reference

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

rename variable

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

rename variable

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add YMQ Tests

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

review reference counting, improve error handling, increase consistency

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

clean up includes

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

update todos

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

update todos

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Improve error integration for futures

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Improve signal handling in sync methods

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Remove temporary ymq test files

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add basic test case

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Set socket TCP_NODELAY

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Make util fns inline

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Major refactor of ymq tests, add slow network test

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add new tests

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Use subprocesses instead of threads for better reliability

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Relocate C++ tests and use GTest

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add Python tests, update interface type hints as necessary

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

mitm

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

add tcp serialization and deserialization

YMQ Python Module Fixes

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

remove panic fn

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

reorganize classes to put private members after public

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

apply suggestion

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

add comment

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

take && reference

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

rename variable

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

rename variable

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add YMQ Tests

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

review reference counting, improve error handling, increase consistency

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

clean up includes

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

update todos

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

update todos

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Improve error integration for futures

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Improve signal handling in sync methods

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Remove temporary ymq test files

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add basic test case

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Set socket TCP_NODELAY

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Make util fns inline

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Major refactor of ymq tests, add slow network test

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add new tests

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Use subprocesses instead of threads for better reliability

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Relocate C++ tests and use GTest

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

Add Python tests, update interface type hints as necessary

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

mitm

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>

add tcp serialization and deserialization
---
 scaler/io/ymq/message_connection_tcp.cpp    |    2 +-
 scaler/io/ymq/tests/incomplete_identity.h   |   53 +
 scaler/io/ymq/third_party/concurrentqueue.h | 7130 ++++++++++---------
 scripts/lint.sh                             |   15 +
 tests/cc_ymq/common.h                       |    5 +-
 tests/cc_ymq/py_mitm/core.py                |   54 +
 tests/cc_ymq/py_mitm/drop.py                |   28 +
 tests/cc_ymq/py_mitm/rst.py                 |   48 +
 tests/cc_ymq/py_mitm/runner.py              |  157 +
 tests/pymod_ymq/config.py                   |   13 +
 10 files changed, 4105 insertions(+), 3400 deletions(-)
 create mode 100644 scaler/io/ymq/tests/incomplete_identity.h
 create mode 100755 scripts/lint.sh
 create mode 100644 tests/cc_ymq/py_mitm/core.py
 create mode 100644 tests/cc_ymq/py_mitm/drop.py
 create mode 100644 tests/cc_ymq/py_mitm/rst.py
 create mode 100644 tests/cc_ymq/py_mitm/runner.py
 create mode 100644 tests/pymod_ymq/config.py

diff --git a/scaler/io/ymq/message_connection_tcp.cpp b/scaler/io/ymq/message_connection_tcp.cpp
index 8212238f1..df2338292 100644
--- a/scaler/io/ymq/message_connection_tcp.cpp
+++ b/scaler/io/ymq/message_connection_tcp.cpp
@@ -267,7 +267,7 @@ void MessageConnectionTCP::updateReadOperation()
             _receivedReadOperations.pop();
             auto recvMessageCallback = std::move(_pendingRecvMessageCallbacks->front());
             _pendingRecvMessageCallbacks->pop();
-
+            
             recvMessageCallback({Message(std::move(address), std::move(payload)), {}});
         } else {
             assert(_pendingRecvMessageCallbacks->size());
diff --git a/scaler/io/ymq/tests/incomplete_identity.h b/scaler/io/ymq/tests/incomplete_identity.h
new file mode 100644
index 000000000..e8ea4f1ed
--- /dev/null
+++ b/scaler/io/ymq/tests/incomplete_identity.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <print>
+#include <thread>
+
+#include "scaler/io/ymq/examples/common.h"
+#include "scaler/io/ymq/io_context.h"
+#include "tests/cc_ymq/common.h"
+
+void incomplete_identity_server_main()
+{
+    IOContext context(1);
+
+    auto socket = syncCreateSocket(context, IOSocketType::Binder, "server");
+    syncBindSocket(socket, "tcp://127.0.0.1:25715");
+    auto result = syncRecvMessage(socket);
+
+    assert(result.has_value());
+    assert(result->payload.as_string() == "yi er san si wu liu");
+
+    context.removeIOSocket(socket);
+}
+
+void incomplete_identity_client_main()
+{
+    // open a socket, write an incomplete identity and exit
+    {
+        TcpSocket socket;
+
+        socket.connect("127.0.0.1", 25715);
+
+        auto remote_identity = socket.read_message();
+        assert(remote_identity == "server");
+
+        // write incomplete identity and exit
+        std::string identity = "client";
+        uint64_t header      = identity.length();
+        socket.write_all((char*)&header, 8);
+        socket.write_all(identity.data(), identity.length() - 2);
+        std::this_thread::sleep_for(3s);
+    }
+
+    // connect again and try to send a message
+    {
+        TcpSocket socket;
+        socket.connect("127.0.0.1", 25715);
+        auto remote_identity = socket.read_message();
+        assert(remote_identity == "server");
+        socket.write_message("client");
+        socket.write_message("yi er san si wu liu");
+        std::this_thread::sleep_for(3s);
+    }
+}
diff --git a/scaler/io/ymq/third_party/concurrentqueue.h b/scaler/io/ymq/third_party/concurrentqueue.h
index 2fc775400..d5498b116 100644
--- a/scaler/io/ymq/third_party/concurrentqueue.h
+++ b/scaler/io/ymq/third_party/concurrentqueue.h
@@ -47,7 +47,7 @@
 // VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
 // does not support `if constexpr`, so we have no choice but to simply disable the warning
 #pragma warning(push)
-#pragma warning(disable: 4127)  // conditional expression is constant
+#pragma warning(disable : 4127)  // conditional expression is constant
 #endif
 
 #if defined(__APPLE__)
@@ -64,81 +64,114 @@
 #undef malloc
 #undef free
 #else
-#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <atomic>  // Requires C++11. Sorry VS2010.
 #include <cassert>
 #endif
-#include <cstddef>              // for max_align_t
+#include <algorithm>
+#include <array>
+#include <climits>  // for CHAR_BIT
+#include <cstddef>  // for max_align_t
 #include <cstdint>
 #include <cstdlib>
+#include <limits>
+#include <mutex>   // used for thread exit synchronization
+#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
 #include <type_traits>
-#include <algorithm>
 #include <utility>
-#include <limits>
-#include <climits>		// for CHAR_BIT
-#include <array>
-#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
-#include <mutex>        // used for thread exit synchronization
 
 // Platform-specific definitions of a numeric thread ID type and an invalid value
-namespace moodycamel { namespace details {
-	template<typename thread_id_t> struct thread_id_converter {
-		typedef thread_id_t thread_id_numeric_size_t;
-		typedef thread_id_t thread_id_hash_t;
-		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
-	};
-} }
+namespace moodycamel {
+namespace details {
+template <typename thread_id_t>
+struct thread_id_converter {
+    typedef thread_id_t thread_id_numeric_size_t;
+    typedef thread_id_t thread_id_hash_t;
+    static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+};
+}  // namespace details
+}  // namespace moodycamel
 #if defined(MCDBGQ_USE_RELACY)
-namespace moodycamel { namespace details {
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
-	static inline thread_id_t thread_id() { return rl::thread_index(); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+static inline thread_id_t thread_id()
+{
+    return rl::thread_index();
+}
+}  // namespace details
+}  // namespace moodycamel
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
 // No sense pulling in windows.h in a header, we'll manually declare the function
 // we use and rely on backwards-compatibility for this not to break
 extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
-namespace moodycamel { namespace details {
-	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
-	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
-} }
-#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
-namespace moodycamel { namespace details {
-	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
-	
-	typedef std::thread::id thread_id_t;
-	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
-
-	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
-	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
-	// be.
-	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
-
-	template<std::size_t> struct thread_id_size { };
-	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
-	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
-
-	template<> struct thread_id_converter<thread_id_t> {
-		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+namespace moodycamel {
+namespace details {
+static_assert(
+    sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id =
+    0;  // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+static const thread_id_t invalid_thread_id2 =
+    0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread
+                  // IDs are presently multiples of 4.
+static inline thread_id_t thread_id()
+{
+    return static_cast<thread_id_t>(::GetCurrentThreadId());
+}
+}  // namespace details
+}  // namespace moodycamel
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || \
+    defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel {
+namespace details {
+static_assert(
+    sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
+    "std::thread::id is expected to be either 4 or 8 bytes");
+
+typedef std::thread::id thread_id_t;
+static const thread_id_t invalid_thread_id;  // Default ctor creates invalid ID
+
+// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+// be.
+static inline thread_id_t thread_id()
+{
+    return std::this_thread::get_id();
+}
+
+template <std::size_t>
+struct thread_id_size {};
+template <>
+struct thread_id_size<4> {
+    typedef std::uint32_t numeric_t;
+};
+template <>
+struct thread_id_size<8> {
+    typedef std::uint64_t numeric_t;
+};
+
+template <>
+struct thread_id_converter<thread_id_t> {
+    typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
 #ifndef __APPLE__
-		typedef std::size_t thread_id_hash_t;
+    typedef std::size_t thread_id_hash_t;
 #else
-		typedef thread_id_numeric_size_t thread_id_hash_t;
+    typedef thread_id_numeric_size_t thread_id_hash_t;
 #endif
 
-		static thread_id_hash_t prehash(thread_id_t const& x)
-		{
+    static thread_id_hash_t prehash(thread_id_t const& x)
+    {
 #ifndef __APPLE__
-			return std::hash<std::thread::id>()(x);
+        return std::hash<std::thread::id>()(x);
 #else
-			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+        return *reinterpret_cast<thread_id_hash_t const*>(&x);
 #endif
-		}
-	};
-} }
+    }
+};
+}
+}
 #else
 // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
 // In order to get a numeric thread ID in a platform-independent way, we use a thread-local
@@ -151,12 +184,19 @@ namespace moodycamel { namespace details {
 // Assume C++11 compliant compiler
 #define MOODYCAMEL_THREADLOCAL thread_local
 #endif
-namespace moodycamel { namespace details {
-	typedef std::uintptr_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
-	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
-	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uintptr_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0;  // Address can't be nullptr
+static const thread_id_t invalid_thread_id2 =
+    1;  // Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+inline thread_id_t thread_id()
+{
+    static MOODYCAMEL_THREADLOCAL int x;
+    return reinterpret_cast<thread_id_t>(&x);
+}
+}
+}
 #endif
 
 // Constexpr if
@@ -172,18 +212,19 @@ namespace moodycamel { namespace details {
 
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || \
+    (!defined(_MSC_VER) && !defined(__GNUC__))
 #define MOODYCAMEL_EXCEPTIONS_ENABLED
 #endif
 #endif
 #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
-#define MOODYCAMEL_TRY try
-#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
-#define MOODYCAMEL_RETHROW throw
-#define MOODYCAMEL_THROW(expr) throw (expr)
+#define MOODYCAMEL_TRY         try
+#define MOODYCAMEL_CATCH(...)  catch (__VA_ARGS__)
+#define MOODYCAMEL_RETHROW     throw
+#define MOODYCAMEL_THROW(expr) throw(expr)
 #else
-#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
-#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_TRY        MOODYCAMEL_CONSTEXPR_IF(true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
 #define MOODYCAMEL_RETHROW
 #define MOODYCAMEL_THROW(expr)
 #endif
@@ -191,21 +232,35 @@ namespace moodycamel { namespace details {
 #ifndef MOODYCAMEL_NOEXCEPT
 #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
 #define MOODYCAMEL_NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)   true
 #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
 // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
 // We have to assume *all* non-trivial constructors may throw on VS2012!
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)                                      \
+    (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? \
+         std::is_trivially_move_constructible<type>::value :                                 \
+         std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)                                                    \
+    ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ?                   \
+          std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value :   \
+          std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && \
+     MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)                                                         \
+    (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ?                    \
+         std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : \
+         std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)                                                    \
+    ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ?                   \
+          std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value :   \
+          std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && \
+     MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #else
-#define MOODYCAMEL_NOEXCEPT noexcept
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT                               noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)   noexcept(expr)
 #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
 #endif
 #endif
@@ -214,18 +269,24 @@ namespace moodycamel { namespace details {
 #ifdef MCDBGQ_USE_RELACY
 #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #else
-// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
-// g++ <=4.7 doesn't support thread_local either.
-// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
-#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug:
+// http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't support thread_local either. Finally, iOS/ARM doesn't
+// have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                                                                     \
+    (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) &&                          \
+    (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                               \
+    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && \
+    !defined(__MVS__)
 // Assume `thread_local` is fully supported in all other C++11 compilers/platforms
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now; years ago several users report having
+                                                 // problems with it on
 #endif
 #endif
 #endif
 
-// VS2012 doesn't support deleted functions. 
-// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+// VS2012 doesn't support deleted functions.
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is
+// called.
 #ifndef MOODYCAMEL_DELETE_FUNCTION
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #define MOODYCAMEL_DELETE_FUNCTION
@@ -234,54 +295,100 @@ namespace moodycamel { namespace details {
 #endif
 #endif
 
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #ifndef MOODYCAMEL_ALIGNAS
 // VS2013 doesn't support alignas or alignof, and align() requires a constant literal
 #if defined(_MSC_VER) && _MSC_VER <= 1800
-#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
-#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNAS(alignment)        __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj)              __alignof(obj)
 #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
-	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
-	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
-	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
-	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
-	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
-	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
-	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
-	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
-	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
-	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+template <int Align, typename T>
+struct Vs2013Aligned {};  // default, unsupported alignment
+template <typename T>
+struct Vs2013Aligned<1, T> {
+    typedef __declspec(align(1)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<2, T> {
+    typedef __declspec(align(2)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<4, T> {
+    typedef __declspec(align(4)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<8, T> {
+    typedef __declspec(align(8)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<16, T> {
+    typedef __declspec(align(16)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<32, T> {
+    typedef __declspec(align(32)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<64, T> {
+    typedef __declspec(align(64)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<128, T> {
+    typedef __declspec(align(128)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<256, T> {
+    typedef __declspec(align(256)) T type;
+};
 #else
-	template<typename T> struct identity { typedef T type; };
-#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
-#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+template <typename T>
+struct identity {
+    typedef T type;
+};
+#define MOODYCAMEL_ALIGNAS(alignment)        alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj)              alignof(obj)
 #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
 #endif
 #endif
-} }
-
+}  // namespace details
+}  // namespace moodycamel
 
 // TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
 // we can apply per-function compile-time suppression.
 // See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
 #define MOODYCAMEL_NO_TSAN
 #if defined(__has_feature)
- #if __has_feature(thread_sanitizer)
-  #undef MOODYCAMEL_NO_TSAN
-  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
- #endif // TSAN
-#endif // TSAN
+#if __has_feature(thread_sanitizer)
+#undef MOODYCAMEL_NO_TSAN
+#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+#endif  // TSAN
+#endif  // TSAN
 
 // Compiler-specific likely/unlikely hints
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #if defined(__GNUC__)
-	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
-	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+static inline bool(likely)(bool x)
+{
+    return __builtin_expect((x), true);
+}
+static inline bool(unlikely)(bool x)
+{
+    return __builtin_expect((x), false);
+}
 #else
-	static inline bool (likely)(bool x) { return x; }
-	static inline bool (unlikely)(bool x) { return x; }
+static inline bool(likely)(bool x)
+{
+    return x;
+}
+static inline bool(unlikely)(bool x)
+{
+    return x;
+}
 #endif
-} }
+}  // namespace details
+}  // namespace moodycamel
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
 #include "internal/concurrentqueue_internal_debug.h"
@@ -289,28 +396,28 @@ namespace moodycamel { namespace details {
 
 namespace moodycamel {
 namespace details {
-	template<typename T>
-	struct const_numeric_max {
-		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
-		static const T value = std::numeric_limits<T>::is_signed
-			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
-			: static_cast<T>(-1);
-	};
+template <typename T>
+struct const_numeric_max {
+    static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+    static const T value = std::numeric_limits<T>::is_signed ?
+                               (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1) :
+                               static_cast<T>(-1);
+};
 
 #if defined(__GLIBCXX__)
-	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+typedef ::max_align_t std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
 #else
-	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can *only* be accessed via std::
 #endif
 
-	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
-	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
-	typedef union {
-		std_max_align_t x;
-		long long y;
-		void* z;
-	} max_align_t;
-}
+// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+typedef union {
+    std_max_align_t x;
+    long long y;
+    void* z;
+} max_align_t;
+}  // namespace details
 
 // Default traits for the ConcurrentQueue. To change some of the
 // traits without re-implementing all of them, inherit from this
@@ -318,99 +425,96 @@ namespace details {
 // since the traits are used as a template type parameter, the
 // shadowed declarations will be used where defined, and the defaults
 // otherwise.
-struct ConcurrentQueueDefaultTraits
-{
-	// General-purpose size type. std::size_t is strongly recommended.
-	typedef std::size_t size_t;
-	
-	// The type used for the enqueue and dequeue indices. Must be at least as
-	// large as size_t. Should be significantly larger than the number of elements
-	// you expect to hold at once, especially if you have a high turnover rate;
-	// for example, on 32-bit x86, if you expect to have over a hundred million
-	// elements or pump several million elements through your queue in a very
-	// short space of time, using a 32-bit type *may* trigger a race condition.
-	// A 64-bit int type is recommended in that case, and in practice will
-	// prevent a race condition no matter the usage of the queue. Note that
-	// whether the queue is lock-free with a 64-int type depends on the whether
-	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
-	typedef std::size_t index_t;
-	
-	// Internally, all elements are enqueued and dequeued from multi-element
-	// blocks; this is the smallest controllable unit. If you expect few elements
-	// but many producers, a smaller block size should be favoured. For few producers
-	// and/or many elements, a larger block size is preferred. A sane default
-	// is provided. Must be a power of 2.
-	static const size_t BLOCK_SIZE = 32;
-	
-	// For explicit producers (i.e. when using a producer token), the block is
-	// checked for being empty by iterating through a list of flags, one per element.
-	// For large block sizes, this is too inefficient, and switching to an atomic
-	// counter-based approach is faster. The switch is made for block sizes strictly
-	// larger than this threshold.
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
-	
-	// How many full blocks can be expected for a single explicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// How many full blocks can be expected for a single implicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// The initial size of the hash table mapping thread IDs to implicit producers.
-	// Note that the hash is resized every time it becomes half full.
-	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
-	// (using the enqueue methods without an explicit producer token) is disabled.
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
-	
-	// Controls the number of items that an explicit consumer (i.e. one with a token)
-	// must consume before it causes all consumers to rotate and move on to the next
-	// internal queue.
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
-	
-	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
-	// Enqueue operations that would cause this limit to be surpassed will fail. Note
-	// that this limit is enforced at the block level (for performance reasons), i.e.
-	// it's rounded up to the nearest block size.
-	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
-
-	// The number of times to spin before sleeping when waiting on a semaphore.
-	// Recommended values are on the order of 1000-10000 unless the number of
-	// consumer threads exceeds the number of idle cores (in which case try 0-100).
-	// Only affects instances of the BlockingConcurrentQueue.
-	static const int MAX_SEMA_SPINS = 10000;
-
-	// Whether to recycle dynamically-allocated blocks into an internal free list or
-	// not. If false, only pre-allocated blocks (controlled by the constructor
-	// arguments) will be recycled, and all others will be `free`d back to the heap.
-	// Note that blocks consumed by explicit producers are only freed on destruction
-	// of the queue (not following destruction of the token) regardless of this trait.
-	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
-
-	
+struct ConcurrentQueueDefaultTraits {
+    // General-purpose size type. std::size_t is strongly recommended.
+    typedef std::size_t size_t;
+
+    // The type used for the enqueue and dequeue indices. Must be at least as
+    // large as size_t. Should be significantly larger than the number of elements
+    // you expect to hold at once, especially if you have a high turnover rate;
+    // for example, on 32-bit x86, if you expect to have over a hundred million
+    // elements or pump several million elements through your queue in a very
+    // short space of time, using a 32-bit type *may* trigger a race condition.
+    // A 64-bit int type is recommended in that case, and in practice will
+    // prevent a race condition no matter the usage of the queue. Note that
+    // whether the queue is lock-free with a 64-int type depends on the whether
+    // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+    typedef std::size_t index_t;
+
+    // Internally, all elements are enqueued and dequeued from multi-element
+    // blocks; this is the smallest controllable unit. If you expect few elements
+    // but many producers, a smaller block size should be favoured. For few producers
+    // and/or many elements, a larger block size is preferred. A sane default
+    // is provided. Must be a power of 2.
+    static const size_t BLOCK_SIZE = 32;
+
+    // For explicit producers (i.e. when using a producer token), the block is
+    // checked for being empty by iterating through a list of flags, one per element.
+    // For large block sizes, this is too inefficient, and switching to an atomic
+    // counter-based approach is faster. The switch is made for block sizes strictly
+    // larger than this threshold.
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+    // How many full blocks can be expected for a single explicit producer? This should
+    // reflect that number's maximum for optimal performance. Must be a power of 2.
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+    // How many full blocks can be expected for a single implicit producer? This should
+    // reflect that number's maximum for optimal performance. Must be a power of 2.
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+    // The initial size of the hash table mapping thread IDs to implicit producers.
+    // Note that the hash is resized every time it becomes half full.
+    // Must be a power of two, and either 0 or at least 1. If 0, implicit production
+    // (using the enqueue methods without an explicit producer token) is disabled.
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+    // Controls the number of items that an explicit consumer (i.e. one with a token)
+    // must consume before it causes all consumers to rotate and move on to the next
+    // internal queue.
+    static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+    // The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+    // Enqueue operations that would cause this limit to be surpassed will fail. Note
+    // that this limit is enforced at the block level (for performance reasons), i.e.
+    // it's rounded up to the nearest block size.
+    static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+    // The number of times to spin before sleeping when waiting on a semaphore.
+    // Recommended values are on the order of 1000-10000 unless the number of
+    // consumer threads exceeds the number of idle cores (in which case try 0-100).
+    // Only affects instances of the BlockingConcurrentQueue.
+    static const int MAX_SEMA_SPINS = 10000;
+
+    // Whether to recycle dynamically-allocated blocks into an internal free list or
+    // not. If false, only pre-allocated blocks (controlled by the constructor
+    // arguments) will be recycled, and all others will be `free`d back to the heap.
+    // Note that blocks consumed by explicit producers are only freed on destruction
+    // of the queue (not following destruction of the token) regardless of this trait.
+    static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
 #ifndef MCDBGQ_USE_RELACY
-	// Memory allocation can be customized if needed.
-	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+    // Memory allocation can be customized if needed.
+    // malloc should return nullptr on failure, and handle alignment like std::malloc.
 #if defined(malloc) || defined(free)
-	// Gah, this is 2015, stop defining macros that break standard code already!
-	// Work around malloc/free being special macros:
-	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
-	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
-	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
-	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+    // Gah, this is 2015, stop defining macros that break standard code already!
+    // Work around malloc/free being special macros:
+    static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+    static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+    static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); }
+    static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); }
 #else
-	static inline void* malloc(size_t size) { return std::malloc(size); }
-	static inline void free(void* ptr) { return std::free(ptr); }
+    static inline void* malloc(size_t size) { return std::malloc(size); }
+    static inline void free(void* ptr) { return std::free(ptr); }
 #endif
 #else
-	// Debug versions when running under the Relacy race detector (ignore
-	// these in user code)
-	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
-	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+    // Debug versions when running under the Relacy race detector (ignore
+    // these in user code)
+    static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+    static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
 #endif
 };
 
-
 // When producing or consuming many elements, the most efficient way is to:
 //    1) Use one of the bulk-operation methods of the queue with a token
 //    2) Failing that, use the bulk-operation methods without a token
@@ -421,3322 +525,3554 @@ struct ConcurrentQueueDefaultTraits
 struct ProducerToken;
 struct ConsumerToken;
 
-template<typename T, typename Traits> class ConcurrentQueue;
-template<typename T, typename Traits> class BlockingConcurrentQueue;
+template <typename T, typename Traits>
+class ConcurrentQueue;
+template <typename T, typename Traits>
+class BlockingConcurrentQueue;
 class ConcurrentQueueTests;
 
+namespace details {
+struct ConcurrentQueueProducerTypelessBase {
+    ConcurrentQueueProducerTypelessBase* next;
+    std::atomic<bool> inactive;
+    ProducerToken* token;
+
+    ConcurrentQueueProducerTypelessBase(): next(nullptr), inactive(false), token(nullptr) {}
+};
+
+template <bool use32>
+struct _hash_32_or_64 {
+    static inline std::uint32_t hash(std::uint32_t h)
+    {
+        // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+        // Since the thread ID is already unique, all we really want to do is propagate that
+        // uniqueness evenly across all the bits, so that we can use a subset of the bits while
+        // reducing collisions significantly
+        h ^= h >> 16;
+        h *= 0x85ebca6b;
+        h ^= h >> 13;
+        h *= 0xc2b2ae35;
+        return h ^ (h >> 16);
+    }
+};
+template <>
+struct _hash_32_or_64<1> {
+    static inline std::uint64_t hash(std::uint64_t h)
+    {
+        h ^= h >> 33;
+        h *= 0xff51afd7ed558ccd;
+        h ^= h >> 33;
+        h *= 0xc4ceb9fe1a85ec53;
+        return h ^ (h >> 33);
+    }
+};
+template <std::size_t size>
+struct hash_32_or_64: public _hash_32_or_64<(size > 4)> {};
+
+static inline size_t hash_thread_id(thread_id_t id)
+{
+    static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+    return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+        thread_id_converter<thread_id_t>::prehash(id)));
+}
+
+template <typename T>
+static inline bool circular_less_than(T a, T b)
+{
+    static_assert(
+        std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+        "circular_less_than is intended to be used only with unsigned integer types");
+    return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+    // Note: extra parens around rhs of operator<< is MSVC bug:
+    // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+    //       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when
+    //       done here.
+}
+
+template <typename U>
+static inline char* align_for(char* ptr)
+{
+    const std::size_t alignment = std::alignment_of<U>::value;
+    return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+}
+
+template <typename T>
+static inline T ceil_to_pow_2(T x)
+{
+    static_assert(
+        std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+        "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+    // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+        x |= x >> (i << 3);
+    }
+    ++x;
+    return x;
+}
+
+template <typename T>
+static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+{
+    T temp = left.load(std::memory_order_relaxed);
+    left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+    right.store(temp, std::memory_order_relaxed);
+}
+
+template <typename T>
+static inline T const& nomove(T const& x)
+{
+    return x;
+}
+
+template <bool Enable>
+struct nomove_if {
+    template <typename T>
+    static inline T const& eval(T const& x)
+    {
+        return x;
+    }
+};
+
+template <>
+struct nomove_if<false> {
+    template <typename U>
+    static inline auto eval(U&& x) -> decltype(std::forward<U>(x))
+    {
+        return std::forward<U>(x);
+    }
+};
 
-namespace details
+template <typename It>
+static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
 {
-	struct ConcurrentQueueProducerTypelessBase
-	{
-		ConcurrentQueueProducerTypelessBase* next;
-		std::atomic<bool> inactive;
-		ProducerToken* token;
-		
-		ConcurrentQueueProducerTypelessBase()
-			: next(nullptr), inactive(false), token(nullptr)
-		{
-		}
-	};
-	
-	template<bool use32> struct _hash_32_or_64 {
-		static inline std::uint32_t hash(std::uint32_t h)
-		{
-			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
-			// Since the thread ID is already unique, all we really want to do is propagate that
-			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
-			// reducing collisions significantly
-			h ^= h >> 16;
-			h *= 0x85ebca6b;
-			h ^= h >> 13;
-			h *= 0xc2b2ae35;
-			return h ^ (h >> 16);
-		}
-	};
-	template<> struct _hash_32_or_64<1> {
-		static inline std::uint64_t hash(std::uint64_t h)
-		{
-			h ^= h >> 33;
-			h *= 0xff51afd7ed558ccd;
-			h ^= h >> 33;
-			h *= 0xc4ceb9fe1a85ec53;
-			return h ^ (h >> 33);
-		}
-	};
-	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
-	
-	static inline size_t hash_thread_id(thread_id_t id)
-	{
-		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
-		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
-			thread_id_converter<thread_id_t>::prehash(id)));
-	}
-	
-	template<typename T>
-	static inline bool circular_less_than(T a, T b)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
-		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
-		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
-		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
-	}
-	
-	template<typename U>
-	static inline char* align_for(char* ptr)
-	{
-		const std::size_t alignment = std::alignment_of<U>::value;
-		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-	}
-
-	template<typename T>
-	static inline T ceil_to_pow_2(T x)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
-
-		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-		--x;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
-			x |= x >> (i << 3);
-		}
-		++x;
-		return x;
-	}
-	
-	template<typename T>
-	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
-	{
-		T temp = left.load(std::memory_order_relaxed);
-		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		right.store(temp, std::memory_order_relaxed);
-	}
-	
-	template<typename T>
-	static inline T const& nomove(T const& x)
-	{
-		return x;
-	}
-	
-	template<bool Enable>
-	struct nomove_if
-	{
-		template<typename T>
-		static inline T const& eval(T const& x)
-		{
-			return x;
-		}
-	};
-	
-	template<>
-	struct nomove_if<false>
-	{
-		template<typename U>
-		static inline auto eval(U&& x)
-			-> decltype(std::forward<U>(x))
-		{
-			return std::forward<U>(x);
-		}
-	};
-	
-	template<typename It>
-	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
-	{
-		return *it;
-	}
-	
+    return *it;
+}
+
 #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
-	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+template <typename T>
+struct is_trivially_destructible: std::is_trivially_destructible<T> {};
 #else
-	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+template <typename T>
+struct is_trivially_destructible: std::has_trivial_destructor<T> {};
 #endif
-	
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #ifdef MCDBGQ_USE_RELACY
-	typedef RelacyThreadExitListener ThreadExitListener;
-	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+typedef RelacyThreadExitListener ThreadExitListener;
+typedef RelacyThreadExitNotifier ThreadExitNotifier;
 #else
-	class ThreadExitNotifier;
-
-	struct ThreadExitListener
-	{
-		typedef void (*callback_t)(void*);
-		callback_t callback;
-		void* userData;
-		
-		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
-		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
-	};
-
-	class ThreadExitNotifier
-	{
-	public:
-		static void subscribe(ThreadExitListener* listener)
-		{
-			auto& tlsInst = instance();
-			std::lock_guard<std::mutex> guard(mutex());
-			listener->next = tlsInst.tail;
-			listener->chain = &tlsInst;
-			tlsInst.tail = listener;
-		}
-		
-		static void unsubscribe(ThreadExitListener* listener)
-		{
-			std::lock_guard<std::mutex> guard(mutex());
-			if (!listener->chain) {
-				return;  // race with ~ThreadExitNotifier
-			}
-			auto& tlsInst = *listener->chain;
-			listener->chain = nullptr;
-			ThreadExitListener** prev = &tlsInst.tail;
-			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
-				if (ptr == listener) {
-					*prev = ptr->next;
-					break;
-				}
-				prev = &ptr->next;
-			}
-		}
-		
-	private:
-		ThreadExitNotifier() : tail(nullptr) { }
-		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		~ThreadExitNotifier()
-		{
-			// This thread is about to exit, let everyone know!
-			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
-			std::lock_guard<std::mutex> guard(mutex());
-			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
-				ptr->chain = nullptr;
-				ptr->callback(ptr->userData);
-			}
-		}
-		
-		// Thread-local
-		static inline ThreadExitNotifier& instance()
-		{
-			static thread_local ThreadExitNotifier notifier;
-			return notifier;
-		}
-
-		static inline std::mutex& mutex()
-		{
-			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
-			static std::mutex mutex;
-			return mutex;
-		}
-		
-	private:
-		ThreadExitListener* tail;
-	};
-#endif
-#endif
-	
-	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
-	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
-	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
-	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
-	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
-}
+class ThreadExitNotifier;
 
+struct ThreadExitListener {
+    typedef void (*callback_t)(void*);
+    callback_t callback;
+    void* userData;
 
-struct ProducerToken
-{
-	template<typename T, typename Traits>
-	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
-	
-	template<typename T, typename Traits>
-	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
-	
-	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-		: producer(other.producer)
-	{
-		other.producer = nullptr;
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-	}
-	
-	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(producer, other.producer);
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-		if (other.producer != nullptr) {
-			other.producer->token = &other;
-		}
-	}
-	
-	// A token is always valid unless:
-	//     1) Memory allocation failed during construction
-	//     2) It was moved via the move constructor
-	//        (Note: assignment does a swap, leaving both potentially valid)
-	//     3) The associated queue was destroyed
-	// Note that if valid() returns true, that only indicates
-	// that the token is valid for use with a specific queue,
-	// but not which one; that's up to the user to track.
-	inline bool valid() const { return producer != nullptr; }
-	
-	~ProducerToken()
-	{
-		if (producer != nullptr) {
-			producer->token = nullptr;
-			producer->inactive.store(true, std::memory_order_release);
-		}
-	}
-	
-	// Disable copying and assignment
-	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
-protected:
-	details::ConcurrentQueueProducerTypelessBase* producer;
+    ThreadExitListener* next;   // reserved for use by the ThreadExitNotifier
+    ThreadExitNotifier* chain;  // reserved for use by the ThreadExitNotifier
 };
 
+class ThreadExitNotifier {
+public:
+    static void subscribe(ThreadExitListener* listener)
+    {
+        auto& tlsInst = instance();
+        std::lock_guard<std::mutex> guard(mutex());
+        listener->next  = tlsInst.tail;
+        listener->chain = &tlsInst;
+        tlsInst.tail    = listener;
+    }
+
+    static void unsubscribe(ThreadExitListener* listener)
+    {
+        std::lock_guard<std::mutex> guard(mutex());
+        if (!listener->chain) {
+            return;  // race with ~ThreadExitNotifier
+        }
+        auto& tlsInst             = *listener->chain;
+        listener->chain           = nullptr;
+        ThreadExitListener** prev = &tlsInst.tail;
+        for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+            if (ptr == listener) {
+                *prev = ptr->next;
+                break;
+            }
+            prev = &ptr->next;
+        }
+    }
 
-struct ConsumerToken
-{
-	template<typename T, typename Traits>
-	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
-	
-	template<typename T, typename Traits>
-	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
-	
-	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
-	{
-	}
-	
-	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(initialOffset, other.initialOffset);
-		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
-		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
-		std::swap(currentProducer, other.currentProducer);
-		std::swap(desiredProducer, other.desiredProducer);
-	}
-	
-	// Disable copying and assignment
-	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+private:
+    ThreadExitNotifier(): tail(nullptr) {}
+    ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+    ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+
+    ~ThreadExitNotifier()
+    {
+        // This thread is about to exit, let everyone know!
+        assert(
+            this == &instance() &&
+            "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that "
+            "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+        std::lock_guard<std::mutex> guard(mutex());
+        for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+            ptr->chain = nullptr;
+            ptr->callback(ptr->userData);
+        }
+    }
+
+    // Thread-local
+    static inline ThreadExitNotifier& instance()
+    {
+        static thread_local ThreadExitNotifier notifier;
+        return notifier;
+    }
+
+    static inline std::mutex& mutex()
+    {
+        // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+        static std::mutex mutex;
+        return mutex;
+    }
 
 private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
-private: // but shared with ConcurrentQueue
-	std::uint32_t initialOffset;
-	std::uint32_t lastKnownGlobalOffset;
-	std::uint32_t itemsConsumedFromCurrent;
-	details::ConcurrentQueueProducerTypelessBase* currentProducer;
-	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+    ThreadExitListener* tail;
 };
+#endif
+#endif
 
-// Need to forward-declare this swap because it's in a namespace.
-// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+template <typename T>
+struct static_is_lock_free_num {
+    enum { value = 0 };
+};
+template <>
+struct static_is_lock_free_num<signed char> {
+    enum { value = ATOMIC_CHAR_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<short> {
+    enum { value = ATOMIC_SHORT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<int> {
+    enum { value = ATOMIC_INT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long> {
+    enum { value = ATOMIC_LONG_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long long> {
+    enum { value = ATOMIC_LLONG_LOCK_FREE };
+};
+template <typename T>
+struct static_is_lock_free: static_is_lock_free_num<typename std::make_signed<T>::type> {};
+template <>
+struct static_is_lock_free<bool> {
+    enum { value = ATOMIC_BOOL_LOCK_FREE };
+};
+template <typename U>
+struct static_is_lock_free<U*> {
+    enum { value = ATOMIC_POINTER_LOCK_FREE };
+};
+}  // namespace details
+
+struct ProducerToken {
+    template <typename T, typename Traits>
+    explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+    template <typename T, typename Traits>
+    explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+
+    ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT: producer(other.producer)
+    {
+        other.producer = nullptr;
+        if (producer != nullptr) {
+            producer->token = this;
+        }
+    }
+
+    inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap(other);
+        return *this;
+    }
+
+    void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+    {
+        std::swap(producer, other.producer);
+        if (producer != nullptr) {
+            producer->token = this;
+        }
+        if (other.producer != nullptr) {
+            other.producer->token = &other;
+        }
+    }
+
+    // A token is always valid unless:
+    //     1) Memory allocation failed during construction
+    //     2) It was moved via the move constructor
+    //        (Note: assignment does a swap, leaving both potentially valid)
+    //     3) The associated queue was destroyed
+    // Note that if valid() returns true, that only indicates
+    // that the token is valid for use with a specific queue,
+    // but not which one; that's up to the user to track.
+    inline bool valid() const { return producer != nullptr; }
+
+    ~ProducerToken()
+    {
+        if (producer != nullptr) {
+            producer->token = nullptr;
+            producer->inactive.store(true, std::memory_order_release);
+        }
+    }
+
+    // Disable copying and assignment
+    ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+    ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
 
+private:
+    template <typename T, typename Traits>
+    friend class ConcurrentQueue;
+    friend class ConcurrentQueueTests;
 
-template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
-class ConcurrentQueue
-{
+protected:
+    details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+struct ConsumerToken {
+    template <typename T, typename Traits>
+    explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+    template <typename T, typename Traits>
+    explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+
+    ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT: initialOffset(other.initialOffset),
+                                                              lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+                                                              itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+                                                              currentProducer(other.currentProducer),
+                                                              desiredProducer(other.desiredProducer)
+    {
+    }
+
+    inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+    {
+        swap(other);
+        return *this;
+    }
+
+    void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+    {
+        std::swap(initialOffset, other.initialOffset);
+        std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+        std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+        std::swap(currentProducer, other.currentProducer);
+        std::swap(desiredProducer, other.desiredProducer);
+    }
+
+    // Disable copying and assignment
+    ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+    template <typename T, typename Traits>
+    friend class ConcurrentQueue;
+    friend class ConcurrentQueueTests;
+
+private:  // but shared with ConcurrentQueue
+    std::uint32_t initialOffset;
+    std::uint32_t lastKnownGlobalOffset;
+    std::uint32_t itemsConsumedFromCurrent;
+    details::ConcurrentQueueProducerTypelessBase* currentProducer;
+    details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See
+// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template <typename T, typename Traits>
+inline void swap(
+    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
+    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue {
 public:
-	typedef ::moodycamel::ProducerToken producer_token_t;
-	typedef ::moodycamel::ConsumerToken consumer_token_t;
-	
-	typedef typename Traits::index_t index_t;
-	typedef typename Traits::size_t size_t;
-	
-	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+    typedef ::moodycamel::ProducerToken producer_token_t;
+    typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+    typedef typename Traits::index_t index_t;
+    typedef typename Traits::size_t size_t;
+
+    static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
+        static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
+        static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+    static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+        static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
-#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#pragma warning(disable : 4307)  // + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable : 4309)  // static_cast: Truncation of constant value
 #endif
-	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+    static const size_t MAX_SUBQUEUE_SIZE =
+        (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ?
+            details::const_numeric_max<size_t>::value :
+            ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
 
-	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
-	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
-	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
-	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
-	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
-	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
-	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+    static_assert(
+        !std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value,
+        "Traits::size_t must be an unsigned integral type");
+    static_assert(
+        !std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value,
+        "Traits::index_t must be an unsigned integral type");
+    static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+    static_assert(
+        (BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)),
+        "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+    static_assert(
+        (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
+            !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
+        "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+    static_assert(
+        (EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
+        "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+    static_assert(
+        (IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
+        "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+    static_assert(
+        (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
+            !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
+        "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+    static_assert(
+        INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
+        "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
 
 public:
-	// Creates a queue with at least `capacity` element slots; note that the
-	// actual number of elements that can be inserted without additional memory
-	// allocation depends on the number of producers and the block size (e.g. if
-	// the block size is equal to `capacity`, only a single block will be allocated
-	// up-front, which means only a single producer will be able to enqueue elements
-	// without an extra allocation -- blocks aren't shared between producers).
-	// This method is not thread safe -- it is up to the user to ensure that the
-	// queue is fully constructed before it starts being used by other threads (this
-	// includes making the memory effects of construction visible, possibly with a
-	// memory barrier).
-	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
-		
+    // Creates a queue with at least `capacity` element slots; note that the
+    // actual number of elements that can be inserted without additional memory
+    // allocation depends on the number of producers and the block size (e.g. if
+    // the block size is equal to `capacity`, only a single block will be allocated
+    // up-front, which means only a single producer will be able to enqueue elements
+    // without an extra allocation -- blocks aren't shared between producers).
+    // This method is not thread safe -- it is up to the user to ensure that the
+    // queue is fully constructed before it starts being used by other threads (this
+    // includes making the memory effects of construction visible, possibly with a
+    // memory barrier).
+    explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+        : producerListTail(nullptr)
+        , producerCount(0)
+        , initialBlockPoolIndex(0)
+        , nextExplicitConsumerId(0)
+        , globalExplicitConsumerOffset(0)
+    {
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		// Track all the producers using a fully-resolved typed list for
-		// each kind; this makes it possible to debug them starting from
-		// the root queue object (otherwise wacky casts are needed that
-		// don't compile in the debugger's expression evaluator).
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Computes the correct amount of pre-allocated blocks for you based
-	// on the minimum number of elements you want available at any given
-	// time, and the maximum concurrent number of each type of producer.
-	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
-		populate_initial_block_list(blocks);
-		
+        // Track all the producers using a fully-resolved typed list for
+        // each kind; this makes it possible to debug them starting from
+        // the root queue object (otherwise wacky casts are needed that
+        // don't compile in the debugger's expression evaluator).
+        explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+    }
+
+    // Computes the correct amount of pre-allocated blocks for you based
+    // on the minimum number of elements you want available at any given
+    // time, and the maximum concurrent number of each type of producer.
+    ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+        : producerListTail(nullptr)
+        , producerCount(0)
+        , initialBlockPoolIndex(0)
+        , nextExplicitConsumerId(0)
+        , globalExplicitConsumerOffset(0)
+    {
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) +
+                        2 * (maxExplicitProducers + maxImplicitProducers);
+        populate_initial_block_list(blocks);
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Note: The queue should not be accessed concurrently while it's
-	// being deleted. It's up to the user to synchronize this.
-	// This method is not thread safe.
-	~ConcurrentQueue()
-	{
-		// Destroy producers
-		auto ptr = producerListTail.load(std::memory_order_relaxed);
-		while (ptr != nullptr) {
-			auto next = ptr->next_prod();
-			if (ptr->token != nullptr) {
-				ptr->token->producer = nullptr;
-			}
-			destroy(ptr);
-			ptr = next;
-		}
-		
-		// Destroy implicit producer hash tables
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
-			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
-			while (hash != nullptr) {
-				auto prev = hash->prev;
-				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
-					for (size_t i = 0; i != hash->capacity; ++i) {
-						hash->entries[i].~ImplicitProducerKVP();
-					}
-					hash->~ImplicitProducerHash();
-					(Traits::free)(hash);
-				}
-				hash = prev;
-			}
-		}
-		
-		// Destroy global free list
-		auto block = freeList.head_unsafe();
-		while (block != nullptr) {
-			auto next = block->freeListNext.load(std::memory_order_relaxed);
-			if (block->dynamicallyAllocated) {
-				destroy(block);
-			}
-			block = next;
-		}
-		
-		// Destroy initial free list
-		destroy_array(initialBlockPool, initialBlockPoolSize);
-	}
-
-	// Disable copying and copy assignment
-	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	
-	// Moving is supported, but note that it is *not* a thread-safe operation.
-	// Nobody can use the queue while it's being moved, and the memory effects
-	// of that move must be propagated to other threads before they can use it.
-	// Note: When a queue is moved, its tokens are still valid but can only be
-	// used with the destination queue (i.e. semantically they are moved along
-	// with the queue itself).
-	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
-		producerCount(other.producerCount.load(std::memory_order_relaxed)),
-		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
-		initialBlockPool(other.initialBlockPool),
-		initialBlockPoolSize(other.initialBlockPoolSize),
-		freeList(std::move(other.freeList)),
-		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
-		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
-	{
-		// Move the other one into this, and leave the other one as an empty queue
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		swap_implicit_producer_hashes(other);
-		
-		other.producerListTail.store(nullptr, std::memory_order_relaxed);
-		other.producerCount.store(0, std::memory_order_relaxed);
-		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
-		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
-		
+        explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+    }
+
+    // Note: The queue should not be accessed concurrently while it's
+    // being deleted. It's up to the user to synchronize this.
+    // This method is not thread safe.
+    ~ConcurrentQueue()
+    {
+        // Destroy producers
+        auto ptr = producerListTail.load(std::memory_order_relaxed);
+        while (ptr != nullptr) {
+            auto next = ptr->next_prod();
+            if (ptr->token != nullptr) {
+                ptr->token->producer = nullptr;
+            }
+            destroy(ptr);
+            ptr = next;
+        }
+
+        // Destroy implicit producer hash tables
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0)
+        {
+            auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+            while (hash != nullptr) {
+                auto prev = hash->prev;
+                if (prev != nullptr) {  // The last hash is part of this object and was not allocated dynamically
+                    for (size_t i = 0; i != hash->capacity; ++i) {
+                        hash->entries[i].~ImplicitProducerKVP();
+                    }
+                    hash->~ImplicitProducerHash();
+                    (Traits::free)(hash);
+                }
+                hash = prev;
+            }
+        }
+
+        // Destroy global free list
+        auto block = freeList.head_unsafe();
+        while (block != nullptr) {
+            auto next = block->freeListNext.load(std::memory_order_relaxed);
+            if (block->dynamicallyAllocated) {
+                destroy(block);
+            }
+            block = next;
+        }
+
+        // Destroy initial free list
+        destroy_array(initialBlockPool, initialBlockPoolSize);
+    }
+
+    // Disable copying and copy assignment
+    ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+    ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+
+    // Moving is supported, but note that it is *not* a thread-safe operation.
+    // Nobody can use the queue while it's being moved, and the memory effects
+    // of that move must be propagated to other threads before they can use it.
+    // Note: When a queue is moved, its tokens are still valid but can only be
+    // used with the destination queue (i.e. semantically they are moved along
+    // with the queue itself).
+    ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+        : producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+          producerCount(other.producerCount.load(std::memory_order_relaxed)),
+          initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+          initialBlockPool(other.initialBlockPool),
+          initialBlockPoolSize(other.initialBlockPoolSize),
+          freeList(std::move(other.freeList)),
+          nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+          globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+    {
+        // Move the other one into this, and leave the other one as an empty queue
+        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+        populate_initial_implicit_producer_hash();
+        swap_implicit_producer_hashes(other);
+
+        other.producerListTail.store(nullptr, std::memory_order_relaxed);
+        other.producerCount.store(0, std::memory_order_relaxed);
+        other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+        other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-		
-		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
-		other.initialBlockPoolSize = 0;
-		other.initialBlockPool = nullptr;
-		
-		reown_producers();
-	}
-	
-	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-	{
-		return swap_internal(other);
-	}
-	
-	// Swaps this queue's state with the other's. Not thread-safe.
-	// Swapping two queues does not invalidate their tokens, however
-	// the tokens that were created for one queue must be used with
-	// only the swapped queue (i.e. the tokens are tied to the
-	// queue's movable state, not the object itself).
-	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap_internal(other);
-	}
-	
+        explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+        implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+
+        other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+        other.initialBlockPoolSize = 0;
+        other.initialBlockPool     = nullptr;
+
+        reown_producers();
+    }
+
+    inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT { return swap_internal(other); }
+
+    // Swaps this queue's state with the other's. Not thread-safe.
+    // Swapping two queues does not invalidate their tokens, however
+    // the tokens that were created for one queue must be used with
+    // only the swapped queue (i.e. the tokens are tied to the
+    // queue's movable state, not the object itself).
+    inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT { swap_internal(other); }
+
 private:
-	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
-	{
-		if (this == &other) {
-			return *this;
-		}
-		
-		details::swap_relaxed(producerListTail, other.producerListTail);
-		details::swap_relaxed(producerCount, other.producerCount);
-		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
-		std::swap(initialBlockPool, other.initialBlockPool);
-		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
-		freeList.swap(other.freeList);
-		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
-		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
-		
-		swap_implicit_producer_hashes(other);
-		
-		reown_producers();
-		other.reown_producers();
-		
+    ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+    {
+        if (this == &other) {
+            return *this;
+        }
+
+        details::swap_relaxed(producerListTail, other.producerListTail);
+        details::swap_relaxed(producerCount, other.producerCount);
+        details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+        std::swap(initialBlockPool, other.initialBlockPool);
+        std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+        freeList.swap(other.freeList);
+        details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+        details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+
+        swap_implicit_producer_hashes(other);
+
+        reown_producers();
+        other.reown_producers();
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		details::swap_relaxed(explicitProducers, other.explicitProducers);
-		details::swap_relaxed(implicitProducers, other.implicitProducers);
+        details::swap_relaxed(explicitProducers, other.explicitProducers);
+        details::swap_relaxed(implicitProducers, other.implicitProducers);
 #endif
-		
-		return *this;
-	}
-	
-public:
-	// Enqueues a single item (by copying it).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CanAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CanAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails
-	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
-	}
-	
-	// Enqueues a single item (by copying it).
-	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
-	}
-	
-	
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(U& item)
-	{
-		// Instead of simply trying each producer in turn (which could cause needless contention on the first
-		// producer), we score them heuristically.
-		size_t nonEmptyCount = 0;
-		ProducerBase* best = nullptr;
-		size_t bestSize = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
-			auto size = ptr->size_approx();
-			if (size > 0) {
-				if (size > bestSize) {
-					bestSize = size;
-					best = ptr;
-				}
-				++nonEmptyCount;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (nonEmptyCount > 0) {
-			if ((details::likely)(best->dequeue(item))) {
-				return true;
-			}
-			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-				if (ptr != best && ptr->dequeue(item)) {
-					return true;
-				}
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// This differs from the try_dequeue(item) method in that this one does
-	// not attempt to reduce contention by interleaving the order that producer
-	// streams are dequeued from. So, using this method can reduce overall throughput
-	// under contention, but will give more predictable results in single-threaded
-	// consumer scenarios. This is mostly only useful for internal unit tests.
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue_non_interleaved(U& item)
-	{
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->dequeue(item)) {
-				return true;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue using an explicit consumer token.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(consumer_token_t& token, U& item)
-	{
-		// The idea is roughly as follows:
-		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
-		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
-		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
-		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
-		
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return false;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
-			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return true;
-		}
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			if (ptr->dequeue(item)) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = 1;
-				return true;
-			}
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue several elements from the queue.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(It itemFirst, size_t max)
-	{
-		size_t count = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			count += ptr->dequeue_bulk(itemFirst, max - count);
-			if (count == max) {
-				break;
-			}
-		}
-		return count;
-	}
-	
-	// Attempts to dequeue several elements from the queue using an explicit consumer token.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
-	{
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return 0;
-			}
-		}
-		
-		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
-		if (count == max) {
-			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return max;
-		}
-		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
-		max -= count;
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
-			count += dequeued;
-			if (dequeued != 0) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
-			}
-			if (dequeued == max) {
-				break;
-			}
-			max -= dequeued;
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return count;
-	}
-	
-	
-	
-	// Attempts to dequeue from a specific producer's inner queue.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns false if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
-	}
-	
-	// Attempts to dequeue several elements from a specific producer's inner queue.
-	// Returns the number of items actually dequeued.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns 0 if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
-	}
-	
-	
-	// Returns an estimate of the total number of elements currently in the queue. This
-	// estimate is only accurate if the queue has completely stabilized before it is called
-	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
-	// visible on the calling thread, and no further operations start while this method is
-	// being called).
-	// Thread-safe.
-	size_t size_approx() const
-	{
-		size_t size = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			size += ptr->size_approx();
-		}
-		return size;
-	}
-	
-	
-	// Returns true if the underlying atomic variables used by
-	// the queue are lock-free (they should be on most platforms).
-	// Thread-safe.
-	static constexpr bool is_lock_free()
-	{
-		return
-			details::static_is_lock_free<bool>::value == 2 &&
-			details::static_is_lock_free<size_t>::value == 2 &&
-			details::static_is_lock_free<std::uint32_t>::value == 2 &&
-			details::static_is_lock_free<index_t>::value == 2 &&
-			details::static_is_lock_free<void*>::value == 2 &&
-			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
-	}
 
+        return *this;
+    }
+
+public:
+    // Enqueues a single item (by copying it).
+    // Allocates memory if required. Only fails if memory allocation fails (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T const& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else return inner_enqueue<CanAlloc>(item);
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Allocates memory if required. Only fails if memory allocation fails (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(T&& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else return inner_enqueue<CanAlloc>(std::move(item));
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T const& item) { return inner_enqueue<CanAlloc>(token, item); }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Thread-safe.
+    inline bool enqueue(producer_token_t const& token, T&& item)
+    {
+        return inner_enqueue<CanAlloc>(token, std::move(item));
+    }
+
+    // Enqueues several items.
+    // Allocates memory if required. Only fails if memory allocation fails (or
+    // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+    // Thread-safe.
+    template <typename It>
+    bool enqueue_bulk(It itemFirst, size_t count)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Allocates memory if required. Only fails if memory allocation fails
+    // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+    }
+
+    // Enqueues a single item (by copying it).
+    // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+    // is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T const& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else return inner_enqueue<CannotAlloc>(item);
+    }
+
+    // Enqueues a single item (by moving it, if possible).
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Thread-safe.
+    inline bool try_enqueue(T&& item)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else return inner_enqueue<CannotAlloc>(std::move(item));
+    }
+
+    // Enqueues a single item (by copying it) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T const& item)
+    {
+        return inner_enqueue<CannotAlloc>(token, item);
+    }
+
+    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Thread-safe.
+    inline bool try_enqueue(producer_token_t const& token, T&& item)
+    {
+        return inner_enqueue<CannotAlloc>(token, std::move(item));
+    }
+
+    // Enqueues several items.
+    // Does not allocate memory (except for one-time implicit producer).
+    // Fails if not enough room to enqueue (or implicit production is
+    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    bool try_enqueue_bulk(It itemFirst, size_t count)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+        else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+    }
+
+    // Enqueues several items using an explicit producer token.
+    // Does not allocate memory. Fails if not enough room to enqueue.
+    // Note: Use std::make_move_iterator if the elements should be moved
+    // instead of copied.
+    // Thread-safe.
+    template <typename It>
+    bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    bool try_dequeue(U& item)
+    {
+        // Instead of simply trying each producer in turn (which could cause needless contention on the first
+        // producer), we score them heuristically.
+        size_t nonEmptyCount = 0;
+        ProducerBase* best   = nullptr;
+        size_t bestSize      = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr;
+             ptr      = ptr->next_prod()) {
+            auto size = ptr->size_approx();
+            if (size > 0) {
+                if (size > bestSize) {
+                    bestSize = size;
+                    best     = ptr;
+                }
+                ++nonEmptyCount;
+            }
+        }
+
+        // If there was at least one non-empty queue but it appears empty at the time
+        // we try to dequeue from it, we need to make sure every queue's been tried
+        if (nonEmptyCount > 0) {
+            if ((details::likely)(best->dequeue(item))) {
+                return true;
+            }
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+                if (ptr != best && ptr->dequeue(item)) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // This differs from the try_dequeue(item) method in that this one does
+    // not attempt to reduce contention by interleaving the order that producer
+    // streams are dequeued from. So, using this method can reduce overall throughput
+    // under contention, but will give more predictable results in single-threaded
+    // consumer scenarios. This is mostly only useful for internal unit tests.
+    // Never allocates. Thread-safe.
+    template <typename U>
+    bool try_dequeue_non_interleaved(U& item)
+    {
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            if (ptr->dequeue(item)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue from the queue using an explicit consumer token.
+    // Returns false if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    bool try_dequeue(consumer_token_t& token, U& item)
+    {
+        // The idea is roughly as follows:
+        // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the
+        // highest efficiency consumer dictates the rotation speed of everyone else, more or less If you see that the
+        // global offset has changed, you must reset your consumption counter and move to your designated place If
+        // there's no items where you're supposed to be, keep moving until you find a producer with some items If the
+        // global offset has not changed but you've run out of items to consume, move over from your current position
+        // until you find an producer with something in it
+
+        if (token.desiredProducer == nullptr ||
+            token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+            if (!update_current_producer_after_rotation(token)) {
+                return false;
+            }
+        }
+
+        // If there was at least one non-empty queue but it appears empty at the time
+        // we try to dequeue from it, we need to make sure every queue's been tried
+        if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+            if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+                globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+            }
+            return true;
+        }
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr) {
+            ptr = tail;
+        }
+        while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+            if (ptr->dequeue(item)) {
+                token.currentProducer          = ptr;
+                token.itemsConsumedFromCurrent = 1;
+                return true;
+            }
+            ptr = ptr->next_prod();
+            if (ptr == nullptr) {
+                ptr = tail;
+            }
+        }
+        return false;
+    }
+
+    // Attempts to dequeue several elements from the queue.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    size_t try_dequeue_bulk(It itemFirst, size_t max)
+    {
+        size_t count = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            count += ptr->dequeue_bulk(itemFirst, max - count);
+            if (count == max) {
+                break;
+            }
+        }
+        return count;
+    }
+
+    // Attempts to dequeue several elements from the queue using an explicit consumer token.
+    // Returns the number of items actually dequeued.
+    // Returns 0 if all producer streams appeared empty at the time they
+    // were checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+    {
+        if (token.desiredProducer == nullptr ||
+            token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+            if (!update_current_producer_after_rotation(token)) {
+                return 0;
+            }
+        }
+
+        size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+        if (count == max) {
+            if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
+                EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+                globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+            }
+            return max;
+        }
+        token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+        max -= count;
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr) {
+            ptr = tail;
+        }
+        while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+            auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+            count += dequeued;
+            if (dequeued != 0) {
+                token.currentProducer          = ptr;
+                token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+            }
+            if (dequeued == max) {
+                break;
+            }
+            max -= dequeued;
+            ptr = ptr->next_prod();
+            if (ptr == nullptr) {
+                ptr = tail;
+            }
+        }
+        return count;
+    }
+
+    // Attempts to dequeue from a specific producer's inner queue.
+    // If you happen to know which producer you want to dequeue from, this
+    // is significantly faster than using the general-case try_dequeue methods.
+    // Returns false if the producer's queue appeared empty at the time it
+    // was checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename U>
+    inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+    {
+        return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+    }
+
+    // Attempts to dequeue several elements from a specific producer's inner queue.
+    // Returns the number of items actually dequeued.
+    // If you happen to know which producer you want to dequeue from, this
+    // is significantly faster than using the general-case try_dequeue methods.
+    // Returns 0 if the producer's queue appeared empty at the time it
+    // was checked (so, the queue is likely but not guaranteed to be empty).
+    // Never allocates. Thread-safe.
+    template <typename It>
+    inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+    {
+        return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+    }
+
+    // Returns an estimate of the total number of elements currently in the queue. This
+    // estimate is only accurate if the queue has completely stabilized before it is called
+    // (i.e. all enqueue and dequeue operations have completed and their memory effects are
+    // visible on the calling thread, and no further operations start while this method is
+    // being called).
+    // Thread-safe.
+    size_t size_approx() const
+    {
+        size_t size = 0;
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            size += ptr->size_approx();
+        }
+        return size;
+    }
+
+    // Returns true if the underlying atomic variables used by
+    // the queue are lock-free (they should be on most platforms).
+    // Thread-safe.
+    static constexpr bool is_lock_free()
+    {
+        return details::static_is_lock_free<bool>::value == 2 && details::static_is_lock_free<size_t>::value == 2 &&
+               details::static_is_lock_free<std::uint32_t>::value == 2 &&
+               details::static_is_lock_free<index_t>::value == 2 && details::static_is_lock_free<void*>::value == 2 &&
+               details::static_is_lock_free<
+                   typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+    }
 
 private:
-	friend struct ProducerToken;
-	friend struct ConsumerToken;
-	struct ExplicitProducer;
-	friend struct ExplicitProducer;
-	struct ImplicitProducer;
-	friend struct ImplicitProducer;
-	friend class ConcurrentQueueTests;
-		
-	enum AllocationMode { CanAlloc, CannotAlloc };
-	
-	
-	///////////////////////////////
-	// Queue methods
-	///////////////////////////////
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(producer_token_t const& token, U&& element)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(U&& element)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	inline bool update_current_producer_after_rotation(consumer_token_t& token)
-	{
-		// Ah, there's been a rotation, figure out where we should be!
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		if (token.desiredProducer == nullptr && tail == nullptr) {
-			return false;
-		}
-		auto prodCount = producerCount.load(std::memory_order_relaxed);
-		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
-		if ((details::unlikely)(token.desiredProducer == nullptr)) {
-			// Aha, first time we're dequeueing anything.
-			// Figure out our local position
-			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
-			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
-			token.desiredProducer = tail;
-			for (std::uint32_t i = 0; i != offset; ++i) {
-				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-				if (token.desiredProducer == nullptr) {
-					token.desiredProducer = tail;
-				}
-			}
-		}
-		
-		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
-		if (delta >= prodCount) {
-			delta = delta % prodCount;
-		}
-		for (std::uint32_t i = 0; i != delta; ++i) {
-			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-			if (token.desiredProducer == nullptr) {
-				token.desiredProducer = tail;
-			}
-		}
-		
-		token.lastKnownGlobalOffset = globalOffset;
-		token.currentProducer = token.desiredProducer;
-		token.itemsConsumedFromCurrent = 0;
-		return true;
-	}
-	
-	
-	///////////////////////////
-	// Free list
-	///////////////////////////
-	
-	template <typename N>
-	struct FreeListNode
-	{
-		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
-		
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<N*> freeListNext;
-	};
-	
-	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
-	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
-	// speedy under low contention.
-	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
-	struct FreeList
-	{
-		FreeList() : freeListHead(nullptr) { }
-		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
-		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
-		
-		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		inline void add(N* node)
-		{
+    friend struct ProducerToken;
+    friend struct ConsumerToken;
+    struct ExplicitProducer;
+    friend struct ExplicitProducer;
+    struct ImplicitProducer;
+    friend struct ImplicitProducer;
+    friend class ConcurrentQueueTests;
+
+    enum AllocationMode { CanAlloc, CannotAlloc };
+
+    ///////////////////////////////
+    // Queue methods
+    ///////////////////////////////
+
+    template <AllocationMode canAlloc, typename U>
+    inline bool inner_enqueue(producer_token_t const& token, U&& element)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)
+            ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+    }
+
+    template <AllocationMode canAlloc, typename U>
+    inline bool inner_enqueue(U&& element)
+    {
+        auto producer = get_or_add_implicit_producer();
+        return producer == nullptr ?
+                   false :
+                   producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+    }
+
+    template <AllocationMode canAlloc, typename It>
+    inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)
+            ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+    }
+
+    template <AllocationMode canAlloc, typename It>
+    inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+    {
+        auto producer = get_or_add_implicit_producer();
+        return producer == nullptr ?
+                   false :
+                   producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+    }
+
+    inline bool update_current_producer_after_rotation(consumer_token_t& token)
+    {
+        // Ah, there's been a rotation, figure out where we should be!
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        if (token.desiredProducer == nullptr && tail == nullptr) {
+            return false;
+        }
+        auto prodCount    = producerCount.load(std::memory_order_relaxed);
+        auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+        if ((details::unlikely)(token.desiredProducer == nullptr)) {
+            // Aha, first time we're dequeueing anything.
+            // Figure out our local position
+            // Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+            std::uint32_t offset  = prodCount - 1 - (token.initialOffset % prodCount);
+            token.desiredProducer = tail;
+            for (std::uint32_t i = 0; i != offset; ++i) {
+                token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+                if (token.desiredProducer == nullptr) {
+                    token.desiredProducer = tail;
+                }
+            }
+        }
+
+        std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+        if (delta >= prodCount) {
+            delta = delta % prodCount;
+        }
+        for (std::uint32_t i = 0; i != delta; ++i) {
+            token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+            if (token.desiredProducer == nullptr) {
+                token.desiredProducer = tail;
+            }
+        }
+
+        token.lastKnownGlobalOffset    = globalOffset;
+        token.currentProducer          = token.desiredProducer;
+        token.itemsConsumedFromCurrent = 0;
+        return true;
+    }
+
+    ///////////////////////////
+    // Free list
+    ///////////////////////////
+
+    template <typename N>
+    struct FreeListNode {
+        FreeListNode(): freeListRefs(0), freeListNext(nullptr) {}
+
+        std::atomic<std::uint32_t> freeListRefs;
+        std::atomic<N*> freeListNext;
+    };
+
+    // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+    // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+    // speedy under low contention.
+    template <typename N>  // N must inherit FreeListNode or have the same fields (and initialization of them)
+    struct FreeList {
+        FreeList(): freeListHead(nullptr) {}
+        FreeList(FreeList&& other): freeListHead(other.freeListHead.load(std::memory_order_relaxed))
+        {
+            other.freeListHead.store(nullptr, std::memory_order_relaxed);
+        }
+        void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+
+        FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+        FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+
+        inline void add(N* node)
+        {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
-			// set it using a fetch_add
-			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
-				// Oh look! We were the last ones referencing this node, and we know
-				// we want to add it to the free list, so let's do it!
-		 		add_knowing_refcount_is_zero(node);
-			}
-		}
-		
-		inline N* try_get()
-		{
+            debug::DebugLock lock(mutex);
+#endif
+            // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+            // set it using a fetch_add
+            if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+                // Oh look! We were the last ones referencing this node, and we know
+                // we want to add it to the free list, so let's do it!
+                add_knowing_refcount_is_zero(node);
+            }
+        }
+
+        inline N* try_get()
+        {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			auto head = freeListHead.load(std::memory_order_acquire);
-			while (head != nullptr) {
-				auto prevHead = head;
-				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
-				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
-					head = freeListHead.load(std::memory_order_acquire);
-					continue;
-				}
-				
-				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
-				// next and not worry about it changing between now and the time we do the CAS
-				auto next = head->freeListNext.load(std::memory_order_relaxed);
-				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
-					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
-					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
-					
-					// Decrease refcount twice, once for our ref, and once for the list's ref
-					head->freeListRefs.fetch_sub(2, std::memory_order_release);
-					return head;
-				}
-				
-				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
-				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
-				// count decrement happens-after the CAS on the head.
-				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
-				if (refs == SHOULD_BE_ON_FREELIST + 1) {
-					add_knowing_refcount_is_zero(prevHead);
-				}
-			}
-			
-			return nullptr;
-		}
-		
-		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
-		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
-		
-	private:
-		inline void add_knowing_refcount_is_zero(N* node)
-		{
-			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
-			// only one copy of this method per node at a time, i.e. the single thread case), then we know
-			// we can safely change the next pointer of the node; however, once the refcount is back above
-			// zero, then other threads could increase it (happens under heavy contention, when the refcount
-			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
-			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
-			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
-			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
-			auto head = freeListHead.load(std::memory_order_relaxed);
-			while (true) {
-				node->freeListNext.store(head, std::memory_order_relaxed);
-				node->freeListRefs.store(1, std::memory_order_release);
-				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
-					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
-					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
-						continue;
-					}
-				}
-				return;
-			}
-		}
-		
-	private:
-		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
-		std::atomic<N*> freeListHead;
-	
-	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
-	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
-		
+            debug::DebugLock lock(mutex);
+#endif
+            auto head = freeListHead.load(std::memory_order_acquire);
+            while (head != nullptr) {
+                auto prevHead = head;
+                auto refs     = head->freeListRefs.load(std::memory_order_relaxed);
+                if ((refs & REFS_MASK) == 0 ||
+                    !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
+                    head = freeListHead.load(std::memory_order_acquire);
+                    continue;
+                }
+
+                // Good, reference count has been incremented (it wasn't at zero), which means we can read the
+                // next and not worry about it changing between now and the time we do the CAS
+                auto next = head->freeListNext.load(std::memory_order_relaxed);
+                if (freeListHead.compare_exchange_strong(
+                        head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+                    // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+                    // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put
+                    // back on).
+                    assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+
+                    // Decrease refcount twice, once for our ref, and once for the list's ref
+                    head->freeListRefs.fetch_sub(2, std::memory_order_release);
+                    return head;
+                }
+
+                // OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+                // Note that we don't need to release any memory effects, but we do need to ensure that the reference
+                // count decrement happens-after the CAS on the head.
+                refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+                if (refs == SHOULD_BE_ON_FREELIST + 1) {
+                    add_knowing_refcount_is_zero(prevHead);
+                }
+            }
+
+            return nullptr;
+        }
+
+        // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+        N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+
+    private:
+        inline void add_knowing_refcount_is_zero(N* node)
+        {
+            // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+            // only one copy of this method per node at a time, i.e. the single thread case), then we know
+            // we can safely change the next pointer of the node; however, once the refcount is back above
+            // zero, then other threads could increase it (happens under heavy contention, when the refcount
+            // goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+            // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+            // to add the node to the actual list fails, decrease the refcount and leave the add operation to
+            // the next thread who puts the refcount back at zero (which could be us, hence the loop).
+            auto head = freeListHead.load(std::memory_order_relaxed);
+            while (true) {
+                node->freeListNext.store(head, std::memory_order_relaxed);
+                node->freeListRefs.store(1, std::memory_order_release);
+                if (!freeListHead.compare_exchange_strong(
+                        head, node, std::memory_order_release, std::memory_order_relaxed)) {
+                    // Hmm, the add failed, but we can only try again when the refcount goes back to zero
+                    if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
+                        continue;
+                    }
+                }
+                return;
+            }
+        }
+
+    private:
+        // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under
+        // contention)
+        std::atomic<N*> freeListHead;
+
+        static const std::uint32_t REFS_MASK             = 0x7FFFFFFF;
+        static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-		debug::DebugMutex mutex;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Block
-	///////////////////////////
-	
-	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
-	
-	struct Block
-	{
-		Block()
-			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
-		{
+        debug::DebugMutex mutex;
+#endif
+    };
+
+    ///////////////////////////
+    // Block
+    ///////////////////////////
+
+    enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+
+    struct Block {
+        Block()
+            : next(nullptr)
+            , elementsCompletelyDequeued(0)
+            , freeListRefs(0)
+            , freeListNext(nullptr)
+            , dynamicallyAllocated(true)
+        {
 #ifdef MCDBGQ_TRACKMEM
-			owner = nullptr;
-#endif
-		}
-		
-		template<InnerQueueContext context>
-		inline bool is_empty() const
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Check flags
-				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
-					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
-						return false;
-					}
-				}
-				
-				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
-				std::atomic_thread_fence(std::memory_order_acquire);
-				return true;
-			}
-			else {
-				// Check counter
-				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
-					std::atomic_thread_fence(std::memory_order_acquire);
-					return true;
-				}
-				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
-				return false;
-			}
-		}
-		
-		// Returns true if the block is now empty (does not apply in explicit context)
-		template<InnerQueueContext context>
-		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flag
-				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
-				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
-				assert(prevVal < BLOCK_SIZE);
-				return prevVal == BLOCK_SIZE - 1;
-			}
-		}
-		
-		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
-		// Returns true if the block is now empty (does not apply in explicit context).
-		template<InnerQueueContext context>
-		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flags
-				std::atomic_thread_fence(std::memory_order_release);
-				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
-				for (size_t j = 0; j != count; ++j) {
-					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
-					emptyFlags[i + j].store(true, std::memory_order_relaxed);
-				}
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
-				assert(prevVal + count <= BLOCK_SIZE);
-				return prevVal + count == BLOCK_SIZE;
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void set_all_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set all flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(true, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void reset_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Reset flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(false, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
-			}
-		}
-		
-		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		
-	private:
-		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
-		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
-	public:
-		Block* next;
-		std::atomic<size_t> elementsCompletelyDequeued;
-		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
-	public:
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<Block*> freeListNext;
-		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
-		
+            owner = nullptr;
+#endif
+        }
+
+        template <InnerQueueContext context>
+        inline bool is_empty() const
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Check flags
+                for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+                    if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+                        return false;
+                    }
+                }
+
+                // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+                std::atomic_thread_fence(std::memory_order_acquire);
+                return true;
+            }
+            else
+            {
+                // Check counter
+                if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+                    std::atomic_thread_fence(std::memory_order_acquire);
+                    return true;
+                }
+                assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+                return false;
+            }
+        }
+
+        // Returns true if the block is now empty (does not apply in explicit context)
+        template <InnerQueueContext context>
+        inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Set flag
+                assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(
+                    std::memory_order_relaxed));
+                emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(
+                    true, std::memory_order_release);
+                return false;
+            }
+            else
+            {
+                // Increment counter
+                auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+                assert(prevVal < BLOCK_SIZE);
+                return prevVal == BLOCK_SIZE - 1;
+            }
+        }
+
+        // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+        // Returns true if the block is now empty (does not apply in explicit context).
+        template <InnerQueueContext context>
+        inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Set flags
+                std::atomic_thread_fence(std::memory_order_release);
+                i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+                for (size_t j = 0; j != count; ++j) {
+                    assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+                    emptyFlags[i + j].store(true, std::memory_order_relaxed);
+                }
+                return false;
+            }
+            else
+            {
+                // Increment counter
+                auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
+                assert(prevVal + count <= BLOCK_SIZE);
+                return prevVal + count == BLOCK_SIZE;
+            }
+        }
+
+        template <InnerQueueContext context>
+        inline void set_all_empty()
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Set all flags
+                for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+                    emptyFlags[i].store(true, std::memory_order_relaxed);
+                }
+            }
+            else
+            {
+                // Reset counter
+                elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+            }
+        }
+
+        template <InnerQueueContext context>
+        inline void reset_empty()
+        {
+            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+            {
+                // Reset flags
+                for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+                    emptyFlags[i].store(false, std::memory_order_relaxed);
+                }
+            }
+            else
+            {
+                // Reset counter
+                elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+            }
+        }
+
+        inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT
+        {
+            return static_cast<T*>(static_cast<void*>(elements)) +
+                   static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+        }
+        inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT
+        {
+            return static_cast<T const*>(static_cast<void const*>(elements)) +
+                   static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+        }
+
+    private:
+        static_assert(
+            std::alignment_of<T>::value <= sizeof(T),
+            "The queue does not support types with an alignment greater than their size at this time");
+        MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+
+    public:
+        Block* next;
+        std::atomic<size_t> elementsCompletelyDequeued;
+        std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+
+    public:
+        std::atomic<std::uint32_t> freeListRefs;
+        std::atomic<Block*> freeListNext;
+        bool dynamicallyAllocated;  // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+
 #ifdef MCDBGQ_TRACKMEM
-		void* owner;
+        void* owner;
 #endif
-	};
-	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
-
+    };
+    static_assert(
+        std::alignment_of<Block>::value >= std::alignment_of<T>::value,
+        "Internal error: Blocks must be at least as aligned as the type they are wrapping");
 
 #ifdef MCDBGQ_TRACKMEM
 public:
-	struct MemStats;
+    struct MemStats;
+
 private:
 #endif
-	
-	///////////////////////////
-	// Producer base
-	///////////////////////////
-	
-	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
-	{
-		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
-			tailIndex(0),
-			headIndex(0),
-			dequeueOptimisticCount(0),
-			dequeueOvercommit(0),
-			tailBlock(nullptr),
-			isExplicit(isExplicit_),
-			parent(parent_)
-		{
-		}
-		
-		virtual ~ProducerBase() { }
-		
-		template<typename U>
-		inline bool dequeue(U& element)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue(element);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue(element);
-			}
-		}
-		
-		template<typename It>
-		inline size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-		}
-		
-		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
-		
-		inline size_t size_approx() const
-		{
-			auto tail = tailIndex.load(std::memory_order_relaxed);
-			auto head = headIndex.load(std::memory_order_relaxed);
-			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
-		}
-		
-		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
-	protected:
-		std::atomic<index_t> tailIndex;		// Where to enqueue to next
-		std::atomic<index_t> headIndex;		// Where to dequeue from next
-		
-		std::atomic<index_t> dequeueOptimisticCount;
-		std::atomic<index_t> dequeueOvercommit;
-		
-		Block* tailBlock;
-		
-	public:
-		bool isExplicit;
-		ConcurrentQueue* parent;
-		
-	protected:
+
+    ///////////////////////////
+    // Producer base
+    ///////////////////////////
+
+    struct ProducerBase: public details::ConcurrentQueueProducerTypelessBase {
+        ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
+            : tailIndex(0)
+            , headIndex(0)
+            , dequeueOptimisticCount(0)
+            , dequeueOvercommit(0)
+            , tailBlock(nullptr)
+            , isExplicit(isExplicit_)
+            , parent(parent_)
+        {
+        }
+
+        virtual ~ProducerBase() {}
+
+        template <typename U>
+        inline bool dequeue(U& element)
+        {
+            if (isExplicit) {
+                return static_cast<ExplicitProducer*>(this)->dequeue(element);
+            } else {
+                return static_cast<ImplicitProducer*>(this)->dequeue(element);
+            }
+        }
+
+        template <typename It>
+        inline size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            if (isExplicit) {
+                return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+            } else {
+                return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+            }
+        }
+
+        inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+
+        inline size_t size_approx() const
+        {
+            auto tail = tailIndex.load(std::memory_order_relaxed);
+            auto head = headIndex.load(std::memory_order_relaxed);
+            return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+        }
+
+        inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+
+    protected:
+        std::atomic<index_t> tailIndex;  // Where to enqueue to next
+        std::atomic<index_t> headIndex;  // Where to dequeue from next
+
+        std::atomic<index_t> dequeueOptimisticCount;
+        std::atomic<index_t> dequeueOvercommit;
+
+        Block* tailBlock;
+
+    public:
+        bool isExplicit;
+        ConcurrentQueue* parent;
+
+    protected:
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Explicit queue
-	///////////////////////////
-		
-	struct ExplicitProducer : public ProducerBase
-	{
-		explicit ExplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, true),
-			blockIndex(nullptr),
-			pr_blockIndexSlotsUsed(0),
-			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
-			pr_blockIndexFront(0),
-			pr_blockIndexEntries(nullptr),
-			pr_blockIndexRaw(nullptr)
-		{
-			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
-			if (poolBasedIndexSize > pr_blockIndexSize) {
-				pr_blockIndexSize = poolBasedIndexSize;
-			}
-			
-			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
-		}
-		
-		~ExplicitProducer()
-		{
-			// Destruct any elements not yet dequeued.
-			// Since we're in the destructor, we can assume all elements
-			// are either completely dequeued or completely not (no halfways).
-			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
-				// First find the block that's partially dequeued, if any
-				Block* halfDequeuedBlock = nullptr;
-				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
-					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
-					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
-					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
-					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
-						i = (i + 1) & (pr_blockIndexSize - 1);
-					}
-					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
-					halfDequeuedBlock = pr_blockIndexEntries[i].block;
-				}
-				
-				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
-				auto block = this->tailBlock;
-				do {
-					block = block->next;
-					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-						continue;
-					}
-					
-					size_t i = 0;	// Offset into block
-					if (block == halfDequeuedBlock) {
-						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					}
-					
-					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
-					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
-						(*block)[i++]->~T();
-					}
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy all blocks that we own
-			if (this->tailBlock != nullptr) {
-				auto block = this->tailBlock;
-				do {
-					auto nextBlock = block->next;
-					this->parent->add_block_to_free_list(block);
-					block = nextBlock;
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy the block indices
-			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
-			while (header != nullptr) {
-				auto prev = static_cast<BlockIndexHeader*>(header->prev);
-				header->~BlockIndexHeader();
-				(Traits::free)(header);
-				header = prev;
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto startBlock = this->tailBlock;
-				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					// We can re-use the block ahead of us, it's empty!					
-					this->tailBlock = this->tailBlock->next;
-					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					
-					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
-					// last block from it first -- except instead of removing then adding, we can just overwrite).
-					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
-					// it would have been re-attempted when adding the first block to the queue; since there is such
-					// a block, a block index must have been successfully allocated.
-				}
-				else {
-					// Whatever head value we see here is >= the last value we saw here (relatively),
-					// and <= its current value. Since we have the most recent tail, the head must be
-					// <= to it.
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
-						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-						// We can't enqueue in another block because there's not enough leeway -- the
-						// tail could surpass the head by the time the block fills up! (Or we'll exceed
-						// the size limit, if the second part of the condition was true.)
-						return false;
-					}
-					// We're going to need a new block; check that the block index has room
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
-						// Hmm, the circular block index is already full -- we'll need
-						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
-						// the initial allocation failed in the constructor.
-						
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							return false;
-						}
-						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
-							return false;
-						}
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						return false;
-					}
+        friend struct MemStats;
+#endif
+    };
+
+    ///////////////////////////
+    // Explicit queue
+    ///////////////////////////
+
+    struct ExplicitProducer: public ProducerBase {
+        explicit ExplicitProducer(ConcurrentQueue* parent_)
+            : ProducerBase(parent_, true)
+            , blockIndex(nullptr)
+            , pr_blockIndexSlotsUsed(0)
+            , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1)
+            , pr_blockIndexFront(0)
+            , pr_blockIndexEntries(nullptr)
+            , pr_blockIndexRaw(nullptr)
+        {
+            size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+            if (poolBasedIndexSize > pr_blockIndexSize) {
+                pr_blockIndexSize = poolBasedIndexSize;
+            }
+
+            new_block_index(0);  // This creates an index with double the number of current entries, i.e.
+                                 // EXPLICIT_INITIAL_INDEX_SIZE
+        }
+
+        ~ExplicitProducer()
+        {
+            // Destruct any elements not yet dequeued.
+            // Since we're in the destructor, we can assume all elements
+            // are either completely dequeued or completely not (no halfways).
+            if (this->tailBlock != nullptr) {  // Note this means there must be a block index too
+                // First find the block that's partially dequeued, if any
+                Block* halfDequeuedBlock = nullptr;
+                if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+                    // The head's not on a block boundary, meaning a block somewhere is partially dequeued
+                    // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a
+                    // boundary)
+                    size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+                    while (details::circular_less_than<index_t>(
+                        pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+                        i = (i + 1) & (pr_blockIndexSize - 1);
+                    }
+                    assert(
+                        details::circular_less_than<index_t>(
+                            pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+                    halfDequeuedBlock = pr_blockIndexEntries[i].block;
+                }
+
+                // Start at the head block (note the first line in the loop gives us the head from the tail on the first
+                // iteration)
+                auto block = this->tailBlock;
+                do {
+                    block = block->next;
+                    if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+                        continue;
+                    }
+
+                    size_t i = 0;  // Offset into block
+                    if (block == halfDequeuedBlock) {
+                        i = static_cast<size_t>(
+                            this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+                    }
+
+                    // Walk through all the items in the block; if this is the tail block, we need to stop when we reach
+                    // the tail index
+                    auto lastValidIndex =
+                        (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ?
+                            BLOCK_SIZE :
+                            static_cast<size_t>(
+                                this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+                    while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+                        (*block)[i++]->~T();
+                    }
+                } while (block != this->tailBlock);
+            }
+
+            // Destroy all blocks that we own
+            if (this->tailBlock != nullptr) {
+                auto block = this->tailBlock;
+                do {
+                    auto nextBlock = block->next;
+                    this->parent->add_block_to_free_list(block);
+                    block = nextBlock;
+                } while (block != this->tailBlock);
+            }
+
+            // Destroy the block indices
+            auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+            while (header != nullptr) {
+                auto prev = static_cast<BlockIndexHeader*>(header->prev);
+                header->~BlockIndexHeader();
+                (Traits::free)(header);
+                header = prev;
+            }
+        }
+
+        template <AllocationMode allocMode, typename U>
+        inline bool enqueue(U&& element)
+        {
+            index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            index_t newTailIndex     = 1 + currentTailIndex;
+            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                // We reached the end of a block, start a new one
+                auto startBlock                  = this->tailBlock;
+                auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+                if (this->tailBlock != nullptr &&
+                    this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+                    // We can re-use the block ahead of us, it's empty!
+                    this->tailBlock = this->tailBlock->next;
+                    this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+
+                    // We'll put the block on the block index (guaranteed to be room since we're conceptually removing
+                    // the last block from it first -- except instead of removing then adding, we can just overwrite).
+                    // Note that there must be a valid block index here, since even if allocation failed in the ctor,
+                    // it would have been re-attempted when adding the first block to the queue; since there is such
+                    // a block, a block index must have been successfully allocated.
+                } else {
+                    // Whatever head value we see here is >= the last value we saw here (relatively),
+                    // and <= its current value. Since we have the most recent tail, the head must be
+                    // <= to it.
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
+                        (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+                         (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+                        // We can't enqueue in another block because there's not enough leeway -- the
+                        // tail could surpass the head by the time the block fills up! (Or we'll exceed
+                        // the size limit, if the second part of the condition was true.)
+                        return false;
+                    }
+                    // We're going to need a new block; check that the block index has room
+                    if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+                        // Hmm, the circular block index is already full -- we'll need
+                        // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+                        // the initial allocation failed in the constructor.
+
+                        MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                        {
+                            return false;
+                        }
+                        else if (!new_block_index(pr_blockIndexSlotsUsed))
+                        {
+                            return false;
+                        }
+                    }
+
+                    // Insert a new block in the circular linked list
+                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                    if (newBlock == nullptr) {
+                        return false;
+                    }
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					++pr_blockIndexSlotsUsed;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// The constructor may throw. We want the element not to appear in the queue in
-					// that case (without corrupting the queue):
-					MOODYCAMEL_TRY {
-						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Revert change to the current block, but leave the new block available
-						// for next time
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				else {
-					(void)startBlock;
-					(void)originalBlockIndexSlotsUsed;
-				}
-				
-				// Add block to block index
-				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-				entry.base = currentTailIndex;
-				entry.block = this->tailBlock;
-				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
-				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				// Might be something to dequeue, let's give it a try
-				
-				// Note that this if is purely for performance purposes in the common case when the queue is
-				// empty and the values are eventually consistent -- we may enter here spuriously.
-				
-				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
-				// change them) and must be the same value at this point (inside the if) as when the if condition was
-				// evaluated.
-
-				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
-				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
-				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
-				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
-				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
-				// unfortunately that can't be shown to be correct using only the C++11 standard.
-				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				// Increment optimistic counter, then check if it went over the boundary
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				
-				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
-				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
-				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
-				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
-				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
-				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
-				
-				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
-				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
-				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					// Guaranteed to be at least one element to dequeue!
-					
-					// Get the index. Note that since there's guaranteed to be at least one element, this
-					// will never exceed tail. We need to do an acquire-release fence here since it's possible
-					// that whatever condition got us to this point was for an earlier enqueued element (that
-					// we already see the memory effects for), but that by the time we increment somebody else
-					// has incremented it, and we need to see the memory effects for *that* element, which is
-					// in such a case is necessarily visible on the thread that incremented it in the first
-					// place with the more current condition (they must have acquired a tail that is at least
-					// as recent).
-					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					
-					// Determine which block the element is in
-					
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					// We need to be careful here about subtracting and dividing because of index wrap-around.
-					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
-					// block size (in order to get a correct signed block count offset in all cases):
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
-					
-					// Dequeue
-					auto& el = *((*block)[index]);
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-						// Make sure the element is still fully dequeued and destroyed even if the assignment
-						// throws
-						struct Guard {
-							Block* block;
-							index_t index;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-							}
-						} guard = { block, index };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-					}
-					
-					return true;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
-				}
-			}
-		
-			return false;
-		}
-		
-		template<AllocationMode allocMode, typename It>
-		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			auto originalBlockIndexFront = pr_blockIndexFront;
-			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-			
-			Block* firstAllocatedBlock = nullptr;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-				// Allocate as many blocks as possible from ahead
-				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					this->tailBlock = this->tailBlock->next;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Now allocate as many blocks as necessary from the block pool
-				while (blockBaseDiff > 0) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						
-						// pr_blockIndexFront is updated inside new_block_index, so we need to
-						// update our fallback value too (since we keep the new index even if we
-						// later fail)
-						originalBlockIndexFront = originalBlockIndexSlotsUsed;
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						return false;
-					}
-					
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                    if (this->tailBlock == nullptr) {
+                        newBlock->next = newBlock;
+                    } else {
+                        newBlock->next        = this->tailBlock->next;
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock = newBlock;
+                    ++pr_blockIndexSlotsUsed;
+                }
+
+                MOODYCAMEL_CONSTEXPR_IF(
+                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                {
+                    // The constructor may throw. We want the element not to appear in the queue in
+                    // that case (without corrupting the queue):
+                    MOODYCAMEL_TRY
+                    {
+                        new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        // Revert change to the current block, but leave the new block available
+                        // for next time
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock        = startBlock == nullptr ? this->tailBlock : startBlock;
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+                else
+                {
+                    (void)startBlock;
+                    (void)originalBlockIndexSlotsUsed;
+                }
+
+                // Add block to block index
+                auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                entry.base  = currentTailIndex;
+                entry.block = this->tailBlock;
+                blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+                pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+                MOODYCAMEL_CONSTEXPR_IF(
+                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                {
+                    this->tailIndex.store(newTailIndex, std::memory_order_release);
+                    return true;
+                }
+            }
+
+            // Enqueue
+            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template <typename U>
+        bool dequeue(U& element)
+        {
+            auto tail       = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            if (details::circular_less_than<index_t>(
+                    this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+                // Might be something to dequeue, let's give it a try
+
+                // Note that this if is purely for performance purposes in the common case when the queue is
+                // empty and the values are eventually consistent -- we may enter here spuriously.
+
+                // Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+                // change them) and must be the same value at this point (inside the if) as when the if condition was
+                // evaluated.
+
+                // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit
+                // below. This ensures that whatever the value we got loaded into overcommit, the load of
+                // dequeueOptisticCount in the fetch_add below will result in a value at least as recent as that (and
+                // therefore at least as large). Note that I believe a compiler (signal) fence here would be sufficient
+                // due to the nature of fetch_add (all read-modify-write operations are guaranteed to work on the latest
+                // value in the modification order), but unfortunately that can't be shown to be correct using only the
+                // C++11 standard. See
+                // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                // Increment optimistic counter, then check if it went over the boundary
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+
+                // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is
+                // only ever incremented after dequeueOptimisticCount -- this is enforced in the `else` block below),
+                // and since we now have a version of dequeueOptimisticCount that is at least as recent as overcommit
+                // (due to the release upon incrementing dequeueOvercommit and the acquire above that synchronizes with
+                // it), overcommit <= myDequeueCount. However, we can't assert this since both dequeueOptimisticCount
+                // and dequeueOvercommit may (independently) overflow; in such a case, though, the logic still holds
+                // since the difference between the two is maintained.
+
+                // Note that we reload tail here in case it changed; it will be the same value as before or greater,
+                // since this load is sequenced after (happens after) the earlier load above. This is supported by
+                // read-read coherency (as defined in the standard), explained here:
+                // http://en.cppreference.com/w/cpp/atomic/memory_order
+                tail = this->tailIndex.load(std::memory_order_acquire);
+                if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+                    // Guaranteed to be at least one element to dequeue!
+
+                    // Get the index. Note that since there's guaranteed to be at least one element, this
+                    // will never exceed tail. We need to do an acquire-release fence here since it's possible
+                    // that whatever condition got us to this point was for an earlier enqueued element (that
+                    // we already see the memory effects for), but that by the time we increment somebody else
+                    // has incremented it, and we need to see the memory effects for *that* element, which is
+                    // in such a case is necessarily visible on the thread that incremented it in the first
+                    // place with the more current condition (they must have acquired a tail that is at least
+                    // as recent).
+                    auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                    // Determine which block the element is in
+
+                    auto localBlockIndex     = blockIndex.load(std::memory_order_acquire);
+                    auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                    // We need to be careful here about subtracting and dividing because of index wrap-around.
+                    // When an index wraps, we need to preserve the sign of the offset when dividing it by the
+                    // block size (in order to get a correct signed block count offset in all cases):
+                    auto headBase       = localBlockIndex->entries[localBlockIndexHead].base;
+                    auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                    auto offset         = static_cast<size_t>(
+                        static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) /
+                        static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                    auto block =
+                        localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+
+                    // Dequeue
+                    auto& el = *((*block)[index]);
+                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+                        // Make sure the element is still fully dequeued and destroyed even if the assignment
+                        // throws
+                        struct Guard {
+                            Block* block;
+                            index_t index;
+
+                            ~Guard()
+                            {
+                                (*block)[index]->~T();
+                                block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                            }
+                        } guard = {block, index};
+
+                        element = std::move(el);  // NOLINT
+                    } else {
+                        element = std::move(el);  // NOLINT
+                        el.~T();                  // NOLINT
+                        block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                    }
+
+                    return true;
+                } else {
+                    // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                    this->dequeueOvercommit.fetch_add(
+                        1, std::memory_order_release);  // Release so that the fetch_add on dequeueOptimisticCount is
+                                                        // guaranteed to happen before this write
+                }
+            }
+
+            return false;
+        }
+
+        template <AllocationMode allocMode, typename It>
+        bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+        {
+            // First, we need to make sure we have enough room to enqueue all of the elements;
+            // this means pre-allocating blocks and putting them in the block index (but only if
+            // all the allocations succeeded).
+            index_t startTailIndex           = this->tailIndex.load(std::memory_order_relaxed);
+            auto startBlock                  = this->tailBlock;
+            auto originalBlockIndexFront     = pr_blockIndexFront;
+            auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+            Block* firstAllocatedBlock = nullptr;
+
+            // Figure out how many blocks we'll need to allocate, and do so
+            size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+                                   ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+            index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            if (blockBaseDiff > 0) {
+                // Allocate as many blocks as possible from ahead
+                while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
+                       this->tailBlock->next != firstAllocatedBlock &&
+                       this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    this->tailBlock     = this->tailBlock->next;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                    auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                    entry.base         = currentTailIndex;
+                    entry.block        = this->tailBlock;
+                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                }
+
+                // Now allocate as many blocks as necessary from the block pool
+                while (blockBaseDiff > 0) {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
+                                (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+                                 (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+                    if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+                        MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                        {
+                            // Failed to allocate, undo changes (but keep injected blocks)
+                            pr_blockIndexFront     = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                            return false;
+                        }
+                        else if (full || !new_block_index(originalBlockIndexSlotsUsed))
+                        {
+                            // Failed to allocate, undo changes (but keep injected blocks)
+                            pr_blockIndexFront     = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                            return false;
+                        }
+
+                        // pr_blockIndexFront is updated inside new_block_index, so we need to
+                        // update our fallback value too (since we keep the new index even if we
+                        // later fail)
+                        originalBlockIndexFront = originalBlockIndexSlotsUsed;
+                    }
+
+                    // Insert a new block in the circular linked list
+                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                    if (newBlock == nullptr) {
+                        pr_blockIndexFront     = originalBlockIndexFront;
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                        return false;
+                    }
+
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					++pr_blockIndexSlotsUsed;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
-				// publish the new block index front
-				auto block = firstAllocatedBlock;
-				while (true) {
-					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (block == this->tailBlock) {
-						break;
-					}
-					block = block->next;
-				}
-				
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-				}
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			auto endBlock = this->tailBlock;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							// Must use copy constructor even if move constructor is available
-							// because we may have to revert if there's an exception.
-							// Sorry about the horrible templated next line, but it was the only way
-							// to disable moving *at compile time*, which is important because a type
-							// may only define a (noexcept) move constructor, and so calls to the
-							// cctor will not compile, even if they are in an if branch that will never
-							// be executed
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Oh dear, an exception's been thrown -- destroy the elements that
-						// were enqueued so far and revert the entire bulk operation (we'll keep
-						// any allocated blocks in our linked list for later, though).
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			
-			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-				if (firstAllocatedBlock != nullptr)
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-			}
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Determine which block the first element is in
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					do {
-						auto firstIndexInBlock = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						auto block = localBlockIndex->entries[indexIndex].block;
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								// It's too late to revert the dequeue, but we can make sure that all
-								// the dequeued objects are properly destroyed and the block index
-								// (and empty count) are properly updated before we propagate the exception
-								do {
-									block = localBlockIndex->entries[indexIndex].block;
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-									
-									firstIndexInBlock = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		struct BlockIndexEntry
-		{
-			index_t base;
-			Block* block;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t size;
-			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
-			BlockIndexEntry* entries;
-			void* prev;
-		};
-		
-		
-		bool new_block_index(size_t numberOfFilledSlotsToExpose)
-		{
-			auto prevBlockSizeMask = pr_blockIndexSize - 1;
-			
-			// Create the new block
-			pr_blockIndexSize <<= 1;
-			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
-			if (newRawPtr == nullptr) {
-				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
-				return false;
-			}
-			
-			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
-			
-			// Copy in all the old indices, if any
-			size_t j = 0;
-			if (pr_blockIndexSlotsUsed != 0) {
-				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
-				do {
-					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
-					i = (i + 1) & prevBlockSizeMask;
-				} while (i != pr_blockIndexFront);
-			}
-			
-			// Update everything
-			auto header = new (newRawPtr) BlockIndexHeader;
-			header->size = pr_blockIndexSize;
-			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
-			header->entries = newBlockIndexEntries;
-			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
-			
-			pr_blockIndexFront = j;
-			pr_blockIndexEntries = newBlockIndexEntries;
-			pr_blockIndexRaw = newRawPtr;
-			blockIndex.store(header, std::memory_order_release);
-			
-			return true;
-		}
-		
-	private:
-		std::atomic<BlockIndexHeader*> blockIndex;
-		
-		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
-		size_t pr_blockIndexSlotsUsed;
-		size_t pr_blockIndexSize;
-		size_t pr_blockIndexFront;		// Next slot (not current)
-		BlockIndexEntry* pr_blockIndexEntries;
-		void* pr_blockIndexRaw;
-		
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+                    if (this->tailBlock == nullptr) {
+                        newBlock->next = newBlock;
+                    } else {
+                        newBlock->next        = this->tailBlock->next;
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock     = newBlock;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                    ++pr_blockIndexSlotsUsed;
+
+                    auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                    entry.base         = currentTailIndex;
+                    entry.block        = this->tailBlock;
+                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                }
+
+                // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+                // publish the new block index front
+                auto block = firstAllocatedBlock;
+                while (true) {
+                    block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                    if (block == this->tailBlock) {
+                        break;
+                    }
+                    block = block->next;
+                }
+
+                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+                    T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                {
+                    blockIndex.load(std::memory_order_relaxed)
+                        ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+                }
+            }
+
+            // Enqueue, one block at a time
+            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+            currentTailIndex     = startTailIndex;
+            auto endBlock        = this->tailBlock;
+            this->tailBlock      = startBlock;
+            assert(
+                (startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr ||
+                count == 0);
+            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+                this->tailBlock = firstAllocatedBlock;
+            }
+            while (true) {
+                index_t stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+                    stopIndex = newTailIndex;
+                }
+                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+                    T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                {
+                    while (currentTailIndex != stopIndex) {
+                        new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                    }
+                }
+                else
+                {
+                    MOODYCAMEL_TRY
+                    {
+                        while (currentTailIndex != stopIndex) {
+                            // Must use copy constructor even if move constructor is available
+                            // because we may have to revert if there's an exception.
+                            // Sorry about the horrible templated next line, but it was the only way
+                            // to disable moving *at compile time*, which is important because a type
+                            // may only define a (noexcept) move constructor, and so calls to the
+                            // cctor will not compile, even if they are in an if branch that will never
+                            // be executed
+                            new ((*this->tailBlock)[currentTailIndex])
+                                T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                                      T,
+                                      decltype(*itemFirst),
+                                      new (static_cast<T*>(nullptr))
+                                          T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+                            ++currentTailIndex;
+                            ++itemFirst;
+                        }
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        // Oh dear, an exception's been thrown -- destroy the elements that
+                        // were enqueued so far and revert the entire bulk operation (we'll keep
+                        // any allocated blocks in our linked list for later, though).
+                        auto constructedStopIndex = currentTailIndex;
+                        auto lastBlockEnqueued    = this->tailBlock;
+
+                        pr_blockIndexFront     = originalBlockIndexFront;
+                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                        this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+                        if (!details::is_trivially_destructible<T>::value) {
+                            auto block = startBlock;
+                            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                                block = firstAllocatedBlock;
+                            }
+                            currentTailIndex = startTailIndex;
+                            while (true) {
+                                stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                                            static_cast<index_t>(BLOCK_SIZE);
+                                if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+                                    stopIndex = constructedStopIndex;
+                                }
+                                while (currentTailIndex != stopIndex) {
+                                    (*block)[currentTailIndex++]->~T();
+                                }
+                                if (block == lastBlockEnqueued) {
+                                    break;
+                                }
+                                block = block->next;
+                            }
+                        }
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                if (this->tailBlock == endBlock) {
+                    assert(currentTailIndex == newTailIndex);
+                    break;
+                }
+                this->tailBlock = this->tailBlock->next;
+            }
+
+            MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+                T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+            {
+                if (firstAllocatedBlock != nullptr)
+                    blockIndex.load(std::memory_order_relaxed)
+                        ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+            }
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template <typename It>
+        size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            auto tail       = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            auto desiredCount =
+                static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+            if (details::circular_less_than<size_t>(0, desiredCount)) {
+                desiredCount = desiredCount < max ? desiredCount : max;
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                tail             = this->tailIndex.load(std::memory_order_acquire);
+                auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                if (details::circular_less_than<size_t>(0, actualCount)) {
+                    actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                    if (actualCount < desiredCount) {
+                        this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                    }
+
+                    // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                    // will never exceed tail.
+                    auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                    // Determine which block the first element is in
+                    auto localBlockIndex     = blockIndex.load(std::memory_order_acquire);
+                    auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                    auto headBase            = localBlockIndex->entries[localBlockIndexHead].base;
+                    auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                    auto offset              = static_cast<size_t>(
+                        static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) /
+                        static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                    auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+                    // Iterate the blocks and dequeue
+                    auto index = firstIndex;
+                    do {
+                        auto firstIndexInBlock = index;
+                        index_t endIndex =
+                            (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                        endIndex   = details::circular_less_than<index_t>(
+                                       firstIndex + static_cast<index_t>(actualCount), endIndex) ?
+                                         firstIndex + static_cast<index_t>(actualCount) :
+                                         endIndex;
+                        auto block = localBlockIndex->entries[indexIndex].block;
+                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(
+                                T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+                            while (index != endIndex) {
+                                auto& el     = *((*block)[index]);
+                                *itemFirst++ = std::move(el);
+                                el.~T();
+                                ++index;
+                            }
+                        } else {
+                            MOODYCAMEL_TRY
+                            {
+                                while (index != endIndex) {
+                                    auto& el   = *((*block)[index]);
+                                    *itemFirst = std::move(el);
+                                    ++itemFirst;
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            MOODYCAMEL_CATCH(...)
+                            {
+                                // It's too late to revert the dequeue, but we can make sure that all
+                                // the dequeued objects are properly destroyed and the block index
+                                // (and empty count) are properly updated before we propagate the exception
+                                do {
+                                    block = localBlockIndex->entries[indexIndex].block;
+                                    while (index != endIndex) {
+                                        (*block)[index++]->~T();
+                                    }
+                                    block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(
+                                        firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+                                    indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                                    firstIndexInBlock = index;
+                                    endIndex          = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                                               static_cast<index_t>(BLOCK_SIZE);
+                                    endIndex = details::circular_less_than<index_t>(
+                                                   firstIndex + static_cast<index_t>(actualCount), endIndex) ?
+                                                   firstIndex + static_cast<index_t>(actualCount) :
+                                                   endIndex;
+                                } while (index != firstIndex + actualCount);
+
+                                MOODYCAMEL_RETHROW;
+                            }
+                        }
+                        block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(
+                            firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+                        indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+                    } while (index != firstIndex + actualCount);
+
+                    return actualCount;
+                } else {
+                    // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                    this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                }
+            }
+
+            return 0;
+        }
+
+    private:
+        struct BlockIndexEntry {
+            index_t base;
+            Block* block;
+        };
+
+        struct BlockIndexHeader {
+            size_t size;
+            std::atomic<size_t> front;  // Current slot (not next, like pr_blockIndexFront)
+            BlockIndexEntry* entries;
+            void* prev;
+        };
+
+        bool new_block_index(size_t numberOfFilledSlotsToExpose)
+        {
+            auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+            // Create the new block
+            pr_blockIndexSize <<= 1;
+            auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) +
+                                                                 std::alignment_of<BlockIndexEntry>::value - 1 +
+                                                                 sizeof(BlockIndexEntry) * pr_blockIndexSize));
+            if (newRawPtr == nullptr) {
+                pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
+                return false;
+            }
+
+            auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(
+                details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+
+            // Copy in all the old indices, if any
+            size_t j = 0;
+            if (pr_blockIndexSlotsUsed != 0) {
+                auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+                do {
+                    newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+                    i                         = (i + 1) & prevBlockSizeMask;
+                } while (i != pr_blockIndexFront);
+            }
+
+            // Update everything
+            auto header  = new (newRawPtr) BlockIndexHeader;
+            header->size = pr_blockIndexSize;
+            header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+            header->entries = newBlockIndexEntries;
+            header->prev    = pr_blockIndexRaw;  // we link the new block to the old one so we can free it later
+
+            pr_blockIndexFront   = j;
+            pr_blockIndexEntries = newBlockIndexEntries;
+            pr_blockIndexRaw     = newRawPtr;
+            blockIndex.store(header, std::memory_order_release);
+
+            return true;
+        }
+
+    private:
+        std::atomic<BlockIndexHeader*> blockIndex;
+
+        // To be used by producer only -- consumer must use the ones in referenced by blockIndex
+        size_t pr_blockIndexSlotsUsed;
+        size_t pr_blockIndexSize;
+        size_t pr_blockIndexFront;  // Next slot (not current)
+        BlockIndexEntry* pr_blockIndexEntries;
+        void* pr_blockIndexRaw;
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ExplicitProducer* nextExplicitProducer;
-	private:
+    public:
+        ExplicitProducer* nextExplicitProducer;
+
+    private:
 #endif
-		
+
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Implicit queue
-	//////////////////////////////////
-	
-	struct ImplicitProducer : public ProducerBase
-	{			
-		ImplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, false),
-			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
-			blockIndex(nullptr)
-		{
-			new_block_index();
-		}
-		
-		~ImplicitProducer()
-		{
-			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
-			// completed already; this means that all undequeued elements are placed contiguously across
-			// contiguous blocks, and that only the first and last remaining blocks can be only partially
-			// empty (all other remaining blocks must be completely full).
-			
+        friend struct MemStats;
+#endif
+    };
+
+    //////////////////////////////////
+    // Implicit queue
+    //////////////////////////////////
+
+    struct ImplicitProducer: public ProducerBase {
+        ImplicitProducer(ConcurrentQueue* parent_)
+            : ProducerBase(parent_, false), nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), blockIndex(nullptr)
+        {
+            new_block_index();
+        }
+
+        ~ImplicitProducer()
+        {
+            // Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+            // completed already; this means that all undequeued elements are placed contiguously across
+            // contiguous blocks, and that only the first and last remaining blocks can be only partially
+            // empty (all other remaining blocks must be completely full).
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-			// Unregister ourselves for thread termination notification
-			if (!this->inactive.load(std::memory_order_relaxed)) {
-				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
-			}
-#endif
-			
-			// Destroy all remaining elements!
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto index = this->headIndex.load(std::memory_order_relaxed);
-			Block* block = nullptr;
-			assert(index == tail || details::circular_less_than(index, tail));
-			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
-			while (index != tail) {
-				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
-					if (block != nullptr) {
-						// Free the old block
-						this->parent->add_block_to_free_list(block);
-					}
-					
-					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
-				}
-				
-				((*block)[index])->~T();
-				++index;
-			}
-			// Even if the queue is empty, there's still one block that's not on the free list
-			// (unless the head index reached the end of it, in which case the tail will be poised
-			// to create a new block).
-			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
-				this->parent->add_block_to_free_list(this->tailBlock);
-			}
-			
-			// Destroy block index
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			if (localBlockIndex != nullptr) {
-				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
-					localBlockIndex->index[i]->~BlockIndexEntry();
-				}
-				do {
-					auto prev = localBlockIndex->prev;
-					localBlockIndex->~BlockIndexHeader();
-					(Traits::free)(localBlockIndex);
-					localBlockIndex = prev;
-				} while (localBlockIndex != nullptr);
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto head = this->headIndex.load(std::memory_order_relaxed);
-				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-					return false;
-				}
+            // Unregister ourselves for thread termination notification
+            if (!this->inactive.load(std::memory_order_relaxed)) {
+                details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+            }
+#endif
+
+            // Destroy all remaining elements!
+            auto tail    = this->tailIndex.load(std::memory_order_relaxed);
+            auto index   = this->headIndex.load(std::memory_order_relaxed);
+            Block* block = nullptr;
+            assert(index == tail || details::circular_less_than(index, tail));
+            bool forceFreeLastBlock =
+                index != tail;  // If we enter the loop, then the last (tail) block will not be freed
+            while (index != tail) {
+                if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+                    if (block != nullptr) {
+                        // Free the old block
+                        this->parent->add_block_to_free_list(block);
+                    }
+
+                    block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+                }
+
+                ((*block)[index])->~T();
+                ++index;
+            }
+            // Even if the queue is empty, there's still one block that's not on the free list
+            // (unless the head index reached the end of it, in which case the tail will be poised
+            // to create a new block).
+            if (this->tailBlock != nullptr &&
+                (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+                this->parent->add_block_to_free_list(this->tailBlock);
+            }
+
+            // Destroy block index
+            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+            if (localBlockIndex != nullptr) {
+                for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+                    localBlockIndex->index[i]->~BlockIndexEntry();
+                }
+                do {
+                    auto prev = localBlockIndex->prev;
+                    localBlockIndex->~BlockIndexHeader();
+                    (Traits::free)(localBlockIndex);
+                    localBlockIndex = prev;
+                } while (localBlockIndex != nullptr);
+            }
+        }
+
+        template <AllocationMode allocMode, typename U>
+        inline bool enqueue(U&& element)
+        {
+            index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            index_t newTailIndex     = 1 + currentTailIndex;
+            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                // We reached the end of a block, start a new one
+                auto head = this->headIndex.load(std::memory_order_relaxed);
+                assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
+                    (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+                     (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+                    return false;
+                }
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				// Find out where we'll be inserting this block in the block index
-				BlockIndexEntry* idxEntry;
-				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
-					return false;
-				}
-				
-				// Get ahold of a new block
-				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-				if (newBlock == nullptr) {
-					rewind_block_index_tail();
-					idxEntry->value.store(nullptr, std::memory_order_relaxed);
-					return false;
-				}
+                debug::DebugLock lock(mutex);
+#endif
+                // Find out where we'll be inserting this block in the block index
+                BlockIndexEntry* idxEntry;
+                if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+                    return false;
+                }
+
+                // Get ahold of a new block
+                auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                if (newBlock == nullptr) {
+                    rewind_block_index_tail();
+                    idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                    return false;
+                }
 #ifdef MCDBGQ_TRACKMEM
-				newBlock->owner = this;
-#endif
-				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// May throw, try to insert now before we publish the fact that we have this new block
-					MOODYCAMEL_TRY {
-						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						rewind_block_index_tail();
-						idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						this->parent->add_block_to_free_list(newBlock);
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				// Insert the new block into the index
-				idxEntry->value.store(newBlock, std::memory_order_relaxed);
-				
-				this->tailBlock = newBlock;
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			// See ExplicitProducer::dequeue for rationale and explanation
-			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
-			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					// Determine which block the element is in
-					auto entry = get_block_index_entry_for_index(index);
-					
-					// Dequeue
-					auto block = entry->value.load(std::memory_order_relaxed);
-					auto& el = *((*block)[index]);
-					
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+                newBlock->owner = this;
+#endif
+                newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+                MOODYCAMEL_CONSTEXPR_IF(
+                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                {
+                    // May throw, try to insert now before we publish the fact that we have this new block
+                    MOODYCAMEL_TRY
+                    {
+                        new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        rewind_block_index_tail();
+                        idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                        this->parent->add_block_to_free_list(newBlock);
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                // Insert the new block into the index
+                idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                this->tailBlock = newBlock;
+
+                MOODYCAMEL_CONSTEXPR_IF(
+                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                {
+                    this->tailIndex.store(newTailIndex, std::memory_order_release);
+                    return true;
+                }
+            }
+
+            // Enqueue
+            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
+
+        template <typename U>
+        bool dequeue(U& element)
+        {
+            // See ExplicitProducer::dequeue for rationale and explanation
+            index_t tail       = this->tailIndex.load(std::memory_order_relaxed);
+            index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            if (details::circular_less_than<index_t>(
+                    this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+                tail                   = this->tailIndex.load(std::memory_order_acquire);
+                if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+                    index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                    // Determine which block the element is in
+                    auto entry = get_block_index_entry_for_index(index);
+
+                    // Dequeue
+                    auto block = entry->value.load(std::memory_order_relaxed);
+                    auto& el   = *((*block)[index]);
+
+                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-						// Note: Acquiring the mutex with every dequeue instead of only when a block
-						// is released is very sub-optimal, but it is, after all, purely debug code.
-						debug::DebugLock lock(producer->mutex);
-#endif
-						struct Guard {
-							Block* block;
-							index_t index;
-							BlockIndexEntry* entry;
-							ConcurrentQueue* parent;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-									entry->value.store(nullptr, std::memory_order_relaxed);
-									parent->add_block_to_free_list(block);
-								}
-							}
-						} guard = { block, index, entry, this->parent };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-
-						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-							{
+                        // Note: Acquiring the mutex with every dequeue instead of only when a block
+                        // is released is very sub-optimal, but it is, after all, purely debug code.
+                        debug::DebugLock lock(producer->mutex);
+#endif
+                        struct Guard {
+                            Block* block;
+                            index_t index;
+                            BlockIndexEntry* entry;
+                            ConcurrentQueue* parent;
+
+                            ~Guard()
+                            {
+                                (*block)[index]->~T();
+                                if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+                                    entry->value.store(nullptr, std::memory_order_relaxed);
+                                    parent->add_block_to_free_list(block);
+                                }
+                            }
+                        } guard = {block, index, entry, this->parent};
+
+                        element = std::move(el);  // NOLINT
+                    } else {
+                        element = std::move(el);  // NOLINT
+                        el.~T();                  // NOLINT
+
+                        if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+                            {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Add the block back into the global free pool (and remove from block index)
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-					}
-					
-					return true;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
-				}
-			}
-		
-			return false;
-		}
-		
+                                debug::DebugLock lock(mutex);
+#endif
+                                // Add the block back into the global free pool (and remove from block index)
+                                entry->value.store(nullptr, std::memory_order_relaxed);
+                            }
+                            this->parent->add_block_to_free_list(block);  // releases the above store
+                        }
+                    }
+
+                    return true;
+                } else {
+                    this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+                }
+            }
+
+            return false;
+        }
+
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable: 4706)  // assignment within conditional expression
-#endif
-		template<AllocationMode allocMode, typename It>
-		bool enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			
-			// Note that the tailBlock we start off with may not be owned by us any more;
-			// this happens if it was filled up exactly to the top (setting tailIndex to
-			// the first index of the next block which is not yet allocated), then dequeued
-			// completely (putting it on the free list) before we enqueue again.
-			
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			Block* firstAllocatedBlock = nullptr;
-			auto endBlock = this->tailBlock;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
+#pragma warning(disable : 4706)  // assignment within conditional expression
+#endif
+        template <AllocationMode allocMode, typename It>
+        bool enqueue_bulk(It itemFirst, size_t count)
+        {
+            // First, we need to make sure we have enough room to enqueue all of the elements;
+            // this means pre-allocating blocks and putting them in the block index (but only if
+            // all the allocations succeeded).
+
+            // Note that the tailBlock we start off with may not be owned by us any more;
+            // this happens if it was filled up exactly to the top (setting tailIndex to
+            // the first index of the next block which is not yet allocated), then dequeued
+            // completely (putting it on the free list) before we enqueue again.
+
+            index_t startTailIndex     = this->tailIndex.load(std::memory_order_relaxed);
+            auto startBlock            = this->tailBlock;
+            Block* firstAllocatedBlock = nullptr;
+            auto endBlock              = this->tailBlock;
+
+            // Figure out how many blocks we'll need to allocate, and do so
+            size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+                                   ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+            index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            if (blockBaseDiff > 0) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				do {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					// Find out where we'll be inserting this block in the block index
-					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
-					Block* newBlock;
-					bool indexInserted = false;
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-
-					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
-						// Index allocation or block allocation failed; revert any other allocations
-						// and index insertions done so far for this operation
-						if (indexInserted) {
-							rewind_block_index_tail();
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						}
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						
-						return false;
-					}
-					
+                debug::DebugLock lock(mutex);
+#endif
+                do {
+                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                    // Find out where we'll be inserting this block in the block index
+                    BlockIndexEntry* idxEntry =
+                        nullptr;  // initialization here unnecessary but compiler can't always tell
+                    Block* newBlock;
+                    bool indexInserted = false;
+                    auto head          = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
+                                (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+                                 (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+                    if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) ||
+                        (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) ==
+                            nullptr) {
+                        // Index allocation or block allocation failed; revert any other allocations
+                        // and index insertions done so far for this operation
+                        if (indexInserted) {
+                            rewind_block_index_tail();
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                        }
+                        currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+                            currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                            idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            rewind_block_index_tail();
+                        }
+                        this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                        this->tailBlock = startBlock;
+
+                        return false;
+                    }
+
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-					newBlock->next = nullptr;
-					
-					// Insert the new block into the index
-					idxEntry->value.store(newBlock, std::memory_order_relaxed);
-					
-					// Store the chain of blocks so that we can undo if later allocations fail,
-					// and so that we can find the blocks when we do the actual enqueueing
-					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
-						assert(this->tailBlock != nullptr);
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					endBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
-				} while (blockBaseDiff > 0);
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+                    newBlock->next = nullptr;
+
+                    // Insert the new block into the index
+                    idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                    // Store the chain of blocks so that we can undo if later allocations fail,
+                    // and so that we can find the blocks when we do the actual enqueueing
+                    if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+                        firstAllocatedBlock != nullptr) {
+                        assert(this->tailBlock != nullptr);
+                        this->tailBlock->next = newBlock;
+                    }
+                    this->tailBlock     = newBlock;
+                    endBlock            = newBlock;
+                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+                } while (blockBaseDiff > 0);
+            }
+
+            // Enqueue, one block at a time
+            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+            currentTailIndex     = startTailIndex;
+            this->tailBlock      = startBlock;
+            assert(
+                (startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr ||
+                count == 0);
+            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+                this->tailBlock = firstAllocatedBlock;
+            }
+            while (true) {
+                index_t stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+                    stopIndex = newTailIndex;
+                }
+                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+                    T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                {
+                    while (currentTailIndex != stopIndex) {
+                        new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                    }
+                }
+                else
+                {
+                    MOODYCAMEL_TRY
+                    {
+                        while (currentTailIndex != stopIndex) {
+                            new ((*this->tailBlock)[currentTailIndex])
+                                T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                                      T,
+                                      decltype(*itemFirst),
+                                      new (static_cast<T*>(nullptr))
+                                          T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+                            ++currentTailIndex;
+                            ++itemFirst;
+                        }
+                    }
+                    MOODYCAMEL_CATCH(...)
+                    {
+                        auto constructedStopIndex = currentTailIndex;
+                        auto lastBlockEnqueued    = this->tailBlock;
+
+                        if (!details::is_trivially_destructible<T>::value) {
+                            auto block = startBlock;
+                            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+                                block = firstAllocatedBlock;
+                            }
+                            currentTailIndex = startTailIndex;
+                            while (true) {
+                                stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                                            static_cast<index_t>(BLOCK_SIZE);
+                                if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+                                    stopIndex = constructedStopIndex;
+                                }
+                                while (currentTailIndex != stopIndex) {
+                                    (*block)[currentTailIndex++]->~T();
+                                }
+                                if (block == lastBlockEnqueued) {
+                                    break;
+                                }
+                                block = block->next;
+                            }
+                        }
+
+                        currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+                            currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                            auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            rewind_block_index_tail();
+                        }
+                        this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                        this->tailBlock = startBlock;
+                        MOODYCAMEL_RETHROW;
+                    }
+                }
+
+                if (this->tailBlock == endBlock) {
+                    assert(currentTailIndex == newTailIndex);
+                    break;
+                }
+                this->tailBlock = this->tailBlock->next;
+            }
+            this->tailIndex.store(newTailIndex, std::memory_order_release);
+            return true;
+        }
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					BlockIndexHeader* localBlockIndex;
-					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
-					do {
-						auto blockStartIndex = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						
-						auto entry = localBlockIndex->index[indexIndex];
-						auto block = entry->value.load(std::memory_order_relaxed);
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								do {
-									entry = localBlockIndex->index[indexIndex];
-									block = entry->value.load(std::memory_order_relaxed);
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									
-									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+
+        template <typename It>
+        size_t dequeue_bulk(It& itemFirst, size_t max)
+        {
+            auto tail       = this->tailIndex.load(std::memory_order_relaxed);
+            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+            auto desiredCount =
+                static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+            if (details::circular_less_than<size_t>(0, desiredCount)) {
+                desiredCount = desiredCount < max ? desiredCount : max;
+                std::atomic_thread_fence(std::memory_order_acquire);
+
+                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                tail             = this->tailIndex.load(std::memory_order_acquire);
+                auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                if (details::circular_less_than<size_t>(0, actualCount)) {
+                    actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                    if (actualCount < desiredCount) {
+                        this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                    }
+
+                    // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                    // will never exceed tail.
+                    auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                    // Iterate the blocks and dequeue
+                    auto index = firstIndex;
+                    BlockIndexHeader* localBlockIndex;
+                    auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+                    do {
+                        auto blockStartIndex = index;
+                        index_t endIndex =
+                            (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                        endIndex = details::circular_less_than<index_t>(
+                                       firstIndex + static_cast<index_t>(actualCount), endIndex) ?
+                                       firstIndex + static_cast<index_t>(actualCount) :
+                                       endIndex;
+
+                        auto entry = localBlockIndex->index[indexIndex];
+                        auto block = entry->value.load(std::memory_order_relaxed);
+                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(
+                                T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+                            while (index != endIndex) {
+                                auto& el     = *((*block)[index]);
+                                *itemFirst++ = std::move(el);
+                                el.~T();
+                                ++index;
+                            }
+                        } else {
+                            MOODYCAMEL_TRY
+                            {
+                                while (index != endIndex) {
+                                    auto& el   = *((*block)[index]);
+                                    *itemFirst = std::move(el);
+                                    ++itemFirst;
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            MOODYCAMEL_CATCH(...)
+                            {
+                                do {
+                                    entry = localBlockIndex->index[indexIndex];
+                                    block = entry->value.load(std::memory_order_relaxed);
+                                    while (index != endIndex) {
+                                        (*block)[index++]->~T();
+                                    }
+
+                                    if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(
+                                            blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-										debug::DebugLock lock(mutex);
-#endif
-										entry->value.store(nullptr, std::memory_order_relaxed);
-										this->parent->add_block_to_free_list(block);
-									}
-									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-									
-									blockStartIndex = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-							{
+                                        debug::DebugLock lock(mutex);
+#endif
+                                        entry->value.store(nullptr, std::memory_order_relaxed);
+                                        this->parent->add_block_to_free_list(block);
+                                    }
+                                    indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                                    blockStartIndex = index;
+                                    endIndex        = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                                               static_cast<index_t>(BLOCK_SIZE);
+                                    endIndex = details::circular_less_than<index_t>(
+                                                   firstIndex + static_cast<index_t>(actualCount), endIndex) ?
+                                                   firstIndex + static_cast<index_t>(actualCount) :
+                                                   endIndex;
+                                } while (index != firstIndex + actualCount);
+
+                                MOODYCAMEL_RETHROW;
+                            }
+                        }
+                        if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(
+                                blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+                            {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
-								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		// The block size must be > 1, so any number with the low bit set is an invalid block base index
-		static const index_t INVALID_BLOCK_BASE = 1;
-		
-		struct BlockIndexEntry
-		{
-			std::atomic<index_t> key;
-			std::atomic<Block*> value;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t capacity;
-			std::atomic<size_t> tail;
-			BlockIndexEntry* entries;
-			BlockIndexEntry** index;
-			BlockIndexHeader* prev;
-		};
-		
-		template<AllocationMode allocMode>
-		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
-			if (localBlockIndex == nullptr) {
-				return false;  // this can happen if new_block_index failed in the constructor
-			}
-			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-			idxEntry = localBlockIndex->index[newTail];
-			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
-				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
-				
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-			
-			// No room in the old block index, try to allocate another one!
-			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-				return false;
-			}
-			else if (!new_block_index()) {
-				return false;
-			}
-			else {
-				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-				idxEntry = localBlockIndex->index[newTail];
-				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-		}
-		
-		inline void rewind_block_index_tail()
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
-		}
-		
-		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
-		{
-			BlockIndexHeader* localBlockIndex;
-			auto idx = get_block_index_index_for_index(index, localBlockIndex);
-			return localBlockIndex->index[idx];
-		}
-		
-		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
-		{
+                                debug::DebugLock lock(mutex);
+#endif
+                                // Note that the set_many_empty above did a release, meaning that anybody who acquires
+                                // the block we're about to free can use it safely since our writes (and reads!) will
+                                // have happened-before then.
+                                entry->value.store(nullptr, std::memory_order_relaxed);
+                            }
+                            this->parent->add_block_to_free_list(block);  // releases the above store
+                        }
+                        indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+                    } while (index != firstIndex + actualCount);
+
+                    return actualCount;
+                } else {
+                    this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                }
+            }
+
+            return 0;
+        }
+
+    private:
+        // The block size must be > 1, so any number with the low bit set is an invalid block base index
+        static const index_t INVALID_BLOCK_BASE = 1;
+
+        struct BlockIndexEntry {
+            std::atomic<index_t> key;
+            std::atomic<Block*> value;
+        };
+
+        struct BlockIndexHeader {
+            size_t capacity;
+            std::atomic<size_t> tail;
+            BlockIndexEntry* entries;
+            BlockIndexEntry** index;
+            BlockIndexHeader* prev;
+        };
+
+        template <AllocationMode allocMode>
+        inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+        {
+            auto localBlockIndex =
+                blockIndex.load(std::memory_order_relaxed);  // We're the only writer thread, relaxed is OK
+            if (localBlockIndex == nullptr) {
+                return false;  // this can happen if new_block_index failed in the constructor
+            }
+            size_t newTail =
+                (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+            idxEntry = localBlockIndex->index[newTail];
+            if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+                idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+                idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                localBlockIndex->tail.store(newTail, std::memory_order_release);
+                return true;
+            }
+
+            // No room in the old block index, try to allocate another one!
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+            {
+                return false;
+            }
+            else if (!new_block_index())
+            {
+                return false;
+            }
+            else
+            {
+                localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+                idxEntry = localBlockIndex->index[newTail];
+                assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+                idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                localBlockIndex->tail.store(newTail, std::memory_order_release);
+                return true;
+            }
+        }
+
+        inline void rewind_block_index_tail()
+        {
+            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+            localBlockIndex->tail.store(
+                (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1),
+                std::memory_order_relaxed);
+        }
+
+        inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+        {
+            BlockIndexHeader* localBlockIndex;
+            auto idx = get_block_index_index_for_index(index, localBlockIndex);
+            return localBlockIndex->index[idx];
+        }
+
+        inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+        {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-			debug::DebugLock lock(mutex);
-#endif
-			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
-			localBlockIndex = blockIndex.load(std::memory_order_acquire);
-			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
-			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
-			assert(tailBase != INVALID_BLOCK_BASE);
-			// Note: Must use division instead of shift because the index may wrap around, causing a negative
-			// offset, whose negativity we want to preserve
-			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
-			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
-			return idx;
-		}
-		
-		bool new_block_index()
-		{
-			auto prev = blockIndex.load(std::memory_order_relaxed);
-			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
-			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-			auto raw = static_cast<char*>((Traits::malloc)(
-				sizeof(BlockIndexHeader) +
-				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
-				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
-			if (raw == nullptr) {
-				return false;
-			}
-			
-			auto header = new (raw) BlockIndexHeader;
-			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
-			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
-			if (prev != nullptr) {
-				auto prevTail = prev->tail.load(std::memory_order_relaxed);
-				auto prevPos = prevTail;
-				size_t i = 0;
-				do {
-					prevPos = (prevPos + 1) & (prev->capacity - 1);
-					index[i++] = prev->index[prevPos];
-				} while (prevPos != prevTail);
-				assert(i == prevCapacity);
-			}
-			for (size_t i = 0; i != entryCount; ++i) {
-				new (entries + i) BlockIndexEntry;
-				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
-				index[prevCapacity + i] = entries + i;
-			}
-			header->prev = prev;
-			header->entries = entries;
-			header->index = index;
-			header->capacity = nextBlockIndexCapacity;
-			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
-			
-			blockIndex.store(header, std::memory_order_release);
-			
-			nextBlockIndexCapacity <<= 1;
-			
-			return true;
-		}
-		
-	private:
-		size_t nextBlockIndexCapacity;
-		std::atomic<BlockIndexHeader*> blockIndex;
+            debug::DebugLock lock(mutex);
+#endif
+            index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+            localBlockIndex = blockIndex.load(std::memory_order_acquire);
+            auto tail       = localBlockIndex->tail.load(std::memory_order_acquire);
+            auto tailBase   = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+            assert(tailBase != INVALID_BLOCK_BASE);
+            // Note: Must use division instead of shift because the index may wrap around, causing a negative
+            // offset, whose negativity we want to preserve
+            auto offset = static_cast<size_t>(
+                static_cast<typename std::make_signed<index_t>::type>(index - tailBase) /
+                static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+            size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+            assert(
+                localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index &&
+                localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+            return idx;
+        }
+
+        bool new_block_index()
+        {
+            auto prev           = blockIndex.load(std::memory_order_relaxed);
+            size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+            auto entryCount     = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+            auto raw            = static_cast<char*>(
+                (Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 +
+                                 sizeof(BlockIndexEntry) * entryCount + std::alignment_of<BlockIndexEntry*>::value - 1 +
+                                 sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+            if (raw == nullptr) {
+                return false;
+            }
+
+            auto header = new (raw) BlockIndexHeader;
+            auto entries =
+                reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+            auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(
+                reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+            if (prev != nullptr) {
+                auto prevTail = prev->tail.load(std::memory_order_relaxed);
+                auto prevPos  = prevTail;
+                size_t i      = 0;
+                do {
+                    prevPos    = (prevPos + 1) & (prev->capacity - 1);
+                    index[i++] = prev->index[prevPos];
+                } while (prevPos != prevTail);
+                assert(i == prevCapacity);
+            }
+            for (size_t i = 0; i != entryCount; ++i) {
+                new (entries + i) BlockIndexEntry;
+                entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+                index[prevCapacity + i] = entries + i;
+            }
+            header->prev     = prev;
+            header->entries  = entries;
+            header->index    = index;
+            header->capacity = nextBlockIndexCapacity;
+            header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+
+            blockIndex.store(header, std::memory_order_release);
+
+            nextBlockIndexCapacity <<= 1;
+
+            return true;
+        }
+
+    private:
+        size_t nextBlockIndexCapacity;
+        std::atomic<BlockIndexHeader*> blockIndex;
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	public:
-		details::ThreadExitListener threadExitListener;
-	private:
+    public:
+        details::ThreadExitListener threadExitListener;
+
+    private:
 #endif
-		
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ImplicitProducer* nextImplicitProducer;
-	private:
+    public:
+        ImplicitProducer* nextImplicitProducer;
+
+    private:
 #endif
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-		mutable debug::DebugMutex mutex;
+        mutable debug::DebugMutex mutex;
 #endif
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Block pool manipulation
-	//////////////////////////////////
-	
-	void populate_initial_block_list(size_t blockCount)
-	{
-		initialBlockPoolSize = blockCount;
-		if (initialBlockPoolSize == 0) {
-			initialBlockPool = nullptr;
-			return;
-		}
-		
-		initialBlockPool = create_array<Block>(blockCount);
-		if (initialBlockPool == nullptr) {
-			initialBlockPoolSize = 0;
-		}
-		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
-			initialBlockPool[i].dynamicallyAllocated = false;
-		}
-	}
-	
-	inline Block* try_get_block_from_initial_pool()
-	{
-		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
-			return nullptr;
-		}
-		
-		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
-		
-		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
-	}
-	
-	inline void add_block_to_free_list(Block* block)
-	{
+        friend struct MemStats;
+#endif
+    };
+
+    //////////////////////////////////
+    // Block pool manipulation
+    //////////////////////////////////
+
+    void populate_initial_block_list(size_t blockCount)
+    {
+        initialBlockPoolSize = blockCount;
+        if (initialBlockPoolSize == 0) {
+            initialBlockPool = nullptr;
+            return;
+        }
+
+        initialBlockPool = create_array<Block>(blockCount);
+        if (initialBlockPool == nullptr) {
+            initialBlockPoolSize = 0;
+        }
+        for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+            initialBlockPool[i].dynamicallyAllocated = false;
+        }
+    }
+
+    inline Block* try_get_block_from_initial_pool()
+    {
+        if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+            return nullptr;
+        }
+
+        auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+        return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+    }
+
+    inline void add_block_to_free_list(Block* block)
+    {
 #ifdef MCDBGQ_TRACKMEM
-		block->owner = nullptr;
-#endif
-		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
-			destroy(block);
-		}
-		else {
-			freeList.add(block);
-		}
-	}
-	
-	inline void add_blocks_to_free_list(Block* block)
-	{
-		while (block != nullptr) {
-			auto next = block->next;
-			add_block_to_free_list(block);
-			block = next;
-		}
-	}
-	
-	inline Block* try_get_block_from_free_list()
-	{
-		return freeList.try_get();
-	}
-	
-	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
-	template<AllocationMode canAlloc>
-	Block* requisition_block()
-	{
-		auto block = try_get_block_from_initial_pool();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		block = try_get_block_from_free_list();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
-			return create<Block>();
-		}
-		else {
-			return nullptr;
-		}
-	}
-	
+        block->owner = nullptr;
+#endif
+        if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+            destroy(block);
+        } else {
+            freeList.add(block);
+        }
+    }
+
+    inline void add_blocks_to_free_list(Block* block)
+    {
+        while (block != nullptr) {
+            auto next = block->next;
+            add_block_to_free_list(block);
+            block = next;
+        }
+    }
+
+    inline Block* try_get_block_from_free_list() { return freeList.try_get(); }
+
+    // Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+    template <AllocationMode canAlloc>
+    Block* requisition_block()
+    {
+        auto block = try_get_block_from_initial_pool();
+        if (block != nullptr) {
+            return block;
+        }
+
+        block = try_get_block_from_free_list();
+        if (block != nullptr) {
+            return block;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc)
+        {
+            return create<Block>();
+        }
+        else
+        {
+            return nullptr;
+        }
+    }
 
 #ifdef MCDBGQ_TRACKMEM
-	public:
-		struct MemStats {
-			size_t allocatedBlocks;
-			size_t usedBlocks;
-			size_t freeBlocks;
-			size_t ownedBlocksExplicit;
-			size_t ownedBlocksImplicit;
-			size_t implicitProducers;
-			size_t explicitProducers;
-			size_t elementsEnqueued;
-			size_t blockClassBytes;
-			size_t queueClassBytes;
-			size_t implicitBlockIndexBytes;
-			size_t explicitBlockIndexBytes;
-			
-			friend class ConcurrentQueue;
-			
-		private:
-			static MemStats getFor(ConcurrentQueue* q)
-			{
-				MemStats stats = { 0 };
-				
-				stats.elementsEnqueued = q->size_approx();
-			
-				auto block = q->freeList.head_unsafe();
-				while (block != nullptr) {
-					++stats.allocatedBlocks;
-					++stats.freeBlocks;
-					block = block->freeListNext.load(std::memory_order_relaxed);
-				}
-				
-				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
-					stats.implicitProducers += implicit ? 1 : 0;
-					stats.explicitProducers += implicit ? 0 : 1;
-					
-					if (implicit) {
-						auto prod = static_cast<ImplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ImplicitProducer);
-						auto head = prod->headIndex.load(std::memory_order_relaxed);
-						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
-						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
-						if (hash != nullptr) {
-							for (size_t i = 0; i != hash->capacity; ++i) {
-								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
-									++stats.allocatedBlocks;
-									++stats.ownedBlocksImplicit;
-								}
-							}
-							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
-							for (; hash != nullptr; hash = hash->prev) {
-								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
-							}
-						}
-						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
-							//auto block = prod->get_block_index_entry_for_index(head);
-							++stats.usedBlocks;
-						}
-					}
-					else {
-						auto prod = static_cast<ExplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ExplicitProducer);
-						auto tailBlock = prod->tailBlock;
-						bool wasNonEmpty = false;
-						if (tailBlock != nullptr) {
-							auto block = tailBlock;
-							do {
-								++stats.allocatedBlocks;
-								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
-									++stats.usedBlocks;
-									wasNonEmpty = wasNonEmpty || block != tailBlock;
-								}
-								++stats.ownedBlocksExplicit;
-								block = block->next;
-							} while (block != tailBlock);
-						}
-						auto index = prod->blockIndex.load(std::memory_order_relaxed);
-						while (index != nullptr) {
-							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
-							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
-						}
-					}
-				}
-				
-				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
-				stats.allocatedBlocks += freeOnInitialPool;
-				stats.freeBlocks += freeOnInitialPool;
-				
-				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
-				stats.queueClassBytes += sizeof(ConcurrentQueue);
-				
-				return stats;
-			}
-		};
-		
-		// For debugging only. Not thread-safe.
-		MemStats getMemStats()
-		{
-			return MemStats::getFor(this);
-		}
-	private:
-		friend struct MemStats;
-#endif
-	
-	
-	//////////////////////////////////
-	// Producer list manipulation
-	//////////////////////////////////	
-	
-	ProducerBase* recycle_or_create_producer(bool isExplicit)
-	{
+public:
+    struct MemStats {
+        size_t allocatedBlocks;
+        size_t usedBlocks;
+        size_t freeBlocks;
+        size_t ownedBlocksExplicit;
+        size_t ownedBlocksImplicit;
+        size_t implicitProducers;
+        size_t explicitProducers;
+        size_t elementsEnqueued;
+        size_t blockClassBytes;
+        size_t queueClassBytes;
+        size_t implicitBlockIndexBytes;
+        size_t explicitBlockIndexBytes;
+
+        friend class ConcurrentQueue;
+
+    private:
+        static MemStats getFor(ConcurrentQueue* q)
+        {
+            MemStats stats = {0};
+
+            stats.elementsEnqueued = q->size_approx();
+
+            auto block = q->freeList.head_unsafe();
+            while (block != nullptr) {
+                ++stats.allocatedBlocks;
+                ++stats.freeBlocks;
+                block = block->freeListNext.load(std::memory_order_relaxed);
+            }
+
+            for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr;
+                 ptr      = ptr->next_prod()) {
+                bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+                stats.implicitProducers += implicit ? 1 : 0;
+                stats.explicitProducers += implicit ? 0 : 1;
+
+                if (implicit) {
+                    auto prod = static_cast<ImplicitProducer*>(ptr);
+                    stats.queueClassBytes += sizeof(ImplicitProducer);
+                    auto head = prod->headIndex.load(std::memory_order_relaxed);
+                    auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+                    auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+                    if (hash != nullptr) {
+                        for (size_t i = 0; i != hash->capacity; ++i) {
+                            if (hash->index[i]->key.load(std::memory_order_relaxed) !=
+                                    ImplicitProducer::INVALID_BLOCK_BASE &&
+                                hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+                                ++stats.allocatedBlocks;
+                                ++stats.ownedBlocksImplicit;
+                            }
+                        }
+                        stats.implicitBlockIndexBytes +=
+                            hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+                        for (; hash != nullptr; hash = hash->prev) {
+                            stats.implicitBlockIndexBytes +=
+                                sizeof(typename ImplicitProducer::BlockIndexHeader) +
+                                hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+                        }
+                    }
+                    for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+                        // auto block = prod->get_block_index_entry_for_index(head);
+                        ++stats.usedBlocks;
+                    }
+                } else {
+                    auto prod = static_cast<ExplicitProducer*>(ptr);
+                    stats.queueClassBytes += sizeof(ExplicitProducer);
+                    auto tailBlock   = prod->tailBlock;
+                    bool wasNonEmpty = false;
+                    if (tailBlock != nullptr) {
+                        auto block = tailBlock;
+                        do {
+                            ++stats.allocatedBlocks;
+                            if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+                                ++stats.usedBlocks;
+                                wasNonEmpty = wasNonEmpty || block != tailBlock;
+                            }
+                            ++stats.ownedBlocksExplicit;
+                            block = block->next;
+                        } while (block != tailBlock);
+                    }
+                    auto index = prod->blockIndex.load(std::memory_order_relaxed);
+                    while (index != nullptr) {
+                        stats.explicitBlockIndexBytes +=
+                            sizeof(typename ExplicitProducer::BlockIndexHeader) +
+                            index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+                        index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+                    }
+                }
+            }
+
+            auto freeOnInitialPool =
+                q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ?
+                    0 :
+                    q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+            stats.allocatedBlocks += freeOnInitialPool;
+            stats.freeBlocks += freeOnInitialPool;
+
+            stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+            stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+            return stats;
+        }
+    };
+
+    // For debugging only. Not thread-safe.
+    MemStats getMemStats() { return MemStats::getFor(this); }
+
+private:
+    friend struct MemStats;
+#endif
+
+    //////////////////////////////////
+    // Producer list manipulation
+    //////////////////////////////////
+
+    ProducerBase* recycle_or_create_producer(bool isExplicit)
+    {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		// Try to re-use one first
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
-				bool expected = true;
-				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// We caught one! It's been marked as activated, the caller can have it
-					return ptr;
-				}
-			}
-		}
-
-		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
-	}
-	
-	ProducerBase* add_producer(ProducerBase* producer)
-	{
-		// Handle failed memory allocation
-		if (producer == nullptr) {
-			return nullptr;
-		}
-		
-		producerCount.fetch_add(1, std::memory_order_relaxed);
-		
-		// Add it to the lock-free list
-		auto prevTail = producerListTail.load(std::memory_order_relaxed);
-		do {
-			producer->next = prevTail;
-		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
-		
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+        // Try to re-use one first
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+                bool expected = true;
+                if (ptr->inactive.compare_exchange_strong(
+                        expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+                    // We caught one! It's been marked as activated, the caller can have it
+                    return ptr;
+                }
+            }
+        }
+
+        return add_producer(
+            isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+    }
+
+    ProducerBase* add_producer(ProducerBase* producer)
+    {
+        // Handle failed memory allocation
+        if (producer == nullptr) {
+            return nullptr;
+        }
+
+        producerCount.fetch_add(1, std::memory_order_relaxed);
+
+        // Add it to the lock-free list
+        auto prevTail = producerListTail.load(std::memory_order_relaxed);
+        do {
+            producer->next = prevTail;
+        } while (!producerListTail.compare_exchange_weak(
+            prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		if (producer->isExplicit) {
-			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
-			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-		else {
-			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
-			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-#endif
-		
-		return producer;
-	}
-	
-	void reown_producers()
-	{
-		// After another instance is moved-into/swapped-with this one, all the
-		// producers we stole still think their parents are the other queue.
-		// So fix them up!
-		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
-			ptr->parent = this;
-		}
-	}
-	
-	
-	//////////////////////////////////
-	// Implicit producer hash
-	//////////////////////////////////
-	
-	struct ImplicitProducerKVP
-	{
-		std::atomic<details::thread_id_t> key;
-		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
-		
-		ImplicitProducerKVP() : value(nullptr) { }
-		
-		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
-			value = other.value;
-		}
-		
-		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			swap(other);
-			return *this;
-		}
-		
-		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
-		{
-			if (this != &other) {
-				details::swap_relaxed(key, other.key);
-				std::swap(value, other.value);
-			}
-		}
-	};
-	
-	template<typename XT, typename XTraits>
-	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
-	
-	struct ImplicitProducerHash
-	{
-		size_t capacity;
-		ImplicitProducerKVP* entries;
-		ImplicitProducerHash* prev;
-	};
-	
-	inline void populate_initial_implicit_producer_hash()
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			implicitProducerHashCount.store(0, std::memory_order_relaxed);
-			auto hash = &initialImplicitProducerHash;
-			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-			hash->entries = &initialImplicitProducerHashEntries[0];
-			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-			}
-			hash->prev = nullptr;
-			implicitProducerHash.store(hash, std::memory_order_relaxed);
-		}
-	}
-	
-	void swap_implicit_producer_hashes(ConcurrentQueue& other)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			// Swap (assumes our implicit producer hash is initialized)
-			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
-			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-			
-			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-			
-			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &initialImplicitProducerHash;
-			}
-			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &other.initialImplicitProducerHash;
-			}
-		}
-	}
-	
-	// Only fails (returns nullptr) if memory allocation fails
-	ImplicitProducer* get_or_add_implicit_producer()
-	{
-		// Note that since the data is essentially thread-local (key is thread ID),
-		// there's a reduced need for fences (memory ordering is already consistent
-		// for any individual thread), except for the current table itself.
-		
-		// Start by looking for the thread ID in the current and all previous hash tables.
-		// If it's not found, it must not be in there yet, since this same thread would
-		// have added it previously to one of the tables that we traversed.
-		
-		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
-		
+        if (producer->isExplicit) {
+            auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+            do {
+                static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+            } while (!explicitProducers.compare_exchange_weak(
+                prevTailExplicit,
+                static_cast<ExplicitProducer*>(producer),
+                std::memory_order_release,
+                std::memory_order_relaxed));
+        } else {
+            auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+            do {
+                static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+            } while (!implicitProducers.compare_exchange_weak(
+                prevTailImplicit,
+                static_cast<ImplicitProducer*>(producer),
+                std::memory_order_release,
+                std::memory_order_relaxed));
+        }
+#endif
+
+        return producer;
+    }
+
+    void reown_producers()
+    {
+        // After another instance is moved-into/swapped-with this one, all the
+        // producers we stole still think their parents are the other queue.
+        // So fix them up!
+        for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+            ptr->parent = this;
+        }
+    }
+
+    //////////////////////////////////
+    // Implicit producer hash
+    //////////////////////////////////
+
+    struct ImplicitProducerKVP {
+        std::atomic<details::thread_id_t> key;
+        ImplicitProducer*
+            value;  // No need for atomicity since it's only read by the thread that sets it in the first place
+
+        ImplicitProducerKVP(): value(nullptr) {}
+
+        ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+        {
+            key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            value = other.value;
+        }
+
+        inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap(other);
+            return *this;
+        }
+
+        inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+        {
+            if (this != &other) {
+                details::swap_relaxed(key, other.key);
+                std::swap(value, other.value);
+            }
+        }
+    };
+
+    template <typename XT, typename XTraits>
+    friend void moodycamel::swap(
+        typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&,
+        typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+
+    struct ImplicitProducerHash {
+        size_t capacity;
+        ImplicitProducerKVP* entries;
+        ImplicitProducerHash* prev;
+    };
+
+    inline void populate_initial_implicit_producer_hash()
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+        {
+            return;
+        }
+        else
+        {
+            implicitProducerHashCount.store(0, std::memory_order_relaxed);
+            auto hash      = &initialImplicitProducerHash;
+            hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+            hash->entries  = &initialImplicitProducerHashEntries[0];
+            for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+                initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+            }
+            hash->prev = nullptr;
+            implicitProducerHash.store(hash, std::memory_order_relaxed);
+        }
+    }
+
+    void swap_implicit_producer_hashes(ConcurrentQueue& other)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+        {
+            return;
+        }
+        else
+        {
+            // Swap (assumes our implicit producer hash is initialized)
+            initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+            initialImplicitProducerHash.entries       = &initialImplicitProducerHashEntries[0];
+            other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+
+            details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+
+            details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+            if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+                implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+            } else {
+                ImplicitProducerHash* hash;
+                for (hash = implicitProducerHash.load(std::memory_order_relaxed);
+                     hash->prev != &other.initialImplicitProducerHash;
+                     hash = hash->prev) {
+                    continue;
+                }
+                hash->prev = &initialImplicitProducerHash;
+            }
+            if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+                other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+            } else {
+                ImplicitProducerHash* hash;
+                for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
+                     hash->prev != &initialImplicitProducerHash;
+                     hash = hash->prev) {
+                    continue;
+                }
+                hash->prev = &other.initialImplicitProducerHash;
+            }
+        }
+    }
+
+    // Only fails (returns nullptr) if memory allocation fails
+    ImplicitProducer* get_or_add_implicit_producer()
+    {
+        // Note that since the data is essentially thread-local (key is thread ID),
+        // there's a reduced need for fences (memory ordering is already consistent
+        // for any individual thread), except for the current table itself.
+
+        // Start by looking for the thread ID in the current and all previous hash tables.
+        // If it's not found, it must not be in there yet, since this same thread would
+        // have added it previously to one of the tables that we traversed.
+
+        // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		
-		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
-		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
-			// Look for the id in this hash
-			auto index = hashedId;
-			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
-				index &= hash->capacity - 1u;
-				
-				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
-				if (probedKey == id) {
-					// Found it! If we had to search several hashes deep, though, we should lazily add it
-					// to the current main hash table to avoid the extended search next time.
-					// Note there's guaranteed to be room in the current hash table since every subsequent
-					// table implicitly reserves space for all previous tables (there's only one
-					// implicitProducerHashCount).
-					auto value = hash->entries[index].value;
-					if (hash != mainHash) {
-						index = hashedId;
-						while (true) {
-							index &= mainHash->capacity - 1u;
-							auto empty = details::invalid_thread_id;
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+
+        auto id       = details::thread_id();
+        auto hashedId = details::hash_thread_id(id);
+
+        auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+        for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+            // Look for the id in this hash
+            auto index = hashedId;
+            while (true) {  // Not an infinite loop because at least one slot is free in the hash table
+                index &= hash->capacity - 1u;
+
+                auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+                if (probedKey == id) {
+                    // Found it! If we had to search several hashes deep, though, we should lazily add it
+                    // to the current main hash table to avoid the extended search next time.
+                    // Note there's guaranteed to be room in the current hash table since every subsequent
+                    // table implicitly reserves space for all previous tables (there's only one
+                    // implicitProducerHashCount).
+                    auto value = hash->entries[index].value;
+                    if (hash != mainHash) {
+                        index = hashedId;
+                        while (true) {
+                            index &= mainHash->capacity - 1u;
+                            auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-							auto reusable = details::invalid_thread_id2;
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
-								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+                            auto reusable = details::invalid_thread_id2;
+                            if (mainHash->entries[index].key.compare_exchange_strong(
+                                    empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+                                mainHash->entries[index].key.compare_exchange_strong(
+                                    reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
 #else
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#endif
-								mainHash->entries[index].value = value;
-								break;
-							}
-							++index;
-						}
-					}
-					
-					return value;
-				}
-				if (probedKey == details::invalid_thread_id) {
-					break;		// Not in this hash table
-				}
-				++index;
-			}
-		}
-		
-		// Insert!
-		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
-		while (true) {
-			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
-				// We've acquired the resize lock, try to allocate a bigger hash table.
-				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
-				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
-				// locked block).
-				mainHash = implicitProducerHash.load(std::memory_order_acquire);
-				if (newCount >= (mainHash->capacity >> 1)) {
-					size_t newCapacity = mainHash->capacity << 1;
-					while (newCount >= (newCapacity >> 1)) {
-						newCapacity <<= 1;
-					}
-					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
-					if (raw == nullptr) {
-						// Allocation failed
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-						return nullptr;
-					}
-					
-					auto newHash = new (raw) ImplicitProducerHash;
-					newHash->capacity = static_cast<size_t>(newCapacity);
-					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
-					for (size_t i = 0; i != newCapacity; ++i) {
-						new (newHash->entries + i) ImplicitProducerKVP;
-						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-					}
-					newHash->prev = mainHash;
-					implicitProducerHash.store(newHash, std::memory_order_release);
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-					mainHash = newHash;
-				}
-				else {
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-				}
-			}
-			
-			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
-			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
-			// always be true)
-			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
-				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
-				if (producer == nullptr) {
-					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-					return nullptr;
-				}
-				
+                            if (mainHash->entries[index].key.compare_exchange_strong(
+                                    empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#endif
+                                mainHash->entries[index].value = value;
+                                break;
+                            }
+                            ++index;
+                        }
+                    }
+
+                    return value;
+                }
+                if (probedKey == details::invalid_thread_id) {
+                    break;  // Not in this hash table
+                }
+                ++index;
+            }
+        }
+
+        // Insert!
+        auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+        while (true) {
+            // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+            if (newCount >= (mainHash->capacity >> 1) &&
+                !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+                // We've acquired the resize lock, try to allocate a bigger hash table.
+                // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+                // we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+                // locked block).
+                mainHash = implicitProducerHash.load(std::memory_order_acquire);
+                if (newCount >= (mainHash->capacity >> 1)) {
+                    size_t newCapacity = mainHash->capacity << 1;
+                    while (newCount >= (newCapacity >> 1)) {
+                        newCapacity <<= 1;
+                    }
+                    auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) +
+                                                                   std::alignment_of<ImplicitProducerKVP>::value - 1 +
+                                                                   sizeof(ImplicitProducerKVP) * newCapacity));
+                    if (raw == nullptr) {
+                        // Allocation failed
+                        implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+                        return nullptr;
+                    }
+
+                    auto newHash      = new (raw) ImplicitProducerHash;
+                    newHash->capacity = static_cast<size_t>(newCapacity);
+                    newHash->entries  = reinterpret_cast<ImplicitProducerKVP*>(
+                        details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+                    for (size_t i = 0; i != newCapacity; ++i) {
+                        new (newHash->entries + i) ImplicitProducerKVP;
+                        newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+                    }
+                    newHash->prev = mainHash;
+                    implicitProducerHash.store(newHash, std::memory_order_release);
+                    implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                    mainHash = newHash;
+                } else {
+                    implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                }
+            }
+
+            // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+            // to finish being allocated by another thread (and if we just finished allocating above, the condition will
+            // always be true)
+            if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+                auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+                if (producer == nullptr) {
+                    implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                    return nullptr;
+                }
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
-				producer->threadExitListener.userData = producer;
-				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
-#endif
-				
-				auto index = hashedId;
-				while (true) {
-					index &= mainHash->capacity - 1u;
-					auto empty = details::invalid_thread_id;
+                producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+                producer->threadExitListener.userData = producer;
+                details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+
+                auto index = hashedId;
+                while (true) {
+                    index &= mainHash->capacity - 1u;
+                    auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-					auto reusable = details::invalid_thread_id2;
-					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
-						mainHash->entries[index].value = producer;
-						break;
-					}
-#endif
-					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						mainHash->entries[index].value = producer;
-						break;
-					}
-					++index;
-				}
-				return producer;
-			}
-			
-			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
-			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
-			// we try to allocate ourselves).
-			mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		}
-	}
-	
+                    auto reusable = details::invalid_thread_id2;
+                    if (mainHash->entries[index].key.compare_exchange_strong(
+                            reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+                        implicitProducerHashCount.fetch_sub(
+                            1, std::memory_order_relaxed);  // already counted as a used slot
+                        mainHash->entries[index].value = producer;
+                        break;
+                    }
+#endif
+                    if (mainHash->entries[index].key.compare_exchange_strong(
+                            empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+                        mainHash->entries[index].value = producer;
+                        break;
+                    }
+                    ++index;
+                }
+                return producer;
+            }
+
+            // Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+            // We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+            // we try to allocate ourselves).
+            mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        }
+    }
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	void implicit_producer_thread_exited(ImplicitProducer* producer)
-	{
-		// Remove from hash
+    void implicit_producer_thread_exited(ImplicitProducer* producer)
+    {
+        // Remove from hash
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		auto hash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		details::thread_id_t probedKey;
-		
-		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
-		// trying to add an entry thinking there's a free slot (because they reused a producer)
-		for (; hash != nullptr; hash = hash->prev) {
-			auto index = hashedId;
-			do {
-				index &= hash->capacity - 1u;
-				probedKey = id;
-				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-					break;
-				}
-				++index;
-			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
-		}
-		
-		// Mark the queue as being recyclable
-		producer->inactive.store(true, std::memory_order_release);
-	}
-	
-	static void implicit_producer_thread_exited_callback(void* userData)
-	{
-		auto producer = static_cast<ImplicitProducer*>(userData);
-		auto queue = producer->parent;
-		queue->implicit_producer_thread_exited(producer);
-	}
-#endif
-	
-	//////////////////////////////////
-	// Utility functions
-	//////////////////////////////////
-
-	template<typename TAlign>
-	static inline void* aligned_malloc(size_t size)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::malloc)(size);
-		else {
-			size_t alignment = std::alignment_of<TAlign>::value;
-			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
-			if (!raw)
-				return nullptr;
-			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
-			*(reinterpret_cast<void**>(ptr) - 1) = raw;
-			return ptr;
-		}
-	}
-
-	template<typename TAlign>
-	static inline void aligned_free(void* ptr)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::free)(ptr);
-		else
-			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
-	}
-
-	template<typename U>
-	static inline U* create_array(size_t count)
-	{
-		assert(count > 0);
-		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
-		if (p == nullptr)
-			return nullptr;
-
-		for (size_t i = 0; i != count; ++i)
-			new (p + i) U();
-		return p;
-	}
-
-	template<typename U>
-	static inline void destroy_array(U* p, size_t count)
-	{
-		if (p != nullptr) {
-			assert(count > 0);
-			for (size_t i = count; i != 0; )
-				(p + --i)->~U();
-		}
-		aligned_free<U>(p);
-	}
-
-	template<typename U>
-	static inline U* create()
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U : nullptr;
-	}
-
-	template<typename U, typename A1>
-	static inline U* create(A1&& a1)
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
-	}
-
-	template<typename U>
-	static inline void destroy(U* p)
-	{
-		if (p != nullptr)
-			p->~U();
-		aligned_free<U>(p);
-	}
+        debug::DebugLock lock(implicitProdMutex);
+#endif
+        auto hash = implicitProducerHash.load(std::memory_order_acquire);
+        assert(
+            hash !=
+            nullptr);  // The thread exit listener is only registered if we were added to a hash in the first place
+        auto id       = details::thread_id();
+        auto hashedId = details::hash_thread_id(id);
+        details::thread_id_t probedKey;
+
+        // We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+        // trying to add an entry thinking there's a free slot (because they reused a producer)
+        for (; hash != nullptr; hash = hash->prev) {
+            auto index = hashedId;
+            do {
+                index &= hash->capacity - 1u;
+                probedKey = id;
+                if (hash->entries[index].key.compare_exchange_strong(
+                        probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+                    break;
+                }
+                ++index;
+            } while (probedKey !=
+                     details::invalid_thread_id);  // Can happen if the hash has changed but we weren't put back in it
+                                                   // yet, or if we weren't added to this hash in the first place
+        }
+
+        // Mark the queue as being recyclable
+        producer->inactive.store(true, std::memory_order_release);
+    }
+
+    static void implicit_producer_thread_exited_callback(void* userData)
+    {
+        auto producer = static_cast<ImplicitProducer*>(userData);
+        auto queue    = producer->parent;
+        queue->implicit_producer_thread_exited(producer);
+    }
+#endif
+
+    //////////////////////////////////
+    // Utility functions
+    //////////////////////////////////
+
+    template <typename TAlign>
+    static inline void* aligned_malloc(size_t size)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+        return (Traits::malloc)(size);
+        else
+        {
+            size_t alignment = std::alignment_of<TAlign>::value;
+            void* raw        = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+            if (!raw)
+                return nullptr;
+            char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+            *(reinterpret_cast<void**>(ptr) - 1) = raw;
+            return ptr;
+        }
+    }
+
+    template <typename TAlign>
+    static inline void aligned_free(void* ptr)
+    {
+        MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+        return (Traits::free)(ptr);
+        else(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+    }
+
+    template <typename U>
+    static inline U* create_array(size_t count)
+    {
+        assert(count > 0);
+        U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+        if (p == nullptr)
+            return nullptr;
+
+        for (size_t i = 0; i != count; ++i)
+            new (p + i) U();
+        return p;
+    }
+
+    template <typename U>
+    static inline void destroy_array(U* p, size_t count)
+    {
+        if (p != nullptr) {
+            assert(count > 0);
+            for (size_t i = count; i != 0;)
+                (p + --i)->~U();
+        }
+        aligned_free<U>(p);
+    }
+
+    template <typename U>
+    static inline U* create()
+    {
+        void* p = aligned_malloc<U>(sizeof(U));
+        return p != nullptr ? new (p) U : nullptr;
+    }
+
+    template <typename U, typename A1>
+    static inline U* create(A1&& a1)
+    {
+        void* p = aligned_malloc<U>(sizeof(U));
+        return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+    }
+
+    template <typename U>
+    static inline void destroy(U* p)
+    {
+        if (p != nullptr)
+            p->~U();
+        aligned_free<U>(p);
+    }
 
 private:
-	std::atomic<ProducerBase*> producerListTail;
-	std::atomic<std::uint32_t> producerCount;
-	
-	std::atomic<size_t> initialBlockPoolIndex;
-	Block* initialBlockPool;
-	size_t initialBlockPoolSize;
-	
+    std::atomic<ProducerBase*> producerListTail;
+    std::atomic<std::uint32_t> producerCount;
+
+    std::atomic<size_t> initialBlockPoolIndex;
+    Block* initialBlockPool;
+    size_t initialBlockPoolSize;
+
 #ifndef MCDBGQ_USEDEBUGFREELIST
-	FreeList<Block> freeList;
+    FreeList<Block> freeList;
 #else
-	debug::DebugFreeList<Block> freeList;
-#endif
-	
-	std::atomic<ImplicitProducerHash*> implicitProducerHash;
-	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
-	ImplicitProducerHash initialImplicitProducerHash;
-	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
-	std::atomic_flag implicitProducerHashResizeInProgress;
-	
-	std::atomic<std::uint32_t> nextExplicitConsumerId;
-	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
-	
+    debug::DebugFreeList<Block> freeList;
+#endif
+
+    std::atomic<ImplicitProducerHash*> implicitProducerHash;
+    std::atomic<size_t> implicitProducerHashCount;  // Number of slots logically used
+    ImplicitProducerHash initialImplicitProducerHash;
+    std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+    std::atomic_flag implicitProducerHashResizeInProgress;
+
+    std::atomic<std::uint32_t> nextExplicitConsumerId;
+    std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-	debug::DebugMutex implicitProdMutex;
+    debug::DebugMutex implicitProdMutex;
 #endif
-	
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	std::atomic<ExplicitProducer*> explicitProducers;
-	std::atomic<ImplicitProducer*> implicitProducers;
+    std::atomic<ExplicitProducer*> explicitProducers;
+    std::atomic<ImplicitProducer*> implicitProducers;
 #endif
 };
 
-
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
-	: producer(queue.recycle_or_create_producer(true))
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue): producer(queue.recycle_or_create_producer(true))
 {
-	if (producer != nullptr) {
-		producer->token = this;
-	}
+    if (producer != nullptr) {
+        producer->token = this;
+    }
 }
 
-template<typename T, typename Traits>
+template <typename T, typename Traits>
 ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+    : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
 {
-	if (producer != nullptr) {
-		producer->token = this;
-	}
+    if (producer != nullptr) {
+        producer->token = this;
+    }
 }
 
-template<typename T, typename Traits>
+template <typename T, typename Traits>
 ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+    : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
 {
-	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+    initialOffset         = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+    lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
 }
 
-template<typename T, typename Traits>
+template <typename T, typename Traits>
 ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+    : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
 {
-	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+    initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(
+        1, std::memory_order_release);
+    lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
 }
 
-template<typename T, typename Traits>
+template <typename T, typename Traits>
 inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
 {
-	a.swap(b);
+    a.swap(b);
 }
 
 inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
 {
-	a.swap(b);
+    a.swap(b);
 }
 
 inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
 {
-	a.swap(b);
+    a.swap(b);
 }
 
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+template <typename T, typename Traits>
+inline void swap(
+    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
+    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
 {
-	a.swap(b);
+    a.swap(b);
 }
 
-}
+}  // namespace moodycamel
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
 #pragma warning(pop)
diff --git a/scripts/lint.sh b/scripts/lint.sh
new file mode 100755
index 000000000..273d70043
--- /dev/null
+++ b/scripts/lint.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -euxo pipefail
+
+black .
+pflake8 .
+mypy --install-types .
+
+# disable tracing -- too verbose
+set +x
+# find all .h, .hpp, and .cpp files excepting third party files
+files=$(find ./scaler -path './scaler/io/ymq/third_party' -prune -o \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) -print0 | xargs -0)
+set -x
+
+echo "running clang format on $(wc -w <<< $files) files"
+clang-format -i -style file -- $files
diff --git a/tests/cc_ymq/common.h b/tests/cc_ymq/common.h
index 42dfab9fe..bafe5d907 100644
--- a/tests/cc_ymq/common.h
+++ b/tests/cc_ymq/common.h
@@ -30,10 +30,11 @@
 #include <format>
 #include <functional>
 #include <iostream>
-#include <numeric>
+#include <functional>
+#include <initializer_list>
+#include <iostream>
 #include <optional>
 #include <print>
-#include <stdexcept>
 #include <string>
 #include <system_error>
 #include <thread>
diff --git a/tests/cc_ymq/py_mitm/core.py b/tests/cc_ymq/py_mitm/core.py
new file mode 100644
index 000000000..4a22ee01a
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/core.py
@@ -0,0 +1,54 @@
+"""
+This is the common code for implementing man in the middle in Python
+"""
+
+import dataclasses
+from typing import Protocol
+from scapy.all import TunTapInterface, IP, TCP  # type: ignore
+
+
+@dataclasses.dataclass
+class TCPConnection:
+    """
+    Represents a TCP connection over the TUNTAP interface
+    local_ip and local_port are the mitm's ip and port, and
+    remote_ip and remote_port are the port for the remote peer
+    """
+
+    local_ip: str
+    local_port: int
+    remote_ip: str
+    remote_port: int
+
+    def rewrite(self, pkt: IP, ack: int | None = None, data=None):
+        """
+        Rewrite a TCP/IP packet as a packet originating
+        from (local_ip, local_port) and going to (remote_ip, remote_port)
+        This function is useful for taking a packet received from one connection, and redirecting it to another
+
+        Args:
+            pkt: A scapy TCP/IP packet to rewrite
+            ack: An optional ack number to use instead of the one found in `pkt`
+            data: An optional payload to use instead of the one found int `pkt`
+
+        Returns:
+            The rewritten packet, suitable for sending over TUNTAP
+        """
+        tcp = pkt[TCP]
+
+        return (
+            IP(src=self.local_ip, dst=self.remote_ip)
+            / TCP(sport=self.local_port, dport=self.remote_port, flags=tcp.flags, seq=tcp.seq, ack=ack or tcp.ack)
+            / bytes(data or tcp.payload)
+        )
+
+
+class MITMProtocol(Protocol):
+    def proxy(
+        self,
+        tuntap: TunTapInterface,
+        pkt: IP,
+        sender: TCPConnection,
+        client_conn: TCPConnection | None,
+        server_conn: TCPConnection,
+    ) -> None: ...
diff --git a/tests/cc_ymq/py_mitm/drop.py b/tests/cc_ymq/py_mitm/drop.py
new file mode 100644
index 000000000..520e5bf70
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/drop.py
@@ -0,0 +1,28 @@
+"""
+This MITM drops a % of packets
+"""
+
+import random
+from core import MITMProtocol, TunTapInterface, IP, TCPConnection
+
+
+class MITM(MITMProtocol):
+    def __init__(self, drop_pcent: str):
+        self.drop_pcent = float(drop_pcent)
+
+    def proxy(
+        self,
+        tuntap: TunTapInterface,
+        pkt: IP,
+        sender: TCPConnection,
+        client_conn: TCPConnection | None,
+        server_conn: TCPConnection,
+    ) -> None:
+        if random.random() < self.drop_pcent:
+            print("[!] Dropping packet")
+            return
+
+        if sender == client_conn:
+            tuntap.send(server_conn.rewrite(pkt))
+        elif sender == server_conn:
+            tuntap.send(client_conn.rewrite(pkt))
diff --git a/tests/cc_ymq/py_mitm/rst.py b/tests/cc_ymq/py_mitm/rst.py
new file mode 100644
index 000000000..be301d5b0
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/rst.py
@@ -0,0 +1,48 @@
+"""
+This MITM inserts an unexpected TCP RST
+"""
+
+from core import IP, TCP, MITMProtocol, TCPConnection, TunTapInterface
+
+
+class MITM(MITMProtocol):
+    def __init__(self):
+        # count the number of psh-acks sent by the client
+        self.client_pshack_counter = 0
+
+    def proxy(
+        self,
+        tuntap: TunTapInterface,
+        pkt: IP,
+        sender: TCPConnection,
+        client_conn: TCPConnection | None,
+        server_conn: TCPConnection,
+    ) -> None:
+        if sender == client_conn or client_conn is None:
+            if pkt[TCP].flags == "PA":
+                self.client_pshack_counter += 1
+
+                # on the second psh-ack, send a rst instead
+                if self.client_pshack_counter == 2:
+                    rst_pkt = IP(src=client_conn.local_ip, dst=client_conn.remote_ip) / TCP(
+                        sport=client_conn.local_port, dport=client_conn.remote_port, flags="R", seq=pkt[TCP].ack
+                    )
+                    print(f"<- [{rst_pkt[TCP].flags}] (simulated)")
+                    tuntap.send(rst_pkt)
+                    return
+
+            tuntap.send(server_conn.rewrite(pkt))
+        elif sender == server_conn:
+            tuntap.send(client_conn.rewrite(pkt))
+
+
+# client -> mitm -> server
+# server -> mitm -> client
+
+# client: 127.0.0.1:8080
+# mitm: 127.0.0.1:8081
+# server: 127.0.0.1:8081
+
+
+# client -> mitm == src = client.ip, sport = client.port ;; dst = mitm.ip, dport = mitm.port
+# mitm -> server == src = mitm.ip, sport = mitm.port ;; dst = server.ip, dport = server.port
diff --git a/tests/cc_ymq/py_mitm/runner.py b/tests/cc_ymq/py_mitm/runner.py
new file mode 100644
index 000000000..f56ccb9f9
--- /dev/null
+++ b/tests/cc_ymq/py_mitm/runner.py
@@ -0,0 +1,157 @@
+# flake8: noqa: E402
+
+"""
+This script provides a framework for running MITM test cases
+
+This script accepts 5 arguments in the following order:
+    1. pid: the pid of the test process, used for signaling
+    2. testcase: the MITM test case. \
+       this loads `from .testcase import MITM` where `MITM` is a class implementing `MITMProtocol`
+    3. mitm_ip: an ipv4 address for the mitm server
+    4. mitm_port: the port used to connect to the remote server
+    5. server_ip: the desired ip of the remote side of the TUNTAP interface
+    6. server_port: the port of the remote server
+    7. *args: Additional args, if any are passed to the constructor: `MITM(*args)`
+
+See the documentation on `main` for more
+"""
+import os
+import sys
+
+# add the script's directory to path
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+
+import importlib
+import signal
+import subprocess
+
+from core import MITMProtocol, TCPConnection
+from scapy.all import IP, TCP, TunTapInterface  # type: ignore
+
+
+def echo_call(cmd: list[str]):
+    print(f"+ {' '.join(cmd)}")
+    subprocess.check_call(cmd)
+
+
+def create_tuntap_interface(iface_name: str, mitm_ip: str, remote_ip: str) -> TunTapInterface:
+    """
+    Creates a TUNTAP interface and sets brings it up and adds ips using the `ip` program
+
+    Args:
+        iface_name: The name of the TUNTAP interface, usually like `tun0`, `tun1`, etc.
+        mitm_ip: The desired ip address of the mitm. This is the ip that clients can use to connect to the mitm
+        remote_ip: The ip that routes to/from the tuntap interface.
+        packets sent to `mitm_ip` will appear to come from `remote_ip`,\
+        and conversely the tuntap interface can connect/send packets
+        to `remote_ip`, making it a suitable ip for binding a server
+
+    Returns:
+        The TUNTAP interface
+    """
+    iface = TunTapInterface(iface_name, mode="tun")
+
+    try:
+        echo_call(["sudo", "ip", "link", "set", iface_name, "up"])
+        echo_call(["sudo", "ip", "addr", "add", remote_ip, "peer", mitm_ip, "dev", iface_name])
+        print(f"[+] Interface {iface_name} up with IP {mitm_ip}")
+    except subprocess.CalledProcessError:
+        print("[!] Could not bring up interface. Run as root or set manually.")
+        raise
+
+    return iface
+
+
+def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: int, mitm: MITMProtocol):
+    """
+    This function serves as a framework for man in the middle implementations
+    A client connects to the MITM, then the MITM connects to a remote server
+    The MITM sits inbetween the client and the server, manipulating the packets sent depending on the test case
+    This function:
+        1. creates a TUNTAP interface and prepares it for MITM
+        2. handles connecting clients and handling connection closes
+        3. delegates additional logic to a pluggable callable, `mitm`
+        4. returns when both connections have terminated (via )
+
+    Args:
+        pid: this is the pid of the test process, used for signaling readiness \
+        we send SIGUSR1 to this process when the mitm is ready
+        mitm_ip: The desired ip address of the mitm server
+        mitm_port: The desired port of the mitm server. \
+        This is the port used to connect to the server, but the client is free to connect on any port
+        remote_ip: The desired remote ip for the TUNTAP interface. This is the only ip address \
+        reachable by the interface and is thus the src ip for clients, and the ip that the remote server \
+        must be bound to
+        server_port: The port that the remote server is bound to
+        mitm: The core logic for a MITM test case. This callable may maintain its own state and is responsible \
+        for sending packets over the TUNTAP interface (if it doesn't, nothing will happen)
+    """
+
+    tuntap = create_tuntap_interface("tun0", mitm_ip, remote_ip)
+
+    # signal the caller that the tuntap interface has been created
+    if pid > 0:
+        os.kill(pid, signal.SIGUSR1)
+
+    # these track information about our connections
+    # we already know what to expect for the server connection, we are the connector
+    client_conn = None
+    server_conn = TCPConnection(mitm_ip, mitm_port, remote_ip, server_port)
+
+    # tracks the state of each connection
+    client_sent_fin_ack = False
+    client_closed = False
+    server_sent_fin_ack = False
+    server_closed = False
+
+    while True:
+        pkt = tuntap.recv()
+        if not pkt.haslayer(IP) or not pkt.haslayer(TCP):
+            continue
+        ip = pkt[IP]
+        tcp = pkt[TCP]
+
+        # for a received packet, the destination ip and port are our local ip and port
+        # and the source ip and port will be the remote ip and port
+        sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport)
+
+        if sender == client_conn:
+            print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}")
+        elif sender == server_conn:
+            print(f"<- [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}")
+
+        if tcp.flags == "S":  # SYN from client
+            print("-> [S]")
+            print(f"[*] New connection from {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}")
+            client_conn = sender
+
+        if tcp.flags == "SA":  # SYN-ACK from server
+            if sender == server_conn:
+                print(f"[*] Connection to server established: {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}")
+
+        if tcp.flags == "FA":  # FIN-ACK
+            if sender == client_conn:
+                client_sent_fin_ack = True
+            if sender == server_conn:
+                server_sent_fin_ack = True
+
+        if tcp.flags == "A":  # ACK
+            if sender == client_conn and server_sent_fin_ack:
+                server_closed = True
+            if sender == server_conn and client_sent_fin_ack:
+                client_closed = True
+
+        mitm.proxy(tuntap, pkt, sender, client_conn, server_conn)
+
+        if client_closed and server_closed:
+            print("[*] Both connections closed")
+            return
+
+
+if __name__ == "__main__":
+    # parse the ips, ports, and test case from the command line
+    pid, testcase, mitm_ip, mitm_port, remote_ip, server_port, *args = sys.argv[1:]
+
+    # load the module dynamically
+    module = importlib.import_module(testcase)
+    main(int(pid), mitm_ip, int(mitm_port), remote_ip, int(server_port), module.MITM(*args))
diff --git a/tests/pymod_ymq/config.py b/tests/pymod_ymq/config.py
new file mode 100644
index 000000000..e3b138516
--- /dev/null
+++ b/tests/pymod_ymq/config.py
@@ -0,0 +1,13 @@
+__all__ = ["ymq"]
+
+import sys
+import os
+
+file_path = os.path.realpath(__file__)
+joined_path = os.path.join(file_path, "..", "..", "..", "scaler", "io", "ymq")
+normed_path = os.path.normpath(joined_path)
+
+sys.path.append(normed_path)
+import ymq  # noqa: E402
+
+sys.path.pop()

From c468afc637a26294eb3f638e56876b9b3af0166a Mon Sep 17 00:00:00 2001
From: magniloquency <197707854+magniloquency@users.noreply.github.com>
Date: Mon, 15 Sep 2025 21:53:44 -0400
Subject: [PATCH 3/3] Add more tests

Signed-off-by: magniloquency <197707854+magniloquency@users.noreply.github.com>
---
 pyproject.toml                                |    1 +
 scaler/io/ymq/message_connection_tcp.cpp      |    2 +-
 scaler/io/ymq/tests/incomplete_identity.h     |   53 -
 scaler/io/ymq/third_party/concurrentqueue.h   | 7130 ++++++++---------
 scripts/lint.sh                               |   15 -
 tests/cc_ymq/common.h                         |    5 +-
 tests/cc_ymq/py_mitm/core.py                  |   54 -
 tests/cc_ymq/py_mitm/drop.py                  |   28 -
 tests/cc_ymq/py_mitm/main.py                  |    5 +-
 tests/cc_ymq/py_mitm/passthrough.py           |    5 +-
 tests/cc_ymq/py_mitm/randomly_drop_packets.py |    5 +-
 tests/cc_ymq/py_mitm/rst.py                   |   48 -
 tests/cc_ymq/py_mitm/runner.py                |  157 -
 tests/cc_ymq/py_mitm/send_rst_to_client.py    |    5 +-
 tests/cc_ymq/py_mitm/types.py                 |    2 +-
 15 files changed, 3416 insertions(+), 4099 deletions(-)
 delete mode 100644 scaler/io/ymq/tests/incomplete_identity.h
 delete mode 100755 scripts/lint.sh
 delete mode 100644 tests/cc_ymq/py_mitm/core.py
 delete mode 100644 tests/cc_ymq/py_mitm/drop.py
 delete mode 100644 tests/cc_ymq/py_mitm/rst.py
 delete mode 100644 tests/cc_ymq/py_mitm/runner.py

diff --git a/pyproject.toml b/pyproject.toml
index 9fb667b61..a5aef39a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dev = [
     "flake8>=7.3.0",
     "flake8-pyproject>=1.2.3",
     "mypy>=1.17.1",
+    "scapy==2.*",
 ]
 
 [tool.scikit-build.metadata.version]
diff --git a/scaler/io/ymq/message_connection_tcp.cpp b/scaler/io/ymq/message_connection_tcp.cpp
index df2338292..8212238f1 100644
--- a/scaler/io/ymq/message_connection_tcp.cpp
+++ b/scaler/io/ymq/message_connection_tcp.cpp
@@ -267,7 +267,7 @@ void MessageConnectionTCP::updateReadOperation()
             _receivedReadOperations.pop();
             auto recvMessageCallback = std::move(_pendingRecvMessageCallbacks->front());
             _pendingRecvMessageCallbacks->pop();
-            
+
             recvMessageCallback({Message(std::move(address), std::move(payload)), {}});
         } else {
             assert(_pendingRecvMessageCallbacks->size());
diff --git a/scaler/io/ymq/tests/incomplete_identity.h b/scaler/io/ymq/tests/incomplete_identity.h
deleted file mode 100644
index e8ea4f1ed..000000000
--- a/scaler/io/ymq/tests/incomplete_identity.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include <print>
-#include <thread>
-
-#include "scaler/io/ymq/examples/common.h"
-#include "scaler/io/ymq/io_context.h"
-#include "tests/cc_ymq/common.h"
-
-void incomplete_identity_server_main()
-{
-    IOContext context(1);
-
-    auto socket = syncCreateSocket(context, IOSocketType::Binder, "server");
-    syncBindSocket(socket, "tcp://127.0.0.1:25715");
-    auto result = syncRecvMessage(socket);
-
-    assert(result.has_value());
-    assert(result->payload.as_string() == "yi er san si wu liu");
-
-    context.removeIOSocket(socket);
-}
-
-void incomplete_identity_client_main()
-{
-    // open a socket, write an incomplete identity and exit
-    {
-        TcpSocket socket;
-
-        socket.connect("127.0.0.1", 25715);
-
-        auto remote_identity = socket.read_message();
-        assert(remote_identity == "server");
-
-        // write incomplete identity and exit
-        std::string identity = "client";
-        uint64_t header      = identity.length();
-        socket.write_all((char*)&header, 8);
-        socket.write_all(identity.data(), identity.length() - 2);
-        std::this_thread::sleep_for(3s);
-    }
-
-    // connect again and try to send a message
-    {
-        TcpSocket socket;
-        socket.connect("127.0.0.1", 25715);
-        auto remote_identity = socket.read_message();
-        assert(remote_identity == "server");
-        socket.write_message("client");
-        socket.write_message("yi er san si wu liu");
-        std::this_thread::sleep_for(3s);
-    }
-}
diff --git a/scaler/io/ymq/third_party/concurrentqueue.h b/scaler/io/ymq/third_party/concurrentqueue.h
index d5498b116..2fc775400 100644
--- a/scaler/io/ymq/third_party/concurrentqueue.h
+++ b/scaler/io/ymq/third_party/concurrentqueue.h
@@ -47,7 +47,7 @@
 // VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
 // does not support `if constexpr`, so we have no choice but to simply disable the warning
 #pragma warning(push)
-#pragma warning(disable : 4127)  // conditional expression is constant
+#pragma warning(disable: 4127)  // conditional expression is constant
 #endif
 
 #if defined(__APPLE__)
@@ -64,114 +64,81 @@
 #undef malloc
 #undef free
 #else
-#include <atomic>  // Requires C++11. Sorry VS2010.
+#include <atomic>		// Requires C++11. Sorry VS2010.
 #include <cassert>
 #endif
-#include <algorithm>
-#include <array>
-#include <climits>  // for CHAR_BIT
-#include <cstddef>  // for max_align_t
+#include <cstddef>              // for max_align_t
 #include <cstdint>
 #include <cstdlib>
-#include <limits>
-#include <mutex>   // used for thread exit synchronization
-#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
 #include <type_traits>
+#include <algorithm>
 #include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>        // used for thread exit synchronization
 
 // Platform-specific definitions of a numeric thread ID type and an invalid value
-namespace moodycamel {
-namespace details {
-template <typename thread_id_t>
-struct thread_id_converter {
-    typedef thread_id_t thread_id_numeric_size_t;
-    typedef thread_id_t thread_id_hash_t;
-    static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
-};
-}  // namespace details
-}  // namespace moodycamel
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
 #if defined(MCDBGQ_USE_RELACY)
-namespace moodycamel {
-namespace details {
-typedef std::uint32_t thread_id_t;
-static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
-static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
-static inline thread_id_t thread_id()
-{
-    return rl::thread_index();
-}
-}  // namespace details
-}  // namespace moodycamel
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
 // No sense pulling in windows.h in a header, we'll manually declare the function
 // we use and rely on backwards-compatibility for this not to break
 extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
-namespace moodycamel {
-namespace details {
-static_assert(
-    sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
-typedef std::uint32_t thread_id_t;
-static const thread_id_t invalid_thread_id =
-    0;  // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
-static const thread_id_t invalid_thread_id2 =
-    0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread
-                  // IDs are presently multiples of 4.
-static inline thread_id_t thread_id()
-{
-    return static_cast<thread_id_t>(::GetCurrentThreadId());
-}
-}  // namespace details
-}  // namespace moodycamel
-#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || \
-    defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
-namespace moodycamel {
-namespace details {
-static_assert(
-    sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
-    "std::thread::id is expected to be either 4 or 8 bytes");
-
-typedef std::thread::id thread_id_t;
-static const thread_id_t invalid_thread_id;  // Default ctor creates invalid ID
-
-// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
-// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
-// be.
-static inline thread_id_t thread_id()
-{
-    return std::this_thread::get_id();
-}
-
-template <std::size_t>
-struct thread_id_size {};
-template <>
-struct thread_id_size<4> {
-    typedef std::uint32_t numeric_t;
-};
-template <>
-struct thread_id_size<8> {
-    typedef std::uint64_t numeric_t;
-};
-
-template <>
-struct thread_id_converter<thread_id_t> {
-    typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
 #ifndef __APPLE__
-    typedef std::size_t thread_id_hash_t;
+		typedef std::size_t thread_id_hash_t;
 #else
-    typedef thread_id_numeric_size_t thread_id_hash_t;
+		typedef thread_id_numeric_size_t thread_id_hash_t;
 #endif
 
-    static thread_id_hash_t prehash(thread_id_t const& x)
-    {
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
 #ifndef __APPLE__
-        return std::hash<std::thread::id>()(x);
+			return std::hash<std::thread::id>()(x);
 #else
-        return *reinterpret_cast<thread_id_hash_t const*>(&x);
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
 #endif
-    }
-};
-}
-}
+		}
+	};
+} }
 #else
 // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
 // In order to get a numeric thread ID in a platform-independent way, we use a thread-local
@@ -184,19 +151,12 @@ struct thread_id_converter<thread_id_t> {
 // Assume C++11 compliant compiler
 #define MOODYCAMEL_THREADLOCAL thread_local
 #endif
-namespace moodycamel {
-namespace details {
-typedef std::uintptr_t thread_id_t;
-static const thread_id_t invalid_thread_id = 0;  // Address can't be nullptr
-static const thread_id_t invalid_thread_id2 =
-    1;  // Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
-inline thread_id_t thread_id()
-{
-    static MOODYCAMEL_THREADLOCAL int x;
-    return reinterpret_cast<thread_id_t>(&x);
-}
-}
-}
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
 #endif
 
 // Constexpr if
@@ -212,19 +172,18 @@ inline thread_id_t thread_id()
 
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || \
-    (!defined(_MSC_VER) && !defined(__GNUC__))
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
 #define MOODYCAMEL_EXCEPTIONS_ENABLED
 #endif
 #endif
 #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
-#define MOODYCAMEL_TRY         try
-#define MOODYCAMEL_CATCH(...)  catch (__VA_ARGS__)
-#define MOODYCAMEL_RETHROW     throw
-#define MOODYCAMEL_THROW(expr) throw(expr)
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
 #else
-#define MOODYCAMEL_TRY        MOODYCAMEL_CONSTEXPR_IF(true)
-#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
 #define MOODYCAMEL_RETHROW
 #define MOODYCAMEL_THROW(expr)
 #endif
@@ -232,35 +191,21 @@ inline thread_id_t thread_id()
 #ifndef MOODYCAMEL_NOEXCEPT
 #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
 #define MOODYCAMEL_NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)   true
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
 #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
 // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
 // We have to assume *all* non-trivial constructors may throw on VS2012!
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)                                      \
-    (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? \
-         std::is_trivially_move_constructible<type>::value :                                 \
-         std::is_trivially_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)                                                    \
-    ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ?                   \
-          std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value :   \
-          std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && \
-     MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)                                                         \
-    (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ?                    \
-         std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : \
-         std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)                                                    \
-    ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ?                   \
-          std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value :   \
-          std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && \
-     MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #else
-#define MOODYCAMEL_NOEXCEPT                               noexcept
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)   noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
 #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
 #endif
 #endif
@@ -269,24 +214,18 @@ inline thread_id_t thread_id()
 #ifdef MCDBGQ_USE_RELACY
 #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #else
-// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug:
-// http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't support thread_local either. Finally, iOS/ARM doesn't
-// have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
-#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                                                                     \
-    (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) &&                          \
-    (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                               \
-    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && \
-    !defined(__MVS__)
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
 // Assume `thread_local` is fully supported in all other C++11 compilers/platforms
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now; years ago several users report having
-                                                 // problems with it on
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
 #endif
 #endif
 #endif
 
-// VS2012 doesn't support deleted functions.
-// In this case, we declare the function normally but don't define it. A link error will be generated if the function is
-// called.
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
 #ifndef MOODYCAMEL_DELETE_FUNCTION
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #define MOODYCAMEL_DELETE_FUNCTION
@@ -295,100 +234,54 @@ inline thread_id_t thread_id()
 #endif
 #endif
 
-namespace moodycamel {
-namespace details {
+namespace moodycamel { namespace details {
 #ifndef MOODYCAMEL_ALIGNAS
 // VS2013 doesn't support alignas or alignof, and align() requires a constant literal
 #if defined(_MSC_VER) && _MSC_VER <= 1800
-#define MOODYCAMEL_ALIGNAS(alignment)        __declspec(align(alignment))
-#define MOODYCAMEL_ALIGNOF(obj)              __alignof(obj)
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
 #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
-template <int Align, typename T>
-struct Vs2013Aligned {};  // default, unsupported alignment
-template <typename T>
-struct Vs2013Aligned<1, T> {
-    typedef __declspec(align(1)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<2, T> {
-    typedef __declspec(align(2)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<4, T> {
-    typedef __declspec(align(4)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<8, T> {
-    typedef __declspec(align(8)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<16, T> {
-    typedef __declspec(align(16)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<32, T> {
-    typedef __declspec(align(32)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<64, T> {
-    typedef __declspec(align(64)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<128, T> {
-    typedef __declspec(align(128)) T type;
-};
-template <typename T>
-struct Vs2013Aligned<256, T> {
-    typedef __declspec(align(256)) T type;
-};
+	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
+	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
+	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
+	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
+	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
+	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
+	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
+	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
+	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
+	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
 #else
-template <typename T>
-struct identity {
-    typedef T type;
-};
-#define MOODYCAMEL_ALIGNAS(alignment)        alignas(alignment)
-#define MOODYCAMEL_ALIGNOF(obj)              alignof(obj)
+	template<typename T> struct identity { typedef T type; };
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
 #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
 #endif
 #endif
-}  // namespace details
-}  // namespace moodycamel
+} }
+
 
 // TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
 // we can apply per-function compile-time suppression.
 // See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
 #define MOODYCAMEL_NO_TSAN
 #if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#undef MOODYCAMEL_NO_TSAN
-#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
-#endif  // TSAN
-#endif  // TSAN
+ #if __has_feature(thread_sanitizer)
+  #undef MOODYCAMEL_NO_TSAN
+  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+ #endif // TSAN
+#endif // TSAN
 
 // Compiler-specific likely/unlikely hints
-namespace moodycamel {
-namespace details {
+namespace moodycamel { namespace details {
 #if defined(__GNUC__)
-static inline bool(likely)(bool x)
-{
-    return __builtin_expect((x), true);
-}
-static inline bool(unlikely)(bool x)
-{
-    return __builtin_expect((x), false);
-}
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
 #else
-static inline bool(likely)(bool x)
-{
-    return x;
-}
-static inline bool(unlikely)(bool x)
-{
-    return x;
-}
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
 #endif
-}  // namespace details
-}  // namespace moodycamel
+} }
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
 #include "internal/concurrentqueue_internal_debug.h"
@@ -396,28 +289,28 @@ static inline bool(unlikely)(bool x)
 
 namespace moodycamel {
 namespace details {
-template <typename T>
-struct const_numeric_max {
-    static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
-    static const T value = std::numeric_limits<T>::is_signed ?
-                               (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1) :
-                               static_cast<T>(-1);
-};
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
 
 #if defined(__GLIBCXX__)
-typedef ::max_align_t std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
 #else
-typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can *only* be accessed via std::
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
 #endif
 
-// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
-// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
-typedef union {
-    std_max_align_t x;
-    long long y;
-    void* z;
-} max_align_t;
-}  // namespace details
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
 
 // Default traits for the ConcurrentQueue. To change some of the
 // traits without re-implementing all of them, inherit from this
@@ -425,96 +318,99 @@ typedef union {
 // since the traits are used as a template type parameter, the
 // shadowed declarations will be used where defined, and the defaults
 // otherwise.
-struct ConcurrentQueueDefaultTraits {
-    // General-purpose size type. std::size_t is strongly recommended.
-    typedef std::size_t size_t;
-
-    // The type used for the enqueue and dequeue indices. Must be at least as
-    // large as size_t. Should be significantly larger than the number of elements
-    // you expect to hold at once, especially if you have a high turnover rate;
-    // for example, on 32-bit x86, if you expect to have over a hundred million
-    // elements or pump several million elements through your queue in a very
-    // short space of time, using a 32-bit type *may* trigger a race condition.
-    // A 64-bit int type is recommended in that case, and in practice will
-    // prevent a race condition no matter the usage of the queue. Note that
-    // whether the queue is lock-free with a 64-int type depends on the whether
-    // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
-    typedef std::size_t index_t;
-
-    // Internally, all elements are enqueued and dequeued from multi-element
-    // blocks; this is the smallest controllable unit. If you expect few elements
-    // but many producers, a smaller block size should be favoured. For few producers
-    // and/or many elements, a larger block size is preferred. A sane default
-    // is provided. Must be a power of 2.
-    static const size_t BLOCK_SIZE = 32;
-
-    // For explicit producers (i.e. when using a producer token), the block is
-    // checked for being empty by iterating through a list of flags, one per element.
-    // For large block sizes, this is too inefficient, and switching to an atomic
-    // counter-based approach is faster. The switch is made for block sizes strictly
-    // larger than this threshold.
-    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
-
-    // How many full blocks can be expected for a single explicit producer? This should
-    // reflect that number's maximum for optimal performance. Must be a power of 2.
-    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
-
-    // How many full blocks can be expected for a single implicit producer? This should
-    // reflect that number's maximum for optimal performance. Must be a power of 2.
-    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
-
-    // The initial size of the hash table mapping thread IDs to implicit producers.
-    // Note that the hash is resized every time it becomes half full.
-    // Must be a power of two, and either 0 or at least 1. If 0, implicit production
-    // (using the enqueue methods without an explicit producer token) is disabled.
-    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
-
-    // Controls the number of items that an explicit consumer (i.e. one with a token)
-    // must consume before it causes all consumers to rotate and move on to the next
-    // internal queue.
-    static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
-
-    // The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
-    // Enqueue operations that would cause this limit to be surpassed will fail. Note
-    // that this limit is enforced at the block level (for performance reasons), i.e.
-    // it's rounded up to the nearest block size.
-    static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
-
-    // The number of times to spin before sleeping when waiting on a semaphore.
-    // Recommended values are on the order of 1000-10000 unless the number of
-    // consumer threads exceeds the number of idle cores (in which case try 0-100).
-    // Only affects instances of the BlockingConcurrentQueue.
-    static const int MAX_SEMA_SPINS = 10000;
-
-    // Whether to recycle dynamically-allocated blocks into an internal free list or
-    // not. If false, only pre-allocated blocks (controlled by the constructor
-    // arguments) will be recycled, and all others will be `free`d back to the heap.
-    // Note that blocks consumed by explicit producers are only freed on destruction
-    // of the queue (not following destruction of the token) regardless of this trait.
-    static const bool RECYCLE_ALLOCATED_BLOCKS = false;
-
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+	// The number of times to spin before sleeping when waiting on a semaphore.
+	// Recommended values are on the order of 1000-10000 unless the number of
+	// consumer threads exceeds the number of idle cores (in which case try 0-100).
+	// Only affects instances of the BlockingConcurrentQueue.
+	static const int MAX_SEMA_SPINS = 10000;
+
+	// Whether to recycle dynamically-allocated blocks into an internal free list or
+	// not. If false, only pre-allocated blocks (controlled by the constructor
+	// arguments) will be recycled, and all others will be `free`d back to the heap.
+	// Note that blocks consumed by explicit producers are only freed on destruction
+	// of the queue (not following destruction of the token) regardless of this trait.
+	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+	
 #ifndef MCDBGQ_USE_RELACY
-    // Memory allocation can be customized if needed.
-    // malloc should return nullptr on failure, and handle alignment like std::malloc.
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
 #if defined(malloc) || defined(free)
-    // Gah, this is 2015, stop defining macros that break standard code already!
-    // Work around malloc/free being special macros:
-    static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
-    static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
-    static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); }
-    static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); }
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
 #else
-    static inline void* malloc(size_t size) { return std::malloc(size); }
-    static inline void free(void* ptr) { return std::free(ptr); }
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
 #endif
 #else
-    // Debug versions when running under the Relacy race detector (ignore
-    // these in user code)
-    static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
-    static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
 #endif
 };
 
+
 // When producing or consuming many elements, the most efficient way is to:
 //    1) Use one of the bulk-operation methods of the queue with a token
 //    2) Failing that, use the bulk-operation methods without a token
@@ -525,3554 +421,3322 @@ struct ConcurrentQueueDefaultTraits {
 struct ProducerToken;
 struct ConsumerToken;
 
-template <typename T, typename Traits>
-class ConcurrentQueue;
-template <typename T, typename Traits>
-class BlockingConcurrentQueue;
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
 class ConcurrentQueueTests;
 
-namespace details {
-struct ConcurrentQueueProducerTypelessBase {
-    ConcurrentQueueProducerTypelessBase* next;
-    std::atomic<bool> inactive;
-    ProducerToken* token;
-
-    ConcurrentQueueProducerTypelessBase(): next(nullptr), inactive(false), token(nullptr) {}
-};
-
-template <bool use32>
-struct _hash_32_or_64 {
-    static inline std::uint32_t hash(std::uint32_t h)
-    {
-        // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
-        // Since the thread ID is already unique, all we really want to do is propagate that
-        // uniqueness evenly across all the bits, so that we can use a subset of the bits while
-        // reducing collisions significantly
-        h ^= h >> 16;
-        h *= 0x85ebca6b;
-        h ^= h >> 13;
-        h *= 0xc2b2ae35;
-        return h ^ (h >> 16);
-    }
-};
-template <>
-struct _hash_32_or_64<1> {
-    static inline std::uint64_t hash(std::uint64_t h)
-    {
-        h ^= h >> 33;
-        h *= 0xff51afd7ed558ccd;
-        h ^= h >> 33;
-        h *= 0xc4ceb9fe1a85ec53;
-        return h ^ (h >> 33);
-    }
-};
-template <std::size_t size>
-struct hash_32_or_64: public _hash_32_or_64<(size > 4)> {};
-
-static inline size_t hash_thread_id(thread_id_t id)
-{
-    static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
-    return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
-        thread_id_converter<thread_id_t>::prehash(id)));
-}
-
-template <typename T>
-static inline bool circular_less_than(T a, T b)
-{
-    static_assert(
-        std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
-        "circular_less_than is intended to be used only with unsigned integer types");
-    return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
-    // Note: extra parens around rhs of operator<< is MSVC bug:
-    // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
-    //       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when
-    //       done here.
-}
-
-template <typename U>
-static inline char* align_for(char* ptr)
-{
-    const std::size_t alignment = std::alignment_of<U>::value;
-    return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-}
-
-template <typename T>
-static inline T ceil_to_pow_2(T x)
-{
-    static_assert(
-        std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
-        "ceil_to_pow_2 is intended to be used only with unsigned integer types");
-
-    // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-    --x;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
-        x |= x >> (i << 3);
-    }
-    ++x;
-    return x;
-}
-
-template <typename T>
-static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
-{
-    T temp = left.load(std::memory_order_relaxed);
-    left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
-    right.store(temp, std::memory_order_relaxed);
-}
-
-template <typename T>
-static inline T const& nomove(T const& x)
-{
-    return x;
-}
-
-template <bool Enable>
-struct nomove_if {
-    template <typename T>
-    static inline T const& eval(T const& x)
-    {
-        return x;
-    }
-};
-
-template <>
-struct nomove_if<false> {
-    template <typename U>
-    static inline auto eval(U&& x) -> decltype(std::forward<U>(x))
-    {
-        return std::forward<U>(x);
-    }
-};
 
-template <typename It>
-static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+namespace details
 {
-    return *it;
-}
-
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = left.load(std::memory_order_relaxed);
+		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		right.store(temp, std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
 #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
-template <typename T>
-struct is_trivially_destructible: std::is_trivially_destructible<T> {};
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
 #else
-template <typename T>
-struct is_trivially_destructible: std::has_trivial_destructor<T> {};
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
 #endif
-
+	
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #ifdef MCDBGQ_USE_RELACY
-typedef RelacyThreadExitListener ThreadExitListener;
-typedef RelacyThreadExitNotifier ThreadExitNotifier;
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
 #else
-class ThreadExitNotifier;
-
-struct ThreadExitListener {
-    typedef void (*callback_t)(void*);
-    callback_t callback;
-    void* userData;
-
-    ThreadExitListener* next;   // reserved for use by the ThreadExitNotifier
-    ThreadExitNotifier* chain;  // reserved for use by the ThreadExitNotifier
-};
-
-class ThreadExitNotifier {
-public:
-    static void subscribe(ThreadExitListener* listener)
-    {
-        auto& tlsInst = instance();
-        std::lock_guard<std::mutex> guard(mutex());
-        listener->next  = tlsInst.tail;
-        listener->chain = &tlsInst;
-        tlsInst.tail    = listener;
-    }
-
-    static void unsubscribe(ThreadExitListener* listener)
-    {
-        std::lock_guard<std::mutex> guard(mutex());
-        if (!listener->chain) {
-            return;  // race with ~ThreadExitNotifier
-        }
-        auto& tlsInst             = *listener->chain;
-        listener->chain           = nullptr;
-        ThreadExitListener** prev = &tlsInst.tail;
-        for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
-            if (ptr == listener) {
-                *prev = ptr->next;
-                break;
-            }
-            prev = &ptr->next;
-        }
-    }
-
-private:
-    ThreadExitNotifier(): tail(nullptr) {}
-    ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-    ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-
-    ~ThreadExitNotifier()
-    {
-        // This thread is about to exit, let everyone know!
-        assert(
-            this == &instance() &&
-            "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that "
-            "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
-        std::lock_guard<std::mutex> guard(mutex());
-        for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
-            ptr->chain = nullptr;
-            ptr->callback(ptr->userData);
-        }
-    }
-
-    // Thread-local
-    static inline ThreadExitNotifier& instance()
-    {
-        static thread_local ThreadExitNotifier notifier;
-        return notifier;
-    }
-
-    static inline std::mutex& mutex()
-    {
-        // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
-        static std::mutex mutex;
-        return mutex;
-    }
-
-private:
-    ThreadExitListener* tail;
-};
-#endif
-#endif
+	class ThreadExitNotifier;
+
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
+	};
+
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			std::lock_guard<std::mutex> guard(mutex());
+			listener->next = tlsInst.tail;
+			listener->chain = &tlsInst;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			std::lock_guard<std::mutex> guard(mutex());
+			if (!listener->chain) {
+				return;  // race with ~ThreadExitNotifier
+			}
+			auto& tlsInst = *listener->chain;
+			listener->chain = nullptr;
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			std::lock_guard<std::mutex> guard(mutex());
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->chain = nullptr;
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+
+		static inline std::mutex& mutex()
+		{
+			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+			static std::mutex mutex;
+			return mutex;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
 
-template <typename T>
-struct static_is_lock_free_num {
-    enum { value = 0 };
-};
-template <>
-struct static_is_lock_free_num<signed char> {
-    enum { value = ATOMIC_CHAR_LOCK_FREE };
-};
-template <>
-struct static_is_lock_free_num<short> {
-    enum { value = ATOMIC_SHORT_LOCK_FREE };
-};
-template <>
-struct static_is_lock_free_num<int> {
-    enum { value = ATOMIC_INT_LOCK_FREE };
-};
-template <>
-struct static_is_lock_free_num<long> {
-    enum { value = ATOMIC_LONG_LOCK_FREE };
-};
-template <>
-struct static_is_lock_free_num<long long> {
-    enum { value = ATOMIC_LLONG_LOCK_FREE };
-};
-template <typename T>
-struct static_is_lock_free: static_is_lock_free_num<typename std::make_signed<T>::type> {};
-template <>
-struct static_is_lock_free<bool> {
-    enum { value = ATOMIC_BOOL_LOCK_FREE };
-};
-template <typename U>
-struct static_is_lock_free<U*> {
-    enum { value = ATOMIC_POINTER_LOCK_FREE };
-};
-}  // namespace details
-
-struct ProducerToken {
-    template <typename T, typename Traits>
-    explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
-
-    template <typename T, typename Traits>
-    explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
-
-    ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT: producer(other.producer)
-    {
-        other.producer = nullptr;
-        if (producer != nullptr) {
-            producer->token = this;
-        }
-    }
-
-    inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-    {
-        swap(other);
-        return *this;
-    }
-
-    void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
-    {
-        std::swap(producer, other.producer);
-        if (producer != nullptr) {
-            producer->token = this;
-        }
-        if (other.producer != nullptr) {
-            other.producer->token = &other;
-        }
-    }
-
-    // A token is always valid unless:
-    //     1) Memory allocation failed during construction
-    //     2) It was moved via the move constructor
-    //        (Note: assignment does a swap, leaving both potentially valid)
-    //     3) The associated queue was destroyed
-    // Note that if valid() returns true, that only indicates
-    // that the token is valid for use with a specific queue,
-    // but not which one; that's up to the user to track.
-    inline bool valid() const { return producer != nullptr; }
-
-    ~ProducerToken()
-    {
-        if (producer != nullptr) {
-            producer->token = nullptr;
-            producer->inactive.store(true, std::memory_order_release);
-        }
-    }
-
-    // Disable copying and assignment
-    ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-    ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
 
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
 private:
-    template <typename T, typename Traits>
-    friend class ConcurrentQueue;
-    friend class ConcurrentQueueTests;
-
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
 protected:
-    details::ConcurrentQueueProducerTypelessBase* producer;
+	details::ConcurrentQueueProducerTypelessBase* producer;
 };
 
-struct ConsumerToken {
-    template <typename T, typename Traits>
-    explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
-
-    template <typename T, typename Traits>
-    explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
-
-    ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT: initialOffset(other.initialOffset),
-                                                              lastKnownGlobalOffset(other.lastKnownGlobalOffset),
-                                                              itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
-                                                              currentProducer(other.currentProducer),
-                                                              desiredProducer(other.desiredProducer)
-    {
-    }
-
-    inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-    {
-        swap(other);
-        return *this;
-    }
-
-    void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
-    {
-        std::swap(initialOffset, other.initialOffset);
-        std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
-        std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
-        std::swap(currentProducer, other.currentProducer);
-        std::swap(desiredProducer, other.desiredProducer);
-    }
-
-    // Disable copying and assignment
-    ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-    ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
 
 private:
-    template <typename T, typename Traits>
-    friend class ConcurrentQueue;
-    friend class ConcurrentQueueTests;
-
-private:  // but shared with ConcurrentQueue
-    std::uint32_t initialOffset;
-    std::uint32_t lastKnownGlobalOffset;
-    std::uint32_t itemsConsumedFromCurrent;
-    details::ConcurrentQueueProducerTypelessBase* currentProducer;
-    details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
 };
 
 // Need to forward-declare this swap because it's in a namespace.
-// See
-// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
-template <typename T, typename Traits>
-inline void swap(
-    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
-    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
-
-template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
-class ConcurrentQueue {
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
 public:
-    typedef ::moodycamel::ProducerToken producer_token_t;
-    typedef ::moodycamel::ConsumerToken consumer_token_t;
-
-    typedef typename Traits::index_t index_t;
-    typedef typename Traits::size_t size_t;
-
-    static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
-    static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
-        static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
-    static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
-    static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
-    static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
-        static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
-    static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
-        static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable : 4307)  // + integral constant overflow (that's what the ternary expression is for!)
-#pragma warning(disable : 4309)  // static_cast: Truncation of constant value
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
 #endif
-    static const size_t MAX_SUBQUEUE_SIZE =
-        (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ?
-            details::const_numeric_max<size_t>::value :
-            ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
 
-    static_assert(
-        !std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value,
-        "Traits::size_t must be an unsigned integral type");
-    static_assert(
-        !std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value,
-        "Traits::index_t must be an unsigned integral type");
-    static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
-    static_assert(
-        (BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)),
-        "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
-    static_assert(
-        (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
-            !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
-        "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
-    static_assert(
-        (EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
-        "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-    static_assert(
-        (IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
-        "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-    static_assert(
-        (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
-            !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
-        "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
-    static_assert(
-        INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
-        "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
 
 public:
-    // Creates a queue with at least `capacity` element slots; note that the
-    // actual number of elements that can be inserted without additional memory
-    // allocation depends on the number of producers and the block size (e.g. if
-    // the block size is equal to `capacity`, only a single block will be allocated
-    // up-front, which means only a single producer will be able to enqueue elements
-    // without an extra allocation -- blocks aren't shared between producers).
-    // This method is not thread safe -- it is up to the user to ensure that the
-    // queue is fully constructed before it starts being used by other threads (this
-    // includes making the memory effects of construction visible, possibly with a
-    // memory barrier).
-    explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
-        : producerListTail(nullptr)
-        , producerCount(0)
-        , initialBlockPoolIndex(0)
-        , nextExplicitConsumerId(0)
-        , globalExplicitConsumerOffset(0)
-    {
-        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-        populate_initial_implicit_producer_hash();
-        populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
-
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-        // Track all the producers using a fully-resolved typed list for
-        // each kind; this makes it possible to debug them starting from
-        // the root queue object (otherwise wacky casts are needed that
-        // don't compile in the debugger's expression evaluator).
-        explicitProducers.store(nullptr, std::memory_order_relaxed);
-        implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-    }
-
-    // Computes the correct amount of pre-allocated blocks for you based
-    // on the minimum number of elements you want available at any given
-    // time, and the maximum concurrent number of each type of producer.
-    ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
-        : producerListTail(nullptr)
-        , producerCount(0)
-        , initialBlockPoolIndex(0)
-        , nextExplicitConsumerId(0)
-        , globalExplicitConsumerOffset(0)
-    {
-        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-        populate_initial_implicit_producer_hash();
-        size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) +
-                        2 * (maxExplicitProducers + maxImplicitProducers);
-        populate_initial_block_list(blocks);
-
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-        explicitProducers.store(nullptr, std::memory_order_relaxed);
-        implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-    }
-
-    // Note: The queue should not be accessed concurrently while it's
-    // being deleted. It's up to the user to synchronize this.
-    // This method is not thread safe.
-    ~ConcurrentQueue()
-    {
-        // Destroy producers
-        auto ptr = producerListTail.load(std::memory_order_relaxed);
-        while (ptr != nullptr) {
-            auto next = ptr->next_prod();
-            if (ptr->token != nullptr) {
-                ptr->token->producer = nullptr;
-            }
-            destroy(ptr);
-            ptr = next;
-        }
-
-        // Destroy implicit producer hash tables
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0)
-        {
-            auto hash = implicitProducerHash.load(std::memory_order_relaxed);
-            while (hash != nullptr) {
-                auto prev = hash->prev;
-                if (prev != nullptr) {  // The last hash is part of this object and was not allocated dynamically
-                    for (size_t i = 0; i != hash->capacity; ++i) {
-                        hash->entries[i].~ImplicitProducerKVP();
-                    }
-                    hash->~ImplicitProducerHash();
-                    (Traits::free)(hash);
-                }
-                hash = prev;
-            }
-        }
-
-        // Destroy global free list
-        auto block = freeList.head_unsafe();
-        while (block != nullptr) {
-            auto next = block->freeListNext.load(std::memory_order_relaxed);
-            if (block->dynamicallyAllocated) {
-                destroy(block);
-            }
-            block = next;
-        }
-
-        // Destroy initial free list
-        destroy_array(initialBlockPool, initialBlockPoolSize);
-    }
-
-    // Disable copying and copy assignment
-    ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-    ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-
-    // Moving is supported, but note that it is *not* a thread-safe operation.
-    // Nobody can use the queue while it's being moved, and the memory effects
-    // of that move must be propagated to other threads before they can use it.
-    // Note: When a queue is moved, its tokens are still valid but can only be
-    // used with the destination queue (i.e. semantically they are moved along
-    // with the queue itself).
-    ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-        : producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
-          producerCount(other.producerCount.load(std::memory_order_relaxed)),
-          initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
-          initialBlockPool(other.initialBlockPool),
-          initialBlockPoolSize(other.initialBlockPoolSize),
-          freeList(std::move(other.freeList)),
-          nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
-          globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
-    {
-        // Move the other one into this, and leave the other one as an empty queue
-        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-        populate_initial_implicit_producer_hash();
-        swap_implicit_producer_hashes(other);
-
-        other.producerListTail.store(nullptr, std::memory_order_relaxed);
-        other.producerCount.store(0, std::memory_order_relaxed);
-        other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
-        other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
-
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-        explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-        other.explicitProducers.store(nullptr, std::memory_order_relaxed);
-        implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-        other.implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-
-        other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
-        other.initialBlockPoolSize = 0;
-        other.initialBlockPool     = nullptr;
-
-        reown_producers();
-    }
-
-    inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT { return swap_internal(other); }
-
-    // Swaps this queue's state with the other's. Not thread-safe.
-    // Swapping two queues does not invalidate their tokens, however
-    // the tokens that were created for one queue must be used with
-    // only the swapped queue (i.e. the tokens are tied to the
-    // queue's movable state, not the object itself).
-    inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT { swap_internal(other); }
-
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
 private:
-    ConcurrentQueue& swap_internal(ConcurrentQueue& other)
-    {
-        if (this == &other) {
-            return *this;
-        }
-
-        details::swap_relaxed(producerListTail, other.producerListTail);
-        details::swap_relaxed(producerCount, other.producerCount);
-        details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
-        std::swap(initialBlockPool, other.initialBlockPool);
-        std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
-        freeList.swap(other.freeList);
-        details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
-        details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
-
-        swap_implicit_producer_hashes(other);
-
-        reown_producers();
-        other.reown_producers();
-
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-        details::swap_relaxed(explicitProducers, other.explicitProducers);
-        details::swap_relaxed(implicitProducers, other.implicitProducers);
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
 #endif
-
-        return *this;
-    }
-
+		
+		return *this;
+	}
+	
 public:
-    // Enqueues a single item (by copying it).
-    // Allocates memory if required. Only fails if memory allocation fails (or implicit
-    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-    // Thread-safe.
-    inline bool enqueue(T const& item)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-        else return inner_enqueue<CanAlloc>(item);
-    }
-
-    // Enqueues a single item (by moving it, if possible).
-    // Allocates memory if required. Only fails if memory allocation fails (or implicit
-    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-    // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-    // Thread-safe.
-    inline bool enqueue(T&& item)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-        else return inner_enqueue<CanAlloc>(std::move(item));
-    }
-
-    // Enqueues a single item (by copying it) using an explicit producer token.
-    // Allocates memory if required. Only fails if memory allocation fails (or
-    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-    // Thread-safe.
-    inline bool enqueue(producer_token_t const& token, T const& item) { return inner_enqueue<CanAlloc>(token, item); }
-
-    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
-    // Allocates memory if required. Only fails if memory allocation fails (or
-    // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-    // Thread-safe.
-    inline bool enqueue(producer_token_t const& token, T&& item)
-    {
-        return inner_enqueue<CanAlloc>(token, std::move(item));
-    }
-
-    // Enqueues several items.
-    // Allocates memory if required. Only fails if memory allocation fails (or
-    // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-    // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-    // Note: Use std::make_move_iterator if the elements should be moved instead of copied.
-    // Thread-safe.
-    template <typename It>
-    bool enqueue_bulk(It itemFirst, size_t count)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-        else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
-    }
-
-    // Enqueues several items using an explicit producer token.
-    // Allocates memory if required. Only fails if memory allocation fails
-    // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-    // Note: Use std::make_move_iterator if the elements should be moved
-    // instead of copied.
-    // Thread-safe.
-    template <typename It>
-    bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-    {
-        return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
-    }
-
-    // Enqueues a single item (by copying it).
-    // Does not allocate memory. Fails if not enough room to enqueue (or implicit
-    // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-    // is 0).
-    // Thread-safe.
-    inline bool try_enqueue(T const& item)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-        else return inner_enqueue<CannotAlloc>(item);
-    }
-
-    // Enqueues a single item (by moving it, if possible).
-    // Does not allocate memory (except for one-time implicit producer).
-    // Fails if not enough room to enqueue (or implicit production is
-    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-    // Thread-safe.
-    inline bool try_enqueue(T&& item)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-        else return inner_enqueue<CannotAlloc>(std::move(item));
-    }
-
-    // Enqueues a single item (by copying it) using an explicit producer token.
-    // Does not allocate memory. Fails if not enough room to enqueue.
-    // Thread-safe.
-    inline bool try_enqueue(producer_token_t const& token, T const& item)
-    {
-        return inner_enqueue<CannotAlloc>(token, item);
-    }
-
-    // Enqueues a single item (by moving it, if possible) using an explicit producer token.
-    // Does not allocate memory. Fails if not enough room to enqueue.
-    // Thread-safe.
-    inline bool try_enqueue(producer_token_t const& token, T&& item)
-    {
-        return inner_enqueue<CannotAlloc>(token, std::move(item));
-    }
-
-    // Enqueues several items.
-    // Does not allocate memory (except for one-time implicit producer).
-    // Fails if not enough room to enqueue (or implicit production is
-    // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-    // Note: Use std::make_move_iterator if the elements should be moved
-    // instead of copied.
-    // Thread-safe.
-    template <typename It>
-    bool try_enqueue_bulk(It itemFirst, size_t count)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-        else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
-    }
-
-    // Enqueues several items using an explicit producer token.
-    // Does not allocate memory. Fails if not enough room to enqueue.
-    // Note: Use std::make_move_iterator if the elements should be moved
-    // instead of copied.
-    // Thread-safe.
-    template <typename It>
-    bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-    {
-        return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
-    }
-
-    // Attempts to dequeue from the queue.
-    // Returns false if all producer streams appeared empty at the time they
-    // were checked (so, the queue is likely but not guaranteed to be empty).
-    // Never allocates. Thread-safe.
-    template <typename U>
-    bool try_dequeue(U& item)
-    {
-        // Instead of simply trying each producer in turn (which could cause needless contention on the first
-        // producer), we score them heuristically.
-        size_t nonEmptyCount = 0;
-        ProducerBase* best   = nullptr;
-        size_t bestSize      = 0;
-        for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr;
-             ptr      = ptr->next_prod()) {
-            auto size = ptr->size_approx();
-            if (size > 0) {
-                if (size > bestSize) {
-                    bestSize = size;
-                    best     = ptr;
-                }
-                ++nonEmptyCount;
-            }
-        }
-
-        // If there was at least one non-empty queue but it appears empty at the time
-        // we try to dequeue from it, we need to make sure every queue's been tried
-        if (nonEmptyCount > 0) {
-            if ((details::likely)(best->dequeue(item))) {
-                return true;
-            }
-            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-                if (ptr != best && ptr->dequeue(item)) {
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
-
-    // Attempts to dequeue from the queue.
-    // Returns false if all producer streams appeared empty at the time they
-    // were checked (so, the queue is likely but not guaranteed to be empty).
-    // This differs from the try_dequeue(item) method in that this one does
-    // not attempt to reduce contention by interleaving the order that producer
-    // streams are dequeued from. So, using this method can reduce overall throughput
-    // under contention, but will give more predictable results in single-threaded
-    // consumer scenarios. This is mostly only useful for internal unit tests.
-    // Never allocates. Thread-safe.
-    template <typename U>
-    bool try_dequeue_non_interleaved(U& item)
-    {
-        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-            if (ptr->dequeue(item)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    // Attempts to dequeue from the queue using an explicit consumer token.
-    // Returns false if all producer streams appeared empty at the time they
-    // were checked (so, the queue is likely but not guaranteed to be empty).
-    // Never allocates. Thread-safe.
-    template <typename U>
-    bool try_dequeue(consumer_token_t& token, U& item)
-    {
-        // The idea is roughly as follows:
-        // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the
-        // highest efficiency consumer dictates the rotation speed of everyone else, more or less If you see that the
-        // global offset has changed, you must reset your consumption counter and move to your designated place If
-        // there's no items where you're supposed to be, keep moving until you find a producer with some items If the
-        // global offset has not changed but you've run out of items to consume, move over from your current position
-        // until you find an producer with something in it
-
-        if (token.desiredProducer == nullptr ||
-            token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-            if (!update_current_producer_after_rotation(token)) {
-                return false;
-            }
-        }
-
-        // If there was at least one non-empty queue but it appears empty at the time
-        // we try to dequeue from it, we need to make sure every queue's been tried
-        if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
-            if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-                globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-            }
-            return true;
-        }
-
-        auto tail = producerListTail.load(std::memory_order_acquire);
-        auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-        if (ptr == nullptr) {
-            ptr = tail;
-        }
-        while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-            if (ptr->dequeue(item)) {
-                token.currentProducer          = ptr;
-                token.itemsConsumedFromCurrent = 1;
-                return true;
-            }
-            ptr = ptr->next_prod();
-            if (ptr == nullptr) {
-                ptr = tail;
-            }
-        }
-        return false;
-    }
-
-    // Attempts to dequeue several elements from the queue.
-    // Returns the number of items actually dequeued.
-    // Returns 0 if all producer streams appeared empty at the time they
-    // were checked (so, the queue is likely but not guaranteed to be empty).
-    // Never allocates. Thread-safe.
-    template <typename It>
-    size_t try_dequeue_bulk(It itemFirst, size_t max)
-    {
-        size_t count = 0;
-        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-            count += ptr->dequeue_bulk(itemFirst, max - count);
-            if (count == max) {
-                break;
-            }
-        }
-        return count;
-    }
-
-    // Attempts to dequeue several elements from the queue using an explicit consumer token.
-    // Returns the number of items actually dequeued.
-    // Returns 0 if all producer streams appeared empty at the time they
-    // were checked (so, the queue is likely but not guaranteed to be empty).
-    // Never allocates. Thread-safe.
-    template <typename It>
-    size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
-    {
-        if (token.desiredProducer == nullptr ||
-            token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-            if (!update_current_producer_after_rotation(token)) {
-                return 0;
-            }
-        }
-
-        size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
-        if (count == max) {
-            if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
-                EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-                globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-            }
-            return max;
-        }
-        token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
-        max -= count;
-
-        auto tail = producerListTail.load(std::memory_order_acquire);
-        auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-        if (ptr == nullptr) {
-            ptr = tail;
-        }
-        while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-            auto dequeued = ptr->dequeue_bulk(itemFirst, max);
-            count += dequeued;
-            if (dequeued != 0) {
-                token.currentProducer          = ptr;
-                token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
-            }
-            if (dequeued == max) {
-                break;
-            }
-            max -= dequeued;
-            ptr = ptr->next_prod();
-            if (ptr == nullptr) {
-                ptr = tail;
-            }
-        }
-        return count;
-    }
-
-    // Attempts to dequeue from a specific producer's inner queue.
-    // If you happen to know which producer you want to dequeue from, this
-    // is significantly faster than using the general-case try_dequeue methods.
-    // Returns false if the producer's queue appeared empty at the time it
-    // was checked (so, the queue is likely but not guaranteed to be empty).
-    // Never allocates. Thread-safe.
-    template <typename U>
-    inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
-    {
-        return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
-    }
-
-    // Attempts to dequeue several elements from a specific producer's inner queue.
-    // Returns the number of items actually dequeued.
-    // If you happen to know which producer you want to dequeue from, this
-    // is significantly faster than using the general-case try_dequeue methods.
-    // Returns 0 if the producer's queue appeared empty at the time it
-    // was checked (so, the queue is likely but not guaranteed to be empty).
-    // Never allocates. Thread-safe.
-    template <typename It>
-    inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
-    {
-        return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
-    }
-
-    // Returns an estimate of the total number of elements currently in the queue. This
-    // estimate is only accurate if the queue has completely stabilized before it is called
-    // (i.e. all enqueue and dequeue operations have completed and their memory effects are
-    // visible on the calling thread, and no further operations start while this method is
-    // being called).
-    // Thread-safe.
-    size_t size_approx() const
-    {
-        size_t size = 0;
-        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-            size += ptr->size_approx();
-        }
-        return size;
-    }
-
-    // Returns true if the underlying atomic variables used by
-    // the queue are lock-free (they should be on most platforms).
-    // Thread-safe.
-    static constexpr bool is_lock_free()
-    {
-        return details::static_is_lock_free<bool>::value == 2 && details::static_is_lock_free<size_t>::value == 2 &&
-               details::static_is_lock_free<std::uint32_t>::value == 2 &&
-               details::static_is_lock_free<index_t>::value == 2 && details::static_is_lock_free<void*>::value == 2 &&
-               details::static_is_lock_free<
-                   typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
-    }
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
 
 private:
-    friend struct ProducerToken;
-    friend struct ConsumerToken;
-    struct ExplicitProducer;
-    friend struct ExplicitProducer;
-    struct ImplicitProducer;
-    friend struct ImplicitProducer;
-    friend class ConcurrentQueueTests;
-
-    enum AllocationMode { CanAlloc, CannotAlloc };
-
-    ///////////////////////////////
-    // Queue methods
-    ///////////////////////////////
-
-    template <AllocationMode canAlloc, typename U>
-    inline bool inner_enqueue(producer_token_t const& token, U&& element)
-    {
-        return static_cast<ExplicitProducer*>(token.producer)
-            ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-    }
-
-    template <AllocationMode canAlloc, typename U>
-    inline bool inner_enqueue(U&& element)
-    {
-        auto producer = get_or_add_implicit_producer();
-        return producer == nullptr ?
-                   false :
-                   producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-    }
-
-    template <AllocationMode canAlloc, typename It>
-    inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-    {
-        return static_cast<ExplicitProducer*>(token.producer)
-            ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-    }
-
-    template <AllocationMode canAlloc, typename It>
-    inline bool inner_enqueue_bulk(It itemFirst, size_t count)
-    {
-        auto producer = get_or_add_implicit_producer();
-        return producer == nullptr ?
-                   false :
-                   producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-    }
-
-    inline bool update_current_producer_after_rotation(consumer_token_t& token)
-    {
-        // Ah, there's been a rotation, figure out where we should be!
-        auto tail = producerListTail.load(std::memory_order_acquire);
-        if (token.desiredProducer == nullptr && tail == nullptr) {
-            return false;
-        }
-        auto prodCount    = producerCount.load(std::memory_order_relaxed);
-        auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
-        if ((details::unlikely)(token.desiredProducer == nullptr)) {
-            // Aha, first time we're dequeueing anything.
-            // Figure out our local position
-            // Note: offset is from start, not end, but we're traversing from end -- subtract from count first
-            std::uint32_t offset  = prodCount - 1 - (token.initialOffset % prodCount);
-            token.desiredProducer = tail;
-            for (std::uint32_t i = 0; i != offset; ++i) {
-                token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-                if (token.desiredProducer == nullptr) {
-                    token.desiredProducer = tail;
-                }
-            }
-        }
-
-        std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
-        if (delta >= prodCount) {
-            delta = delta % prodCount;
-        }
-        for (std::uint32_t i = 0; i != delta; ++i) {
-            token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-            if (token.desiredProducer == nullptr) {
-                token.desiredProducer = tail;
-            }
-        }
-
-        token.lastKnownGlobalOffset    = globalOffset;
-        token.currentProducer          = token.desiredProducer;
-        token.itemsConsumedFromCurrent = 0;
-        return true;
-    }
-
-    ///////////////////////////
-    // Free list
-    ///////////////////////////
-
-    template <typename N>
-    struct FreeListNode {
-        FreeListNode(): freeListRefs(0), freeListNext(nullptr) {}
-
-        std::atomic<std::uint32_t> freeListRefs;
-        std::atomic<N*> freeListNext;
-    };
-
-    // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
-    // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
-    // speedy under low contention.
-    template <typename N>  // N must inherit FreeListNode or have the same fields (and initialization of them)
-    struct FreeList {
-        FreeList(): freeListHead(nullptr) {}
-        FreeList(FreeList&& other): freeListHead(other.freeListHead.load(std::memory_order_relaxed))
-        {
-            other.freeListHead.store(nullptr, std::memory_order_relaxed);
-        }
-        void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
-
-        FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-        FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-
-        inline void add(N* node)
-        {
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-            debug::DebugLock lock(mutex);
-#endif
-            // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
-            // set it using a fetch_add
-            if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
-                // Oh look! We were the last ones referencing this node, and we know
-                // we want to add it to the free list, so let's do it!
-                add_knowing_refcount_is_zero(node);
-            }
-        }
-
-        inline N* try_get()
-        {
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-            debug::DebugLock lock(mutex);
-#endif
-            auto head = freeListHead.load(std::memory_order_acquire);
-            while (head != nullptr) {
-                auto prevHead = head;
-                auto refs     = head->freeListRefs.load(std::memory_order_relaxed);
-                if ((refs & REFS_MASK) == 0 ||
-                    !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
-                    head = freeListHead.load(std::memory_order_acquire);
-                    continue;
-                }
-
-                // Good, reference count has been incremented (it wasn't at zero), which means we can read the
-                // next and not worry about it changing between now and the time we do the CAS
-                auto next = head->freeListNext.load(std::memory_order_relaxed);
-                if (freeListHead.compare_exchange_strong(
-                        head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
-                    // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
-                    // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put
-                    // back on).
-                    assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
-
-                    // Decrease refcount twice, once for our ref, and once for the list's ref
-                    head->freeListRefs.fetch_sub(2, std::memory_order_release);
-                    return head;
-                }
-
-                // OK, the head must have changed on us, but we still need to decrease the refcount we increased.
-                // Note that we don't need to release any memory effects, but we do need to ensure that the reference
-                // count decrement happens-after the CAS on the head.
-                refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
-                if (refs == SHOULD_BE_ON_FREELIST + 1) {
-                    add_knowing_refcount_is_zero(prevHead);
-                }
-            }
-
-            return nullptr;
-        }
-
-        // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
-        N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
-
-    private:
-        inline void add_knowing_refcount_is_zero(N* node)
-        {
-            // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
-            // only one copy of this method per node at a time, i.e. the single thread case), then we know
-            // we can safely change the next pointer of the node; however, once the refcount is back above
-            // zero, then other threads could increase it (happens under heavy contention, when the refcount
-            // goes to zero in between a load and a refcount increment of a node in try_get, then back up to
-            // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
-            // to add the node to the actual list fails, decrease the refcount and leave the add operation to
-            // the next thread who puts the refcount back at zero (which could be us, hence the loop).
-            auto head = freeListHead.load(std::memory_order_relaxed);
-            while (true) {
-                node->freeListNext.store(head, std::memory_order_relaxed);
-                node->freeListRefs.store(1, std::memory_order_release);
-                if (!freeListHead.compare_exchange_strong(
-                        head, node, std::memory_order_release, std::memory_order_relaxed)) {
-                    // Hmm, the add failed, but we can only try again when the refcount goes back to zero
-                    if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
-                        continue;
-                    }
-                }
-                return;
-            }
-        }
-
-    private:
-        // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under
-        // contention)
-        std::atomic<N*> freeListHead;
-
-        static const std::uint32_t REFS_MASK             = 0x7FFFFFFF;
-        static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
-
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-        debug::DebugMutex mutex;
-#endif
-    };
-
-    ///////////////////////////
-    // Block
-    ///////////////////////////
-
-    enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
-
-    struct Block {
-        Block()
-            : next(nullptr)
-            , elementsCompletelyDequeued(0)
-            , freeListRefs(0)
-            , freeListNext(nullptr)
-            , dynamicallyAllocated(true)
-        {
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
+		{
 #ifdef MCDBGQ_TRACKMEM
-            owner = nullptr;
-#endif
-        }
-
-        template <InnerQueueContext context>
-        inline bool is_empty() const
-        {
-            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
-            {
-                // Check flags
-                for (size_t i = 0; i < BLOCK_SIZE; ++i) {
-                    if (!emptyFlags[i].load(std::memory_order_relaxed)) {
-                        return false;
-                    }
-                }
-
-                // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
-                std::atomic_thread_fence(std::memory_order_acquire);
-                return true;
-            }
-            else
-            {
-                // Check counter
-                if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
-                    std::atomic_thread_fence(std::memory_order_acquire);
-                    return true;
-                }
-                assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
-                return false;
-            }
-        }
-
-        // Returns true if the block is now empty (does not apply in explicit context)
-        template <InnerQueueContext context>
-        inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
-        {
-            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
-            {
-                // Set flag
-                assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(
-                    std::memory_order_relaxed));
-                emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(
-                    true, std::memory_order_release);
-                return false;
-            }
-            else
-            {
-                // Increment counter
-                auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
-                assert(prevVal < BLOCK_SIZE);
-                return prevVal == BLOCK_SIZE - 1;
-            }
-        }
-
-        // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
-        // Returns true if the block is now empty (does not apply in explicit context).
-        template <InnerQueueContext context>
-        inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
-        {
-            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
-            {
-                // Set flags
-                std::atomic_thread_fence(std::memory_order_release);
-                i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
-                for (size_t j = 0; j != count; ++j) {
-                    assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
-                    emptyFlags[i + j].store(true, std::memory_order_relaxed);
-                }
-                return false;
-            }
-            else
-            {
-                // Increment counter
-                auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
-                assert(prevVal + count <= BLOCK_SIZE);
-                return prevVal + count == BLOCK_SIZE;
-            }
-        }
-
-        template <InnerQueueContext context>
-        inline void set_all_empty()
-        {
-            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
-            {
-                // Set all flags
-                for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-                    emptyFlags[i].store(true, std::memory_order_relaxed);
-                }
-            }
-            else
-            {
-                // Reset counter
-                elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
-            }
-        }
-
-        template <InnerQueueContext context>
-        inline void reset_empty()
-        {
-            MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
-            {
-                // Reset flags
-                for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-                    emptyFlags[i].store(false, std::memory_order_relaxed);
-                }
-            }
-            else
-            {
-                // Reset counter
-                elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
-            }
-        }
-
-        inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT
-        {
-            return static_cast<T*>(static_cast<void*>(elements)) +
-                   static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
-        }
-        inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT
-        {
-            return static_cast<T const*>(static_cast<void const*>(elements)) +
-                   static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
-        }
-
-    private:
-        static_assert(
-            std::alignment_of<T>::value <= sizeof(T),
-            "The queue does not support types with an alignment greater than their size at this time");
-        MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
-
-    public:
-        Block* next;
-        std::atomic<size_t> elementsCompletelyDequeued;
-        std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
-
-    public:
-        std::atomic<std::uint32_t> freeListRefs;
-        std::atomic<Block*> freeListNext;
-        bool dynamicallyAllocated;  // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
-
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
 #ifdef MCDBGQ_TRACKMEM
-        void* owner;
+		void* owner;
 #endif
-    };
-    static_assert(
-        std::alignment_of<Block>::value >= std::alignment_of<T>::value,
-        "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
 
 #ifdef MCDBGQ_TRACKMEM
 public:
-    struct MemStats;
-
+	struct MemStats;
 private:
 #endif
-
-    ///////////////////////////
-    // Producer base
-    ///////////////////////////
-
-    struct ProducerBase: public details::ConcurrentQueueProducerTypelessBase {
-        ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
-            : tailIndex(0)
-            , headIndex(0)
-            , dequeueOptimisticCount(0)
-            , dequeueOvercommit(0)
-            , tailBlock(nullptr)
-            , isExplicit(isExplicit_)
-            , parent(parent_)
-        {
-        }
-
-        virtual ~ProducerBase() {}
-
-        template <typename U>
-        inline bool dequeue(U& element)
-        {
-            if (isExplicit) {
-                return static_cast<ExplicitProducer*>(this)->dequeue(element);
-            } else {
-                return static_cast<ImplicitProducer*>(this)->dequeue(element);
-            }
-        }
-
-        template <typename It>
-        inline size_t dequeue_bulk(It& itemFirst, size_t max)
-        {
-            if (isExplicit) {
-                return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-            } else {
-                return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-            }
-        }
-
-        inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
-
-        inline size_t size_approx() const
-        {
-            auto tail = tailIndex.load(std::memory_order_relaxed);
-            auto head = headIndex.load(std::memory_order_relaxed);
-            return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
-        }
-
-        inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
-
-    protected:
-        std::atomic<index_t> tailIndex;  // Where to enqueue to next
-        std::atomic<index_t> headIndex;  // Where to dequeue from next
-
-        std::atomic<index_t> dequeueOptimisticCount;
-        std::atomic<index_t> dequeueOvercommit;
-
-        Block* tailBlock;
-
-    public:
-        bool isExplicit;
-        ConcurrentQueue* parent;
-
-    protected:
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { }
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
 #ifdef MCDBGQ_TRACKMEM
-        friend struct MemStats;
-#endif
-    };
-
-    ///////////////////////////
-    // Explicit queue
-    ///////////////////////////
-
-    struct ExplicitProducer: public ProducerBase {
-        explicit ExplicitProducer(ConcurrentQueue* parent_)
-            : ProducerBase(parent_, true)
-            , blockIndex(nullptr)
-            , pr_blockIndexSlotsUsed(0)
-            , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1)
-            , pr_blockIndexFront(0)
-            , pr_blockIndexEntries(nullptr)
-            , pr_blockIndexRaw(nullptr)
-        {
-            size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
-            if (poolBasedIndexSize > pr_blockIndexSize) {
-                pr_blockIndexSize = poolBasedIndexSize;
-            }
-
-            new_block_index(0);  // This creates an index with double the number of current entries, i.e.
-                                 // EXPLICIT_INITIAL_INDEX_SIZE
-        }
-
-        ~ExplicitProducer()
-        {
-            // Destruct any elements not yet dequeued.
-            // Since we're in the destructor, we can assume all elements
-            // are either completely dequeued or completely not (no halfways).
-            if (this->tailBlock != nullptr) {  // Note this means there must be a block index too
-                // First find the block that's partially dequeued, if any
-                Block* halfDequeuedBlock = nullptr;
-                if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
-                    // The head's not on a block boundary, meaning a block somewhere is partially dequeued
-                    // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a
-                    // boundary)
-                    size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
-                    while (details::circular_less_than<index_t>(
-                        pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
-                        i = (i + 1) & (pr_blockIndexSize - 1);
-                    }
-                    assert(
-                        details::circular_less_than<index_t>(
-                            pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
-                    halfDequeuedBlock = pr_blockIndexEntries[i].block;
-                }
-
-                // Start at the head block (note the first line in the loop gives us the head from the tail on the first
-                // iteration)
-                auto block = this->tailBlock;
-                do {
-                    block = block->next;
-                    if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-                        continue;
-                    }
-
-                    size_t i = 0;  // Offset into block
-                    if (block == halfDequeuedBlock) {
-                        i = static_cast<size_t>(
-                            this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-                    }
-
-                    // Walk through all the items in the block; if this is the tail block, we need to stop when we reach
-                    // the tail index
-                    auto lastValidIndex =
-                        (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ?
-                            BLOCK_SIZE :
-                            static_cast<size_t>(
-                                this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-                    while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
-                        (*block)[i++]->~T();
-                    }
-                } while (block != this->tailBlock);
-            }
-
-            // Destroy all blocks that we own
-            if (this->tailBlock != nullptr) {
-                auto block = this->tailBlock;
-                do {
-                    auto nextBlock = block->next;
-                    this->parent->add_block_to_free_list(block);
-                    block = nextBlock;
-                } while (block != this->tailBlock);
-            }
-
-            // Destroy the block indices
-            auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
-            while (header != nullptr) {
-                auto prev = static_cast<BlockIndexHeader*>(header->prev);
-                header->~BlockIndexHeader();
-                (Traits::free)(header);
-                header = prev;
-            }
-        }
-
-        template <AllocationMode allocMode, typename U>
-        inline bool enqueue(U&& element)
-        {
-            index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-            index_t newTailIndex     = 1 + currentTailIndex;
-            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-                // We reached the end of a block, start a new one
-                auto startBlock                  = this->tailBlock;
-                auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-                if (this->tailBlock != nullptr &&
-                    this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-                    // We can re-use the block ahead of us, it's empty!
-                    this->tailBlock = this->tailBlock->next;
-                    this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-
-                    // We'll put the block on the block index (guaranteed to be room since we're conceptually removing
-                    // the last block from it first -- except instead of removing then adding, we can just overwrite).
-                    // Note that there must be a valid block index here, since even if allocation failed in the ctor,
-                    // it would have been re-attempted when adding the first block to the queue; since there is such
-                    // a block, a block index must have been successfully allocated.
-                } else {
-                    // Whatever head value we see here is >= the last value we saw here (relatively),
-                    // and <= its current value. Since we have the most recent tail, the head must be
-                    // <= to it.
-                    auto head = this->headIndex.load(std::memory_order_relaxed);
-                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-                    if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
-                        (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
-                         (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-                        // We can't enqueue in another block because there's not enough leeway -- the
-                        // tail could surpass the head by the time the block fills up! (Or we'll exceed
-                        // the size limit, if the second part of the condition was true.)
-                        return false;
-                    }
-                    // We're going to need a new block; check that the block index has room
-                    if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
-                        // Hmm, the circular block index is already full -- we'll need
-                        // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
-                        // the initial allocation failed in the constructor.
-
-                        MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
-                        {
-                            return false;
-                        }
-                        else if (!new_block_index(pr_blockIndexSlotsUsed))
-                        {
-                            return false;
-                        }
-                    }
-
-                    // Insert a new block in the circular linked list
-                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-                    if (newBlock == nullptr) {
-                        return false;
-                    }
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					this->parent->add_block_to_free_list(block);
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							return false;
+						}
+						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
 #ifdef MCDBGQ_TRACKMEM
-                    newBlock->owner = this;
-#endif
-                    newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-                    if (this->tailBlock == nullptr) {
-                        newBlock->next = newBlock;
-                    } else {
-                        newBlock->next        = this->tailBlock->next;
-                        this->tailBlock->next = newBlock;
-                    }
-                    this->tailBlock = newBlock;
-                    ++pr_blockIndexSlotsUsed;
-                }
-
-                MOODYCAMEL_CONSTEXPR_IF(
-                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
-                {
-                    // The constructor may throw. We want the element not to appear in the queue in
-                    // that case (without corrupting the queue):
-                    MOODYCAMEL_TRY
-                    {
-                        new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-                    }
-                    MOODYCAMEL_CATCH(...)
-                    {
-                        // Revert change to the current block, but leave the new block available
-                        // for next time
-                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-                        this->tailBlock        = startBlock == nullptr ? this->tailBlock : startBlock;
-                        MOODYCAMEL_RETHROW;
-                    }
-                }
-                else
-                {
-                    (void)startBlock;
-                    (void)originalBlockIndexSlotsUsed;
-                }
-
-                // Add block to block index
-                auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-                entry.base  = currentTailIndex;
-                entry.block = this->tailBlock;
-                blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
-                pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-
-                MOODYCAMEL_CONSTEXPR_IF(
-                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
-                {
-                    this->tailIndex.store(newTailIndex, std::memory_order_release);
-                    return true;
-                }
-            }
-
-            // Enqueue
-            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-
-            this->tailIndex.store(newTailIndex, std::memory_order_release);
-            return true;
-        }
-
-        template <typename U>
-        bool dequeue(U& element)
-        {
-            auto tail       = this->tailIndex.load(std::memory_order_relaxed);
-            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-            if (details::circular_less_than<index_t>(
-                    this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-                // Might be something to dequeue, let's give it a try
-
-                // Note that this if is purely for performance purposes in the common case when the queue is
-                // empty and the values are eventually consistent -- we may enter here spuriously.
-
-                // Note that whatever the values of overcommit and tail are, they are not going to change (unless we
-                // change them) and must be the same value at this point (inside the if) as when the if condition was
-                // evaluated.
-
-                // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit
-                // below. This ensures that whatever the value we got loaded into overcommit, the load of
-                // dequeueOptisticCount in the fetch_add below will result in a value at least as recent as that (and
-                // therefore at least as large). Note that I believe a compiler (signal) fence here would be sufficient
-                // due to the nature of fetch_add (all read-modify-write operations are guaranteed to work on the latest
-                // value in the modification order), but unfortunately that can't be shown to be correct using only the
-                // C++11 standard. See
-                // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
-                std::atomic_thread_fence(std::memory_order_acquire);
-
-                // Increment optimistic counter, then check if it went over the boundary
-                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-
-                // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is
-                // only ever incremented after dequeueOptimisticCount -- this is enforced in the `else` block below),
-                // and since we now have a version of dequeueOptimisticCount that is at least as recent as overcommit
-                // (due to the release upon incrementing dequeueOvercommit and the acquire above that synchronizes with
-                // it), overcommit <= myDequeueCount. However, we can't assert this since both dequeueOptimisticCount
-                // and dequeueOvercommit may (independently) overflow; in such a case, though, the logic still holds
-                // since the difference between the two is maintained.
-
-                // Note that we reload tail here in case it changed; it will be the same value as before or greater,
-                // since this load is sequenced after (happens after) the earlier load above. This is supported by
-                // read-read coherency (as defined in the standard), explained here:
-                // http://en.cppreference.com/w/cpp/atomic/memory_order
-                tail = this->tailIndex.load(std::memory_order_acquire);
-                if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-                    // Guaranteed to be at least one element to dequeue!
-
-                    // Get the index. Note that since there's guaranteed to be at least one element, this
-                    // will never exceed tail. We need to do an acquire-release fence here since it's possible
-                    // that whatever condition got us to this point was for an earlier enqueued element (that
-                    // we already see the memory effects for), but that by the time we increment somebody else
-                    // has incremented it, and we need to see the memory effects for *that* element, which is
-                    // in such a case is necessarily visible on the thread that incremented it in the first
-                    // place with the more current condition (they must have acquired a tail that is at least
-                    // as recent).
-                    auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-
-                    // Determine which block the element is in
-
-                    auto localBlockIndex     = blockIndex.load(std::memory_order_acquire);
-                    auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-
-                    // We need to be careful here about subtracting and dividing because of index wrap-around.
-                    // When an index wraps, we need to preserve the sign of the offset when dividing it by the
-                    // block size (in order to get a correct signed block count offset in all cases):
-                    auto headBase       = localBlockIndex->entries[localBlockIndexHead].base;
-                    auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
-                    auto offset         = static_cast<size_t>(
-                        static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) /
-                        static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-                    auto block =
-                        localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
-
-                    // Dequeue
-                    auto& el = *((*block)[index]);
-                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-                        // Make sure the element is still fully dequeued and destroyed even if the assignment
-                        // throws
-                        struct Guard {
-                            Block* block;
-                            index_t index;
-
-                            ~Guard()
-                            {
-                                (*block)[index]->~T();
-                                block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-                            }
-                        } guard = {block, index};
-
-                        element = std::move(el);  // NOLINT
-                    } else {
-                        element = std::move(el);  // NOLINT
-                        el.~T();                  // NOLINT
-                        block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-                    }
-
-                    return true;
-                } else {
-                    // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-                    this->dequeueOvercommit.fetch_add(
-                        1, std::memory_order_release);  // Release so that the fetch_add on dequeueOptimisticCount is
-                                                        // guaranteed to happen before this write
-                }
-            }
-
-            return false;
-        }
-
-        template <AllocationMode allocMode, typename It>
-        bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
-        {
-            // First, we need to make sure we have enough room to enqueue all of the elements;
-            // this means pre-allocating blocks and putting them in the block index (but only if
-            // all the allocations succeeded).
-            index_t startTailIndex           = this->tailIndex.load(std::memory_order_relaxed);
-            auto startBlock                  = this->tailBlock;
-            auto originalBlockIndexFront     = pr_blockIndexFront;
-            auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-
-            Block* firstAllocatedBlock = nullptr;
-
-            // Figure out how many blocks we'll need to allocate, and do so
-            size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) -
-                                   ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-            index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-            if (blockBaseDiff > 0) {
-                // Allocate as many blocks as possible from ahead
-                while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
-                       this->tailBlock->next != firstAllocatedBlock &&
-                       this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-                    this->tailBlock     = this->tailBlock->next;
-                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-
-                    auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-                    entry.base         = currentTailIndex;
-                    entry.block        = this->tailBlock;
-                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-                }
-
-                // Now allocate as many blocks as necessary from the block pool
-                while (blockBaseDiff > 0) {
-                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-                    auto head = this->headIndex.load(std::memory_order_relaxed);
-                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-                    bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
-                                (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
-                                 (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-                    if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-                        MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
-                        {
-                            // Failed to allocate, undo changes (but keep injected blocks)
-                            pr_blockIndexFront     = originalBlockIndexFront;
-                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-                            return false;
-                        }
-                        else if (full || !new_block_index(originalBlockIndexSlotsUsed))
-                        {
-                            // Failed to allocate, undo changes (but keep injected blocks)
-                            pr_blockIndexFront     = originalBlockIndexFront;
-                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-                            return false;
-                        }
-
-                        // pr_blockIndexFront is updated inside new_block_index, so we need to
-                        // update our fallback value too (since we keep the new index even if we
-                        // later fail)
-                        originalBlockIndexFront = originalBlockIndexSlotsUsed;
-                    }
-
-                    // Insert a new block in the circular linked list
-                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-                    if (newBlock == nullptr) {
-                        pr_blockIndexFront     = originalBlockIndexFront;
-                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-                        this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-                        return false;
-                    }
-
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
 #ifdef MCDBGQ_TRACKMEM
-                    newBlock->owner = this;
-#endif
-                    newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
-                    if (this->tailBlock == nullptr) {
-                        newBlock->next = newBlock;
-                    } else {
-                        newBlock->next        = this->tailBlock->next;
-                        this->tailBlock->next = newBlock;
-                    }
-                    this->tailBlock     = newBlock;
-                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-
-                    ++pr_blockIndexSlotsUsed;
-
-                    auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-                    entry.base         = currentTailIndex;
-                    entry.block        = this->tailBlock;
-                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-                }
-
-                // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
-                // publish the new block index front
-                auto block = firstAllocatedBlock;
-                while (true) {
-                    block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-                    if (block == this->tailBlock) {
-                        break;
-                    }
-                    block = block->next;
-                }
-
-                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
-                    T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
-                {
-                    blockIndex.load(std::memory_order_relaxed)
-                        ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-                }
-            }
-
-            // Enqueue, one block at a time
-            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-            currentTailIndex     = startTailIndex;
-            auto endBlock        = this->tailBlock;
-            this->tailBlock      = startBlock;
-            assert(
-                (startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr ||
-                count == 0);
-            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-                this->tailBlock = firstAllocatedBlock;
-            }
-            while (true) {
-                index_t stopIndex =
-                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-                if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-                    stopIndex = newTailIndex;
-                }
-                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
-                    T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
-                {
-                    while (currentTailIndex != stopIndex) {
-                        new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-                    }
-                }
-                else
-                {
-                    MOODYCAMEL_TRY
-                    {
-                        while (currentTailIndex != stopIndex) {
-                            // Must use copy constructor even if move constructor is available
-                            // because we may have to revert if there's an exception.
-                            // Sorry about the horrible templated next line, but it was the only way
-                            // to disable moving *at compile time*, which is important because a type
-                            // may only define a (noexcept) move constructor, and so calls to the
-                            // cctor will not compile, even if they are in an if branch that will never
-                            // be executed
-                            new ((*this->tailBlock)[currentTailIndex])
-                                T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
-                                      T,
-                                      decltype(*itemFirst),
-                                      new (static_cast<T*>(nullptr))
-                                          T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-                            ++currentTailIndex;
-                            ++itemFirst;
-                        }
-                    }
-                    MOODYCAMEL_CATCH(...)
-                    {
-                        // Oh dear, an exception's been thrown -- destroy the elements that
-                        // were enqueued so far and revert the entire bulk operation (we'll keep
-                        // any allocated blocks in our linked list for later, though).
-                        auto constructedStopIndex = currentTailIndex;
-                        auto lastBlockEnqueued    = this->tailBlock;
-
-                        pr_blockIndexFront     = originalBlockIndexFront;
-                        pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-                        this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-
-                        if (!details::is_trivially_destructible<T>::value) {
-                            auto block = startBlock;
-                            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-                                block = firstAllocatedBlock;
-                            }
-                            currentTailIndex = startTailIndex;
-                            while (true) {
-                                stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
-                                            static_cast<index_t>(BLOCK_SIZE);
-                                if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-                                    stopIndex = constructedStopIndex;
-                                }
-                                while (currentTailIndex != stopIndex) {
-                                    (*block)[currentTailIndex++]->~T();
-                                }
-                                if (block == lastBlockEnqueued) {
-                                    break;
-                                }
-                                block = block->next;
-                            }
-                        }
-                        MOODYCAMEL_RETHROW;
-                    }
-                }
-
-                if (this->tailBlock == endBlock) {
-                    assert(currentTailIndex == newTailIndex);
-                    break;
-                }
-                this->tailBlock = this->tailBlock->next;
-            }
-
-            MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
-                T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
-            {
-                if (firstAllocatedBlock != nullptr)
-                    blockIndex.load(std::memory_order_relaxed)
-                        ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-            }
-
-            this->tailIndex.store(newTailIndex, std::memory_order_release);
-            return true;
-        }
-
-        template <typename It>
-        size_t dequeue_bulk(It& itemFirst, size_t max)
-        {
-            auto tail       = this->tailIndex.load(std::memory_order_relaxed);
-            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-            auto desiredCount =
-                static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-            if (details::circular_less_than<size_t>(0, desiredCount)) {
-                desiredCount = desiredCount < max ? desiredCount : max;
-                std::atomic_thread_fence(std::memory_order_acquire);
-
-                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-
-                tail             = this->tailIndex.load(std::memory_order_acquire);
-                auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-                if (details::circular_less_than<size_t>(0, actualCount)) {
-                    actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-                    if (actualCount < desiredCount) {
-                        this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-                    }
-
-                    // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-                    // will never exceed tail.
-                    auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-
-                    // Determine which block the first element is in
-                    auto localBlockIndex     = blockIndex.load(std::memory_order_acquire);
-                    auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-
-                    auto headBase            = localBlockIndex->entries[localBlockIndexHead].base;
-                    auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
-                    auto offset              = static_cast<size_t>(
-                        static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) /
-                        static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-                    auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
-
-                    // Iterate the blocks and dequeue
-                    auto index = firstIndex;
-                    do {
-                        auto firstIndexInBlock = index;
-                        index_t endIndex =
-                            (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-                        endIndex   = details::circular_less_than<index_t>(
-                                       firstIndex + static_cast<index_t>(actualCount), endIndex) ?
-                                         firstIndex + static_cast<index_t>(actualCount) :
-                                         endIndex;
-                        auto block = localBlockIndex->entries[indexIndex].block;
-                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(
-                                T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-                            while (index != endIndex) {
-                                auto& el     = *((*block)[index]);
-                                *itemFirst++ = std::move(el);
-                                el.~T();
-                                ++index;
-                            }
-                        } else {
-                            MOODYCAMEL_TRY
-                            {
-                                while (index != endIndex) {
-                                    auto& el   = *((*block)[index]);
-                                    *itemFirst = std::move(el);
-                                    ++itemFirst;
-                                    el.~T();
-                                    ++index;
-                                }
-                            }
-                            MOODYCAMEL_CATCH(...)
-                            {
-                                // It's too late to revert the dequeue, but we can make sure that all
-                                // the dequeued objects are properly destroyed and the block index
-                                // (and empty count) are properly updated before we propagate the exception
-                                do {
-                                    block = localBlockIndex->entries[indexIndex].block;
-                                    while (index != endIndex) {
-                                        (*block)[index++]->~T();
-                                    }
-                                    block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(
-                                        firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-                                    indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-
-                                    firstIndexInBlock = index;
-                                    endIndex          = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
-                                               static_cast<index_t>(BLOCK_SIZE);
-                                    endIndex = details::circular_less_than<index_t>(
-                                                   firstIndex + static_cast<index_t>(actualCount), endIndex) ?
-                                                   firstIndex + static_cast<index_t>(actualCount) :
-                                                   endIndex;
-                                } while (index != firstIndex + actualCount);
-
-                                MOODYCAMEL_RETHROW;
-                            }
-                        }
-                        block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(
-                            firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-                        indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-                    } while (index != firstIndex + actualCount);
-
-                    return actualCount;
-                } else {
-                    // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-                    this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-                }
-            }
-
-            return 0;
-        }
-
-    private:
-        struct BlockIndexEntry {
-            index_t base;
-            Block* block;
-        };
-
-        struct BlockIndexHeader {
-            size_t size;
-            std::atomic<size_t> front;  // Current slot (not next, like pr_blockIndexFront)
-            BlockIndexEntry* entries;
-            void* prev;
-        };
-
-        bool new_block_index(size_t numberOfFilledSlotsToExpose)
-        {
-            auto prevBlockSizeMask = pr_blockIndexSize - 1;
-
-            // Create the new block
-            pr_blockIndexSize <<= 1;
-            auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) +
-                                                                 std::alignment_of<BlockIndexEntry>::value - 1 +
-                                                                 sizeof(BlockIndexEntry) * pr_blockIndexSize));
-            if (newRawPtr == nullptr) {
-                pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
-                return false;
-            }
-
-            auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(
-                details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
-
-            // Copy in all the old indices, if any
-            size_t j = 0;
-            if (pr_blockIndexSlotsUsed != 0) {
-                auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
-                do {
-                    newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
-                    i                         = (i + 1) & prevBlockSizeMask;
-                } while (i != pr_blockIndexFront);
-            }
-
-            // Update everything
-            auto header  = new (newRawPtr) BlockIndexHeader;
-            header->size = pr_blockIndexSize;
-            header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
-            header->entries = newBlockIndexEntries;
-            header->prev    = pr_blockIndexRaw;  // we link the new block to the old one so we can free it later
-
-            pr_blockIndexFront   = j;
-            pr_blockIndexEntries = newBlockIndexEntries;
-            pr_blockIndexRaw     = newRawPtr;
-            blockIndex.store(header, std::memory_order_release);
-
-            return true;
-        }
-
-    private:
-        std::atomic<BlockIndexHeader*> blockIndex;
-
-        // To be used by producer only -- consumer must use the ones in referenced by blockIndex
-        size_t pr_blockIndexSlotsUsed;
-        size_t pr_blockIndexSize;
-        size_t pr_blockIndexFront;  // Next slot (not current)
-        BlockIndexEntry* pr_blockIndexEntries;
-        void* pr_blockIndexRaw;
-
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+				if (firstAllocatedBlock != nullptr)
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-    public:
-        ExplicitProducer* nextExplicitProducer;
-
-    private:
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
 #endif
-
+		
 #ifdef MCDBGQ_TRACKMEM
-        friend struct MemStats;
-#endif
-    };
-
-    //////////////////////////////////
-    // Implicit queue
-    //////////////////////////////////
-
-    struct ImplicitProducer: public ProducerBase {
-        ImplicitProducer(ConcurrentQueue* parent_)
-            : ProducerBase(parent_, false), nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), blockIndex(nullptr)
-        {
-            new_block_index();
-        }
-
-        ~ImplicitProducer()
-        {
-            // Note that since we're in the destructor we can assume that all enqueue/dequeue operations
-            // completed already; this means that all undequeued elements are placed contiguously across
-            // contiguous blocks, and that only the first and last remaining blocks can be only partially
-            // empty (all other remaining blocks must be completely full).
-
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-            // Unregister ourselves for thread termination notification
-            if (!this->inactive.load(std::memory_order_relaxed)) {
-                details::ThreadExitNotifier::unsubscribe(&threadExitListener);
-            }
-#endif
-
-            // Destroy all remaining elements!
-            auto tail    = this->tailIndex.load(std::memory_order_relaxed);
-            auto index   = this->headIndex.load(std::memory_order_relaxed);
-            Block* block = nullptr;
-            assert(index == tail || details::circular_less_than(index, tail));
-            bool forceFreeLastBlock =
-                index != tail;  // If we enter the loop, then the last (tail) block will not be freed
-            while (index != tail) {
-                if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
-                    if (block != nullptr) {
-                        // Free the old block
-                        this->parent->add_block_to_free_list(block);
-                    }
-
-                    block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
-                }
-
-                ((*block)[index])->~T();
-                ++index;
-            }
-            // Even if the queue is empty, there's still one block that's not on the free list
-            // (unless the head index reached the end of it, in which case the tail will be poised
-            // to create a new block).
-            if (this->tailBlock != nullptr &&
-                (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
-                this->parent->add_block_to_free_list(this->tailBlock);
-            }
-
-            // Destroy block index
-            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-            if (localBlockIndex != nullptr) {
-                for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
-                    localBlockIndex->index[i]->~BlockIndexEntry();
-                }
-                do {
-                    auto prev = localBlockIndex->prev;
-                    localBlockIndex->~BlockIndexHeader();
-                    (Traits::free)(localBlockIndex);
-                    localBlockIndex = prev;
-                } while (localBlockIndex != nullptr);
-            }
-        }
-
-        template <AllocationMode allocMode, typename U>
-        inline bool enqueue(U&& element)
-        {
-            index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-            index_t newTailIndex     = 1 + currentTailIndex;
-            if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-                // We reached the end of a block, start a new one
-                auto head = this->headIndex.load(std::memory_order_relaxed);
-                assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-                if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
-                    (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
-                     (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-                    return false;
-                }
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-                debug::DebugLock lock(mutex);
-#endif
-                // Find out where we'll be inserting this block in the block index
-                BlockIndexEntry* idxEntry;
-                if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
-                    return false;
-                }
-
-                // Get ahold of a new block
-                auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-                if (newBlock == nullptr) {
-                    rewind_block_index_tail();
-                    idxEntry->value.store(nullptr, std::memory_order_relaxed);
-                    return false;
-                }
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
 #ifdef MCDBGQ_TRACKMEM
-                newBlock->owner = this;
-#endif
-                newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-
-                MOODYCAMEL_CONSTEXPR_IF(
-                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
-                {
-                    // May throw, try to insert now before we publish the fact that we have this new block
-                    MOODYCAMEL_TRY
-                    {
-                        new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
-                    }
-                    MOODYCAMEL_CATCH(...)
-                    {
-                        rewind_block_index_tail();
-                        idxEntry->value.store(nullptr, std::memory_order_relaxed);
-                        this->parent->add_block_to_free_list(newBlock);
-                        MOODYCAMEL_RETHROW;
-                    }
-                }
-
-                // Insert the new block into the index
-                idxEntry->value.store(newBlock, std::memory_order_relaxed);
-
-                this->tailBlock = newBlock;
-
-                MOODYCAMEL_CONSTEXPR_IF(
-                    !MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
-                {
-                    this->tailIndex.store(newTailIndex, std::memory_order_release);
-                    return true;
-                }
-            }
-
-            // Enqueue
-            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-
-            this->tailIndex.store(newTailIndex, std::memory_order_release);
-            return true;
-        }
-
-        template <typename U>
-        bool dequeue(U& element)
-        {
-            // See ExplicitProducer::dequeue for rationale and explanation
-            index_t tail       = this->tailIndex.load(std::memory_order_relaxed);
-            index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-            if (details::circular_less_than<index_t>(
-                    this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-                std::atomic_thread_fence(std::memory_order_acquire);
-
-                index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-                tail                   = this->tailIndex.load(std::memory_order_acquire);
-                if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-                    index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-
-                    // Determine which block the element is in
-                    auto entry = get_block_index_entry_for_index(index);
-
-                    // Dequeue
-                    auto block = entry->value.load(std::memory_order_relaxed);
-                    auto& el   = *((*block)[index]);
-
-                    if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-                        // Note: Acquiring the mutex with every dequeue instead of only when a block
-                        // is released is very sub-optimal, but it is, after all, purely debug code.
-                        debug::DebugLock lock(producer->mutex);
-#endif
-                        struct Guard {
-                            Block* block;
-                            index_t index;
-                            BlockIndexEntry* entry;
-                            ConcurrentQueue* parent;
-
-                            ~Guard()
-                            {
-                                (*block)[index]->~T();
-                                if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-                                    entry->value.store(nullptr, std::memory_order_relaxed);
-                                    parent->add_block_to_free_list(block);
-                                }
-                            }
-                        } guard = {block, index, entry, this->parent};
-
-                        element = std::move(el);  // NOLINT
-                    } else {
-                        element = std::move(el);  // NOLINT
-                        el.~T();                  // NOLINT
-
-                        if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-                            {
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-                                debug::DebugLock lock(mutex);
-#endif
-                                // Add the block back into the global free pool (and remove from block index)
-                                entry->value.store(nullptr, std::memory_order_relaxed);
-                            }
-                            this->parent->add_block_to_free_list(block);  // releases the above store
-                        }
-                    }
-
-                    return true;
-                } else {
-                    this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
-                }
-            }
-
-            return false;
-        }
-
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable : 4706)  // assignment within conditional expression
-#endif
-        template <AllocationMode allocMode, typename It>
-        bool enqueue_bulk(It itemFirst, size_t count)
-        {
-            // First, we need to make sure we have enough room to enqueue all of the elements;
-            // this means pre-allocating blocks and putting them in the block index (but only if
-            // all the allocations succeeded).
-
-            // Note that the tailBlock we start off with may not be owned by us any more;
-            // this happens if it was filled up exactly to the top (setting tailIndex to
-            // the first index of the next block which is not yet allocated), then dequeued
-            // completely (putting it on the free list) before we enqueue again.
-
-            index_t startTailIndex     = this->tailIndex.load(std::memory_order_relaxed);
-            auto startBlock            = this->tailBlock;
-            Block* firstAllocatedBlock = nullptr;
-            auto endBlock              = this->tailBlock;
-
-            // Figure out how many blocks we'll need to allocate, and do so
-            size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) -
-                                   ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-            index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-            if (blockBaseDiff > 0) {
+#pragma warning(disable: 4706)  // assignment within conditional expression
+#endif
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-                debug::DebugLock lock(mutex);
-#endif
-                do {
-                    blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-                    currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-                    // Find out where we'll be inserting this block in the block index
-                    BlockIndexEntry* idxEntry =
-                        nullptr;  // initialization here unnecessary but compiler can't always tell
-                    Block* newBlock;
-                    bool indexInserted = false;
-                    auto head          = this->headIndex.load(std::memory_order_relaxed);
-                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-                    bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) ||
-                                (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
-                                 (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-
-                    if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) ||
-                        (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) ==
-                            nullptr) {
-                        // Index allocation or block allocation failed; revert any other allocations
-                        // and index insertions done so far for this operation
-                        if (indexInserted) {
-                            rewind_block_index_tail();
-                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
-                        }
-                        currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-                        for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-                            currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-                            idxEntry = get_block_index_entry_for_index(currentTailIndex);
-                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
-                            rewind_block_index_tail();
-                        }
-                        this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-                        this->tailBlock = startBlock;
-
-                        return false;
-                    }
-
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
 #ifdef MCDBGQ_TRACKMEM
-                    newBlock->owner = this;
-#endif
-                    newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-                    newBlock->next = nullptr;
-
-                    // Insert the new block into the index
-                    idxEntry->value.store(newBlock, std::memory_order_relaxed);
-
-                    // Store the chain of blocks so that we can undo if later allocations fail,
-                    // and so that we can find the blocks when we do the actual enqueueing
-                    if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
-                        firstAllocatedBlock != nullptr) {
-                        assert(this->tailBlock != nullptr);
-                        this->tailBlock->next = newBlock;
-                    }
-                    this->tailBlock     = newBlock;
-                    endBlock            = newBlock;
-                    firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
-                } while (blockBaseDiff > 0);
-            }
-
-            // Enqueue, one block at a time
-            index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-            currentTailIndex     = startTailIndex;
-            this->tailBlock      = startBlock;
-            assert(
-                (startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr ||
-                count == 0);
-            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-                this->tailBlock = firstAllocatedBlock;
-            }
-            while (true) {
-                index_t stopIndex =
-                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-                if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-                    stopIndex = newTailIndex;
-                }
-                MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
-                    T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
-                {
-                    while (currentTailIndex != stopIndex) {
-                        new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-                    }
-                }
-                else
-                {
-                    MOODYCAMEL_TRY
-                    {
-                        while (currentTailIndex != stopIndex) {
-                            new ((*this->tailBlock)[currentTailIndex])
-                                T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
-                                      T,
-                                      decltype(*itemFirst),
-                                      new (static_cast<T*>(nullptr))
-                                          T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-                            ++currentTailIndex;
-                            ++itemFirst;
-                        }
-                    }
-                    MOODYCAMEL_CATCH(...)
-                    {
-                        auto constructedStopIndex = currentTailIndex;
-                        auto lastBlockEnqueued    = this->tailBlock;
-
-                        if (!details::is_trivially_destructible<T>::value) {
-                            auto block = startBlock;
-                            if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-                                block = firstAllocatedBlock;
-                            }
-                            currentTailIndex = startTailIndex;
-                            while (true) {
-                                stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
-                                            static_cast<index_t>(BLOCK_SIZE);
-                                if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-                                    stopIndex = constructedStopIndex;
-                                }
-                                while (currentTailIndex != stopIndex) {
-                                    (*block)[currentTailIndex++]->~T();
-                                }
-                                if (block == lastBlockEnqueued) {
-                                    break;
-                                }
-                                block = block->next;
-                            }
-                        }
-
-                        currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-                        for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-                            currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-                            auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
-                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
-                            rewind_block_index_tail();
-                        }
-                        this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-                        this->tailBlock = startBlock;
-                        MOODYCAMEL_RETHROW;
-                    }
-                }
-
-                if (this->tailBlock == endBlock) {
-                    assert(currentTailIndex == newTailIndex);
-                    break;
-                }
-                this->tailBlock = this->tailBlock->next;
-            }
-            this->tailIndex.store(newTailIndex, std::memory_order_release);
-            return true;
-        }
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-
-        template <typename It>
-        size_t dequeue_bulk(It& itemFirst, size_t max)
-        {
-            auto tail       = this->tailIndex.load(std::memory_order_relaxed);
-            auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-            auto desiredCount =
-                static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-            if (details::circular_less_than<size_t>(0, desiredCount)) {
-                desiredCount = desiredCount < max ? desiredCount : max;
-                std::atomic_thread_fence(std::memory_order_acquire);
-
-                auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-
-                tail             = this->tailIndex.load(std::memory_order_acquire);
-                auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-                if (details::circular_less_than<size_t>(0, actualCount)) {
-                    actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-                    if (actualCount < desiredCount) {
-                        this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-                    }
-
-                    // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-                    // will never exceed tail.
-                    auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-
-                    // Iterate the blocks and dequeue
-                    auto index = firstIndex;
-                    BlockIndexHeader* localBlockIndex;
-                    auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
-                    do {
-                        auto blockStartIndex = index;
-                        index_t endIndex =
-                            (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-                        endIndex = details::circular_less_than<index_t>(
-                                       firstIndex + static_cast<index_t>(actualCount), endIndex) ?
-                                       firstIndex + static_cast<index_t>(actualCount) :
-                                       endIndex;
-
-                        auto entry = localBlockIndex->index[indexIndex];
-                        auto block = entry->value.load(std::memory_order_relaxed);
-                        if (MOODYCAMEL_NOEXCEPT_ASSIGN(
-                                T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-                            while (index != endIndex) {
-                                auto& el     = *((*block)[index]);
-                                *itemFirst++ = std::move(el);
-                                el.~T();
-                                ++index;
-                            }
-                        } else {
-                            MOODYCAMEL_TRY
-                            {
-                                while (index != endIndex) {
-                                    auto& el   = *((*block)[index]);
-                                    *itemFirst = std::move(el);
-                                    ++itemFirst;
-                                    el.~T();
-                                    ++index;
-                                }
-                            }
-                            MOODYCAMEL_CATCH(...)
-                            {
-                                do {
-                                    entry = localBlockIndex->index[indexIndex];
-                                    block = entry->value.load(std::memory_order_relaxed);
-                                    while (index != endIndex) {
-                                        (*block)[index++]->~T();
-                                    }
-
-                                    if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(
-                                            blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-                                        debug::DebugLock lock(mutex);
-#endif
-                                        entry->value.store(nullptr, std::memory_order_relaxed);
-                                        this->parent->add_block_to_free_list(block);
-                                    }
-                                    indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-
-                                    blockStartIndex = index;
-                                    endIndex        = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
-                                               static_cast<index_t>(BLOCK_SIZE);
-                                    endIndex = details::circular_less_than<index_t>(
-                                                   firstIndex + static_cast<index_t>(actualCount), endIndex) ?
-                                                   firstIndex + static_cast<index_t>(actualCount) :
-                                                   endIndex;
-                                } while (index != firstIndex + actualCount);
-
-                                MOODYCAMEL_RETHROW;
-                            }
-                        }
-                        if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(
-                                blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-                            {
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-                                debug::DebugLock lock(mutex);
-#endif
-                                // Note that the set_many_empty above did a release, meaning that anybody who acquires
-                                // the block we're about to free can use it safely since our writes (and reads!) will
-                                // have happened-before then.
-                                entry->value.store(nullptr, std::memory_order_relaxed);
-                            }
-                            this->parent->add_block_to_free_list(block);  // releases the above store
-                        }
-                        indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-                    } while (index != firstIndex + actualCount);
-
-                    return actualCount;
-                } else {
-                    this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-                }
-            }
-
-            return 0;
-        }
-
-    private:
-        // The block size must be > 1, so any number with the low bit set is an invalid block base index
-        static const index_t INVALID_BLOCK_BASE = 1;
-
-        struct BlockIndexEntry {
-            std::atomic<index_t> key;
-            std::atomic<Block*> value;
-        };
-
-        struct BlockIndexHeader {
-            size_t capacity;
-            std::atomic<size_t> tail;
-            BlockIndexEntry* entries;
-            BlockIndexEntry** index;
-            BlockIndexHeader* prev;
-        };
-
-        template <AllocationMode allocMode>
-        inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
-        {
-            auto localBlockIndex =
-                blockIndex.load(std::memory_order_relaxed);  // We're the only writer thread, relaxed is OK
-            if (localBlockIndex == nullptr) {
-                return false;  // this can happen if new_block_index failed in the constructor
-            }
-            size_t newTail =
-                (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-            idxEntry = localBlockIndex->index[newTail];
-            if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
-                idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
-                idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-                localBlockIndex->tail.store(newTail, std::memory_order_release);
-                return true;
-            }
-
-            // No room in the old block index, try to allocate another one!
-            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
-            {
-                return false;
-            }
-            else if (!new_block_index())
-            {
-                return false;
-            }
-            else
-            {
-                localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-                newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-                idxEntry = localBlockIndex->index[newTail];
-                assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
-                idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-                localBlockIndex->tail.store(newTail, std::memory_order_release);
-                return true;
-            }
-        }
-
-        inline void rewind_block_index_tail()
-        {
-            auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-            localBlockIndex->tail.store(
-                (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1),
-                std::memory_order_relaxed);
-        }
-
-        inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
-        {
-            BlockIndexHeader* localBlockIndex;
-            auto idx = get_block_index_index_for_index(index, localBlockIndex);
-            return localBlockIndex->index[idx];
-        }
-
-        inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
-        {
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+				return false;
+			}
+			else if (!new_block_index()) {
+				return false;
+			}
+			else {
+				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+				idxEntry = localBlockIndex->index[newTail];
+				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-            debug::DebugLock lock(mutex);
-#endif
-            index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
-            localBlockIndex = blockIndex.load(std::memory_order_acquire);
-            auto tail       = localBlockIndex->tail.load(std::memory_order_acquire);
-            auto tailBase   = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
-            assert(tailBase != INVALID_BLOCK_BASE);
-            // Note: Must use division instead of shift because the index may wrap around, causing a negative
-            // offset, whose negativity we want to preserve
-            auto offset = static_cast<size_t>(
-                static_cast<typename std::make_signed<index_t>::type>(index - tailBase) /
-                static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-            size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
-            assert(
-                localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index &&
-                localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
-            return idx;
-        }
-
-        bool new_block_index()
-        {
-            auto prev           = blockIndex.load(std::memory_order_relaxed);
-            size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
-            auto entryCount     = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-            auto raw            = static_cast<char*>(
-                (Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 +
-                                 sizeof(BlockIndexEntry) * entryCount + std::alignment_of<BlockIndexEntry*>::value - 1 +
-                                 sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
-            if (raw == nullptr) {
-                return false;
-            }
-
-            auto header = new (raw) BlockIndexHeader;
-            auto entries =
-                reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
-            auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(
-                reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
-            if (prev != nullptr) {
-                auto prevTail = prev->tail.load(std::memory_order_relaxed);
-                auto prevPos  = prevTail;
-                size_t i      = 0;
-                do {
-                    prevPos    = (prevPos + 1) & (prev->capacity - 1);
-                    index[i++] = prev->index[prevPos];
-                } while (prevPos != prevTail);
-                assert(i == prevCapacity);
-            }
-            for (size_t i = 0; i != entryCount; ++i) {
-                new (entries + i) BlockIndexEntry;
-                entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
-                index[prevCapacity + i] = entries + i;
-            }
-            header->prev     = prev;
-            header->entries  = entries;
-            header->index    = index;
-            header->capacity = nextBlockIndexCapacity;
-            header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
-
-            blockIndex.store(header, std::memory_order_release);
-
-            nextBlockIndexCapacity <<= 1;
-
-            return true;
-        }
-
-    private:
-        size_t nextBlockIndexCapacity;
-        std::atomic<BlockIndexHeader*> blockIndex;
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-    public:
-        details::ThreadExitListener threadExitListener;
-
-    private:
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
 #endif
-
+		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-    public:
-        ImplicitProducer* nextImplicitProducer;
-
-    private:
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
 #endif
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-        mutable debug::DebugMutex mutex;
+		mutable debug::DebugMutex mutex;
 #endif
 #ifdef MCDBGQ_TRACKMEM
-        friend struct MemStats;
-#endif
-    };
-
-    //////////////////////////////////
-    // Block pool manipulation
-    //////////////////////////////////
-
-    void populate_initial_block_list(size_t blockCount)
-    {
-        initialBlockPoolSize = blockCount;
-        if (initialBlockPoolSize == 0) {
-            initialBlockPool = nullptr;
-            return;
-        }
-
-        initialBlockPool = create_array<Block>(blockCount);
-        if (initialBlockPool == nullptr) {
-            initialBlockPoolSize = 0;
-        }
-        for (size_t i = 0; i < initialBlockPoolSize; ++i) {
-            initialBlockPool[i].dynamicallyAllocated = false;
-        }
-    }
-
-    inline Block* try_get_block_from_initial_pool()
-    {
-        if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
-            return nullptr;
-        }
-
-        auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
-
-        return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
-    }
-
-    inline void add_block_to_free_list(Block* block)
-    {
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
 #ifdef MCDBGQ_TRACKMEM
-        block->owner = nullptr;
-#endif
-        if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
-            destroy(block);
-        } else {
-            freeList.add(block);
-        }
-    }
-
-    inline void add_blocks_to_free_list(Block* block)
-    {
-        while (block != nullptr) {
-            auto next = block->next;
-            add_block_to_free_list(block);
-            block = next;
-        }
-    }
-
-    inline Block* try_get_block_from_free_list() { return freeList.try_get(); }
-
-    // Gets a free block from one of the memory pools, or allocates a new one (if applicable)
-    template <AllocationMode canAlloc>
-    Block* requisition_block()
-    {
-        auto block = try_get_block_from_initial_pool();
-        if (block != nullptr) {
-            return block;
-        }
-
-        block = try_get_block_from_free_list();
-        if (block != nullptr) {
-            return block;
-        }
-
-        MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc)
-        {
-            return create<Block>();
-        }
-        else
-        {
-            return nullptr;
-        }
-    }
+		block->owner = nullptr;
+#endif
+		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+			destroy(block);
+		}
+		else {
+			freeList.add(block);
+		}
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		else {
+			return nullptr;
+		}
+	}
+	
 
 #ifdef MCDBGQ_TRACKMEM
-public:
-    struct MemStats {
-        size_t allocatedBlocks;
-        size_t usedBlocks;
-        size_t freeBlocks;
-        size_t ownedBlocksExplicit;
-        size_t ownedBlocksImplicit;
-        size_t implicitProducers;
-        size_t explicitProducers;
-        size_t elementsEnqueued;
-        size_t blockClassBytes;
-        size_t queueClassBytes;
-        size_t implicitBlockIndexBytes;
-        size_t explicitBlockIndexBytes;
-
-        friend class ConcurrentQueue;
-
-    private:
-        static MemStats getFor(ConcurrentQueue* q)
-        {
-            MemStats stats = {0};
-
-            stats.elementsEnqueued = q->size_approx();
-
-            auto block = q->freeList.head_unsafe();
-            while (block != nullptr) {
-                ++stats.allocatedBlocks;
-                ++stats.freeBlocks;
-                block = block->freeListNext.load(std::memory_order_relaxed);
-            }
-
-            for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr;
-                 ptr      = ptr->next_prod()) {
-                bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
-                stats.implicitProducers += implicit ? 1 : 0;
-                stats.explicitProducers += implicit ? 0 : 1;
-
-                if (implicit) {
-                    auto prod = static_cast<ImplicitProducer*>(ptr);
-                    stats.queueClassBytes += sizeof(ImplicitProducer);
-                    auto head = prod->headIndex.load(std::memory_order_relaxed);
-                    auto tail = prod->tailIndex.load(std::memory_order_relaxed);
-                    auto hash = prod->blockIndex.load(std::memory_order_relaxed);
-                    if (hash != nullptr) {
-                        for (size_t i = 0; i != hash->capacity; ++i) {
-                            if (hash->index[i]->key.load(std::memory_order_relaxed) !=
-                                    ImplicitProducer::INVALID_BLOCK_BASE &&
-                                hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
-                                ++stats.allocatedBlocks;
-                                ++stats.ownedBlocksImplicit;
-                            }
-                        }
-                        stats.implicitBlockIndexBytes +=
-                            hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
-                        for (; hash != nullptr; hash = hash->prev) {
-                            stats.implicitBlockIndexBytes +=
-                                sizeof(typename ImplicitProducer::BlockIndexHeader) +
-                                hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
-                        }
-                    }
-                    for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
-                        // auto block = prod->get_block_index_entry_for_index(head);
-                        ++stats.usedBlocks;
-                    }
-                } else {
-                    auto prod = static_cast<ExplicitProducer*>(ptr);
-                    stats.queueClassBytes += sizeof(ExplicitProducer);
-                    auto tailBlock   = prod->tailBlock;
-                    bool wasNonEmpty = false;
-                    if (tailBlock != nullptr) {
-                        auto block = tailBlock;
-                        do {
-                            ++stats.allocatedBlocks;
-                            if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
-                                ++stats.usedBlocks;
-                                wasNonEmpty = wasNonEmpty || block != tailBlock;
-                            }
-                            ++stats.ownedBlocksExplicit;
-                            block = block->next;
-                        } while (block != tailBlock);
-                    }
-                    auto index = prod->blockIndex.load(std::memory_order_relaxed);
-                    while (index != nullptr) {
-                        stats.explicitBlockIndexBytes +=
-                            sizeof(typename ExplicitProducer::BlockIndexHeader) +
-                            index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
-                        index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
-                    }
-                }
-            }
-
-            auto freeOnInitialPool =
-                q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ?
-                    0 :
-                    q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
-            stats.allocatedBlocks += freeOnInitialPool;
-            stats.freeBlocks += freeOnInitialPool;
-
-            stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
-            stats.queueClassBytes += sizeof(ConcurrentQueue);
-
-            return stats;
-        }
-    };
-
-    // For debugging only. Not thread-safe.
-    MemStats getMemStats() { return MemStats::getFor(this); }
-
-private:
-    friend struct MemStats;
-#endif
-
-    //////////////////////////////////
-    // Producer list manipulation
-    //////////////////////////////////
-
-    ProducerBase* recycle_or_create_producer(bool isExplicit)
-    {
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-        debug::DebugLock lock(implicitProdMutex);
-#endif
-        // Try to re-use one first
-        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-            if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
-                bool expected = true;
-                if (ptr->inactive.compare_exchange_strong(
-                        expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
-                    // We caught one! It's been marked as activated, the caller can have it
-                    return ptr;
-                }
-            }
-        }
-
-        return add_producer(
-            isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
-    }
-
-    ProducerBase* add_producer(ProducerBase* producer)
-    {
-        // Handle failed memory allocation
-        if (producer == nullptr) {
-            return nullptr;
-        }
-
-        producerCount.fetch_add(1, std::memory_order_relaxed);
-
-        // Add it to the lock-free list
-        auto prevTail = producerListTail.load(std::memory_order_relaxed);
-        do {
-            producer->next = prevTail;
-        } while (!producerListTail.compare_exchange_weak(
-            prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
-
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					return ptr;
+				}
+			}
+		}
+
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-        if (producer->isExplicit) {
-            auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
-            do {
-                static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
-            } while (!explicitProducers.compare_exchange_weak(
-                prevTailExplicit,
-                static_cast<ExplicitProducer*>(producer),
-                std::memory_order_release,
-                std::memory_order_relaxed));
-        } else {
-            auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
-            do {
-                static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
-            } while (!implicitProducers.compare_exchange_weak(
-                prevTailImplicit,
-                static_cast<ImplicitProducer*>(producer),
-                std::memory_order_release,
-                std::memory_order_relaxed));
-        }
-#endif
-
-        return producer;
-    }
-
-    void reown_producers()
-    {
-        // After another instance is moved-into/swapped-with this one, all the
-        // producers we stole still think their parents are the other queue.
-        // So fix them up!
-        for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
-            ptr->parent = this;
-        }
-    }
-
-    //////////////////////////////////
-    // Implicit producer hash
-    //////////////////////////////////
-
-    struct ImplicitProducerKVP {
-        std::atomic<details::thread_id_t> key;
-        ImplicitProducer*
-            value;  // No need for atomicity since it's only read by the thread that sets it in the first place
-
-        ImplicitProducerKVP(): value(nullptr) {}
-
-        ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-        {
-            key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
-            value = other.value;
-        }
-
-        inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-        {
-            swap(other);
-            return *this;
-        }
-
-        inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
-        {
-            if (this != &other) {
-                details::swap_relaxed(key, other.key);
-                std::swap(value, other.value);
-            }
-        }
-    };
-
-    template <typename XT, typename XTraits>
-    friend void moodycamel::swap(
-        typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&,
-        typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
-
-    struct ImplicitProducerHash {
-        size_t capacity;
-        ImplicitProducerKVP* entries;
-        ImplicitProducerHash* prev;
-    };
-
-    inline void populate_initial_implicit_producer_hash()
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
-        {
-            return;
-        }
-        else
-        {
-            implicitProducerHashCount.store(0, std::memory_order_relaxed);
-            auto hash      = &initialImplicitProducerHash;
-            hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-            hash->entries  = &initialImplicitProducerHashEntries[0];
-            for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-                initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-            }
-            hash->prev = nullptr;
-            implicitProducerHash.store(hash, std::memory_order_relaxed);
-        }
-    }
-
-    void swap_implicit_producer_hashes(ConcurrentQueue& other)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
-        {
-            return;
-        }
-        else
-        {
-            // Swap (assumes our implicit producer hash is initialized)
-            initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-            initialImplicitProducerHash.entries       = &initialImplicitProducerHashEntries[0];
-            other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-
-            details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-
-            details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-            if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-                implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
-            } else {
-                ImplicitProducerHash* hash;
-                for (hash = implicitProducerHash.load(std::memory_order_relaxed);
-                     hash->prev != &other.initialImplicitProducerHash;
-                     hash = hash->prev) {
-                    continue;
-                }
-                hash->prev = &initialImplicitProducerHash;
-            }
-            if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-                other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-            } else {
-                ImplicitProducerHash* hash;
-                for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
-                     hash->prev != &initialImplicitProducerHash;
-                     hash = hash->prev) {
-                    continue;
-                }
-                hash->prev = &other.initialImplicitProducerHash;
-            }
-        }
-    }
-
-    // Only fails (returns nullptr) if memory allocation fails
-    ImplicitProducer* get_or_add_implicit_producer()
-    {
-        // Note that since the data is essentially thread-local (key is thread ID),
-        // there's a reduced need for fences (memory ordering is already consistent
-        // for any individual thread), except for the current table itself.
-
-        // Start by looking for the thread ID in the current and all previous hash tables.
-        // If it's not found, it must not be in there yet, since this same thread would
-        // have added it previously to one of the tables that we traversed.
-
-        // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
-
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			implicitProducerHashCount.store(0, std::memory_order_relaxed);
+			auto hash = &initialImplicitProducerHash;
+			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+			hash->entries = &initialImplicitProducerHashEntries[0];
+			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+			}
+			hash->prev = nullptr;
+			implicitProducerHash.store(hash, std::memory_order_relaxed);
+		}
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			// Swap (assumes our implicit producer hash is initialized)
+			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+			
+			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+			
+			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &initialImplicitProducerHash;
+			}
+			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &other.initialImplicitProducerHash;
+			}
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-        debug::DebugLock lock(implicitProdMutex);
-#endif
-
-        auto id       = details::thread_id();
-        auto hashedId = details::hash_thread_id(id);
-
-        auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
-        assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
-        for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
-            // Look for the id in this hash
-            auto index = hashedId;
-            while (true) {  // Not an infinite loop because at least one slot is free in the hash table
-                index &= hash->capacity - 1u;
-
-                auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
-                if (probedKey == id) {
-                    // Found it! If we had to search several hashes deep, though, we should lazily add it
-                    // to the current main hash table to avoid the extended search next time.
-                    // Note there's guaranteed to be room in the current hash table since every subsequent
-                    // table implicitly reserves space for all previous tables (there's only one
-                    // implicitProducerHashCount).
-                    auto value = hash->entries[index].value;
-                    if (hash != mainHash) {
-                        index = hashedId;
-                        while (true) {
-                            index &= mainHash->capacity - 1u;
-                            auto empty = details::invalid_thread_id;
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1u;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1u;
+							auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-                            auto reusable = details::invalid_thread_id2;
-                            if (mainHash->entries[index].key.compare_exchange_strong(
-                                    empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
-                                mainHash->entries[index].key.compare_exchange_strong(
-                                    reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+							auto reusable = details::invalid_thread_id2;
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
 #else
-                            if (mainHash->entries[index].key.compare_exchange_strong(
-                                    empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#endif
-                                mainHash->entries[index].value = value;
-                                break;
-                            }
-                            ++index;
-                        }
-                    }
-
-                    return value;
-                }
-                if (probedKey == details::invalid_thread_id) {
-                    break;  // Not in this hash table
-                }
-                ++index;
-            }
-        }
-
-        // Insert!
-        auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
-        while (true) {
-            // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-            if (newCount >= (mainHash->capacity >> 1) &&
-                !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
-                // We've acquired the resize lock, try to allocate a bigger hash table.
-                // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
-                // we reload implicitProducerHash it must be the most recent version (it only gets changed within this
-                // locked block).
-                mainHash = implicitProducerHash.load(std::memory_order_acquire);
-                if (newCount >= (mainHash->capacity >> 1)) {
-                    size_t newCapacity = mainHash->capacity << 1;
-                    while (newCount >= (newCapacity >> 1)) {
-                        newCapacity <<= 1;
-                    }
-                    auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) +
-                                                                   std::alignment_of<ImplicitProducerKVP>::value - 1 +
-                                                                   sizeof(ImplicitProducerKVP) * newCapacity));
-                    if (raw == nullptr) {
-                        // Allocation failed
-                        implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-                        implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-                        return nullptr;
-                    }
-
-                    auto newHash      = new (raw) ImplicitProducerHash;
-                    newHash->capacity = static_cast<size_t>(newCapacity);
-                    newHash->entries  = reinterpret_cast<ImplicitProducerKVP*>(
-                        details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
-                    for (size_t i = 0; i != newCapacity; ++i) {
-                        new (newHash->entries + i) ImplicitProducerKVP;
-                        newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-                    }
-                    newHash->prev = mainHash;
-                    implicitProducerHash.store(newHash, std::memory_order_release);
-                    implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-                    mainHash = newHash;
-                } else {
-                    implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-                }
-            }
-
-            // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
-            // to finish being allocated by another thread (and if we just finished allocating above, the condition will
-            // always be true)
-            if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
-                auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
-                if (producer == nullptr) {
-                    implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-                    return nullptr;
-                }
-
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					size_t newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = static_cast<size_t>(newCapacity);
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-                producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
-                producer->threadExitListener.userData = producer;
-                details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
-#endif
-
-                auto index = hashedId;
-                while (true) {
-                    index &= mainHash->capacity - 1u;
-                    auto empty = details::invalid_thread_id;
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1u;
+					auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-                    auto reusable = details::invalid_thread_id2;
-                    if (mainHash->entries[index].key.compare_exchange_strong(
-                            reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-                        implicitProducerHashCount.fetch_sub(
-                            1, std::memory_order_relaxed);  // already counted as a used slot
-                        mainHash->entries[index].value = producer;
-                        break;
-                    }
-#endif
-                    if (mainHash->entries[index].key.compare_exchange_strong(
-                            empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-                        mainHash->entries[index].value = producer;
-                        break;
-                    }
-                    ++index;
-                }
-                return producer;
-            }
-
-            // Hmm, the old hash is quite full and somebody else is busy allocating a new one.
-            // We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
-            // we try to allocate ourselves).
-            mainHash = implicitProducerHash.load(std::memory_order_acquire);
-        }
-    }
-
+					auto reusable = details::invalid_thread_id2;
+					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
+						mainHash->entries[index].value = producer;
+						break;
+					}
+#endif
+					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-    void implicit_producer_thread_exited(ImplicitProducer* producer)
-    {
-        // Remove from hash
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from hash
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-        debug::DebugLock lock(implicitProdMutex);
-#endif
-        auto hash = implicitProducerHash.load(std::memory_order_acquire);
-        assert(
-            hash !=
-            nullptr);  // The thread exit listener is only registered if we were added to a hash in the first place
-        auto id       = details::thread_id();
-        auto hashedId = details::hash_thread_id(id);
-        details::thread_id_t probedKey;
-
-        // We need to traverse all the hashes just in case other threads aren't on the current one yet and are
-        // trying to add an entry thinking there's a free slot (because they reused a producer)
-        for (; hash != nullptr; hash = hash->prev) {
-            auto index = hashedId;
-            do {
-                index &= hash->capacity - 1u;
-                probedKey = id;
-                if (hash->entries[index].key.compare_exchange_strong(
-                        probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-                    break;
-                }
-                ++index;
-            } while (probedKey !=
-                     details::invalid_thread_id);  // Can happen if the hash has changed but we weren't put back in it
-                                                   // yet, or if we weren't added to this hash in the first place
-        }
-
-        // Mark the queue as being recyclable
-        producer->inactive.store(true, std::memory_order_release);
-    }
-
-    static void implicit_producer_thread_exited_callback(void* userData)
-    {
-        auto producer = static_cast<ImplicitProducer*>(userData);
-        auto queue    = producer->parent;
-        queue->implicit_producer_thread_exited(producer);
-    }
-#endif
-
-    //////////////////////////////////
-    // Utility functions
-    //////////////////////////////////
-
-    template <typename TAlign>
-    static inline void* aligned_malloc(size_t size)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-        return (Traits::malloc)(size);
-        else
-        {
-            size_t alignment = std::alignment_of<TAlign>::value;
-            void* raw        = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
-            if (!raw)
-                return nullptr;
-            char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
-            *(reinterpret_cast<void**>(ptr) - 1) = raw;
-            return ptr;
-        }
-    }
-
-    template <typename TAlign>
-    static inline void aligned_free(void* ptr)
-    {
-        MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-        return (Traits::free)(ptr);
-        else(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
-    }
-
-    template <typename U>
-    static inline U* create_array(size_t count)
-    {
-        assert(count > 0);
-        U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
-        if (p == nullptr)
-            return nullptr;
-
-        for (size_t i = 0; i != count; ++i)
-            new (p + i) U();
-        return p;
-    }
-
-    template <typename U>
-    static inline void destroy_array(U* p, size_t count)
-    {
-        if (p != nullptr) {
-            assert(count > 0);
-            for (size_t i = count; i != 0;)
-                (p + --i)->~U();
-        }
-        aligned_free<U>(p);
-    }
-
-    template <typename U>
-    static inline U* create()
-    {
-        void* p = aligned_malloc<U>(sizeof(U));
-        return p != nullptr ? new (p) U : nullptr;
-    }
-
-    template <typename U, typename A1>
-    static inline U* create(A1&& a1)
-    {
-        void* p = aligned_malloc<U>(sizeof(U));
-        return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
-    }
-
-    template <typename U>
-    static inline void destroy(U* p)
-    {
-        if (p != nullptr)
-            p->~U();
-        aligned_free<U>(p);
-    }
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1u;
+				probedKey = id;
+				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename TAlign>
+	static inline void* aligned_malloc(size_t size)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::malloc)(size);
+		else {
+			size_t alignment = std::alignment_of<TAlign>::value;
+			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+			if (!raw)
+				return nullptr;
+			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+			*(reinterpret_cast<void**>(ptr) - 1) = raw;
+			return ptr;
+		}
+	}
+
+	template<typename TAlign>
+	static inline void aligned_free(void* ptr)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::free)(ptr);
+		else
+			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+	}
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+		if (p == nullptr)
+			return nullptr;
+
+		for (size_t i = 0; i != count; ++i)
+			new (p + i) U();
+		return p;
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; )
+				(p + --i)->~U();
+		}
+		aligned_free<U>(p);
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr)
+			p->~U();
+		aligned_free<U>(p);
+	}
 
 private:
-    std::atomic<ProducerBase*> producerListTail;
-    std::atomic<std::uint32_t> producerCount;
-
-    std::atomic<size_t> initialBlockPoolIndex;
-    Block* initialBlockPool;
-    size_t initialBlockPoolSize;
-
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
 #ifndef MCDBGQ_USEDEBUGFREELIST
-    FreeList<Block> freeList;
+	FreeList<Block> freeList;
 #else
-    debug::DebugFreeList<Block> freeList;
-#endif
-
-    std::atomic<ImplicitProducerHash*> implicitProducerHash;
-    std::atomic<size_t> implicitProducerHashCount;  // Number of slots logically used
-    ImplicitProducerHash initialImplicitProducerHash;
-    std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
-    std::atomic_flag implicitProducerHashResizeInProgress;
-
-    std::atomic<std::uint32_t> nextExplicitConsumerId;
-    std::atomic<std::uint32_t> globalExplicitConsumerOffset;
-
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-    debug::DebugMutex implicitProdMutex;
+	debug::DebugMutex implicitProdMutex;
 #endif
-
+	
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-    std::atomic<ExplicitProducer*> explicitProducers;
-    std::atomic<ImplicitProducer*> implicitProducers;
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
 #endif
 };
 
-template <typename T, typename Traits>
-ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue): producer(queue.recycle_or_create_producer(true))
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
 {
-    if (producer != nullptr) {
-        producer->token = this;
-    }
+	if (producer != nullptr) {
+		producer->token = this;
+	}
 }
 
-template <typename T, typename Traits>
+template<typename T, typename Traits>
 ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
-    : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
 {
-    if (producer != nullptr) {
-        producer->token = this;
-    }
+	if (producer != nullptr) {
+		producer->token = this;
+	}
 }
 
-template <typename T, typename Traits>
+template<typename T, typename Traits>
 ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
-    : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
 {
-    initialOffset         = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-    lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
 }
 
-template <typename T, typename Traits>
+template<typename T, typename Traits>
 ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
-    : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
 {
-    initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(
-        1, std::memory_order_release);
-    lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
 }
 
-template <typename T, typename Traits>
+template<typename T, typename Traits>
 inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
 {
-    a.swap(b);
+	a.swap(b);
 }
 
 inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
 {
-    a.swap(b);
+	a.swap(b);
 }
 
 inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
 {
-    a.swap(b);
+	a.swap(b);
 }
 
-template <typename T, typename Traits>
-inline void swap(
-    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
-    typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
 {
-    a.swap(b);
+	a.swap(b);
 }
 
-}  // namespace moodycamel
+}
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
 #pragma warning(pop)
diff --git a/scripts/lint.sh b/scripts/lint.sh
deleted file mode 100755
index 273d70043..000000000
--- a/scripts/lint.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -euxo pipefail
-
-black .
-pflake8 .
-mypy --install-types .
-
-# disable tracing -- too verbose
-set +x
-# find all .h, .hpp, and .cpp files excepting third party files
-files=$(find ./scaler -path './scaler/io/ymq/third_party' -prune -o \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) -print0 | xargs -0)
-set -x
-
-echo "running clang format on $(wc -w <<< $files) files"
-clang-format -i -style file -- $files
diff --git a/tests/cc_ymq/common.h b/tests/cc_ymq/common.h
index bafe5d907..b361f76df 100644
--- a/tests/cc_ymq/common.h
+++ b/tests/cc_ymq/common.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <cerrno>
+#include <format>
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <arpa/inet.h>
@@ -421,6 +423,7 @@ inline TestResult test(
     return TestResult::Success;
 }
 
+// path is relative to the directory of this source file
 inline TestResult run_python(const char* path, std::vector<const wchar_t*> argv = {})
 {
     // insert the pid at the start of the argv, this is important for signalling readiness
@@ -449,7 +452,7 @@ inline TestResult run_python(const char* path, std::vector<const wchar_t*> argv
         if (!file)
             throw std::system_error(errno, std::generic_category(), "failed to open python file");
 
-        PyRun_SimpleFile(file, path);
+        PyRun_SimpleFile(file, full_path.c_str());
         fclose(file);
     }
 
diff --git a/tests/cc_ymq/py_mitm/core.py b/tests/cc_ymq/py_mitm/core.py
deleted file mode 100644
index 4a22ee01a..000000000
--- a/tests/cc_ymq/py_mitm/core.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""
-This is the common code for implementing man in the middle in Python
-"""
-
-import dataclasses
-from typing import Protocol
-from scapy.all import TunTapInterface, IP, TCP  # type: ignore
-
-
-@dataclasses.dataclass
-class TCPConnection:
-    """
-    Represents a TCP connection over the TUNTAP interface
-    local_ip and local_port are the mitm's ip and port, and
-    remote_ip and remote_port are the port for the remote peer
-    """
-
-    local_ip: str
-    local_port: int
-    remote_ip: str
-    remote_port: int
-
-    def rewrite(self, pkt: IP, ack: int | None = None, data=None):
-        """
-        Rewrite a TCP/IP packet as a packet originating
-        from (local_ip, local_port) and going to (remote_ip, remote_port)
-        This function is useful for taking a packet received from one connection, and redirecting it to another
-
-        Args:
-            pkt: A scapy TCP/IP packet to rewrite
-            ack: An optional ack number to use instead of the one found in `pkt`
-            data: An optional payload to use instead of the one found int `pkt`
-
-        Returns:
-            The rewritten packet, suitable for sending over TUNTAP
-        """
-        tcp = pkt[TCP]
-
-        return (
-            IP(src=self.local_ip, dst=self.remote_ip)
-            / TCP(sport=self.local_port, dport=self.remote_port, flags=tcp.flags, seq=tcp.seq, ack=ack or tcp.ack)
-            / bytes(data or tcp.payload)
-        )
-
-
-class MITMProtocol(Protocol):
-    def proxy(
-        self,
-        tuntap: TunTapInterface,
-        pkt: IP,
-        sender: TCPConnection,
-        client_conn: TCPConnection | None,
-        server_conn: TCPConnection,
-    ) -> None: ...
diff --git a/tests/cc_ymq/py_mitm/drop.py b/tests/cc_ymq/py_mitm/drop.py
deleted file mode 100644
index 520e5bf70..000000000
--- a/tests/cc_ymq/py_mitm/drop.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-This MITM drops a % of packets
-"""
-
-import random
-from core import MITMProtocol, TunTapInterface, IP, TCPConnection
-
-
-class MITM(MITMProtocol):
-    def __init__(self, drop_pcent: str):
-        self.drop_pcent = float(drop_pcent)
-
-    def proxy(
-        self,
-        tuntap: TunTapInterface,
-        pkt: IP,
-        sender: TCPConnection,
-        client_conn: TCPConnection | None,
-        server_conn: TCPConnection,
-    ) -> None:
-        if random.random() < self.drop_pcent:
-            print("[!] Dropping packet")
-            return
-
-        if sender == client_conn:
-            tuntap.send(server_conn.rewrite(pkt))
-        elif sender == server_conn:
-            tuntap.send(client_conn.rewrite(pkt))
diff --git a/tests/cc_ymq/py_mitm/main.py b/tests/cc_ymq/py_mitm/main.py
index edaeba569..6a97faafd 100644
--- a/tests/cc_ymq/py_mitm/main.py
+++ b/tests/cc_ymq/py_mitm/main.py
@@ -100,6 +100,9 @@ def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: in
         # and the source ip and port will be the remote ip and port
         sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport)
 
+        if not mitm.proxy(tuntap, pkt, sender, client_conn, server_conn):
+            continue  # the segment was not proxied, so we can't update our internal state
+
         if sender == client_conn:
             print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}")
         elif sender == server_conn:
@@ -126,8 +129,6 @@ def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: in
             if sender == server_conn and client_sent_fin_ack:
                 client_closed = True
 
-        mitm.proxy(tuntap, pkt, sender, client_conn, server_conn)
-
         if client_closed and server_closed:
             print("[*] Both connections closed")
             return
diff --git a/tests/cc_ymq/py_mitm/passthrough.py b/tests/cc_ymq/py_mitm/passthrough.py
index 20d8a9069..1dc825928 100644
--- a/tests/cc_ymq/py_mitm/passthrough.py
+++ b/tests/cc_ymq/py_mitm/passthrough.py
@@ -16,8 +16,9 @@ def proxy(
         sender: TCPConnection,
         client_conn: TCPConnection | None,
         server_conn: TCPConnection,
-    ) -> None:
-        if sender == client_conn:
+    ) -> bool:
+        if sender == client_conn or client_conn is None:
             tuntap.send(server_conn.rewrite(pkt))
         elif sender == server_conn:
             tuntap.send(client_conn.rewrite(pkt))
+        return True
diff --git a/tests/cc_ymq/py_mitm/randomly_drop_packets.py b/tests/cc_ymq/py_mitm/randomly_drop_packets.py
index a197ac3c8..d84e57ee6 100644
--- a/tests/cc_ymq/py_mitm/randomly_drop_packets.py
+++ b/tests/cc_ymq/py_mitm/randomly_drop_packets.py
@@ -17,12 +17,13 @@ def proxy(
         sender: TCPConnection,
         client_conn: TCPConnection | None,
         server_conn: TCPConnection,
-    ) -> None:
+    ) -> bool:
         if random.random() < self.drop_pcent:
             print("[!] Dropping packet")
-            return
+            return False
 
         if sender == client_conn:
             tuntap.send(server_conn.rewrite(pkt))
         elif sender == server_conn:
             tuntap.send(client_conn.rewrite(pkt))
+        return True
diff --git a/tests/cc_ymq/py_mitm/rst.py b/tests/cc_ymq/py_mitm/rst.py
deleted file mode 100644
index be301d5b0..000000000
--- a/tests/cc_ymq/py_mitm/rst.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""
-This MITM inserts an unexpected TCP RST
-"""
-
-from core import IP, TCP, MITMProtocol, TCPConnection, TunTapInterface
-
-
-class MITM(MITMProtocol):
-    def __init__(self):
-        # count the number of psh-acks sent by the client
-        self.client_pshack_counter = 0
-
-    def proxy(
-        self,
-        tuntap: TunTapInterface,
-        pkt: IP,
-        sender: TCPConnection,
-        client_conn: TCPConnection | None,
-        server_conn: TCPConnection,
-    ) -> None:
-        if sender == client_conn or client_conn is None:
-            if pkt[TCP].flags == "PA":
-                self.client_pshack_counter += 1
-
-                # on the second psh-ack, send a rst instead
-                if self.client_pshack_counter == 2:
-                    rst_pkt = IP(src=client_conn.local_ip, dst=client_conn.remote_ip) / TCP(
-                        sport=client_conn.local_port, dport=client_conn.remote_port, flags="R", seq=pkt[TCP].ack
-                    )
-                    print(f"<- [{rst_pkt[TCP].flags}] (simulated)")
-                    tuntap.send(rst_pkt)
-                    return
-
-            tuntap.send(server_conn.rewrite(pkt))
-        elif sender == server_conn:
-            tuntap.send(client_conn.rewrite(pkt))
-
-
-# client -> mitm -> server
-# server -> mitm -> client
-
-# client: 127.0.0.1:8080
-# mitm: 127.0.0.1:8081
-# server: 127.0.0.1:8081
-
-
-# client -> mitm == src = client.ip, sport = client.port ;; dst = mitm.ip, dport = mitm.port
-# mitm -> server == src = mitm.ip, sport = mitm.port ;; dst = server.ip, dport = server.port
diff --git a/tests/cc_ymq/py_mitm/runner.py b/tests/cc_ymq/py_mitm/runner.py
deleted file mode 100644
index f56ccb9f9..000000000
--- a/tests/cc_ymq/py_mitm/runner.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# flake8: noqa: E402
-
-"""
-This script provides a framework for running MITM test cases
-
-This script accepts 5 arguments in the following order:
-    1. pid: the pid of the test process, used for signaling
-    2. testcase: the MITM test case. \
-       this loads `from .testcase import MITM` where `MITM` is a class implementing `MITMProtocol`
-    3. mitm_ip: an ipv4 address for the mitm server
-    4. mitm_port: the port used to connect to the remote server
-    5. server_ip: the desired ip of the remote side of the TUNTAP interface
-    6. server_port: the port of the remote server
-    7. *args: Additional args, if any are passed to the constructor: `MITM(*args)`
-
-See the documentation on `main` for more
-"""
-import os
-import sys
-
-# add the script's directory to path
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-
-import importlib
-import signal
-import subprocess
-
-from core import MITMProtocol, TCPConnection
-from scapy.all import IP, TCP, TunTapInterface  # type: ignore
-
-
-def echo_call(cmd: list[str]):
-    print(f"+ {' '.join(cmd)}")
-    subprocess.check_call(cmd)
-
-
-def create_tuntap_interface(iface_name: str, mitm_ip: str, remote_ip: str) -> TunTapInterface:
-    """
-    Creates a TUNTAP interface and sets brings it up and adds ips using the `ip` program
-
-    Args:
-        iface_name: The name of the TUNTAP interface, usually like `tun0`, `tun1`, etc.
-        mitm_ip: The desired ip address of the mitm. This is the ip that clients can use to connect to the mitm
-        remote_ip: The ip that routes to/from the tuntap interface.
-        packets sent to `mitm_ip` will appear to come from `remote_ip`,\
-        and conversely the tuntap interface can connect/send packets
-        to `remote_ip`, making it a suitable ip for binding a server
-
-    Returns:
-        The TUNTAP interface
-    """
-    iface = TunTapInterface(iface_name, mode="tun")
-
-    try:
-        echo_call(["sudo", "ip", "link", "set", iface_name, "up"])
-        echo_call(["sudo", "ip", "addr", "add", remote_ip, "peer", mitm_ip, "dev", iface_name])
-        print(f"[+] Interface {iface_name} up with IP {mitm_ip}")
-    except subprocess.CalledProcessError:
-        print("[!] Could not bring up interface. Run as root or set manually.")
-        raise
-
-    return iface
-
-
-def main(pid: int, mitm_ip: str, mitm_port: int, remote_ip: str, server_port: int, mitm: MITMProtocol):
-    """
-    This function serves as a framework for man in the middle implementations
-    A client connects to the MITM, then the MITM connects to a remote server
-    The MITM sits inbetween the client and the server, manipulating the packets sent depending on the test case
-    This function:
-        1. creates a TUNTAP interface and prepares it for MITM
-        2. handles connecting clients and handling connection closes
-        3. delegates additional logic to a pluggable callable, `mitm`
-        4. returns when both connections have terminated (via )
-
-    Args:
-        pid: this is the pid of the test process, used for signaling readiness \
-        we send SIGUSR1 to this process when the mitm is ready
-        mitm_ip: The desired ip address of the mitm server
-        mitm_port: The desired port of the mitm server. \
-        This is the port used to connect to the server, but the client is free to connect on any port
-        remote_ip: The desired remote ip for the TUNTAP interface. This is the only ip address \
-        reachable by the interface and is thus the src ip for clients, and the ip that the remote server \
-        must be bound to
-        server_port: The port that the remote server is bound to
-        mitm: The core logic for a MITM test case. This callable may maintain its own state and is responsible \
-        for sending packets over the TUNTAP interface (if it doesn't, nothing will happen)
-    """
-
-    tuntap = create_tuntap_interface("tun0", mitm_ip, remote_ip)
-
-    # signal the caller that the tuntap interface has been created
-    if pid > 0:
-        os.kill(pid, signal.SIGUSR1)
-
-    # these track information about our connections
-    # we already know what to expect for the server connection, we are the connector
-    client_conn = None
-    server_conn = TCPConnection(mitm_ip, mitm_port, remote_ip, server_port)
-
-    # tracks the state of each connection
-    client_sent_fin_ack = False
-    client_closed = False
-    server_sent_fin_ack = False
-    server_closed = False
-
-    while True:
-        pkt = tuntap.recv()
-        if not pkt.haslayer(IP) or not pkt.haslayer(TCP):
-            continue
-        ip = pkt[IP]
-        tcp = pkt[TCP]
-
-        # for a received packet, the destination ip and port are our local ip and port
-        # and the source ip and port will be the remote ip and port
-        sender = TCPConnection(pkt.dst, pkt.dport, pkt.src, pkt.sport)
-
-        if sender == client_conn:
-            print(f"-> [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}")
-        elif sender == server_conn:
-            print(f"<- [{tcp.flags}]{(': ' + str(bytes(tcp.payload))) if tcp.payload else ''}")
-
-        if tcp.flags == "S":  # SYN from client
-            print("-> [S]")
-            print(f"[*] New connection from {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}")
-            client_conn = sender
-
-        if tcp.flags == "SA":  # SYN-ACK from server
-            if sender == server_conn:
-                print(f"[*] Connection to server established: {ip.src}:{tcp.sport} to {ip.dst}:{tcp.dport}")
-
-        if tcp.flags == "FA":  # FIN-ACK
-            if sender == client_conn:
-                client_sent_fin_ack = True
-            if sender == server_conn:
-                server_sent_fin_ack = True
-
-        if tcp.flags == "A":  # ACK
-            if sender == client_conn and server_sent_fin_ack:
-                server_closed = True
-            if sender == server_conn and client_sent_fin_ack:
-                client_closed = True
-
-        mitm.proxy(tuntap, pkt, sender, client_conn, server_conn)
-
-        if client_closed and server_closed:
-            print("[*] Both connections closed")
-            return
-
-
-if __name__ == "__main__":
-    # parse the ips, ports, and test case from the command line
-    pid, testcase, mitm_ip, mitm_port, remote_ip, server_port, *args = sys.argv[1:]
-
-    # load the module dynamically
-    module = importlib.import_module(testcase)
-    main(int(pid), mitm_ip, int(mitm_port), remote_ip, int(server_port), module.MITM(*args))
diff --git a/tests/cc_ymq/py_mitm/send_rst_to_client.py b/tests/cc_ymq/py_mitm/send_rst_to_client.py
index fc70355e5..2d3922105 100644
--- a/tests/cc_ymq/py_mitm/send_rst_to_client.py
+++ b/tests/cc_ymq/py_mitm/send_rst_to_client.py
@@ -17,7 +17,7 @@ def proxy(
         sender: TCPConnection,
         client_conn: TCPConnection | None,
         server_conn: TCPConnection,
-    ) -> None:
+    ) -> bool:
         if sender == client_conn or client_conn is None:
             if pkt[TCP].flags == "PA":
                 self.client_pshack_counter += 1
@@ -29,11 +29,12 @@ def proxy(
                     )
                     print(f"<- [{rst_pkt[TCP].flags}] (simulated)")
                     tuntap.send(rst_pkt)
-                    return
+                    return True
 
             tuntap.send(server_conn.rewrite(pkt))
         elif sender == server_conn:
             tuntap.send(client_conn.rewrite(pkt))
+        return True
 
 
 # client -> mitm -> server
diff --git a/tests/cc_ymq/py_mitm/types.py b/tests/cc_ymq/py_mitm/types.py
index 4a22ee01a..ae20fd053 100644
--- a/tests/cc_ymq/py_mitm/types.py
+++ b/tests/cc_ymq/py_mitm/types.py
@@ -51,4 +51,4 @@ def proxy(
         sender: TCPConnection,
         client_conn: TCPConnection | None,
         server_conn: TCPConnection,
-    ) -> None: ...
+    ) -> bool: ...