From 1c1a596b6ec7d36a76fae78d752dda8943f3c724 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Mon, 22 Apr 2024 22:14:32 +0000 Subject: [PATCH] update --- cmake/CMakeLists.txt | 5 +- cmake/deps.txt | 1 - .../external/onnxruntime_external_deps.cmake | 18 -- cmake/onnxruntime_mlas.cmake | 2 +- cmake/onnxruntime_providers_cann.cmake | 2 +- cmake/onnxruntime_providers_cuda.cmake | 2 - cmake/onnxruntime_providers_dnnl.cmake | 2 - cmake/onnxruntime_providers_migraphx.cmake | 2 +- cmake/onnxruntime_providers_rocm.cmake | 1 - cmake/onnxruntime_providers_tensorrt.cmake | 3 +- cmake/onnxruntime_unittests.cmake | 13 +- cmake/onnxruntime_webassembly.cmake | 4 +- include/onnxruntime/core/platform/Barrier.h | 69 +------ .../platform/EigenNonBlockingThreadPool.h | 40 ++-- include/onnxruntime/core/platform/ort_mutex.h | 186 +----------------- onnxruntime/contrib_ops/cuda/fused_conv.cc | 2 +- onnxruntime/contrib_ops/rocm/fused_conv.cc | 4 +- onnxruntime/core/common/logging/logging.cc | 4 +- onnxruntime/core/common/profiler.cc | 4 +- onnxruntime/core/common/threadpool.cc | 5 +- onnxruntime/core/framework/bfc_arena.cc | 16 +- .../core/framework/execution_providers.h | 1 + .../framework/kernel_type_str_resolver.cc | 2 +- .../core/framework/mem_pattern_planner.h | 8 +- .../framework/model_metadef_id_generator.cc | 2 +- onnxruntime/core/framework/random_generator.h | 4 +- onnxruntime/core/framework/session_state.cc | 10 +- onnxruntime/core/framework/tuning_context.h | 2 +- .../core/framework/tuning_context_impl.h | 14 +- onnxruntime/core/graph/schema_registry.cc | 2 +- onnxruntime/core/platform/posix/ort_mutex.cc | 42 ---- .../core/platform/windows/logging/etw_sink.cc | 14 +- .../core/platform/windows/telemetry.cc | 16 +- .../providers/cann/cann_execution_provider.cc | 2 +- .../core/providers/cpu/generator/random.cc | 10 +- .../providers/cpu/text/string_normalizer.cc | 1 + .../core/providers/cuda/cuda_allocator.cc | 4 +- .../providers/cuda/cuda_execution_provider.cc | 6 +- onnxruntime/core/providers/cuda/nn/conv.cc | 2 +- .../core/providers/cuda/nn/conv_transpose.cc | 2 +- .../providers/cuda/nvtx_profile_context.h | 4 +- .../core/providers/migraphx/hip_allocator.cc | 4 +- .../migraphx/migraphx_execution_provider.cc | 2 +- .../core/providers/qnn/builder/qnn_model.cc | 2 +- .../providers/qnn/qnn_execution_provider.cc | 8 +- onnxruntime/core/providers/rocm/nn/conv.cc | 2 +- .../core/providers/rocm/nn/conv_transpose.cc | 2 +- .../core/providers/rocm/rocm_allocator.cc | 4 +- .../providers/rocm/rocm_execution_provider.cc | 6 +- .../tensorrt/tensorrt_execution_provider.cc | 10 +- .../tensorrt_execution_provider_custom_ops.cc | 2 +- onnxruntime/core/session/inference_session.cc | 41 ++-- onnxruntime/core/session/inference_session.h | 2 +- onnxruntime/core/session/ort_env.cc | 6 +- onnxruntime/core/session/ort_env.h | 2 +- onnxruntime/test/onnx/TestCase.cc | 6 +- onnxruntime/test/onnx/TestResultStat.h | 12 +- .../test/perftest/performance_runner.cc | 16 +- .../test/perftest/performance_runner.h | 7 +- onnxruntime/test/platform/barrier_test.cc | 72 ------- onnxruntime/test/platform/threadpool_test.cc | 12 +- 61 files changed, 185 insertions(+), 566 deletions(-) delete mode 100644 onnxruntime/core/platform/posix/ort_mutex.cc delete mode 100644 onnxruntime/test/platform/barrier_test.cc diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 1795052953d8c..fd3ee3cdf96fa 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1015,9 +1015,7 @@ function(onnxruntime_set_compile_flags target_name) if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") #external/protobuf/src/google/protobuf/arena.h:445:18: error: unused parameter 'p' target_compile_options(${target_name} PRIVATE "-Wno-unused-parameter") - endif() - target_compile_definitions(${target_name} PUBLIC -DNSYNC_ATOMIC_CPP11) - onnxruntime_add_include_to_target(${target_name} nsync::nsync_cpp) + endif() endif() foreach(ORT_FLAG ${ORT_PROVIDER_FLAGS}) target_compile_definitions(${target_name} PRIVATE ${ORT_FLAG}) @@ -1640,7 +1638,6 @@ if (WIN32) list(APPEND onnxruntime_EXTERNAL_LIBRARIES advapi32) endif() else() - list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync::nsync_cpp) list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ICONV_LIB} ${CMAKE_DL_LIBS} Threads::Threads) endif() diff --git a/cmake/deps.txt b/cmake/deps.txt index d0f4551671681..57b78e5896be8 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -27,7 +27,6 @@ flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip;bf9870756ee3f8d2d3b346b24ee3600a41c74d3d -google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034 googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 8839dbc8fda4f..8fd7c21565c68 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -80,14 +80,6 @@ if (onnxruntime_BUILD_BENCHMARKS) ) endif() -if (NOT WIN32) - FetchContent_Declare( - google_nsync - URL ${DEP_URL_google_nsync} - URL_HASH SHA1=${DEP_SHA1_google_nsync} - FIND_PACKAGE_ARGS NAMES nsync - ) -endif() list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external) FetchContent_Declare( @@ -340,16 +332,6 @@ if (onnxruntime_BUILD_BENCHMARKS) onnxruntime_fetchcontent_makeavailable(google_benchmark) endif() -if (NOT WIN32) - #nsync tests failed on Mac Build - set(NSYNC_ENABLE_TESTS OFF CACHE BOOL "" FORCE) - onnxruntime_fetchcontent_makeavailable(google_nsync) - if (google_nsync_SOURCE_DIR) - add_library(nsync::nsync_cpp ALIAS nsync_cpp) - target_include_directories(nsync_cpp PUBLIC ${google_nsync_SOURCE_DIR}/public) - endif() -endif() - if(onnxruntime_USE_CUDA) FetchContent_Declare( GSL diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index f7103c3b00a37..924d6ebdddbef 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -681,7 +681,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD) target_link_libraries(onnxruntime_mlas_q4dq PRIVATE cpuinfo) endif() if(NOT WIN32) - target_link_libraries(onnxruntime_mlas_q4dq PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS}) + target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${CMAKE_DL_LIBS}) endif() if (CMAKE_SYSTEM_NAME STREQUAL "Android") target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${android_shared_libs}) diff --git a/cmake/onnxruntime_providers_cann.cmake b/cmake/onnxruntime_providers_cann.cmake index 0e26f7ee3a57b..2b82379ed66a9 100644 --- a/cmake/onnxruntime_providers_cann.cmake +++ b/cmake/onnxruntime_providers_cann.cmake @@ -21,7 +21,7 @@ onnxruntime_add_include_to_target(onnxruntime_providers_cann onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface) add_dependencies(onnxruntime_providers_cann onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) - target_link_libraries(onnxruntime_providers_cann PRIVATE ascendcl acl_op_compiler fmk_onnx_parser nsync::nsync_cpp ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED}) + target_link_libraries(onnxruntime_providers_cann PRIVATE ascendcl acl_op_compiler fmk_onnx_parser ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED}) target_link_directories(onnxruntime_providers_cann PRIVATE ${onnxruntime_CANN_HOME}/lib64) target_include_directories(onnxruntime_providers_cann PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${onnxruntime_CANN_HOME} ${onnxruntime_CANN_HOME}/include) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 1346a9ce968c6..6a9871e8824a7 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -265,10 +265,8 @@ if(APPLE) set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/cuda/exported_symbols.lst") - target_link_libraries(${target} PRIVATE nsync::nsync_cpp) elseif(UNIX) set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/cuda/version_script.lds -Xlinker --gc-sections") - target_link_libraries(${target} PRIVATE nsync::nsync_cpp) elseif(WIN32) set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/cuda/symbols.def") else() diff --git a/cmake/onnxruntime_providers_dnnl.cmake b/cmake/onnxruntime_providers_dnnl.cmake index f2965728524b7..9e5a7eed44fff 100644 --- a/cmake/onnxruntime_providers_dnnl.cmake +++ b/cmake/onnxruntime_providers_dnnl.cmake @@ -41,10 +41,8 @@ INSTALL_RPATH "@loader_path" BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH FALSE) - target_link_libraries(onnxruntime_providers_dnnl PRIVATE nsync::nsync_cpp) elseif(UNIX) set_property(TARGET onnxruntime_providers_dnnl APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/dnnl/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\$ORIGIN") - target_link_libraries(onnxruntime_providers_dnnl PRIVATE nsync::nsync_cpp) elseif(WIN32) set_property(TARGET onnxruntime_providers_dnnl APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/dnnl/symbols.def") else() diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake index 91ac66a40721d..53fbde42bdfde 100644 --- a/cmake/onnxruntime_providers_migraphx.cmake +++ b/cmake/onnxruntime_providers_migraphx.cmake @@ -49,7 +49,7 @@ target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare) set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs) + target_link_libraries(onnxruntime_providers_migraphx PRIVATE stdc++fs) include(CheckLibraryExists) check_library_exists(migraphx::c "migraphx_program_run_async" "/opt/rocm/migraphx/lib" HAS_STREAM_SYNC) diff --git a/cmake/onnxruntime_providers_rocm.cmake b/cmake/onnxruntime_providers_rocm.cmake index b66268291579c..86ef4d6a0215d 100644 --- a/cmake/onnxruntime_providers_rocm.cmake +++ b/cmake/onnxruntime_providers_rocm.cmake @@ -208,7 +208,6 @@ if(UNIX) set_property(TARGET onnxruntime_providers_rocm APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/rocm/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_rocm PRIVATE nsync::nsync_cpp) else() message(FATAL_ERROR "onnxruntime_providers_rocm unknown platform, need to specify shared library exports for it") endif() diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 15ffc29e79ff4..4f4567dcb9aef 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -131,11 +131,10 @@ if(APPLE) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst") - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp) elseif(UNIX) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/tensorrt/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp stdc++fs) + target_link_libraries(onnxruntime_providers_tensorrt PRIVATE stdc++fs) elseif(WIN32) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/tensorrt/symbols.def") else() diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 1b9a7c9b5163b..ee83c5d57b290 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -713,7 +713,6 @@ if(MSVC) else() target_compile_definitions(onnxruntime_test_utils PUBLIC -DNSYNC_ATOMIC_CPP11) target_include_directories(onnxruntime_test_utils PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}) - onnxruntime_add_include_to_target(onnxruntime_test_utils nsync::nsync_cpp) endif() if (onnxruntime_USE_NCCL) target_include_directories(onnxruntime_test_utils PRIVATE ${NCCL_INCLUDE_DIRS}) @@ -749,7 +748,6 @@ if(NOT IOS) else() target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11) target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}) - onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp) endif() if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) #TODO: fix the warnings, they are dangerous @@ -1127,7 +1125,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) # "Global initializer calls a non-constexpr function." BENCHMARK_CAPTURE macro needs this. target_compile_options(onnxruntime_mlas_benchmark PRIVATE /wd26426) else() - target_link_libraries(onnxruntime_mlas_benchmark PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS}) + target_link_libraries(onnxruntime_mlas_benchmark PRIVATE ${CMAKE_DL_LIBS}) endif() if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") target_link_libraries(onnxruntime_mlas_benchmark PRIVATE cpuinfo) @@ -1200,7 +1198,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS}) if(NOT WIN32) - list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp) if(onnxruntime_USE_SNPE) list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe) endif() @@ -1238,7 +1235,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) # test inference using shared lib set(onnxruntime_shared_lib_test_LIBS onnxruntime_mocked_allocator onnxruntime_test_utils onnxruntime_common onnx_proto) if(NOT WIN32) - list(APPEND onnxruntime_shared_lib_test_LIBS nsync::nsync_cpp) if(onnxruntime_USE_SNPE) list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_providers_snpe) endif() @@ -1383,7 +1379,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) target_link_libraries(onnxruntime_mlas_test PRIVATE cpuinfo) endif() if(NOT WIN32) - target_link_libraries(onnxruntime_mlas_test PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS}) + target_link_libraries(onnxruntime_mlas_test PRIVATE ${CMAKE_DL_LIBS}) endif() if (CMAKE_SYSTEM_NAME STREQUAL "Android") target_link_libraries(onnxruntime_mlas_test PRIVATE ${android_shared_libs}) @@ -1556,9 +1552,6 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") ${ONNXRUNTIME_CUSTOM_OP_REGISTRATION_TEST_SRC_DIR}/test_registercustomops.cc) set(onnxruntime_customopregistration_test_LIBS custom_op_library onnxruntime_common onnxruntime_test_utils) - if (NOT WIN32) - list(APPEND onnxruntime_customopregistration_test_LIBS nsync::nsync_cpp) - endif() if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") list(APPEND onnxruntime_customopregistration_test_LIBS cpuinfo) endif() @@ -1683,7 +1676,7 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" set(onnxruntime_logging_apis_test_LIBS onnxruntime_common onnxruntime_test_utils) if(NOT WIN32) - list(APPEND onnxruntime_logging_apis_test_LIBS nsync::nsync_cpp ${CMAKE_DL_LIBS}) + list(APPEND onnxruntime_logging_apis_test_LIBS ${CMAKE_DL_LIBS}) endif() AddTest(DYN diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index 3678dbac6937d..662a5e83fd85d 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -97,7 +97,7 @@ target_compile_options(onnx PRIVATE -Wno-unused-parameter -Wno-unused-variable) if (onnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB) bundle_static_library(onnxruntime_webassembly - nsync::nsync_cpp + ${PROTOBUF_LIB} onnx onnx_proto @@ -174,7 +174,7 @@ else() endif() target_link_libraries(onnxruntime_webassembly PRIVATE - nsync::nsync_cpp + ${PROTOBUF_LIB} onnx onnx_proto diff --git a/include/onnxruntime/core/platform/Barrier.h b/include/onnxruntime/core/platform/Barrier.h index 915cfc50953ed..16c367f4580fd 100644 --- a/include/onnxruntime/core/platform/Barrier.h +++ b/include/onnxruntime/core/platform/Barrier.h @@ -6,76 +6,15 @@ // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - +#pragma once #include #include "core/common/spin_pause.h" #include "core/platform/ort_mutex.h" - -#include -#include +#include +#include namespace onnxruntime { -class Barrier { - public: - explicit Barrier(unsigned int count, bool spin = false) - : state_(count << 1), notified_(false), spin_(spin) { - assert(((count << 1) >> 1) == count); - } -#ifdef NDEBUG - ~Barrier() = default; -#else - ~Barrier() { - assert((state_ >> 1) == 0); - } -#endif - - void Notify(unsigned int c = 1) { - unsigned int delta = c << 1; - unsigned int v = state_.fetch_sub(delta, std::memory_order_acq_rel) - delta; - if (v != 1) { - // Clear the lowest bit (waiter flag) and check that the original state - // value was not zero. If it was zero, it means that notify was called - // more times than the original count. - assert(((v + delta) & ~1) != 0); - return; // either count has not dropped to 0, or waiter is not waiting - } - std::unique_lock l(mu_); - assert(!notified_); - notified_ = true; - cv_.notify_all(); - } +using Notification = absl::Notification; - void Wait() { - if (spin_) { - while ((state_ >> 1) != 0) { - onnxruntime::concurrency::SpinPause(); - } - } else { - unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); - if ((v >> 1) == 0) - return; - std::unique_lock l(mu_); - while (!notified_) { - cv_.wait(l); - } - } - } - - private: - OrtMutex mu_; - OrtCondVar cv_; - std::atomic state_; // low bit is waiter flag - bool notified_; - const bool spin_; -}; - -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object, -// but only one caller must call Notify() on the object. -struct Notification : Barrier { - Notification() : Barrier(1){}; -}; } // namespace onnxruntime diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h index f9b694efb936f..eb849c5e97571 100644 --- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h +++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h @@ -460,7 +460,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); #endif unsigned back = back_.load(std::memory_order_relaxed); Elem& e = array_[(back - 1) & kMask]; @@ -484,7 +484,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); #endif unsigned back = back_.load(std::memory_order_relaxed); w_idx = (back - 1) & kMask; @@ -509,7 +509,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); #endif unsigned back; Elem* e; @@ -555,7 +555,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); #endif Elem& e = array_[w_idx]; ElemState s = e.state.load(std::memory_order_relaxed); @@ -1440,17 +1440,22 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter ThreadStatus seen = GetStatus(); if (seen == ThreadStatus::Blocking || seen == ThreadStatus::Blocked) { - std::unique_lock lk(mutex); - // Blocking state exists only transiently during the SetBlock() method - // while holding the lock. We may observe it at the start of this - // function, but after acquiring the lock then the target thread - // will either be blocked or not. - seen = status.load(std::memory_order_relaxed); - assert(seen != ThreadStatus::Blocking); - if (seen == ThreadStatus::Blocked) { - status.store(ThreadStatus::Waking, std::memory_order_relaxed); - lk.unlock(); - cv.notify_one(); + bool sig = false; + { + absl::MutexLock lk(&mutex); + // Blocking state exists only transiently during the SetBlock() method + // while holding the lock. We may observe it at the start of this + // function, but after acquiring the lock then the target thread + // will either be blocked or not. + seen = status.load(std::memory_order_relaxed); + assert(seen != ThreadStatus::Blocking); + if (seen == ThreadStatus::Blocked) { + status.store(ThreadStatus::Waking, std::memory_order_relaxed); + sig = true; + } + } + if (sig) { + cv.Signal(); } } } @@ -1470,17 +1475,18 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter void SetBlocked(std::function should_block, std::function post_block) { - std::unique_lock lk(mutex); + mutex.Lock(); assert(GetStatus() == ThreadStatus::Spinning); status.store(ThreadStatus::Blocking, std::memory_order_relaxed); if (should_block()) { status.store(ThreadStatus::Blocked, std::memory_order_relaxed); do { - cv.wait(lk); + cv.Wait(&mutex); } while (status.load(std::memory_order_relaxed) == ThreadStatus::Blocked); post_block(); } status.store(ThreadStatus::Spinning, std::memory_order_relaxed); + mutex.Unlock(); } private: diff --git a/include/onnxruntime/core/platform/ort_mutex.h b/include/onnxruntime/core/platform/ort_mutex.h index e24665f51423d..d128aaea3622a 100644 --- a/include/onnxruntime/core/platform/ort_mutex.h +++ b/include/onnxruntime/core/platform/ort_mutex.h @@ -2,188 +2,8 @@ // Licensed under the MIT License. #pragma once -#ifdef _WIN32 -#include -#include +#include namespace onnxruntime { -// Q: Why OrtMutex is better than std::mutex -// A: OrtMutex supports static initialization but std::mutex doesn't. Static initialization helps us prevent the "static -// initialization order problem". - -// Q: Why std::mutex can't make it? -// A: VC runtime has to support Windows XP at ABI level. But we don't have such requirement. - -// Q: Is OrtMutex faster than std::mutex? -// A: Sure - -class OrtMutex { - private: - SRWLOCK data_ = SRWLOCK_INIT; - - public: - constexpr OrtMutex() = default; - // SRW locks do not need to be explicitly destroyed. - ~OrtMutex() = default; - OrtMutex(const OrtMutex&) = delete; - OrtMutex& operator=(const OrtMutex&) = delete; - void lock() { AcquireSRWLockExclusive(native_handle()); } - bool try_lock() noexcept { return TryAcquireSRWLockExclusive(native_handle()) == TRUE; } - void unlock() noexcept { ReleaseSRWLockExclusive(native_handle()); } - using native_handle_type = SRWLOCK*; - - __forceinline native_handle_type native_handle() { return &data_; } -}; - -class OrtCondVar { - CONDITION_VARIABLE native_cv_object = CONDITION_VARIABLE_INIT; - - public: - constexpr OrtCondVar() noexcept = default; - ~OrtCondVar() = default; - - OrtCondVar(const OrtCondVar&) = delete; - OrtCondVar& operator=(const OrtCondVar&) = delete; - - void notify_one() noexcept { WakeConditionVariable(&native_cv_object); } - void notify_all() noexcept { WakeAllConditionVariable(&native_cv_object); } - - void wait(std::unique_lock& lk) { - if (SleepConditionVariableSRW(&native_cv_object, lk.mutex()->native_handle(), INFINITE, 0) != TRUE) { - std::terminate(); - } - } - template - void wait(std::unique_lock& __lk, _Predicate __pred); - - /** - * returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout. - * @param cond_mutex A unique_lock object. - * @param rel_time A chrono::duration object that specifies the amount of time before the thread wakes up. - * @return returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout - */ - template - std::cv_status wait_for(std::unique_lock& cond_mutex, const std::chrono::duration& rel_time); - using native_handle_type = CONDITION_VARIABLE*; - - native_handle_type native_handle() { return &native_cv_object; } - - private: - void timed_wait_impl(std::unique_lock& __lk, - std::chrono::time_point); -}; - -template -void OrtCondVar::wait(std::unique_lock& __lk, _Predicate __pred) { - while (!__pred()) wait(__lk); -} - -template -std::cv_status OrtCondVar::wait_for(std::unique_lock& cond_mutex, - const std::chrono::duration& rel_time) { - // TODO: is it possible to use nsync_from_time_point_ ? - using namespace std::chrono; - if (rel_time <= duration::zero()) - return std::cv_status::timeout; - using SystemTimePointFloat = time_point >; - using SystemTimePoint = time_point; - SystemTimePointFloat max_time = SystemTimePoint::max(); - steady_clock::time_point steady_now = steady_clock::now(); - system_clock::time_point system_now = system_clock::now(); - if (max_time - rel_time > system_now) { - nanoseconds remain = duration_cast(rel_time); - if (remain < rel_time) - ++remain; - timed_wait_impl(cond_mutex, system_now + remain); - } else - timed_wait_impl(cond_mutex, SystemTimePoint::max()); - return steady_clock::now() - steady_now < rel_time ? std::cv_status::no_timeout : std::cv_status::timeout; -} -} // namespace onnxruntime -#else -#include "nsync.h" -#include //for unique_lock -#include //for cv_status -namespace onnxruntime { - -class OrtMutex { - nsync::nsync_mu data_ = NSYNC_MU_INIT; - - public: - constexpr OrtMutex() = default; - ~OrtMutex() = default; - OrtMutex(const OrtMutex&) = delete; - OrtMutex& operator=(const OrtMutex&) = delete; - - void lock() { nsync::nsync_mu_lock(&data_); } - bool try_lock() noexcept { return nsync::nsync_mu_trylock(&data_) == 0; } - void unlock() noexcept { nsync::nsync_mu_unlock(&data_); } - - using native_handle_type = nsync::nsync_mu*; - native_handle_type native_handle() { return &data_; } -}; - -class OrtCondVar { - nsync::nsync_cv native_cv_object = NSYNC_CV_INIT; - - public: - constexpr OrtCondVar() noexcept = default; - - ~OrtCondVar() = default; - OrtCondVar(const OrtCondVar&) = delete; - OrtCondVar& operator=(const OrtCondVar&) = delete; - - void notify_one() noexcept { nsync::nsync_cv_signal(&native_cv_object); } - void notify_all() noexcept { nsync::nsync_cv_broadcast(&native_cv_object); } - - void wait(std::unique_lock& lk); - template - void wait(std::unique_lock& __lk, _Predicate __pred); - - /** - * returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout. - * @param cond_mutex A unique_lock object. - * @param rel_time A chrono::duration object that specifies the amount of time before the thread wakes up. - * @return returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout - */ - template - std::cv_status wait_for(std::unique_lock& cond_mutex, const std::chrono::duration& rel_time); - using native_handle_type = nsync::nsync_cv*; - native_handle_type native_handle() { return &native_cv_object; } - - private: - void timed_wait_impl(std::unique_lock& __lk, - std::chrono::time_point); -}; - -template -void OrtCondVar::wait(std::unique_lock& __lk, _Predicate __pred) { - while (!__pred()) wait(__lk); -} - -template -std::cv_status OrtCondVar::wait_for(std::unique_lock& cond_mutex, - const std::chrono::duration& rel_time) { - // TODO: is it possible to use nsync_from_time_point_ ? - using namespace std::chrono; - if (rel_time <= duration::zero()) - return std::cv_status::timeout; - using SystemTimePointFloat = time_point >; - using SystemTimePoint = time_point; - SystemTimePointFloat max_time = SystemTimePoint::max(); - steady_clock::time_point steady_now = steady_clock::now(); - system_clock::time_point system_now = system_clock::now(); - if (max_time - rel_time > system_now) { - nanoseconds remain = duration_cast(rel_time); - if (remain < rel_time) - ++remain; - timed_wait_impl(cond_mutex, system_now + remain); - } else - timed_wait_impl(cond_mutex, SystemTimePoint::max()); - return steady_clock::now() - steady_now < rel_time ? std::cv_status::no_timeout : std::cv_status::timeout; -} +using OrtMutex = absl::Mutex; +using OrtCondVar = absl::CondVar; }; // namespace onnxruntime -#endif diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index e126f8bcb3d11..c731ef942d58b 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -33,7 +33,7 @@ class FusedConv : public onnxruntime::cuda::Conv { } Status ComputeInternal(OpKernelContext* context) const override { - std::lock_guard lock(Base::s_.mutex); + absl::MutexLock lock(Base::s_.mutex); auto cudnnHandle = this->GetCudnnHandle(context); ORT_RETURN_IF_ERROR(Base::UpdateState(context, true)); if (Base::s_.Y->Shape().Size() == 0) { diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc index 63804f79a32fb..0a326a3c71a50 100644 --- a/onnxruntime/contrib_ops/rocm/fused_conv.cc +++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc @@ -144,7 +144,7 @@ class FusedConv : public onnxruntime::rocm::Conv { } Status ComputeInternal(OpKernelContext* context) const override { - std::lock_guard lock(Base::s_.mutex); + absl::MutexLock lock(Base::s_.mutex); ORT_RETURN_IF_ERROR(Base::UpdateState(context, true)); if (Base::s_.Y->Shape().Size() == 0) { @@ -351,7 +351,7 @@ class FusedConv : public onnxruntime::rocm::Conv { FusionPlanCacheItem& FindOrCreateFusionPlanCache(HashKey key, std::function factory) { - std::lock_guard lock(mutex); + absl::MutexLock lock(mutex); auto iter = cache_directory_.find(key); if (iter == cache_directory_.end()) { cache_directory_[key].fusion = std::make_unique(); diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc index eac9a7fa08081..9b6e32a4b74aa 100644 --- a/onnxruntime/core/common/logging/logging.cc +++ b/onnxruntime/core/common/logging/logging.cc @@ -102,7 +102,7 @@ LoggingManager::LoggingManager(std::unique_ptr sink, Severity default_min // lock mutex to create instance, and enable logging // this matches the mutex usage in Shutdown - std::lock_guard guard(DefaultLoggerMutex()); + absl::MutexLock guard(&DefaultLoggerMutex()); if (DefaultLoggerManagerInstance().load() != nullptr) { ORT_THROW("Only one instance of LoggingManager created with InstanceType::Default can exist at any point in time."); @@ -122,7 +122,7 @@ LoggingManager::LoggingManager(std::unique_ptr sink, Severity default_min LoggingManager::~LoggingManager() { if (owns_default_logger_) { // lock mutex to reset DefaultLoggerManagerInstance() and free default logger from this instance. - std::lock_guard guard(DefaultLoggerMutex()); + absl::MutexLock guard(&DefaultLoggerMutex()); #if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L))) DefaultLoggerManagerInstance().store(nullptr, std::memory_order_release); #else diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index 71bca6ef3b582..856cc87e403b5 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -85,7 +85,7 @@ void Profiler::EndTimeAndRecordEvent(EventCategory category, custom_logger_->SendProfileEvent(event); } else { // TODO: sync_gpu if needed. - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); if (events_.size() < max_num_events_) { events_.emplace_back(std::move(event)); } else { @@ -115,7 +115,7 @@ std::string Profiler::EndProfiling() { LOGS(*session_logger_, INFO) << "Writing profiler data to file " << profile_stream_file_; } - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); profile_stream_ << "[\n"; for (const auto& ep_profiler : ep_profilers_) { diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc index 10e117267e14b..0abc50274933c 100644 --- a/onnxruntime/core/common/threadpool.cc +++ b/onnxruntime/core/common/threadpool.cc @@ -15,11 +15,14 @@ limitations under the License. #include #include - +#ifdef _WIN32 +#include +#endif #include "core/platform/threadpool.h" #include "core/common/common.h" #include "core/common/cpuid_info.h" #include "core/common/eigen_common_wrapper.h" + #include "core/platform/EigenNonBlockingThreadPool.h" #include "core/platform/ort_mutex.h" #if !defined(ORT_MINIMAL_BUILD) diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc index 13f9656ae0595..6a10437fdea8a 100644 --- a/onnxruntime/core/framework/bfc_arena.cc +++ b/onnxruntime/core/framework/bfc_arena.cc @@ -276,7 +276,7 @@ void* BFCArena::Reserve(size_t size) { if (size == 0) return nullptr; - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); LOGS_DEFAULT(INFO) << "Reserving memory in BFCArena for " << device_allocator_->Info().name << " size: " << size; @@ -293,7 +293,7 @@ void* BFCArena::Reserve(size_t size) { } size_t BFCArena::RequestedSize(const void* ptr) { - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); BFCArena::ChunkHandle h = region_manager_.get_handle(ptr); ORT_ENFORCE(h != kInvalidChunkHandle); BFCArena::Chunk* c = ChunkFromHandle(h); @@ -301,7 +301,7 @@ size_t BFCArena::RequestedSize(const void* ptr) { } size_t BFCArena::AllocatedSize(const void* ptr) { - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); BFCArena::ChunkHandle h = region_manager_.get_handle(ptr); ORT_ENFORCE(h != kInvalidChunkHandle); BFCArena::Chunk* c = ChunkFromHandle(h); @@ -325,7 +325,7 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes, // The BFC allocator tries to find the best fit first. BinNum bin_num = BinNumForSize(rounded_bytes); - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); // search for a valid chunk auto* chunk = FindChunkPtr(bin_num, rounded_bytes, @@ -377,7 +377,7 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes, } void BFCArena::GetStats(AllocatorStats* stats) { - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); *stats = stats_; } @@ -496,7 +496,7 @@ void BFCArena::Free(void* p) { if (p == nullptr) { return; } - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); auto it = reserved_chunks_.find(p); if (it != reserved_chunks_.end()) { device_allocator_->Free(it->first); @@ -509,7 +509,7 @@ void BFCArena::Free(void* p) { } Status BFCArena::Shrink() { - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); auto num_regions = region_manager_.regions().size(); std::vector region_ptrs; std::vector region_sizes; @@ -807,7 +807,7 @@ void BFCArena::DumpMemoryLog(size_t num_bytes) { } #ifdef ORT_ENABLE_STREAM void BFCArena::ResetChunkOnTargetStream(Stream* target_stream, bool coalesce_flag) { - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); for (const auto& region : region_manager_.regions()) { ChunkHandle region_begin_chunk = region_manager_.get_handle(region.ptr()); diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h index dc45cad692b6e..ff9044997139b 100644 --- a/onnxruntime/core/framework/execution_providers.h +++ b/onnxruntime/core/framework/execution_providers.h @@ -13,6 +13,7 @@ #include "core/common/logging/logging.h" #ifdef _WIN32 #include +#include #include #include "core/platform/tracing.h" #include "core/platform/windows/telemetry.h" diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.cc b/onnxruntime/core/framework/kernel_type_str_resolver.cc index 732029d408c6b..0f4e4ffe6a415 100644 --- a/onnxruntime/core/framework/kernel_type_str_resolver.cc +++ b/onnxruntime/core/framework/kernel_type_str_resolver.cc @@ -264,7 +264,7 @@ void KernelTypeStrResolver::Merge(KernelTypeStrResolver src) { Status OpSchemaKernelTypeStrResolver::ResolveKernelTypeStr( const Node& node, std::string_view kernel_type_str, gsl::span& resolved_args) const { - std::lock_guard lock{resolver_mutex_}; + absl::MutexLock lock(&resolver_mutex_); ORT_RETURN_IF_ERROR(resolver_.RegisterNodeOpSchema(node)); ORT_RETURN_IF_ERROR(resolver_.ResolveKernelTypeStr(node, kernel_type_str, resolved_args)); return Status::OK(); diff --git a/onnxruntime/core/framework/mem_pattern_planner.h b/onnxruntime/core/framework/mem_pattern_planner.h index f4db5d9f1c75f..a70801db1db74 100644 --- a/onnxruntime/core/framework/mem_pattern_planner.h +++ b/onnxruntime/core/framework/mem_pattern_planner.h @@ -68,7 +68,7 @@ class MemPatternPlanner { void TraceAllocation(int ml_value_idx, const AllocPlanPerValue::ProgramCounter& counter, size_t size) { ORT_ENFORCE(using_counters_); - std::lock_guard lock(lock_); + absl::MutexLock lock(lock_); if (size == 0) { allocs_.emplace_back(ml_value_idx, MemoryBlock(0, 0)); @@ -133,7 +133,7 @@ class MemPatternPlanner { void TraceAllocation(int ml_value_idx, size_t size) { ORT_ENFORCE(!using_counters_); - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); if (size == 0) { allocs_.emplace_back(ml_value_idx, MemoryBlock(0, 0)); @@ -190,7 +190,7 @@ class MemPatternPlanner { } void TraceFree(int ml_value_index) { - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); for (auto it = blocks_.begin(); it != blocks_.end(); it++) { if (allocs_[*it].index_ == ml_value_index) { @@ -201,7 +201,7 @@ class MemPatternPlanner { } MemoryPattern GenerateMemPattern() const { - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); #ifdef ENABLE_TRAINING if (using_counters_) { diff --git a/onnxruntime/core/framework/model_metadef_id_generator.cc b/onnxruntime/core/framework/model_metadef_id_generator.cc index e51c6ebc29975..2a79b185073f6 100644 --- a/onnxruntime/core/framework/model_metadef_id_generator.cc +++ b/onnxruntime/core/framework/model_metadef_id_generator.cc @@ -12,7 +12,7 @@ int ModelMetadefIdGenerator::GenerateId(const onnxruntime::GraphViewer& graph_vi // if the EP is shared across multiple sessions there's a very small potential for concurrency issues. // use a lock when generating an id to be paranoid static OrtMutex mutex; - std::lock_guard lock(mutex); + absl::MutexLock lock(&mutex); model_hash = 0; // find the top level graph diff --git a/onnxruntime/core/framework/random_generator.h b/onnxruntime/core/framework/random_generator.h index 39f31b2f9af8a..8db7942769e75 100644 --- a/onnxruntime/core/framework/random_generator.h +++ b/onnxruntime/core/framework/random_generator.h @@ -57,7 +57,7 @@ class PhiloxGenerator { * Resets the seed and offset. */ void SetSeed(uint64_t seed) { - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); seed_ = seed; offset_ = 0; } @@ -66,7 +66,7 @@ class PhiloxGenerator { * Gets the seed and offset pair, incrementing the offset by the specified count. */ std::pair NextPhiloxSeeds(uint64_t count) { - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); auto seeds = std::make_pair(seed_, offset_); offset_ += count; return seeds; diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 6244d426450a2..9cbce08067b29 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -516,7 +516,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap l(prepacked_weights_container_->mutex_); + absl::MutexLock l(&prepacked_weights_container_->mutex_); return prepacked_constant_weights(true); } else { return prepacked_constant_weights(false); @@ -773,7 +773,7 @@ const MemoryPatternGroup* SessionState::GetMemoryPatternGroup( const InlinedHashMap*& out_inferred_shapes) const { out_inferred_shapes = nullptr; int64_t key = CalculateMemoryPatternsKey(tensor_inputs); - std::lock_guard lock(mem_patterns_lock_); + absl::MutexLock lock(&mem_patterns_lock_); auto it = mem_patterns_.find(key); if (it == mem_patterns_.end()) { #ifdef ENABLE_TRAINING @@ -849,7 +849,7 @@ Status SessionState::UpdateMemoryPatternGroupCache(gsl::span ten MemoryPatternGroup mem_patterns) const { int64_t key = CalculateMemoryPatternsKey(tensor_inputs); - std::lock_guard lock(mem_patterns_lock_); + absl::MutexLock lock(&mem_patterns_lock_); // Do not update if present, as the pointer to the existing one is cached mem_patterns_.emplace(key, std::move(mem_patterns)); return Status::OK(); @@ -1585,7 +1585,7 @@ static void BindToDeviceStream(const SequentialExecutionPlan& execution_plan, std::unique_ptr SessionState::AcquireDeviceStreamCollection() const { if (has_device_stream_enabled_ep_) { - std::lock_guard lock(device_stream_pool_mutex_); + absl::MutexLock lock(&device_stream_pool_mutex_); if (!device_stream_pool_.empty()) { auto device_stream = std::move(device_stream_pool_.back()); device_stream_pool_.pop_back(); @@ -1604,7 +1604,7 @@ std::unique_ptr SessionState::AcquireDeviceStreamCollect void SessionState::RecycleDeviceStreamCollection(std::unique_ptr device_stream_collection) const { // if no need to reuse the device stream, don't perform the recycle if (has_device_stream_enabled_ep_) { - std::lock_guard lock(device_stream_pool_mutex_); + absl::MutexLock lock(&device_stream_pool_mutex_); device_stream_pool_.push_back(std::move(device_stream_collection)); } else { device_stream_collection.reset(nullptr); diff --git a/onnxruntime/core/framework/tuning_context.h b/onnxruntime/core/framework/tuning_context.h index aae70d85814bc..34faf69ba8aef 100644 --- a/onnxruntime/core/framework/tuning_context.h +++ b/onnxruntime/core/framework/tuning_context.h @@ -77,7 +77,7 @@ class TuningResultsManager { void Clear(); private: - mutable OrtMutex lock_; + mutable absl::Mutex lock_; std::unordered_map results_; }; diff --git a/onnxruntime/core/framework/tuning_context_impl.h b/onnxruntime/core/framework/tuning_context_impl.h index 402a3c0a691e0..eefb0b6c1cfd1 100644 --- a/onnxruntime/core/framework/tuning_context_impl.h +++ b/onnxruntime/core/framework/tuning_context_impl.h @@ -38,7 +38,7 @@ Status ITuningContext::LoadTuningResults(const TuningResults& tr) { } KernelMap TuningResultsManager::Lookup(const std::string& op_signature) const { - std::scoped_lock l{lock_}; + absl::MutexLock l(&lock_); auto it = results_.find(op_signature); if (it == results_.cend()) { return {}; @@ -48,7 +48,7 @@ KernelMap TuningResultsManager::Lookup(const std::string& op_signature) const { // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) int TuningResultsManager::Lookup(const std::string& op_signature, const std::string& params_signature) const { - std::scoped_lock l{lock_}; + absl::MutexLock l(&lock_); auto kernel_map_it = results_.find(op_signature); if (kernel_map_it == results_.cend()) { return -1; @@ -81,7 +81,7 @@ inline void AddImpl(const std::string& op_signature, } void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, int best_id) { - std::scoped_lock l{lock_}; + absl::MutexLock l(&lock_); auto it = results_.find(op_signature); if (it == results_.end()) { @@ -93,7 +93,7 @@ void TuningResultsManager::Add(const std::string& op_signature, const std::strin // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) { - std::scoped_lock l{lock_}; + absl::MutexLock l(&lock_); auto it = results_.find(op_signature); if (it == results_.end()) { @@ -110,7 +110,7 @@ void TuningResultsManager::Delete(const std::string& op_signature, const std::st } std::unordered_map TuningResultsManager::Dump() const { - std::scoped_lock l{lock_}; + absl::MutexLock l(&lock_); return results_; } @@ -133,14 +133,14 @@ void DisjointMergeImpl( } void TuningResultsManager::Load(const std::unordered_map& results_to_load) { - std::scoped_lock l{lock_}; + absl::MutexLock l(&lock_); for (const auto& [op_signature, kernel_map] : results_to_load) { DisjointMergeImpl(op_signature, kernel_map, results_); } } void TuningResultsManager::DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map) { - std::scoped_lock l{lock_}; + absl::MutexLock l(&lock_); DisjointMergeImpl(op_signature, kernel_map, results_); } diff --git a/onnxruntime/core/graph/schema_registry.cc b/onnxruntime/core/graph/schema_registry.cc index 4dc714bd8af79..c3b6ef8bd5a03 100644 --- a/onnxruntime/core/graph/schema_registry.cc +++ b/onnxruntime/core/graph/schema_registry.cc @@ -10,7 +10,7 @@ common::Status OnnxRuntimeOpSchemaRegistry::SetBaselineAndOpsetVersionForDomain( const std::string& domain, int baseline_opset_version, int opset_version) { - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); auto it = domain_version_range_map_.find(domain); if (domain_version_range_map_.end() != it) { diff --git a/onnxruntime/core/platform/posix/ort_mutex.cc b/onnxruntime/core/platform/posix/ort_mutex.cc deleted file mode 100644 index e124ce168085f..0000000000000 --- a/onnxruntime/core/platform/posix/ort_mutex.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/common/common.h" -#include "core/platform/ort_mutex.h" -#include -#include -#include - -namespace onnxruntime { -void OrtCondVar::timed_wait_impl(std::unique_lock& lk, - std::chrono::time_point tp) { - using namespace std::chrono; -#ifndef NDEBUG - if (!lk.owns_lock()) - ORT_THROW("condition_variable::timed wait: mutex not locked"); -#endif - nanoseconds d = tp.time_since_epoch(); - timespec abs_deadline; - seconds s = duration_cast(d); - using ts_sec = decltype(abs_deadline.tv_sec); - constexpr ts_sec ts_sec_max = std::numeric_limits::max(); - if (s.count() < ts_sec_max) { - abs_deadline.tv_sec = static_cast(s.count()); - abs_deadline.tv_nsec = static_cast((d - s).count()); - } else { - abs_deadline.tv_sec = ts_sec_max; - abs_deadline.tv_nsec = 999999999; - } - nsync::nsync_cv_wait_with_deadline(&native_cv_object, lk.mutex()->native_handle(), abs_deadline, nullptr); -} - -void OrtCondVar::wait(std::unique_lock& lk) { -#ifndef NDEBUG - if (!lk.owns_lock()) { - ORT_THROW("OrtCondVar wait failed: mutex not locked"); - } -#endif - nsync::nsync_cv_wait(&native_cv_object, lk.mutex()->native_handle()); -} - -} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc index 5fb7f7a65161d..5a6936857d811 100644 --- a/onnxruntime/core/platform/windows/logging/etw_sink.cc +++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc @@ -65,12 +65,12 @@ EtwRegistrationManager& EtwRegistrationManager::Instance() { } bool EtwRegistrationManager::IsEnabled() const { - std::lock_guard lock(provider_change_mutex_); + absl::MutexLock lock(&provider_change_mutex_); return is_enabled_; } UCHAR EtwRegistrationManager::Level() const { - std::lock_guard lock(provider_change_mutex_); + absl::MutexLock lock(&provider_change_mutex_); return level_; } @@ -94,7 +94,7 @@ Severity EtwRegistrationManager::MapLevelToSeverity() { } ULONGLONG EtwRegistrationManager::Keyword() const { - std::lock_guard lock(provider_change_mutex_); + absl::MutexLock lock(&provider_change_mutex_); return keyword_; } @@ -103,7 +103,7 @@ HRESULT EtwRegistrationManager::Status() const { } void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) { - std::lock_guard lock(callbacks_mutex_); + absl::MutexLock lock(&callbacks_mutex_); callbacks_.push_back(callback); } @@ -117,7 +117,7 @@ void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback( _In_opt_ PVOID CallbackContext) { auto& manager = EtwRegistrationManager::Instance(); { - std::lock_guard lock(manager.provider_change_mutex_); + absl::MutexLock lock(&manager.provider_change_mutex_); manager.is_enabled_ = (IsEnabled != 0); manager.level_ = Level; manager.keyword_ = MatchAnyKeyword; @@ -134,7 +134,7 @@ EtwRegistrationManager::EtwRegistrationManager() { void EtwRegistrationManager::LazyInitialize() { if (!initialized_) { - std::lock_guard lock(init_mutex_); + absl::MutexLock lock(&init_mutex_); if (!initialized_) { // Double-check locking pattern initialized_ = true; etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr); @@ -148,7 +148,7 @@ void EtwRegistrationManager::LazyInitialize() { void EtwRegistrationManager::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext) { - std::lock_guard lock(callbacks_mutex_); + absl::MutexLock lock(&callbacks_mutex_); for (const auto& callback : callbacks_) { callback(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext); } diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc index 654281d526e4d..3992f513d60cb 100644 --- a/onnxruntime/core/platform/windows/telemetry.cc +++ b/onnxruntime/core/platform/windows/telemetry.cc @@ -68,7 +68,7 @@ std::vector WindowsTelemetry::callbacks_; OrtMutex WindowsTelemetry::callbacks_mutex_; WindowsTelemetry::WindowsTelemetry() { - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); if (global_register_count_ == 0) { // TraceLoggingRegister is fancy in that you can only register once GLOBALLY for the whole process HRESULT hr = TraceLoggingRegisterEx(telemetry_provider_handle, ORT_TL_EtwEnableCallback, nullptr); @@ -79,7 +79,7 @@ WindowsTelemetry::WindowsTelemetry() { } WindowsTelemetry::~WindowsTelemetry() { - std::lock_guard lock(mutex_); + absl::MutexLock lock(&mutex_); if (global_register_count_ > 0) { global_register_count_ -= 1; if (global_register_count_ == 0) { @@ -89,17 +89,17 @@ WindowsTelemetry::~WindowsTelemetry() { } bool WindowsTelemetry::IsEnabled() const { - std::lock_guard lock(provider_change_mutex_); + absl::MutexLock lock(&provider_change_mutex_); return enabled_; } UCHAR WindowsTelemetry::Level() const { - std::lock_guard lock(provider_change_mutex_); + absl::MutexLock lock(&provider_change_mutex_); return level_; } UINT64 WindowsTelemetry::Keyword() const { - std::lock_guard lock(provider_change_mutex_); + absl::MutexLock lock(&provider_change_mutex_); return keyword_; } @@ -108,7 +108,7 @@ UINT64 WindowsTelemetry::Keyword() const { // } void WindowsTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) { - std::lock_guard lock(callbacks_mutex_); + absl::MutexLock lock(&callbacks_mutex_); callbacks_.push_back(callback); } @@ -120,7 +120,7 @@ void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback( _In_ ULONGLONG MatchAllKeyword, _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData, _In_opt_ PVOID CallbackContext) { - std::lock_guard lock(provider_change_mutex_); + absl::MutexLock lock(&provider_change_mutex_); enabled_ = (IsEnabled != 0); level_ = Level; keyword_ = MatchAnyKeyword; @@ -131,7 +131,7 @@ void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback( void WindowsTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext) { - std::lock_guard lock(callbacks_mutex_); + absl::MutexLock lock(&callbacks_mutex_); for (const auto& callback : callbacks_) { callback(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext); } diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc index 9a242919665bb..34233e3353ef2 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.cc +++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc @@ -1389,7 +1389,7 @@ Status CANNExecutionProvider::Compile(const std::vector& fuse if (modelIDs_.find(filename) != modelIDs_.end()) { modelID = modelIDs_[filename]; } else { - std::lock_guard lock(g_mutex); + absl::MutexLock lock(&g_mutex); if (cann::FileExist(filename_with_suffix)) { CANN_RETURN_IF_ERROR(aclmdlLoadFromFile(filename_with_suffix.c_str(), &modelID)); diff --git a/onnxruntime/core/providers/cpu/generator/random.cc b/onnxruntime/core/providers/cpu/generator/random.cc index dfa27f1f44d5a..211e655368faa 100644 --- a/onnxruntime/core/providers/cpu/generator/random.cc +++ b/onnxruntime/core/providers/cpu/generator/random.cc @@ -138,7 +138,7 @@ static TensorProto::DataType InferDataType(const Tensor& tensor); Status RandomNormal::Compute(OpKernelContext* ctx) const { Tensor& Y = *ctx->Output(0, shape_); - std::lock_guard l(generator_mutex_); + absl::MutexLock l(&generator_mutex_); auto status = RandomNormalCompute(mean_, scale_, generator_, dtype_, Y); return status; @@ -147,7 +147,7 @@ Status RandomNormal::Compute(OpKernelContext* ctx) const { Status RandomUniform::Compute(OpKernelContext* ctx) const { Tensor& Y = *ctx->Output(0, shape_); - std::lock_guard l(generator_mutex_); + absl::MutexLock l(&generator_mutex_); auto status = RandomUniformCompute(low_, high_, generator_, dtype_, Y); return status; @@ -169,7 +169,7 @@ Status RandomNormalLike::Compute(OpKernelContext* ctx) const { "Could not infer data type from input tensor with data type ", X.DataType()); - std::lock_guard l(generator_mutex_); + absl::MutexLock l(&generator_mutex_); status = RandomNormalCompute(mean_, scale_, generator_, dtype, *Y); return status; @@ -190,7 +190,7 @@ Status RandomUniformLike::Compute(OpKernelContext* ctx) const { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Could not infer data type from input tensor with data type ", X.DataType()); - std::lock_guard l(generator_mutex_); + absl::MutexLock l(&generator_mutex_); status = RandomUniformCompute(low_, high_, generator_, dtype, *Y); return status; @@ -310,7 +310,7 @@ Status Multinomial::Compute(OpKernelContext* ctx) const { Tensor* Y = ctx->Output(0, {batch_size, num_samples_}); Status status = Status::OK(); - std::lock_guard l(generator_mutex_); + absl::MutexLock l(&generator_mutex_); switch (output_dtype_) { case TensorProto::INT32: { status = MultinomialCompute(ctx, X, batch_size, num_classes, num_samples_, generator_, *Y); diff --git a/onnxruntime/core/providers/cpu/text/string_normalizer.cc b/onnxruntime/core/providers/cpu/text/string_normalizer.cc index 32de3105d627d..bf6e34a04957d 100644 --- a/onnxruntime/core/providers/cpu/text/string_normalizer.cc +++ b/onnxruntime/core/providers/cpu/text/string_normalizer.cc @@ -9,6 +9,7 @@ #ifdef _MSC_VER #include +#include #endif // _MSC_VER #include diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc index 314aa1062f1b0..9987b0ff4c9fd 100644 --- a/onnxruntime/core/providers/cuda/cuda_allocator.cc +++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc @@ -69,7 +69,7 @@ void* CUDAExternalAllocator::Alloc(size_t size) { void CUDAExternalAllocator::Free(void* p) { free_(p); - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); auto it = reserved_.find(p); if (it != reserved_.end()) { reserved_.erase(it); @@ -80,7 +80,7 @@ void CUDAExternalAllocator::Free(void* p) { void* CUDAExternalAllocator::Reserve(size_t size) { void* p = Alloc(size); if (!p) return nullptr; - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); ORT_ENFORCE(reserved_.find(p) == reserved_.end()); reserved_.insert(p); return p; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 05d9f3b5a1e8f..4b3ca533b6845 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -322,7 +322,7 @@ DataLayout CUDAExecutionProvider::GetPreferredLayout() const { CUDAExecutionProvider::~CUDAExecutionProvider() { // clean up thread local context caches { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -367,7 +367,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -404,7 +404,7 @@ void CUDAExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index e05786248cbcf..30498f0f37167 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -443,7 +443,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) template Status Conv::ComputeInternal(OpKernelContext* context) const { - std::lock_guard lock(s_.mutex); + absl::MutexLock lock(&s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context)); if (s_.Y->Shape().Size() == 0) { return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc index 939b9959af818..d2202a19324c6 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc @@ -107,7 +107,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy } { - std::lock_guard lock(s_.mutex); + absl::MutexLock lock(&s_.mutex); // CUDNN_CONFIG_RETURN_IF_ERROR(cudnnSetStream(CudnnHandle(), Stream(context))); // TODO: add a global cache if need to handle cases for multiple frames running simultaneously with different batch_size bool input_dims_changed = (s_.last_x_dims.AsShapeVector() != x_dims); diff --git a/onnxruntime/core/providers/cuda/nvtx_profile_context.h b/onnxruntime/core/providers/cuda/nvtx_profile_context.h index e2e3be07bd474..cf408d3171547 100644 --- a/onnxruntime/core/providers/cuda/nvtx_profile_context.h +++ b/onnxruntime/core/providers/cuda/nvtx_profile_context.h @@ -25,14 +25,14 @@ class Context { // Return tag for the specified thread. // If the thread's tag doesn't exist, this function returns an empty string. std::string GetThreadTagOrDefault(const std::thread::id& thread_id) { - const std::lock_guard lock(mtx_); + absl::MutexLock lock(&mtx_); return thread_tag_[thread_id]; } // Set tag for the specified thread. void SetThreadTag( const std::thread::id& thread_id, const std::string& tag) { - const std::lock_guard lock(mtx_); + absl::MutexLock lock(&mtx_); thread_tag_[thread_id] = tag; } diff --git a/onnxruntime/core/providers/migraphx/hip_allocator.cc b/onnxruntime/core/providers/migraphx/hip_allocator.cc index 53f10e318e65f..e1166fa6aadb1 100644 --- a/onnxruntime/core/providers/migraphx/hip_allocator.cc +++ b/onnxruntime/core/providers/migraphx/hip_allocator.cc @@ -51,7 +51,7 @@ void* HIPExternalAllocator::Alloc(size_t size) { void HIPExternalAllocator::Free(void* p) { free_(p); - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); auto it = reserved_.find(p); if (it != reserved_.end()) { reserved_.erase(it); @@ -62,7 +62,7 @@ void HIPExternalAllocator::Free(void* p) { void* HIPExternalAllocator::Reserve(size_t size) { void* p = Alloc(size); if (!p) return nullptr; - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); ORT_ENFORCE(reserved_.find(p) == reserved_.end()); reserved_.insert(p); return p; diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index 50782569ee80a..c98f881de6bc0 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -1324,7 +1324,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector& { // lock to avoid race condition - std::lock_guard lock(*(mgx_state->mgx_mu_ptr)); + absl::MutexLock lock(mgx_state->mgx_mu_ptr); #ifdef MIGRAPHX_STREAM_SYNC void* rocm_stream; diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc index 5db89e2c1af49..f2437eebb58c4 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc @@ -272,7 +272,7 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context) { { // Acquire mutex before calling graphExecute and profiling APIs to support calling session.Run() // from multiple threads. - std::lock_guard lock(graph_exec_mutex_); + absl::MutexLock lock(&graph_exec_mutex_); execute_status = qnn_interface.graphExecute(graph_info_->Graph(), qnn_inputs.data(), static_cast(qnn_inputs.size()), diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 235ea45cd4dde..f6486a20f737a 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -31,7 +31,7 @@ static std::unique_ptr>> s_run_on_unload_; void RunOnUnload(std::function function) { OrtMutex mutex; - std::lock_guard guard(mutex); + absl::MutexLock guard(mutex); if (!s_run_on_unload_) { s_run_on_unload_ = std::make_unique>>(); } @@ -339,7 +339,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio QNNExecutionProvider::~QNNExecutionProvider() { // clean up thread local context caches - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -835,7 +835,7 @@ QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContex // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -869,7 +869,7 @@ void QNNExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc index 6214ec7bc0ea3..331376cd8975a 100644 --- a/onnxruntime/core/providers/rocm/nn/conv.cc +++ b/onnxruntime/core/providers/rocm/nn/conv.cc @@ -324,7 +324,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) template Status Conv::ComputeInternal(OpKernelContext* context) const { - std::lock_guard lock(s_.mutex); + absl::MutexLock lock(&s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context)); if (s_.Y->Shape().Size() == 0) { return Status::OK(); diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc index 7447113fdf847..66cb8695766b2 100644 --- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc @@ -66,7 +66,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy } { - std::lock_guard lock(s_.mutex); + absl::MutexLock lock(&s_.mutex); // TODO: add a global cache if need to handle cases for multiple frames running simultaneously with different batch_size bool input_dims_changed = (s_.last_x_dims.AsShapeVector() != x_dims); bool w_dims_changed = (s_.last_w_dims.AsShapeVector() != w_dims); diff --git a/onnxruntime/core/providers/rocm/rocm_allocator.cc b/onnxruntime/core/providers/rocm/rocm_allocator.cc index 8645b791d4b0f..cfca8ea1d736c 100644 --- a/onnxruntime/core/providers/rocm/rocm_allocator.cc +++ b/onnxruntime/core/providers/rocm/rocm_allocator.cc @@ -69,7 +69,7 @@ void* ROCMExternalAllocator::Alloc(size_t size) { void ROCMExternalAllocator::Free(void* p) { free_(p); - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); auto it = reserved_.find(p); if (it != reserved_.end()) { reserved_.erase(it); @@ -80,7 +80,7 @@ void ROCMExternalAllocator::Free(void* p) { void* ROCMExternalAllocator::Reserve(size_t size) { void* p = Alloc(size); if (!p) return nullptr; - std::lock_guard lock(lock_); + absl::MutexLock lock(&lock_); ORT_ENFORCE(reserved_.find(p) == reserved_.end()); reserved_.insert(p); return p; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 87daaeea969ac..704e0b5ab26c9 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -273,7 +273,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in ROCMExecutionProvider::~ROCMExecutionProvider() { // clean up thread local context caches { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -308,7 +308,7 @@ ROCMExecutionProvider::PerThreadContext& ROCMExecutionProvider::GetPerThreadCont // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -341,7 +341,7 @@ void ROCMExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index f33e9a968ce95..29863b81d82a8 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1165,7 +1165,7 @@ void TensorrtExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } @@ -1187,7 +1187,7 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -1644,7 +1644,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv TensorrtExecutionProvider::~TensorrtExecutionProvider() { // clean up thread local context caches { - std::lock_guard lock(context_state_.mutex); + absl::MutexLock lock(&context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -3078,7 +3078,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine, // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading. // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading - std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); + absl::MutexLock lock(trt_state->tensorrt_mu_ptr); const std::unordered_map& input_indexes = (trt_state->input_info)[0]; const std::unordered_map& output_indexes = (trt_state->output_info)[0]; const std::unordered_map& output_types = (trt_state->output_info)[1]; @@ -3653,7 +3653,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // The whole compute_function should be considered the critical section. // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading - std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); + absl::MutexLock lock(trt_state->tensorrt_mu_ptr); const std::unordered_map& input_indexes = (trt_state->input_info)[0]; const std::unordered_map& output_indexes = (trt_state->output_info)[0]; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc index 58a1afd005563..2b10ff50013a7 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc @@ -29,7 +29,7 @@ common::Status CreateTensorRTCustomOpDomainList(std::vector& static std::unique_ptr custom_op_domain = std::make_unique(); static std::vector> created_custom_op_list; static OrtMutex mutex; - std::lock_guard lock(mutex); + absl::MutexLock lock(&mutex); if (custom_op_domain->domain_ != "" && custom_op_domain->custom_ops_.size() > 0) { domain_list.push_back(custom_op_domain.get()); return Status::OK(); diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index c1cd21570a6a4..172ace00e780d 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -366,7 +366,7 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options, session_id_ = global_session_id_.fetch_add(1); #ifdef _WIN32 - std::lock_guard lock(active_sessions_mutex_); + absl::MutexLock lock(&active_sessions_mutex_); active_sessions_[global_session_id_++] = this; // Register callback for ETW capture state (rundown) @@ -675,7 +675,7 @@ InferenceSession::~InferenceSession() { // Unregister the session #ifdef _WIN32 - std::lock_guard lock(active_sessions_mutex_); + absl::MutexLock lock(&active_sessions_mutex_); #endif active_sessions_.erase(global_session_id_); @@ -693,7 +693,7 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr return Status(common::ONNXRUNTIME, common::FAIL, "Received nullptr for exec provider"); } - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (is_inited_) { // adding an EP is pointless as the graph as already been partitioned so no nodes will be assigned to @@ -816,7 +816,7 @@ common::Status InferenceSession::RegisterGraphTransformer( return Status(common::ONNXRUNTIME, common::FAIL, "Received nullptr for graph transformer"); } - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (is_inited_) { // adding a transformer now is pointless as the graph as already been transformed @@ -882,7 +882,7 @@ common::Status InferenceSession::LoadWithLoader(std::function l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (is_model_loaded_) { // already loaded LOGS(*session_logger_, ERROR) << "This session already contains a loaded model."; return common::Status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session already contains a loaded model."); @@ -1336,7 +1336,7 @@ Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort_format_model_bytes) { static_assert(FLATBUFFERS_LITTLEENDIAN, "ORT format only supports little-endian machines"); - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (is_model_loaded_) { // already loaded Status status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session already contains a loaded model."); @@ -1460,7 +1460,7 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort } bool InferenceSession::IsInitialized() const { - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); return is_inited_; } @@ -1606,7 +1606,7 @@ common::Status InferenceSession::Initialize() { bool have_cpu_ep = false; { - std::lock_guard initial_guard(session_mutex_); + absl::MutexLock initial_guard(&session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; @@ -1644,7 +1644,7 @@ common::Status InferenceSession::Initialize() { } // re-acquire mutex - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); #if !defined(DISABLE_EXTERNAL_INITIALIZERS) && !defined(ORT_MINIMAL_BUILD) if (!session_options_.external_initializers.empty()) { @@ -2489,10 +2489,7 @@ Status InferenceSession::Run(const RunOptions& run_options, std::unique_ptr owned_run_logger; const auto& run_logger = CreateLoggerForRun(run_options, owned_run_logger); - std::optional> sequential_run_lock; - if (is_concurrent_run_supported_ == false) { - sequential_run_lock.emplace(session_mutex_); - } + absl::MutexLockMaybe sequential_run_lock(is_concurrent_run_supported_ ? nullptr : &session_mutex_); // info all execution providers InferenceSession:Run started // TODO: only call OnRunStart for all providers in-use @@ -2741,7 +2738,7 @@ common::Status InferenceSession::Run(const RunOptions& run_options, const NameML std::pair InferenceSession::GetModelMetadata() const { { - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2753,7 +2750,7 @@ std::pair InferenceSession::GetModelMetada std::pair InferenceSession::GetModelInputs() const { { - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2766,7 +2763,7 @@ std::pair InferenceSession::GetModelInputs( std::pair InferenceSession::GetOverridableInitializers() const { { - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2779,7 +2776,7 @@ std::pair InferenceSession::GetOverridableI std::pair InferenceSession::GetModelOutputs() const { { - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2791,7 +2788,7 @@ std::pair InferenceSession::GetModelOutput common::Status InferenceSession::NewIOBinding(std::unique_ptr* io_binding) { { - std::lock_guard l(session_mutex_); + absl::MutexLock l(&session_mutex_); if (!is_inited_) { LOGS(*session_logger_, ERROR) << "Session was not initialized"; return common::Status(common::ONNXRUNTIME, common::FAIL, "Session not initialized."); @@ -3140,9 +3137,9 @@ common::Status InferenceSession::AddPredefinedTransformers( common::Status InferenceSession::WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms) { if (timeout_in_ms > 0) { - ORT_NOT_IMPLEMENTED(__FUNCTION__, "timeout_in_ms >0 is not supported"); // TODO - } - p_executor_done->Wait(); + p_executor_done->WaitForNotificationWithTimeout(absl::Milliseconds(timeout_in_ms)); + } else + p_executor_done->WaitForNotification(); return Status::OK(); } @@ -3169,7 +3166,7 @@ IOBinding* SessionIOBinding::Get() { #ifdef _WIN32 void InferenceSession::LogAllSessions() { - std::lock_guard lock(active_sessions_mutex_); + absl::MutexLock lock(&active_sessions_mutex_); for (const auto& session_pair : active_sessions_) { InferenceSession* session = session_pair.second; TraceSessionOptions(session->session_options_, true); diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index 3038c8d22ec80..f0f69ea3a835d 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -28,6 +28,7 @@ #include "core/optimizer/graph_transformer_mgr.h" #include "core/optimizer/insert_cast_transformer.h" #include "core/platform/ort_mutex.h" +#include "core/platform/Barrier.h" #ifdef ENABLE_LANGUAGE_INTEROP_OPS #include "core/language_interop_ops/language_interop_ops.h" #endif @@ -46,7 +47,6 @@ class Environment; class GraphTransformer; class IExecutionProvider; class IOBinding; -struct Notification; #ifdef ENABLE_TRAINING struct PartialGraphExecutionState; diff --git a/onnxruntime/core/session/ort_env.cc b/onnxruntime/core/session/ort_env.cc index 331f1db26a029..d09bba794be2c 100644 --- a/onnxruntime/core/session/ort_env.cc +++ b/onnxruntime/core/session/ort_env.cc @@ -19,7 +19,7 @@ using namespace onnxruntime::logging; std::unique_ptr OrtEnv::p_instance_; int OrtEnv::ref_count_ = 0; -onnxruntime::OrtMutex OrtEnv::m_; +absl::Mutex OrtEnv::m_; OrtEnv::OrtEnv(std::unique_ptr value1) : value_(std::move(value1)) { @@ -35,7 +35,7 @@ OrtEnv::~OrtEnv() { OrtEnv* OrtEnv::GetInstance(const OrtEnv::LoggingManagerConstructionInfo& lm_info, onnxruntime::common::Status& status, const OrtThreadingOptions* tp_options) { - std::lock_guard lock(m_); + absl::MutexLock lock(&m_); if (!p_instance_) { std::unique_ptr lmgr; std::string name = lm_info.logid; @@ -76,7 +76,7 @@ void OrtEnv::Release(OrtEnv* env_ptr) { if (!env_ptr) { return; } - std::lock_guard lock(m_); + absl::MutexLock lock(&m_); ORT_ENFORCE(env_ptr == p_instance_.get()); // sanity check --ref_count_; if (ref_count_ == 0) { diff --git a/onnxruntime/core/session/ort_env.h b/onnxruntime/core/session/ort_env.h index 444134d0612e9..f41a316c11a05 100644 --- a/onnxruntime/core/session/ort_env.h +++ b/onnxruntime/core/session/ort_env.h @@ -67,7 +67,7 @@ struct OrtEnv { private: static std::unique_ptr p_instance_; - static onnxruntime::OrtMutex m_; + static absl::Mutex m_; static int ref_count_; std::unique_ptr value_; diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index e12e9401413be..16dd6c0a4f889 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -293,7 +293,7 @@ class OnnxTestCase : public ITestCase { std::vector test_data_dirs_; std::string GetDatasetDebugInfoString(size_t dataset_id) const override { - std::lock_guard l(m_); + absl::MutexLock l(&m_); if (dataset_id < debuginfo_strings_.size()) { return debuginfo_strings_[dataset_id]; } @@ -488,7 +488,7 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b, if (st.IsOK()) { // has an all-in-one input file std::ostringstream oss; { - std::lock_guard l(m_); + absl::MutexLock l(&m_); oss << debuginfo_strings_[id]; } ORT_TRY { @@ -503,7 +503,7 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b, } { - std::lock_guard l(m_); + absl::MutexLock l(&m_); debuginfo_strings_[id] = oss.str(); } return; diff --git a/onnxruntime/test/onnx/TestResultStat.h b/onnxruntime/test/onnx/TestResultStat.h index 5bfc04c3cd577..895843ac01415 100644 --- a/onnxruntime/test/onnx/TestResultStat.h +++ b/onnxruntime/test/onnx/TestResultStat.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -26,22 +26,22 @@ class TestResultStat { TestResultStat() : succeeded(0), not_implemented(0), load_model_failed(0), throwed_exception(0), result_differs(0), skipped(0), invalid_graph(0) {} void AddNotImplementedKernels(const std::string& s) { - std::lock_guard l(m_); + absl::MutexLock l(&m_); not_implemented_kernels.insert(s); } void AddFailedKernels(const std::string& s) { - std::lock_guard l(m_); + absl::MutexLock l(&m_); failed_kernels.insert(s); } void AddFailedTest(const std::pair& p) { - std::lock_guard l(m_); + absl::MutexLock l(&m_); failed_test_cases.insert(p); } const std::set>& GetFailedTest() const { - std::lock_guard l(m_); + absl::MutexLock l(&m_); return failed_test_cases; } @@ -74,7 +74,7 @@ class TestResultStat { } private: - mutable onnxruntime::OrtMutex m_; + mutable absl::Mutex m_; std::unordered_set not_implemented_kernels; std::unordered_set failed_kernels; std::set> failed_test_cases; // pairs of test name and version diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc index 08d77008dc25c..faf0c34193717 100644 --- a/onnxruntime/test/perftest/performance_runner.cc +++ b/onnxruntime/test/perftest/performance_runner.cc @@ -189,8 +189,8 @@ Status PerformanceRunner::RunParallelDuration() { // TODO: Make each thread enqueue a new worker. auto tpool = GetDefaultThreadPool(Env::Default()); std::atomic counter = {0}; - OrtMutex m; - OrtCondVar cv; + std::mutex m; + std::condition_variable cv; auto start = std::chrono::high_resolution_clock::now(); auto end = start; @@ -206,7 +206,7 @@ Status PerformanceRunner::RunParallelDuration() { if (!status.IsOK()) std::cerr << status.ErrorMessage(); // Simplified version of Eigen::Barrier - std::lock_guard lg(m); + std::lock_guard lg(m); counter--; cv.notify_all(); }); @@ -216,7 +216,7 @@ Status PerformanceRunner::RunParallelDuration() { } while (duration_seconds.count() < performance_test_config_.run_config.duration_in_seconds); // Join - std::unique_lock lock(m); + std::unique_lock lock(m); cv.wait(lock, [&counter]() { return counter == 0; }); return Status::OK(); @@ -228,8 +228,8 @@ Status PerformanceRunner::ForkJoinRepeat() { // create a threadpool with one thread per concurrent request auto tpool = std::make_unique(run_config.concurrent_session_runs); std::atomic counter{0}, requests{0}; - OrtMutex m; - OrtCondVar cv; + std::mutex m; + std::condition_variable cv; // Fork for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) { @@ -242,14 +242,14 @@ Status PerformanceRunner::ForkJoinRepeat() { } // Simplified version of Eigen::Barrier - std::lock_guard lg(m); + std::lock_guard lg(m); counter--; cv.notify_all(); }); } // Join - std::unique_lock lock(m); + std::unique_lock lock(m); cv.wait(lock, [&counter]() { return counter == 0; }); return Status::OK(); diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h index cb1cb661550a7..71afad916e8ca 100644 --- a/onnxruntime/test/perftest/performance_runner.h +++ b/onnxruntime/test/perftest/performance_runner.h @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include #include #include "test_configuration.h" #include "heap_buffer.h" @@ -75,7 +76,7 @@ class PerformanceRunner { ORT_RETURN_IF_ERROR(status); if (!isWarmup) { - std::lock_guard guard(results_mutex_); + std::lock_guard guard(results_mutex_); performance_result_.time_costs.emplace_back(duration_seconds.count()); performance_result_.total_time_cost += duration_seconds.count(); if (performance_test_config_.run_config.f_verbose) { @@ -116,7 +117,7 @@ class PerformanceRunner { onnxruntime::test::HeapBuffer b_; std::unique_ptr test_case_; - OrtMutex results_mutex_; + std::mutex results_mutex_; }; } // namespace perftest } // namespace onnxruntime diff --git a/onnxruntime/test/platform/barrier_test.cc b/onnxruntime/test/platform/barrier_test.cc deleted file mode 100644 index 979e8a07f0ad6..0000000000000 --- a/onnxruntime/test/platform/barrier_test.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/platform/Barrier.h" -#include "core/platform/threadpool.h" - -#include "gtest/gtest.h" - -#include -#include - -namespace { - -static void TestBarrier(int num_threads, uint64_t per_thread_count, bool spin) { - std::atomic counter{0}; - onnxruntime::Barrier barrier(num_threads, spin); - - std::vector threads; - for (auto i = 0; i < num_threads + 1; i++) { - threads.push_back(std::thread([&, i] { - if (i > 0) { - // Worker thread; increment the shared counter then - // notify the barrier. - for (uint64_t j = 0; j < per_thread_count; j++) { - counter++; - } - barrier.Notify(); - } else { - // Main thread; wait on the barrier, and then check the count seen. - barrier.Wait(); - ASSERT_EQ(counter, per_thread_count * num_threads); - } - })); - } - - // Wait for the threads to finish - for (auto& t : threads) { - t.join(); - } -} - -} // namespace - -namespace onnxruntime { - -constexpr uint64_t count = 1000000ull; - -TEST(BarrierTest, TestBarrier_0Workers_Spin) { - TestBarrier(0, count, true); -} - -TEST(BarrierTest, TestBarrier_0Workers_Block) { - TestBarrier(0, count, false); -} - -TEST(BarrierTest, TestBarrier_1Worker_Spin) { - TestBarrier(1, count, true); -} - -TEST(BarrierTest, TestBarrier_1Worker_Block) { - TestBarrier(1, count, false); -} - -TEST(BarrierTest, TestBarrier_4Workers_Spin) { - TestBarrier(4, count, true); -} - -TEST(BarrierTest, TestBarrier_4Workers_Block) { - TestBarrier(4, count, false); -} - -} // namespace onnxruntime diff --git a/onnxruntime/test/platform/threadpool_test.cc b/onnxruntime/test/platform/threadpool_test.cc index 9b3eac1088a47..11e69a82b03ae 100644 --- a/onnxruntime/test/platform/threadpool_test.cc +++ b/onnxruntime/test/platform/threadpool_test.cc @@ -38,7 +38,7 @@ std::unique_ptr CreateTestData(int num) { } void IncrementElement(TestData& test_data, ptrdiff_t i) { - std::lock_guard lock(test_data.mutex); + absl::MutexLock lock(&test_data.mutex); test_data.data[i]++; } @@ -84,7 +84,7 @@ void TestBatchParallelFor(const std::string& name, int num_threads, int num_task }); ValidateTestData(*test_data); } - +#if 0 void TestConcurrentParallelFor(const std::string& name, int num_threads, int num_concurrent, int num_tasks, int dynamic_block_base = 0, bool mock_hybrid = false) { // Test running multiple concurrent loops over the same thread pool. This aims to provoke a // more diverse mix of interleavings than with a single loop running at a time. @@ -123,7 +123,7 @@ void TestConcurrentParallelFor(const std::string& name, int num_threads, int num dynamic_block_base, mock_hybrid); } } - +#endif void TestBurstScheduling(const std::string& name, int num_tasks) { // Test submitting a burst of functions for executing. The aim is to provoke cases such // as the thread pool's work queues being full. @@ -275,7 +275,7 @@ TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_100_Batch) { TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_81_Task_20_Batch) { TestBatchParallelFor("TestBatchParallelFor_2_Thread_81_Task_20_Batch", 2, 81, 20); } - +#if 0 TEST(ThreadPoolTest, TestConcurrentParallelFor_0Thread_1Conc_0Tasks) { TestConcurrentParallelFor("TestConcurrentParallelFor_0Thread_1Conc_0Tasks", 0, 1, 0); } @@ -415,7 +415,7 @@ TEST(ThreadPoolTest, TestConcurrentParallelFor_4Thread_4Conc_1MTasks_dynamic_blo TEST(ThreadPoolTest, TestConcurrentParallelFor_4Thread_4Conc_1MTasks_dynamic_block_base_128_hybrid) { TestConcurrentParallelFor("TestConcurrentParallelFor_4Thread_4Conc_1MTasks_dynamic_block_base_128", 4, 4, 1000000, 128, true); } - +#endif TEST(ThreadPoolTest, TestBurstScheduling_0Tasks) { TestBurstScheduling("TestBurstScheduling_0Tasks", 0); } @@ -536,7 +536,7 @@ TEST(ThreadPoolTest, TestStackSize) { } n.Notify(); }); - n.Wait(); + n.WaitForNotification(); if (has_thread_limit_info) ASSERT_EQ(high_limit - low_limit, to.stack_size); }