From c11edb8346b8e726880dff134caacb25d26c3ea4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 16 Jan 2025 23:27:08 +0000 Subject: [PATCH] fix(profiling): reset all profiling c++ mutexes on fork [backport 2.19] (#11993) Backport d855c4a28824c15fd3afdbbe89315808efafdf07 from #11768 to 2.19. I'm not sure why it took so long to surface this defect, but it turns out that stack v2 can deadlock applications because not all mutices are reset. The repro in #11762 appears to be pretty durable. I need to investigate it a bit more in order to distill it down into a native stress test we can use moving forward. In practice, this patch suppresses the noted behavior in the repro. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) Co-authored-by: David Sanchez <838104+sanchda@users.noreply.github.com> --- .../profiling/dd_wrapper/include/uploader_builder.hpp | 2 ++ .../datadog/profiling/dd_wrapper/src/code_provenance.cpp | 3 +-- .../datadog/profiling/dd_wrapper/src/ddup_interface.cpp | 1 + .../internal/datadog/profiling/dd_wrapper/src/profile.cpp | 2 +- .../internal/datadog/profiling/dd_wrapper/src/sample.cpp | 1 - .../internal/datadog/profiling/dd_wrapper/src/uploader.cpp | 3 ++- .../datadog/profiling/dd_wrapper/src/uploader_builder.cpp | 7 +++++++ .../internal/datadog/profiling/stack_v2/src/sampler.cpp | 5 +++++ .../datadog/profiling/stack_v2/src/thread_span_links.cpp | 4 +--- .../fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml | 5 +++++ 10 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp b/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp index 62ee6aad853..7077096c744 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp @@ -43,6 +43,8 @@ class UploaderBuilder static void set_output_filename(std::string_view _output_filename); static std::variant build(); + + static void postfork_child(); }; } // namespace Datadog diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp index 0a4a49a4ce5..f3147cd2034 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp @@ -14,9 +14,8 @@ namespace Datadog { void Datadog::CodeProvenance::postfork_child() { - get_instance().mtx.~mutex(); // Destroy the mutex + // NB placement-new to re-init and leak the mutex because doing anything else is UB new (&get_instance().mtx) std::mutex(); // Recreate the mutex - get_instance().reset(); // Reset the state } void diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp index 5d3ef356c2a..9b52cbcaf6d 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp @@ -24,6 +24,7 @@ ddup_postfork_child() Datadog::Uploader::postfork_child(); Datadog::SampleManager::postfork_child(); Datadog::CodeProvenance::postfork_child(); + Datadog::UploaderBuilder::postfork_child(); } void diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp index 083ad1a655d..860f9c7cd3e 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp @@ -203,6 +203,6 @@ Datadog::Profile::collect(const ddog_prof_Sample& sample, int64_t endtime_ns) void Datadog::Profile::postfork_child() { - profile_mtx.unlock(); + new (&profile_mtx) std::mutex(); cycle_buffers(); } diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp index 1e7ca1b0217..4483a021803 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp @@ -408,7 +408,6 @@ Datadog::Sample::push_absolute_ns(int64_t _timestamp_ns) return true; } - bool Datadog::Sample::push_monotonic_ns(int64_t _monotonic_ns) { diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp index 375c2e09e9e..325771946d8 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp @@ -175,5 +175,6 @@ Datadog::Uploader::postfork_parent() void Datadog::Uploader::postfork_child() { - unlock(); + // NB placement-new to re-init and leak the mutex because doing anything else is UB + new (&upload_lock) std::mutex(); } diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp index 0661b7f217f..8ff5d45e7c2 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp @@ -186,3 +186,10 @@ Datadog::UploaderBuilder::build() return Datadog::Uploader{ output_filename, ddog_exporter }; } + +void +Datadog::UploaderBuilder::postfork_child() +{ + // NB placement-new to re-init and leak the mutex because doing anything else is UB + new (&tag_mutex) std::mutex(); +} diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp index c05ae45477e..7ad9ad692b2 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp @@ -67,6 +67,11 @@ _stack_v2_atfork_child() // so we don't even reveal this function to the user _set_pid(getpid()); ThreadSpanLinks::postfork_child(); + + // `thread_info_map_lock` and `task_link_map_lock` are global locks held in echion + // NB placement-new to re-init and leak the mutex because doing anything else is UB + new (&thread_info_map_lock) std::mutex; + new (&task_link_map_lock) std::mutex; } __attribute__((constructor)) void diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp index c777ff8a510..6be43a04a42 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp @@ -53,10 +53,8 @@ ThreadSpanLinks::reset() void ThreadSpanLinks::postfork_child() { - // Explicitly destroy and reconstruct the mutex to avoid undefined behavior - get_instance().mtx.~mutex(); + // NB placement-new to re-init and leak the mutex because doing anything else is UB new (&get_instance().mtx) std::mutex(); - get_instance().reset(); } diff --git a/releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml b/releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml new file mode 100644 index 00000000000..40167a974c3 --- /dev/null +++ b/releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + profiling: Fixes a bug where profiling mutexes were not cleared on fork in the child process. This could + cause deadlocks in certain configurations.