diff --git a/README.md b/README.md index 7d39612..de9efbc 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ By default, Coz works for C and C++ programs. It has been ported or has wrappers for several other languages, listed below: | Language | Link | -| ----------- | ----------- +| ----------- | ----------- | Java | JCoz: https://github.com/Decave/JCoz| | Go | Cozgo: https://github.com/urjitbhatia/cozgo| | Swift | Swift Coz: https://github.com/funcmike/swift-coz | @@ -89,6 +89,10 @@ To run your program with Coz, you will need to build it with debug information ( Once you have your program built with debug information, you can run it with Coz using the command `coz run {coz options} --- {program name and arguments}`. But, to produce a useful profile you need to decide which part(s) of the application you want to speed up by specifying one or more progress points. +If your program uses `jemalloc`, `tcmalloc`, or any other allocation algorithm, +and it crashes or deadlocks during profiling, try passing `--with-alloc-shims` +option to coz. + ### Profiling Modes Coz departs from conventional profiling by making it possible to view the effect of optimizations on both throughput and latency. To profile throughput, you must specify a progress point. To profile latency, you must specify a pair of progress points. diff --git a/coz b/coz index b195eca..d66affc 100755 --- a/coz +++ b/coz @@ -47,21 +47,22 @@ def _coz_run(args): # Find coz coz_prefix = dirname(realpath(sys.argv[0])) + default_lib_name = 'libcoz.so' # Candidate runtime library locations library_locations = [ # Check for library adjacent to this script - os.path.join(coz_prefix, '..', 'lib64', 'libcoz.so'), - os.path.join(coz_prefix, '..', 'lib', 'libcoz.so'), + os.path.join(coz_prefix, '..', 'lib64', default_lib_name), + os.path.join(coz_prefix, '..', 'lib', default_lib_name), # Check for library under the coz-profiler subdirectory - os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', 'libcoz.so'), - os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', 'libcoz.so'), + os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', default_lib_name), + os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', default_lib_name), # Local library under development directory - os.path.join('libcoz', 'libcoz.so'), # Local library during development - os.path.join(coz_prefix, 'libcoz', 'libcoz.so'), - os.path.join(coz_prefix, 'build', 'libcoz', 'libcoz.so'), + os.path.join('libcoz', default_lib_name), # Local library during development + os.path.join(coz_prefix, 'libcoz', default_lib_name), + os.path.join(coz_prefix, 'build', 'libcoz', default_lib_name), ] # Find the first library location that exists @@ -83,6 +84,11 @@ def _coz_run(args): else: env['LD_PRELOAD'] = coz_runtime + if args.with_alloc_shims: + cozallocshims = coz_runtime.replace(default_lib_name, 'libcozallocshims.so') + # Make cozallocshims first library to get loaded. + env['LD_PRELOAD'] = cozallocshims + ':' + env['LD_PRELOAD'] + if len(args.binary_scope) > 0: env['COZ_BINARY_SCOPE'] = '\t'.join(args.binary_scope) else: @@ -185,6 +191,12 @@ _run_parser.add_argument('--fixed-speedup', type=int, choices=list(range(0, 101)), default=None, help='Evaluate optimizations of a specific amount') +_run_parser.add_argument('--with-alloc-shims', + action='store_true', default=False, + help='Use shims for memory allocation functions (malloc). ' + 'Useful as a proxy on top of libraries like jemalloc or ' + 'tcmalloc, to avoid problems caused by them.') + # Use defaults to recover handler function and parser object from parser output _run_parser.set_defaults(func=_coz_run, parser=_run_parser) diff --git a/libcoz/CMakeLists.txt b/libcoz/CMakeLists.txt index a3a9ea9..cbb067d 100644 --- a/libcoz/CMakeLists.txt +++ b/libcoz/CMakeLists.txt @@ -1,5 +1,6 @@ set(sources ${PROJECT_SOURCE_DIR}/include/coz.h + alloc_shims.h inspect.cpp inspect.h libcoz.cpp @@ -24,6 +25,16 @@ target_include_directories(coz $) target_link_libraries(coz PRIVATE ${CMAKE_DL_LIBS} rt Threads::Threads libelfin::libelfin) +add_library(cozallocshims MODULE alloc_shims.cpp alloc_shims.h) +if(CONAN_PACKAGE_VERSION) + set_target_properties(cozallocshims PROPERTIES VERSION ${CONAN_PACKAGE_VERSION}) +endif() +target_include_directories(cozallocshims + PUBLIC + $ + $) +target_link_libraries(cozallocshims PRIVATE ${CMAKE_DL_LIBS}) + add_library(coz-instrumentation INTERFACE) target_include_directories(coz-instrumentation INTERFACE @@ -32,7 +43,7 @@ target_include_directories(coz-instrumentation target_link_libraries(coz-instrumentation INTERFACE -Wl,--push-state,--no-as-needed ${CMAKE_DL_LIBS} -Wl,--pop-state) if(INSTALL_COZ) - install(TARGETS coz + install(TARGETS coz cozallocshims EXPORT coz-profilerTargets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/libcoz/alloc_shims.cpp b/libcoz/alloc_shims.cpp new file mode 100644 index 0000000..935d97c --- /dev/null +++ b/libcoz/alloc_shims.cpp @@ -0,0 +1,191 @@ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "perf.h" +#include "ccutil/spinlock.h" + +static std::atomic_bool initialized{false}; +static spinlock init_lock; +static spinlock mode_lock; +static std::atomic thread_using_shim{0}; + +static constexpr size_t memory_pool_size = 1000 * alignof(std::max_align_t); +alignas(std::max_align_t) static char memory_pool[memory_pool_size]; + +static void lazy_init(); + +static void* first_malloc(size_t size) { + lazy_init(); + return malloc(size); +} + +static void* first_calloc(size_t nmemb, size_t size) { + lazy_init(); + return calloc(nmemb, size); +} + +static void* (*in_use_malloc)(size_t size) = first_malloc; +static void* (*in_use_calloc)(size_t nmemb, size_t size) = first_calloc; + +static void* (*real_malloc)(size_t size) = nullptr; +static void (*real_free)(void* ptr) = nullptr; +static void* (*real_calloc)(size_t nmemb, size_t size) = nullptr; + +static void* dummy_malloc(size_t size) { + // We use dummy malloc only in thread requesting it, during resolution of real + // symbols. Other threads during that time should use real malloc. + if (gettid() != thread_using_shim.load()) { + // Only possible case when real_malloc is nullptr, is when we're acctually + // looking for it right now, in another thread (initialization is in + // progress in another thread). So it is highly unlikly to happen, but still + // we better be sure. Simplest thing we can do is use busy waiting, since + // it should almost never happen. + while (!initialized.load()) { + /* busy wait */ + } + return real_malloc(size); + } + + // Dummy malloc is used only during resolving real symbols by coz. + // For that particular case, we don't need sofisticated memory allocation + // algorithm or a lot of memory at our disposal. + // However, we should ensure as much correctness of the algorithm as possible, + // like memory alignment, and non overlapping buffers. + + static char* first_unallocated{memory_pool}; + + // Make size multiple of alignof(max_align_t), to keep addresses aligned. + constexpr std::uintmax_t all_ones = ~(std::uintmax_t{}); + size = (size + alignof(std::max_align_t) - 1) & + (all_ones * alignof(std::max_align_t)); // this is same as shifting left. + + char* result = first_unallocated; + first_unallocated += size; + + if (first_unallocated > &memory_pool[memory_pool_size]) + abort(); + + return result; +} + +static void* dummy_calloc(size_t nmemb, size_t size) { + void* ptr = dummy_malloc(nmemb * size); + memset(ptr, 0, nmemb * size); + return ptr; +} + +static void set_dummy_allocs_impl() { + // If another thread ends up in dummy_malloc, make sure it knows it's in the + // wrong place. + thread_using_shim.store(gettid()); + in_use_malloc = dummy_malloc; + in_use_calloc = dummy_calloc; +} + +static void restore_real_allocs_impl() { + in_use_malloc = real_malloc; + in_use_calloc = real_calloc; + thread_using_shim.store(0); +} + +static void find_real_functions() { + // Dummy allocs are on by default, so it is safe to call dlsym. + real_malloc = reinterpret_cast(dlsym(RTLD_NEXT, "malloc")); + if (!real_malloc) { + fprintf(stderr, "Failed to find real malloc!\n"); + abort(); + } + + real_free = reinterpret_cast(dlsym(RTLD_NEXT, "free")); + if (!real_free) { + fprintf(stderr, "Failed to find real free!\n"); + abort(); + } + + real_calloc = + reinterpret_cast(dlsym(RTLD_NEXT, "calloc")); + if (!real_calloc) { + fprintf(stderr, "Failed to find real calloc!\n"); + abort(); + } +} + +static void lazy_init() { + // First check to improve performance, avoid locking if unnecessary. + if (initialized.load(std::memory_order_acquire)) + return; + + init_lock.lock(); + + // Another check, this time to make sure no one acquired the lock between + // first check and our attempt to acquire it. + if (initialized.load(std::memory_order_relaxed)) + return; + + // Allocations could be made by libdl while we search for real functions. + // Prepare dummy allocs for this (first_malloc is no longer needed, we are + // already initializing). + set_dummy_allocs_impl(); + + // Now find real functions. + find_real_functions(); + + // Now that we have real functions, use them by default. + restore_real_allocs_impl(); + + initialized.store(true, std::memory_order_release); + init_lock.unlock(); +} + +extern "C" { +void coz_lock_and_set_dummy_alloc_shims() { + lazy_init(); + + // This is to make sure, only one thread resolves real symbols at the time. + // That in turn makes it possible, to ensure dummy malloc is called only + // from the thread that resolves real symbol, while it resolves it. + // Waiting for this lock should be extrimely rare case. Symbols are usually + // resolved from one thread, and each resolve_* function is called only once, + // during runtime of the program. + mode_lock.lock(); + set_dummy_allocs_impl(); +} + +void coz_restore_real_alloc_shims_and_unlock() { + restore_real_allocs_impl(); + mode_lock.unlock(); +} + +void* malloc(size_t size) { + // When dummy implementation is not needed (during most of the runtime of + // the program), shim will directly call real implementations, minimizing + // overhead as much as teoretically possible. + return in_use_malloc(size); +} + +void free(void *ptr) { + // Null ptrs are ignored anyway. + if (!ptr) + return; + + // If it is allocated in our pool we should free it ( or not :) ) + if (ptr >= memory_pool && ptr < memory_pool + memory_pool_size) + return; + + // It is probably allocated with real malloc, so let real free deal with it. + real_free(ptr); +} + +void* calloc(size_t nmemb, size_t size) { + return in_use_calloc(nmemb, size); +} + +} \ No newline at end of file diff --git a/libcoz/alloc_shims.h b/libcoz/alloc_shims.h new file mode 100644 index 0000000..dd88d1c --- /dev/null +++ b/libcoz/alloc_shims.h @@ -0,0 +1,8 @@ + +#if !defined(CAUSAL_RUNTIME_ALLOC_SHIMS_H) +#define CAUSAL_RUNTIME_ALLOC_SHIMS_H + +extern "C" void coz_lock_and_set_dummy_alloc_shims() __attribute__((weak)); +extern "C" void coz_restore_real_alloc_shims_and_unlock() __attribute__((weak)); + +#endif diff --git a/libcoz/real.cpp b/libcoz/real.cpp index 8e7c063..8360484 100644 --- a/libcoz/real.cpp +++ b/libcoz/real.cpp @@ -13,6 +13,8 @@ #include #include +#include "alloc_shims.h" + static bool resolving = false; //< Set to true while symbol resolution is in progress static bool in_dlopen = false; //< Set to true while dlopen is running static void* pthread_handle = NULL; //< The `dlopen` handle to libpthread @@ -20,7 +22,11 @@ static void* pthread_handle = NULL; //< The `dlopen` handle to libpthread #define GET_SYMBOL_HANDLE(name, handle) \ decltype(::name)* real_##name = nullptr; \ while(!__atomic_exchange_n(&resolving, true, __ATOMIC_ACQ_REL)) {} \ + if (coz_lock_and_set_dummy_alloc_shims) \ + coz_lock_and_set_dummy_alloc_shims(); \ uintptr_t addr = reinterpret_cast(dlsym(handle, #name)); \ + if (coz_restore_real_alloc_shims_and_unlock) \ + coz_restore_real_alloc_shims_and_unlock(); \ memcpy(&real_##name, &addr, sizeof(uintptr_t)); \ if(real_##name) { \ memcpy(&real::name, &addr, sizeof(uintptr_t)); \