Skip to content

Commit

Permalink
Add alloc shims library
Browse files Browse the repository at this point in the history
  • Loading branch information
Marko Ivanovich authored and kormang committed Nov 28, 2022
1 parent 0067ae7 commit ab5747f
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 9 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ By default, Coz works for C and C++ programs. It has been ported or
has wrappers for several other languages, listed below:

| Language | Link |
| ----------- | -----------
| ----------- | -----------
| Java | JCoz: https://github.com/Decave/JCoz|
| Go | Cozgo: https://github.com/urjitbhatia/cozgo|
| Swift | Swift Coz: https://github.com/funcmike/swift-coz |
Expand Down Expand Up @@ -89,6 +89,10 @@ To run your program with Coz, you will need to build it with debug information (

Once you have your program built with debug information, you can run it with Coz using the command `coz run {coz options} --- {program name and arguments}`. But, to produce a useful profile you need to decide which part(s) of the application you want to speed up by specifying one or more progress points.

If your program uses `jemalloc`, `tcmalloc`, or any other allocation algorithm,
and it crashes or deadlocks during profiling, try passing `--with-alloc-shims`
option to coz.

### Profiling Modes
Coz departs from conventional profiling by making it possible to view the effect of optimizations on both throughput and latency. To profile throughput, you must specify a progress point. To profile latency, you must specify a pair of progress points.

Expand Down
26 changes: 19 additions & 7 deletions coz
Original file line number Diff line number Diff line change
Expand Up @@ -47,21 +47,22 @@ def _coz_run(args):

# Find coz
coz_prefix = dirname(realpath(sys.argv[0]))
default_lib_name = 'libcoz.so'

# Candidate runtime library locations
library_locations = [
# Check for library adjacent to this script
os.path.join(coz_prefix, '..', 'lib64', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib64', default_lib_name),
os.path.join(coz_prefix, '..', 'lib', default_lib_name),

# Check for library under the coz-profiler subdirectory
os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', default_lib_name),
os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', default_lib_name),

# Local library under development directory
os.path.join('libcoz', 'libcoz.so'), # Local library during development
os.path.join(coz_prefix, 'libcoz', 'libcoz.so'),
os.path.join(coz_prefix, 'build', 'libcoz', 'libcoz.so'),
os.path.join('libcoz', default_lib_name), # Local library during development
os.path.join(coz_prefix, 'libcoz', default_lib_name),
os.path.join(coz_prefix, 'build', 'libcoz', default_lib_name),
]

# Find the first library location that exists
Expand All @@ -83,6 +84,11 @@ def _coz_run(args):
else:
env['LD_PRELOAD'] = coz_runtime

if args.with_alloc_shims:
cozallocshims = coz_runtime.replace(default_lib_name, 'libcozallocshims.so')
# Make cozallocshims first library to get loaded.
env['LD_PRELOAD'] = cozallocshims + ':' + env['LD_PRELOAD']

if len(args.binary_scope) > 0:
env['COZ_BINARY_SCOPE'] = '\t'.join(args.binary_scope)
else:
Expand Down Expand Up @@ -185,6 +191,12 @@ _run_parser.add_argument('--fixed-speedup',
type=int, choices=list(range(0, 101)), default=None,
help='Evaluate optimizations of a specific amount')

_run_parser.add_argument('--with-alloc-shims',
action='store_true', default=False,
help='Use shims for memory allocation functions (malloc). '
'Useful as a proxy on top of libraries like jemalloc or '
'tcmalloc, to avoid problems caused by them.')

# Use defaults to recover handler function and parser object from parser output
_run_parser.set_defaults(func=_coz_run, parser=_run_parser)

Expand Down
13 changes: 12 additions & 1 deletion libcoz/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(sources
${PROJECT_SOURCE_DIR}/include/coz.h
alloc_shims.h
inspect.cpp
inspect.h
libcoz.cpp
Expand All @@ -24,6 +25,16 @@ target_include_directories(coz
$<INSTALL_INTERFACE:include>)
target_link_libraries(coz PRIVATE ${CMAKE_DL_LIBS} rt Threads::Threads libelfin::libelfin)

add_library(cozallocshims MODULE alloc_shims.cpp alloc_shims.h)
if(CONAN_PACKAGE_VERSION)
set_target_properties(cozallocshims PROPERTIES VERSION ${CONAN_PACKAGE_VERSION})
endif()
target_include_directories(cozallocshims
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
target_link_libraries(cozallocshims PRIVATE ${CMAKE_DL_LIBS})

add_library(coz-instrumentation INTERFACE)
target_include_directories(coz-instrumentation
INTERFACE
Expand All @@ -32,7 +43,7 @@ target_include_directories(coz-instrumentation
target_link_libraries(coz-instrumentation INTERFACE -Wl,--push-state,--no-as-needed ${CMAKE_DL_LIBS} -Wl,--pop-state)

if(INSTALL_COZ)
install(TARGETS coz
install(TARGETS coz cozallocshims
EXPORT coz-profilerTargets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
Expand Down
191 changes: 191 additions & 0 deletions libcoz/alloc_shims.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@

#include <dlfcn.h>
#include <string.h>
#include <unistd.h>

#include <atomic>
#include <cstdlib>
#include <cstddef>
#include <cstdint>
#include <cstdio>

#include "perf.h"
#include "ccutil/spinlock.h"

static std::atomic_bool initialized{false};
static spinlock init_lock;
static spinlock mode_lock;
static std::atomic<pid_t> thread_using_shim{0};

static constexpr size_t memory_pool_size = 1000 * alignof(std::max_align_t);
alignas(std::max_align_t) static char memory_pool[memory_pool_size];

static void lazy_init();

static void* first_malloc(size_t size) {
lazy_init();
return malloc(size);
}

static void* first_calloc(size_t nmemb, size_t size) {
lazy_init();
return calloc(nmemb, size);
}

static void* (*in_use_malloc)(size_t size) = first_malloc;
static void* (*in_use_calloc)(size_t nmemb, size_t size) = first_calloc;

static void* (*real_malloc)(size_t size) = nullptr;
static void (*real_free)(void* ptr) = nullptr;
static void* (*real_calloc)(size_t nmemb, size_t size) = nullptr;

static void* dummy_malloc(size_t size) {
// We use dummy malloc only in thread requesting it, during resolution of real
// symbols. Other threads during that time should use real malloc.
if (gettid() != thread_using_shim.load()) {
// Only possible case when real_malloc is nullptr, is when we're acctually
// looking for it right now, in another thread (initialization is in
// progress in another thread). So it is highly unlikly to happen, but still
// we better be sure. Simplest thing we can do is use busy waiting, since
// it should almost never happen.
while (!initialized.load()) {
/* busy wait */
}
return real_malloc(size);
}

// Dummy malloc is used only during resolving real symbols by coz.
// For that particular case, we don't need sofisticated memory allocation
// algorithm or a lot of memory at our disposal.
// However, we should ensure as much correctness of the algorithm as possible,
// like memory alignment, and non overlapping buffers.

static char* first_unallocated{memory_pool};

// Make size multiple of alignof(max_align_t), to keep addresses aligned.
constexpr std::uintmax_t all_ones = ~(std::uintmax_t{});
size = (size + alignof(std::max_align_t) - 1) &
(all_ones * alignof(std::max_align_t)); // this is same as shifting left.

char* result = first_unallocated;
first_unallocated += size;

if (first_unallocated > &memory_pool[memory_pool_size])
abort();

return result;
}

static void* dummy_calloc(size_t nmemb, size_t size) {
void* ptr = dummy_malloc(nmemb * size);
memset(ptr, 0, nmemb * size);
return ptr;
}

static void set_dummy_allocs_impl() {
// If another thread ends up in dummy_malloc, make sure it knows it's in the
// wrong place.
thread_using_shim.store(gettid());
in_use_malloc = dummy_malloc;
in_use_calloc = dummy_calloc;
}

static void restore_real_allocs_impl() {
in_use_malloc = real_malloc;
in_use_calloc = real_calloc;
thread_using_shim.store(0);
}

static void find_real_functions() {
// Dummy allocs are on by default, so it is safe to call dlsym.
real_malloc = reinterpret_cast<void* (*)(size_t)>(dlsym(RTLD_NEXT, "malloc"));
if (!real_malloc) {
fprintf(stderr, "Failed to find real malloc!\n");
abort();
}

real_free = reinterpret_cast<void (*)(void*)>(dlsym(RTLD_NEXT, "free"));
if (!real_free) {
fprintf(stderr, "Failed to find real free!\n");
abort();
}

real_calloc =
reinterpret_cast<void* (*)(size_t, size_t)>(dlsym(RTLD_NEXT, "calloc"));
if (!real_calloc) {
fprintf(stderr, "Failed to find real calloc!\n");
abort();
}
}

static void lazy_init() {
// First check to improve performance, avoid locking if unnecessary.
if (initialized.load(std::memory_order_acquire))
return;

init_lock.lock();

// Another check, this time to make sure no one acquired the lock between
// first check and our attempt to acquire it.
if (initialized.load(std::memory_order_relaxed))
return;

// Allocations could be made by libdl while we search for real functions.
// Prepare dummy allocs for this (first_malloc is no longer needed, we are
// already initializing).
set_dummy_allocs_impl();

// Now find real functions.
find_real_functions();

// Now that we have real functions, use them by default.
restore_real_allocs_impl();

initialized.store(true, std::memory_order_release);
init_lock.unlock();
}

extern "C" {
void coz_lock_and_set_dummy_alloc_shims() {
lazy_init();

// This is to make sure, only one thread resolves real symbols at the time.
// That in turn makes it possible, to ensure dummy malloc is called only
// from the thread that resolves real symbol, while it resolves it.
// Waiting for this lock should be extrimely rare case. Symbols are usually
// resolved from one thread, and each resolve_* function is called only once,
// during runtime of the program.
mode_lock.lock();
set_dummy_allocs_impl();
}

void coz_restore_real_alloc_shims_and_unlock() {
restore_real_allocs_impl();
mode_lock.unlock();
}

void* malloc(size_t size) {
// When dummy implementation is not needed (during most of the runtime of
// the program), shim will directly call real implementations, minimizing
// overhead as much as teoretically possible.
return in_use_malloc(size);
}

void free(void *ptr) {
// Null ptrs are ignored anyway.
if (!ptr)
return;

// If it is allocated in our pool we should free it ( or not :) )
if (ptr >= memory_pool && ptr < memory_pool + memory_pool_size)
return;

// It is probably allocated with real malloc, so let real free deal with it.
real_free(ptr);
}

void* calloc(size_t nmemb, size_t size) {
return in_use_calloc(nmemb, size);
}

}
8 changes: 8 additions & 0 deletions libcoz/alloc_shims.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

#if !defined(CAUSAL_RUNTIME_ALLOC_SHIMS_H)
#define CAUSAL_RUNTIME_ALLOC_SHIMS_H

extern "C" void coz_lock_and_set_dummy_alloc_shims() __attribute__((weak));
extern "C" void coz_restore_real_alloc_shims_and_unlock() __attribute__((weak));

#endif
6 changes: 6 additions & 0 deletions libcoz/real.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,20 @@
#include <stdint.h>
#include <string.h>

#include "alloc_shims.h"

static bool resolving = false; //< Set to true while symbol resolution is in progress
static bool in_dlopen = false; //< Set to true while dlopen is running
static void* pthread_handle = NULL; //< The `dlopen` handle to libpthread

#define GET_SYMBOL_HANDLE(name, handle) \
decltype(::name)* real_##name = nullptr; \
while(!__atomic_exchange_n(&resolving, true, __ATOMIC_ACQ_REL)) {} \
if (coz_lock_and_set_dummy_alloc_shims) \
coz_lock_and_set_dummy_alloc_shims(); \
uintptr_t addr = reinterpret_cast<uintptr_t>(dlsym(handle, #name)); \
if (coz_restore_real_alloc_shims_and_unlock) \
coz_restore_real_alloc_shims_and_unlock(); \
memcpy(&real_##name, &addr, sizeof(uintptr_t)); \
if(real_##name) { \
memcpy(&real::name, &addr, sizeof(uintptr_t)); \
Expand Down

0 comments on commit ab5747f

Please sign in to comment.