Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add alloc shims #191

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ By default, Coz works for C and C++ programs. It has been ported or
has wrappers for several other languages, listed below:

| Language | Link |
| ----------- | -----------
| ----------- | -----------
| Java | JCoz: https://github.com/Decave/JCoz|
| Go | Cozgo: https://github.com/urjitbhatia/cozgo|
| Swift | Swift Coz: https://github.com/funcmike/swift-coz |
Expand Down Expand Up @@ -89,6 +89,10 @@ To run your program with Coz, you will need to build it with debug information (

Once you have your program built with debug information, you can run it with Coz using the command `coz run {coz options} --- {program name and arguments}`. But, to produce a useful profile you need to decide which part(s) of the application you want to speed up by specifying one or more progress points.

If your program uses `jemalloc`, `tcmalloc`, or any other allocation algorithm,
and it crashes or deadlocks during profiling, try passing `--with-alloc-shims`
option to coz.

### Profiling Modes
Coz departs from conventional profiling by making it possible to view the effect of optimizations on both throughput and latency. To profile throughput, you must specify a progress point. To profile latency, you must specify a pair of progress points.

Expand Down
26 changes: 19 additions & 7 deletions coz
Original file line number Diff line number Diff line change
Expand Up @@ -47,21 +47,22 @@ def _coz_run(args):

# Find coz
coz_prefix = dirname(realpath(sys.argv[0]))
default_lib_name = 'libcoz.so'

# Candidate runtime library locations
library_locations = [
# Check for library adjacent to this script
os.path.join(coz_prefix, '..', 'lib64', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib64', default_lib_name),
os.path.join(coz_prefix, '..', 'lib', default_lib_name),

# Check for library under the coz-profiler subdirectory
os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', 'libcoz.so'),
os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', default_lib_name),
os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', default_lib_name),

# Local library under development directory
os.path.join('libcoz', 'libcoz.so'), # Local library during development
os.path.join(coz_prefix, 'libcoz', 'libcoz.so'),
os.path.join(coz_prefix, 'build', 'libcoz', 'libcoz.so'),
os.path.join('libcoz', default_lib_name), # Local library during development
os.path.join(coz_prefix, 'libcoz', default_lib_name),
os.path.join(coz_prefix, 'build', 'libcoz', default_lib_name),
]

# Find the first library location that exists
Expand All @@ -83,6 +84,11 @@ def _coz_run(args):
else:
env['LD_PRELOAD'] = coz_runtime

if args.with_alloc_shims:
cozallocshims = coz_runtime.replace(default_lib_name, 'libcozallocshims.so')
# Make cozallocshims first library to get loaded.
env['LD_PRELOAD'] = cozallocshims + ':' + env['LD_PRELOAD']

if len(args.binary_scope) > 0:
env['COZ_BINARY_SCOPE'] = '\t'.join(args.binary_scope)
else:
Expand Down Expand Up @@ -185,6 +191,12 @@ _run_parser.add_argument('--fixed-speedup',
type=int, choices=list(range(0, 101)), default=None,
help='Evaluate optimizations of a specific amount')

_run_parser.add_argument('--with-alloc-shims',
action='store_true', default=False,
help='Use shims for memory allocation functions (malloc). '
'Useful as a proxy on top of libraries like jemalloc or '
'tcmalloc, to avoid problems caused by them.')

# Use defaults to recover handler function and parser object from parser output
_run_parser.set_defaults(func=_coz_run, parser=_run_parser)

Expand Down
13 changes: 12 additions & 1 deletion libcoz/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(sources
${PROJECT_SOURCE_DIR}/include/coz.h
alloc_shims.h
inspect.cpp
inspect.h
libcoz.cpp
Expand All @@ -24,6 +25,16 @@ target_include_directories(coz
$<INSTALL_INTERFACE:include>)
target_link_libraries(coz PRIVATE ${CMAKE_DL_LIBS} rt Threads::Threads libelfin::libelfin)

add_library(cozallocshims MODULE alloc_shims.cpp alloc_shims.h)
if(CONAN_PACKAGE_VERSION)
set_target_properties(cozallocshims PROPERTIES VERSION ${CONAN_PACKAGE_VERSION})
endif()
target_include_directories(cozallocshims
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
target_link_libraries(cozallocshims PRIVATE ${CMAKE_DL_LIBS})

add_library(coz-instrumentation INTERFACE)
target_include_directories(coz-instrumentation
INTERFACE
Expand All @@ -32,7 +43,7 @@ target_include_directories(coz-instrumentation
target_link_libraries(coz-instrumentation INTERFACE -Wl,--push-state,--no-as-needed ${CMAKE_DL_LIBS} -Wl,--pop-state)

if(INSTALL_COZ)
install(TARGETS coz
install(TARGETS coz cozallocshims
EXPORT coz-profilerTargets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
Expand Down
191 changes: 191 additions & 0 deletions libcoz/alloc_shims.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@

#include <dlfcn.h>
#include <string.h>
#include <unistd.h>

#include <atomic>
#include <cstdlib>
#include <cstddef>
#include <cstdint>
#include <cstdio>

#include "perf.h"
#include "ccutil/spinlock.h"

static std::atomic_bool initialized{false};
static spinlock init_lock;
static spinlock mode_lock;
static std::atomic<pid_t> thread_using_shim{0};

static constexpr size_t memory_pool_size = 1000 * alignof(std::max_align_t);
alignas(std::max_align_t) static char memory_pool[memory_pool_size];

static void lazy_init();

static void* first_malloc(size_t size) {
lazy_init();
return malloc(size);
}

static void* first_calloc(size_t nmemb, size_t size) {
lazy_init();
return calloc(nmemb, size);
}

static void* (*in_use_malloc)(size_t size) = first_malloc;
static void* (*in_use_calloc)(size_t nmemb, size_t size) = first_calloc;

static void* (*real_malloc)(size_t size) = nullptr;
static void (*real_free)(void* ptr) = nullptr;
static void* (*real_calloc)(size_t nmemb, size_t size) = nullptr;

static void* dummy_malloc(size_t size) {
// We use dummy malloc only in thread requesting it, during resolution of real
// symbols. Other threads during that time should use real malloc.
if (gettid() != thread_using_shim.load()) {
// Only possible case when real_malloc is nullptr, is when we're acctually
// looking for it right now, in another thread (initialization is in
// progress in another thread). So it is highly unlikly to happen, but still
// we better be sure. Simplest thing we can do is use busy waiting, since
// it should almost never happen.
while (!initialized.load()) {
/* busy wait */
}
return real_malloc(size);
}

// Dummy malloc is used only during resolving real symbols by coz.
// For that particular case, we don't need sofisticated memory allocation
// algorithm or a lot of memory at our disposal.
// However, we should ensure as much correctness of the algorithm as possible,
// like memory alignment, and non overlapping buffers.

static char* first_unallocated{memory_pool};

// Make size multiple of alignof(max_align_t), to keep addresses aligned.
constexpr std::uintmax_t all_ones = ~(std::uintmax_t{});
size = (size + alignof(std::max_align_t) - 1) &
(all_ones * alignof(std::max_align_t)); // this is same as shifting left.

char* result = first_unallocated;
first_unallocated += size;

if (first_unallocated > &memory_pool[memory_pool_size])
abort();

return result;
}

static void* dummy_calloc(size_t nmemb, size_t size) {
void* ptr = dummy_malloc(nmemb * size);
memset(ptr, 0, nmemb * size);
return ptr;
}

static void set_dummy_allocs_impl() {
// If another thread ends up in dummy_malloc, make sure it knows it's in the
// wrong place.
thread_using_shim.store(gettid());
in_use_malloc = dummy_malloc;
in_use_calloc = dummy_calloc;
}

static void restore_real_allocs_impl() {
in_use_malloc = real_malloc;
in_use_calloc = real_calloc;
thread_using_shim.store(0);
}

static void find_real_functions() {
// Dummy allocs are on by default, so it is safe to call dlsym.
real_malloc = reinterpret_cast<void* (*)(size_t)>(dlsym(RTLD_NEXT, "malloc"));
if (!real_malloc) {
fprintf(stderr, "Failed to find real malloc!\n");
abort();
}

real_free = reinterpret_cast<void (*)(void*)>(dlsym(RTLD_NEXT, "free"));
if (!real_free) {
fprintf(stderr, "Failed to find real free!\n");
abort();
}

real_calloc =
reinterpret_cast<void* (*)(size_t, size_t)>(dlsym(RTLD_NEXT, "calloc"));
if (!real_calloc) {
fprintf(stderr, "Failed to find real calloc!\n");
abort();
}
}

static void lazy_init() {
// First check to improve performance, avoid locking if unnecessary.
if (initialized.load(std::memory_order_acquire))
return;

init_lock.lock();

// Another check, this time to make sure no one acquired the lock between
// first check and our attempt to acquire it.
if (initialized.load(std::memory_order_relaxed))
return;

// Allocations could be made by libdl while we search for real functions.
// Prepare dummy allocs for this (first_malloc is no longer needed, we are
// already initializing).
set_dummy_allocs_impl();

// Now find real functions.
find_real_functions();

// Now that we have real functions, use them by default.
restore_real_allocs_impl();

initialized.store(true, std::memory_order_release);
init_lock.unlock();
}

extern "C" {
void coz_lock_and_set_dummy_alloc_shims() {
lazy_init();

// This is to make sure, only one thread resolves real symbols at the time.
// That in turn makes it possible, to ensure dummy malloc is called only
// from the thread that resolves real symbol, while it resolves it.
// Waiting for this lock should be extrimely rare case. Symbols are usually
// resolved from one thread, and each resolve_* function is called only once,
// during runtime of the program.
mode_lock.lock();
set_dummy_allocs_impl();
}

void coz_restore_real_alloc_shims_and_unlock() {
restore_real_allocs_impl();
mode_lock.unlock();
}

void* malloc(size_t size) {
// When dummy implementation is not needed (during most of the runtime of
// the program), shim will directly call real implementations, minimizing
// overhead as much as teoretically possible.
return in_use_malloc(size);
}

void free(void *ptr) {
// Null ptrs are ignored anyway.
if (!ptr)
return;

// If it is allocated in our pool we should free it ( or not :) )
if (ptr >= memory_pool && ptr < memory_pool + memory_pool_size)
return;

// It is probably allocated with real malloc, so let real free deal with it.
real_free(ptr);
}

void* calloc(size_t nmemb, size_t size) {
return in_use_calloc(nmemb, size);
}

}
8 changes: 8 additions & 0 deletions libcoz/alloc_shims.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

#if !defined(CAUSAL_RUNTIME_ALLOC_SHIMS_H)
#define CAUSAL_RUNTIME_ALLOC_SHIMS_H

extern "C" void coz_lock_and_set_dummy_alloc_shims() __attribute__((weak));
extern "C" void coz_restore_real_alloc_shims_and_unlock() __attribute__((weak));

#endif
6 changes: 6 additions & 0 deletions libcoz/real.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,20 @@
#include <stdint.h>
#include <string.h>

#include "alloc_shims.h"

static bool resolving = false; //< Set to true while symbol resolution is in progress
static bool in_dlopen = false; //< Set to true while dlopen is running
static void* pthread_handle = NULL; //< The `dlopen` handle to libpthread

#define GET_SYMBOL_HANDLE(name, handle) \
decltype(::name)* real_##name = nullptr; \
while(!__atomic_exchange_n(&resolving, true, __ATOMIC_ACQ_REL)) {} \
if (coz_lock_and_set_dummy_alloc_shims) \
coz_lock_and_set_dummy_alloc_shims(); \
uintptr_t addr = reinterpret_cast<uintptr_t>(dlsym(handle, #name)); \
if (coz_restore_real_alloc_shims_and_unlock) \
coz_restore_real_alloc_shims_and_unlock(); \
memcpy(&real_##name, &addr, sizeof(uintptr_t)); \
if(real_##name) { \
memcpy(&real::name, &addr, sizeof(uintptr_t)); \
Expand Down