diff --git a/README.md b/README.md
index 7d39612..de9efbc 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ By default, Coz works for C and C++ programs. It has been ported or
 has wrappers for several other languages, listed below:
 
 | Language      | Link |
-| ----------- | -----------  
+| ----------- | -----------
 | Java   | JCoz: https://github.com/Decave/JCoz|
 | Go     | Cozgo: https://github.com/urjitbhatia/cozgo|
 | Swift  | Swift Coz: https://github.com/funcmike/swift-coz |
@@ -89,6 +89,10 @@ To run your program with Coz, you will need to build it with debug information (
 
 Once you have your program built with debug information, you can run it with Coz using the command `coz run {coz options} --- {program name and arguments}`. But, to produce a useful profile you need to decide which part(s) of the application you want to speed up by specifying one or more progress points.
 
+If your program uses `jemalloc`, `tcmalloc`, or any other allocation algorithm,
+and it crashes or deadlocks during profiling, try passing `--with-alloc-shims`
+option to coz.
+
 ### Profiling Modes
 Coz departs from conventional profiling by making it possible to view the effect of optimizations on both throughput and latency. To profile throughput, you must specify a progress point. To profile latency, you must specify a pair of progress points.
 
diff --git a/coz b/coz
index b195eca..d66affc 100755
--- a/coz
+++ b/coz
@@ -47,21 +47,22 @@ def _coz_run(args):
 
   # Find coz
   coz_prefix = dirname(realpath(sys.argv[0]))
+  default_lib_name = 'libcoz.so'
 
   # Candidate runtime library locations
   library_locations = [
     # Check for library adjacent to this script
-    os.path.join(coz_prefix, '..', 'lib64', 'libcoz.so'),
-    os.path.join(coz_prefix, '..', 'lib', 'libcoz.so'),
+    os.path.join(coz_prefix, '..', 'lib64', default_lib_name),
+    os.path.join(coz_prefix, '..', 'lib', default_lib_name),
 
     # Check for library under the coz-profiler subdirectory
-    os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', 'libcoz.so'),
-    os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', 'libcoz.so'),
+    os.path.join(coz_prefix, '..', 'lib64', 'coz-profiler', default_lib_name),
+    os.path.join(coz_prefix, '..', 'lib', 'coz-profiler', default_lib_name),
 
     # Local library under development directory
-    os.path.join('libcoz', 'libcoz.so'),      # Local library during development
-    os.path.join(coz_prefix, 'libcoz', 'libcoz.so'),
-    os.path.join(coz_prefix, 'build', 'libcoz', 'libcoz.so'),
+    os.path.join('libcoz', default_lib_name), # Local library during development
+    os.path.join(coz_prefix, 'libcoz', default_lib_name),
+    os.path.join(coz_prefix, 'build', 'libcoz', default_lib_name),
   ]
 
   # Find the first library location that exists
@@ -83,6 +84,11 @@ def _coz_run(args):
   else:
     env['LD_PRELOAD'] = coz_runtime
 
+  if args.with_alloc_shims:
+    cozallocshims = coz_runtime.replace(default_lib_name, 'libcozallocshims.so')
+    # Make cozallocshims first library to get loaded.
+    env['LD_PRELOAD'] = cozallocshims + ':' + env['LD_PRELOAD']
+
   if len(args.binary_scope) > 0:
     env['COZ_BINARY_SCOPE'] = '\t'.join(args.binary_scope)
   else:
@@ -185,6 +191,12 @@ _run_parser.add_argument('--fixed-speedup',
                          type=int, choices=list(range(0, 101)), default=None,
                          help='Evaluate optimizations of a specific amount')
 
+_run_parser.add_argument('--with-alloc-shims',
+                         action='store_true', default=False,
+                         help='Use shims for memory allocation functions (malloc). '
+                            'Useful as a proxy on top of libraries like jemalloc or '
+                            'tcmalloc, to avoid problems caused by them.')
+
 # Use defaults to recover handler function and parser object from parser output
 _run_parser.set_defaults(func=_coz_run, parser=_run_parser)
 
diff --git a/libcoz/CMakeLists.txt b/libcoz/CMakeLists.txt
index a3a9ea9..cbb067d 100644
--- a/libcoz/CMakeLists.txt
+++ b/libcoz/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(sources
     ${PROJECT_SOURCE_DIR}/include/coz.h
+    alloc_shims.h
     inspect.cpp
     inspect.h
     libcoz.cpp
@@ -24,6 +25,16 @@ target_include_directories(coz
         $<INSTALL_INTERFACE:include>)
 target_link_libraries(coz PRIVATE ${CMAKE_DL_LIBS} rt Threads::Threads libelfin::libelfin)
 
+add_library(cozallocshims MODULE alloc_shims.cpp alloc_shims.h)
+if(CONAN_PACKAGE_VERSION)
+    set_target_properties(cozallocshims PROPERTIES VERSION ${CONAN_PACKAGE_VERSION})
+endif()
+target_include_directories(cozallocshims
+    PUBLIC
+        $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:include>)
+target_link_libraries(cozallocshims PRIVATE ${CMAKE_DL_LIBS})
+
 add_library(coz-instrumentation INTERFACE)
 target_include_directories(coz-instrumentation
     INTERFACE
@@ -32,7 +43,7 @@ target_include_directories(coz-instrumentation
 target_link_libraries(coz-instrumentation INTERFACE -Wl,--push-state,--no-as-needed ${CMAKE_DL_LIBS} -Wl,--pop-state)
 
 if(INSTALL_COZ)
-    install(TARGETS coz
+    install(TARGETS coz cozallocshims
         EXPORT coz-profilerTargets
         ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/libcoz/alloc_shims.cpp b/libcoz/alloc_shims.cpp
new file mode 100644
index 0000000..935d97c
--- /dev/null
+++ b/libcoz/alloc_shims.cpp
@@ -0,0 +1,191 @@
+
+#include <dlfcn.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <cstdlib>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+
+#include "perf.h"
+#include "ccutil/spinlock.h"
+
+static std::atomic_bool initialized{false};
+static spinlock init_lock;
+static spinlock mode_lock;
+static std::atomic<pid_t> thread_using_shim{0};
+
+static constexpr size_t memory_pool_size = 1000 * alignof(std::max_align_t);
+alignas(std::max_align_t) static char memory_pool[memory_pool_size];
+
+static void lazy_init();
+
+static void* first_malloc(size_t size) {
+  lazy_init();
+  return malloc(size);
+}
+
+static void* first_calloc(size_t nmemb, size_t size) {
+  lazy_init();
+  return calloc(nmemb, size);
+}
+
+static void* (*in_use_malloc)(size_t size) = first_malloc;
+static void* (*in_use_calloc)(size_t nmemb, size_t size) = first_calloc;
+
+static void* (*real_malloc)(size_t size) = nullptr;
+static void (*real_free)(void* ptr) = nullptr;
+static void* (*real_calloc)(size_t nmemb, size_t size) = nullptr;
+
+static void* dummy_malloc(size_t size) {
+  // We use dummy malloc only in thread requesting it, during resolution of real
+  // symbols. Other threads during that time should use real malloc.
+  if (gettid() != thread_using_shim.load()) {
+    // Only possible case when real_malloc is nullptr, is when we're acctually
+    // looking for it right now, in another thread (initialization is in
+    // progress in another thread). So it is highly unlikly to happen, but still
+    // we better be sure. Simplest thing we can do is use busy waiting, since
+    // it should almost never happen.
+    while (!initialized.load()) {
+      /* busy wait */
+    }
+    return real_malloc(size);
+  }
+
+  // Dummy malloc is used only during resolving real symbols by coz.
+  // For that particular case, we don't need sofisticated memory allocation
+  // algorithm or a lot of memory at our disposal.
+  // However, we should ensure as much correctness of the algorithm as possible,
+  // like memory alignment, and non overlapping buffers.
+
+  static char* first_unallocated{memory_pool};
+
+  // Make size multiple of alignof(max_align_t), to keep addresses aligned.
+  constexpr std::uintmax_t all_ones = ~(std::uintmax_t{});
+  size = (size + alignof(std::max_align_t) - 1) &
+      (all_ones * alignof(std::max_align_t)); // this is same as shifting left.
+
+  char* result = first_unallocated;
+  first_unallocated += size;
+
+  if (first_unallocated > &memory_pool[memory_pool_size])
+    abort();
+
+  return result;
+}
+
+static void* dummy_calloc(size_t nmemb, size_t size) {
+  void* ptr = dummy_malloc(nmemb * size);
+  memset(ptr, 0, nmemb * size);
+  return ptr;
+}
+
+static void set_dummy_allocs_impl() {
+  // If another thread ends up in dummy_malloc, make sure it knows it's in the
+  // wrong place.
+  thread_using_shim.store(gettid());
+  in_use_malloc = dummy_malloc;
+  in_use_calloc = dummy_calloc;
+}
+
+static void restore_real_allocs_impl() {
+  in_use_malloc = real_malloc;
+  in_use_calloc = real_calloc;
+  thread_using_shim.store(0);
+}
+
+static void find_real_functions() {
+  // Dummy allocs are on by default, so it is safe to call dlsym.
+  real_malloc = reinterpret_cast<void* (*)(size_t)>(dlsym(RTLD_NEXT, "malloc"));
+  if (!real_malloc) {
+    fprintf(stderr, "Failed to find real malloc!\n");
+    abort();
+  }
+
+  real_free = reinterpret_cast<void (*)(void*)>(dlsym(RTLD_NEXT, "free"));
+  if (!real_free) {
+    fprintf(stderr, "Failed to find real free!\n");
+    abort();
+  }
+
+  real_calloc =
+      reinterpret_cast<void* (*)(size_t, size_t)>(dlsym(RTLD_NEXT, "calloc"));
+  if (!real_calloc) {
+    fprintf(stderr, "Failed to find real calloc!\n");
+    abort();
+  }
+}
+
+static void lazy_init() {
+  // First check to improve performance, avoid locking if unnecessary.
+  if (initialized.load(std::memory_order_acquire))
+    return;
+
+  init_lock.lock();
+
+  // Another check, this time to make sure no one acquired the lock between
+  // first check and our attempt to acquire it.
+  if (initialized.load(std::memory_order_relaxed))
+    return;
+
+  // Allocations could be made by libdl while we search for real functions.
+  // Prepare dummy allocs for this (first_malloc is no longer needed, we are
+  // already initializing).
+  set_dummy_allocs_impl();
+
+  // Now find real functions.
+  find_real_functions();
+
+  // Now that we have real functions, use them by default.
+  restore_real_allocs_impl();
+
+  initialized.store(true, std::memory_order_release);
+  init_lock.unlock();
+}
+
+extern "C" {
+void coz_lock_and_set_dummy_alloc_shims() {
+  lazy_init();
+
+  // This is to make sure, only one thread resolves real symbols at the time.
+  // That in turn makes it possible, to ensure dummy malloc is called only
+  // from the thread that resolves real symbol, while it resolves it.
+  // Waiting for this lock should be extrimely rare case. Symbols are usually
+  // resolved from one thread, and each resolve_* function is called only once,
+  // during runtime of the program.
+  mode_lock.lock();
+  set_dummy_allocs_impl();
+}
+
+void coz_restore_real_alloc_shims_and_unlock() {
+  restore_real_allocs_impl();
+  mode_lock.unlock();
+}
+
+void* malloc(size_t size) {
+  // When dummy implementation is not needed (during most of the runtime of
+  // the program), shim will directly call real implementations, minimizing
+  // overhead as much as teoretically possible.
+  return in_use_malloc(size);
+}
+
+void free(void *ptr) {
+  // Null ptrs are ignored anyway.
+  if (!ptr)
+    return;
+
+  // If it is allocated in our pool we should free it ( or not :) )
+  if (ptr >= memory_pool && ptr < memory_pool + memory_pool_size)
+    return;
+
+  // It is probably allocated with real malloc, so let real free deal with it.
+  real_free(ptr);
+}
+
+void* calloc(size_t nmemb, size_t size) {
+  return in_use_calloc(nmemb, size);
+}
+
+}
\ No newline at end of file
diff --git a/libcoz/alloc_shims.h b/libcoz/alloc_shims.h
new file mode 100644
index 0000000..dd88d1c
--- /dev/null
+++ b/libcoz/alloc_shims.h
@@ -0,0 +1,8 @@
+
+#if !defined(CAUSAL_RUNTIME_ALLOC_SHIMS_H)
+#define CAUSAL_RUNTIME_ALLOC_SHIMS_H
+
+extern "C" void coz_lock_and_set_dummy_alloc_shims() __attribute__((weak));
+extern "C" void coz_restore_real_alloc_shims_and_unlock() __attribute__((weak));
+
+#endif
diff --git a/libcoz/real.cpp b/libcoz/real.cpp
index 8e7c063..8360484 100644
--- a/libcoz/real.cpp
+++ b/libcoz/real.cpp
@@ -13,6 +13,8 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "alloc_shims.h"
+
 static bool resolving = false;        //< Set to true while symbol resolution is in progress
 static bool in_dlopen = false;        //< Set to true while dlopen is running
 static void* pthread_handle = NULL;   //< The `dlopen` handle to libpthread
@@ -20,7 +22,11 @@ static void* pthread_handle = NULL;   //< The `dlopen` handle to libpthread
 #define GET_SYMBOL_HANDLE(name, handle) \
   decltype(::name)* real_##name = nullptr; \
   while(!__atomic_exchange_n(&resolving, true, __ATOMIC_ACQ_REL)) {} \
+  if (coz_lock_and_set_dummy_alloc_shims) \
+    coz_lock_and_set_dummy_alloc_shims(); \
   uintptr_t addr = reinterpret_cast<uintptr_t>(dlsym(handle, #name)); \
+  if (coz_restore_real_alloc_shims_and_unlock) \
+    coz_restore_real_alloc_shims_and_unlock(); \
   memcpy(&real_##name, &addr, sizeof(uintptr_t)); \
   if(real_##name) { \
     memcpy(&real::name, &addr, sizeof(uintptr_t)); \