Merge pull request #133 from sony/feature/20190306-memory-allocator

Improve memory allocator
sony · Mar 6, 2019 · 719e508 · 719e508
2 parents 1fae54f + d845726
commit 719e508
Show file tree

Hide file tree

Showing 16 changed files with 208 additions and 222 deletions.
diff --git a/include/nbla/cuda/array/cuda_array.hpp b/include/nbla/cuda/array/cuda_array.hpp
@@ -18,7 +18,6 @@
 #include <memory>
 
 #include <nbla/array.hpp>
-#include <nbla/cuda/cuda_memory.hpp>
 #include <nbla/cuda/defs.hpp>
 
 namespace nbla {
@@ -31,34 +30,24 @@ using std::shared_ptr;
 class CudaArray : public Array {
 protected:
   int device_;
-  /* Holding CudaMemory until the instance is destroyed to prevent freeing.
-   */
-  shared_ptr<CudaMemory> inuse_memory_;
 
 public:
   explicit CudaArray(const Size_t size, dtypes dtype, const Context &ctx);
+  explicit CudaArray(const Size_t size, dtypes dtype, const Context &ctx,
+                     AllocatorMemory &&mem);
   virtual ~CudaArray();
   virtual void copy_from(const Array *src_array);
   virtual void zero();
   virtual void fill(float value);
   static Context filter_context(const Context &ctx);
-
-protected:
-  virtual void allocate();
-  virtual void deallocate();
 };
 
 NBLA_CUDA_API void synchronizer_cuda_array_cpu_array(Array *src, Array *dst);
 
 NBLA_CUDA_API void synchronizer_cpu_array_cuda_array(Array *src, Array *dst);
 
-/** Array allocated on CUDA device with Memory Pool
-
-This is a necessary ingredient for imperative programming interface of
-neural networks (aka define-by-run or dynamic). Memory allocation of
-CUDA is not asynchronous. Hence, allocating memory region between each
-function will lead thread synchronization that will block executions of
-CUDA kernels. Then, your network execution will be slow and inefficient.
+/** Array allocated on CUDA device with a CudaMemory obtained by
+Cuda::caching_allocator().
 */
 class CudaCachedArray : public CudaArray {
 public:
@@ -71,10 +60,6 @@ class CudaCachedArray : public CudaArray {
   explicit CudaCachedArray(const Size_t size, dtypes dtype, const Context &ctx);
   virtual ~CudaCachedArray();
   static Context filter_context(const Context &ctx);
-
-protected:
-  virtual void allocate();
-  virtual void deallocate();
 };
 }
 #endif
diff --git a/include/nbla/cuda/common.hpp b/include/nbla/cuda/common.hpp
@@ -47,13 +47,16 @@ using std::map;
 
 /**
 Check CUDA error for synchronous call
+cudaGetLastError is used to clear previous error happening at "condition".
 */
 #define NBLA_CUDA_CHECK(condition)                                             \
   {                                                                            \
     cudaError_t error = condition;                                             \
     if (error != cudaSuccess) {                                                \
-      NBLA_ERROR(error_code::target_specific, "(%s) failed with \"%s\".",      \
-                 #condition, cudaGetErrorString(error));                       \
+      cudaGetLastError();                                                      \
+      NBLA_ERROR(error_code::target_specific, "(%s) failed with \"%s\" (%s).", \
+                 #condition, cudaGetErrorString(error),                        \
+                 cudaGetErrorName(error));                                     \
     }                                                                          \
   }
 

diff --git a/include/nbla/cuda/cuda.hpp b/include/nbla/cuda/cuda.hpp
@@ -18,11 +18,11 @@
 #define __NBLA_CUDA_CUDA_HPP__
 
 #include <nbla/cuda/common.hpp>
-#include <nbla/cuda/cuda_memory.hpp>
 #include <nbla/cuda/defs.hpp>
 #include <nbla/cuda/init.hpp>
+#include <nbla/cuda/memory/cuda_memory.hpp>
 #include <nbla/exception.hpp>
-#include <nbla/memory.hpp>
+#include <nbla/memory/allocator.hpp>
 #include <nbla/singleton_manager.hpp>
 
 #include <mutex>
@@ -66,38 +66,32 @@ class NBLA_CUDA_API Cuda {
    */
   void register_array_class(const string &name);
 
-  /** Get a CudaMemoryCache instance.
+  /** Get a caching allocator.
    */
-  MemoryCache<CudaMemory> &memcache();
-
-  /** Get workspace memory.
-
-      It returns nullptr if size_in_bytes is 0.
-
-      @param[in] size_in_bytes Size of CUDA device memory requested.
-      @param[in] device GPU ID.
-
-      @note It internally holds workspace memory with maximum size over
-            sizes previously requested. Every time the requested size exceeds
-            the maximum size, it will reallocate a new memory region, which
-            will cause memory allocation overhead and device synchronization.
+  shared_ptr<Allocator> caching_allocator();
 
+  /** Get a no-cache allocator.
    */
-  void *get_workspace(Size_t size_in_bytes, int device);
+  shared_ptr<Allocator> naive_allocator();
 
 protected:
   std::mutex mtx_cublas_;
   std::mutex mtx_curand_;
-  std::mutex mtx_workspace_;
   std::mutex mtx_event_;
   unordered_map<int, cublasHandle_t>
       cublas_handles_; ///< cuBLAS handles for each device.
   unordered_map<int, curandGenerator_t> curand_generators_;
   unordered_map<int, unordered_map<unsigned int, vector<cudaEvent_t>>>
       cuda_unused_events_;
-  vector<string> array_classes_;     ///< Available array classes
-  MemoryCache<CudaMemory> memcache_; ///< CUDA memory cache.
-  unordered_map<int, shared_ptr<CudaMemory>> workspace_; ///< Workspace memory.
+  vector<string> array_classes_; ///< Available array classes
+
+  /*
+    NOTE: Allocators must be retained as shared_ptr in order to be passed to a
+    CachedMemory instance to prevernt destroying allocators before destroying
+    memory.
+   */
+  shared_ptr<Allocator> naive_allocator_;
+  shared_ptr<Allocator> caching_allocator_;
 
 private:
   friend SingletonManager;

diff --git a/include/nbla/cuda/half.hpp b/include/nbla/cuda/half.hpp
@@ -74,7 +74,7 @@ struct NBLA_ALIGN(2) HalfCuda {
     return *this;
   }
 #if NBLA_CUDA_HALF
-  HALF_CUDA_PREFIX const unsigned short &as_bits() const {
+  HALF_CUDA_PREFIX unsigned short as_bits() const {
 #if CUDA_VERSION >= 9000
     return ((__half_raw)h).x;
 #else

diff --git a/include/nbla/cuda/cuda_memory.hpp → include/nbla/cuda/memory/cuda_memory.hpp b/include/nbla/cuda/cuda_memory.hpp → include/nbla/cuda/memory/cuda_memory.hpp
@@ -22,23 +22,32 @@
 #include <nbla/array.hpp>
 #include <nbla/common.hpp>
 #include <nbla/cuda/defs.hpp>
-#include <nbla/memory.hpp>
+#include <nbla/memory/memory.hpp>
 
 namespace nbla {
 
-using std::vector;
-using std::shared_ptr;
+/** CUDA memory implementation.
 
-/** CUDA Memory
- */
+    A CUDA device memory block allocated by cudaMalloc function is managed by
+    this.
+
+    The device passed to constructor is a device id as as string such as "0" and
+    "1".
+
+    \ingroup MemoryImplGrp
+*/
 class NBLA_CUDA_API CudaMemory : public Memory {
-protected:
+private:
+  CudaMemory(size_t bytes, const string &device, void *ptr);
   int device_num_;
 
 public:
-  CudaMemory(Size_t bytes, const string &device);
-  virtual bool allocate();
-  virtual ~CudaMemory();
+  CudaMemory(size_t bytes, const string &device);
+  ~CudaMemory();
+  bool alloc_impl() override;
+  shared_ptr<Memory> divide_impl(size_t second_start) override;
+  void merge_next_impl(Memory *from) override;
+  void merge_prev_impl(Memory *from) override;
 };
 }
 #endif
diff --git a/src/nbla/cuda/CMakeLists.txt b/src/nbla/cuda/CMakeLists.txt
@@ -81,7 +81,12 @@ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${ARCH_FLAGS}")
 # To prevent user confusion, library filename will change that cudnn in included or not.
 set(NBLA_CUDA_LIBRARY_NAME nnabla_cuda)
 
-file(GLOB CPP_SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ./*.cpp array/*.cpp cudnn/*.cpp utils/*.cpp)
+file(GLOB CPP_SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+  ./*.cpp
+  memory/*.cpp
+  array/*.cpp
+  cudnn/*.cpp
+  utils/*.cpp)
 
 if(MSVC)
   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler /W0")

diff --git a/src/nbla/cuda/array/cuda_array.cpp b/src/nbla/cuda/array/cuda_array.cpp
@@ -32,30 +32,17 @@ using std::make_shared;
 
 // CudaArray
 CudaArray::CudaArray(const Size_t size, dtypes dtype, const Context &ctx)
-    : Array(size, dtype, ctx), device_(std::stoi(ctx.device_id)),
-      inuse_memory_(nullptr) {}
+    : Array(size, dtype, ctx,
+            SingletonManager::get<Cuda>()->naive_allocator()->alloc(
+                Array::size_as_bytes(size, dtype), ctx.device_id)),
+      device_(std::stoi(ctx.device_id)) {}
 
-CudaArray::~CudaArray() {
-  if (this->object_) {
-    this->deallocate();
-  }
-}
+CudaArray::CudaArray(const Size_t size, dtypes dtype, const Context &ctx,
+                     AllocatorMemory &&mem)
+    : Array::Array(size, dtype, ctx, std::move(mem)),
+      device_(std::stoi(ctx.device_id)) {}
 
-void CudaArray::allocate() {
-#ifdef NBLA_VERBOSE_MEMORY_USAGE
-  printf("CudaArray is created with size of %d\n",
-         (int)(this->size_ * sizeof(this->dtype_)));
-#endif
-  int msize = this->size_ * sizeof_dtype(this->dtype_);
-  inuse_memory_ = make_shared<CudaMemory>(msize, this->ctx_.device_id);
-  inuse_memory_->allocate();
-  this->object_ = inuse_memory_->ptr();
-}
-
-void CudaArray::deallocate() {
-  inuse_memory_ = nullptr;
-  this->object_ = nullptr;
-}
+CudaArray::~CudaArray() {}
 
 void CudaArray::zero() {
   cuda_set_device(device_);
@@ -106,25 +93,12 @@ void synchronizer_cpu_array_cuda_array(Array *src, Array *dst) {
 /////////////////////////////////
 CudaCachedArray::CudaCachedArray(const Size_t size, dtypes dtype,
                                  const Context &ctx)
-    : CudaArray(size, dtype, ctx) {}
+    : CudaArray(size, dtype, ctx,
+                SingletonManager::get<Cuda>()->caching_allocator()->alloc(
+                    Array::size_as_bytes(size, dtype), ctx.device_id)) {}
 
-CudaCachedArray::~CudaCachedArray() { this->deallocate(); }
+CudaCachedArray::~CudaCachedArray() {}
 
-void CudaCachedArray::allocate() {
-  deallocate();
-  int bytes = this->size_ * sizeof_dtype(this->dtype_);
-  auto mem = SingletonManager::get<Cuda>()->memcache().pop_or_create(
-      bytes, this->ctx_.device_id);
-  this->object_ = mem->ptr();
-  this->inuse_memory_ = mem;
-}
-
-void CudaCachedArray::deallocate() {
-  if (this->inuse_memory_) {
-    SingletonManager::get<Cuda>()->memcache().cache(this->inuse_memory_);
-    this->inuse_memory_ = nullptr;
-  }
-}
 Context CudaCachedArray::filter_context(const Context &ctx) {
   return Context({}, "CudaCachedArray", ctx.device_id);
 }

diff --git a/src/nbla/cuda/cuda.cpp b/src/nbla/cuda/cuda.cpp
@@ -16,9 +16,17 @@
 #include <nbla/cuda/utils/random.hpp>
 #include <nbla/singleton_manager-internal.hpp>
 
+#include <nbla/cuda/memory/cuda_memory.hpp>
+
+#include <nbla/memory/caching_allocator_with_buckets.hpp>
+#include <nbla/memory/naive_allocator.hpp>
+
 namespace nbla {
 
-Cuda::Cuda() {}
+Cuda::Cuda()
+    : naive_allocator_(make_shared<NaiveAllocator<CudaMemory>>()),
+      caching_allocator_(
+          make_shared<CachingAllocatorWithBuckets<CudaMemory>>()) {}
 
 Cuda::~Cuda() {
   for (auto handle : this->cublas_handles_) {
@@ -144,25 +152,8 @@ void Cuda::register_array_class(const string &name) {
   array_classes_.push_back(name);
 }
 
-MemoryCache<CudaMemory> &Cuda::memcache() { return memcache_; }
-
-void *Cuda::get_workspace(Size_t size_in_bytes, int device) {
-  if (size_in_bytes == 0) {
-    return nullptr;
-  }
-  std::lock_guard<decltype(mtx_workspace_)> lock(mtx_workspace_);
-  auto it = workspace_.find(device);
-  if (it == workspace_.end()) {
-    workspace_[device] =
-        make_shared<CudaMemory>(size_in_bytes, std::to_string(device));
-  } else if (it->second->size() < size_in_bytes) {
-    workspace_.erase(it);
-    workspace_[device] =
-        make_shared<CudaMemory>(size_in_bytes, std::to_string(device));
-  }
-  it = workspace_.find(device);
-  return it->second->ptr();
-}
+shared_ptr<Allocator> Cuda::caching_allocator() { return caching_allocator_; }
+shared_ptr<Allocator> Cuda::naive_allocator() { return naive_allocator_; }
 
 NBLA_INSTANTIATE_SINGLETON(NBLA_CUDA_API, Cuda);
 }