Skip to content

Commit

Permalink
Merge pull request #133 from sony/feature/20190306-memory-allocator
Browse files Browse the repository at this point in the history
Improve memory allocator
  • Loading branch information
KazukiYoshiyama-sony authored Mar 6, 2019
2 parents 1fae54f + d845726 commit 719e508
Show file tree
Hide file tree
Showing 16 changed files with 208 additions and 222 deletions.
23 changes: 4 additions & 19 deletions include/nbla/cuda/array/cuda_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <memory>

#include <nbla/array.hpp>
#include <nbla/cuda/cuda_memory.hpp>
#include <nbla/cuda/defs.hpp>

namespace nbla {
Expand All @@ -31,34 +30,24 @@ using std::shared_ptr;
class CudaArray : public Array {
protected:
int device_;
/* Holding CudaMemory until the instance is destroyed to prevent freeing.
*/
shared_ptr<CudaMemory> inuse_memory_;

public:
explicit CudaArray(const Size_t size, dtypes dtype, const Context &ctx);
explicit CudaArray(const Size_t size, dtypes dtype, const Context &ctx,
AllocatorMemory &&mem);
virtual ~CudaArray();
virtual void copy_from(const Array *src_array);
virtual void zero();
virtual void fill(float value);
static Context filter_context(const Context &ctx);

protected:
virtual void allocate();
virtual void deallocate();
};

NBLA_CUDA_API void synchronizer_cuda_array_cpu_array(Array *src, Array *dst);

NBLA_CUDA_API void synchronizer_cpu_array_cuda_array(Array *src, Array *dst);

/** Array allocated on CUDA device with Memory Pool
This is a necessary ingredient for imperative programming interface of
neural networks (aka define-by-run or dynamic). Memory allocation of
CUDA is not asynchronous. Hence, allocating memory region between each
function will lead thread synchronization that will block executions of
CUDA kernels. Then, your network execution will be slow and inefficient.
/** Array allocated on CUDA device with a CudaMemory obtained by
Cuda::caching_allocator().
*/
class CudaCachedArray : public CudaArray {
public:
Expand All @@ -71,10 +60,6 @@ class CudaCachedArray : public CudaArray {
explicit CudaCachedArray(const Size_t size, dtypes dtype, const Context &ctx);
virtual ~CudaCachedArray();
static Context filter_context(const Context &ctx);

protected:
virtual void allocate();
virtual void deallocate();
};
}
#endif
7 changes: 5 additions & 2 deletions include/nbla/cuda/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,16 @@ using std::map;

/**
Check CUDA error for synchronous call
cudaGetLastError is used to clear previous error happening at "condition".
*/
#define NBLA_CUDA_CHECK(condition) \
{ \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
NBLA_ERROR(error_code::target_specific, "(%s) failed with \"%s\".", \
#condition, cudaGetErrorString(error)); \
cudaGetLastError(); \
NBLA_ERROR(error_code::target_specific, "(%s) failed with \"%s\" (%s).", \
#condition, cudaGetErrorString(error), \
cudaGetErrorName(error)); \
} \
}

Expand Down
36 changes: 15 additions & 21 deletions include/nbla/cuda/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
#define __NBLA_CUDA_CUDA_HPP__

#include <nbla/cuda/common.hpp>
#include <nbla/cuda/cuda_memory.hpp>
#include <nbla/cuda/defs.hpp>
#include <nbla/cuda/init.hpp>
#include <nbla/cuda/memory/cuda_memory.hpp>
#include <nbla/exception.hpp>
#include <nbla/memory.hpp>
#include <nbla/memory/allocator.hpp>
#include <nbla/singleton_manager.hpp>

#include <mutex>
Expand Down Expand Up @@ -66,38 +66,32 @@ class NBLA_CUDA_API Cuda {
*/
void register_array_class(const string &name);

/** Get a CudaMemoryCache instance.
/** Get a caching allocator.
*/
MemoryCache<CudaMemory> &memcache();

/** Get workspace memory.
It returns nullptr if size_in_bytes is 0.
@param[in] size_in_bytes Size of CUDA device memory requested.
@param[in] device GPU ID.
@note It internally holds workspace memory with maximum size over
sizes previously requested. Every time the requested size exceeds
the maximum size, it will reallocate a new memory region, which
will cause memory allocation overhead and device synchronization.
shared_ptr<Allocator> caching_allocator();

/** Get a no-cache allocator.
*/
void *get_workspace(Size_t size_in_bytes, int device);
shared_ptr<Allocator> naive_allocator();

protected:
std::mutex mtx_cublas_;
std::mutex mtx_curand_;
std::mutex mtx_workspace_;
std::mutex mtx_event_;
unordered_map<int, cublasHandle_t>
cublas_handles_; ///< cuBLAS handles for each device.
unordered_map<int, curandGenerator_t> curand_generators_;
unordered_map<int, unordered_map<unsigned int, vector<cudaEvent_t>>>
cuda_unused_events_;
vector<string> array_classes_; ///< Available array classes
MemoryCache<CudaMemory> memcache_; ///< CUDA memory cache.
unordered_map<int, shared_ptr<CudaMemory>> workspace_; ///< Workspace memory.
vector<string> array_classes_; ///< Available array classes

/*
NOTE: Allocators must be retained as shared_ptr in order to be passed to a
CachedMemory instance to prevernt destroying allocators before destroying
memory.
*/
shared_ptr<Allocator> naive_allocator_;
shared_ptr<Allocator> caching_allocator_;

private:
friend SingletonManager;
Expand Down
2 changes: 1 addition & 1 deletion include/nbla/cuda/half.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ struct NBLA_ALIGN(2) HalfCuda {
return *this;
}
#if NBLA_CUDA_HALF
HALF_CUDA_PREFIX const unsigned short &as_bits() const {
HALF_CUDA_PREFIX unsigned short as_bits() const {
#if CUDA_VERSION >= 9000
return ((__half_raw)h).x;
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,32 @@
#include <nbla/array.hpp>
#include <nbla/common.hpp>
#include <nbla/cuda/defs.hpp>
#include <nbla/memory.hpp>
#include <nbla/memory/memory.hpp>

namespace nbla {

using std::vector;
using std::shared_ptr;
/** CUDA memory implementation.
/** CUDA Memory
*/
A CUDA device memory block allocated by cudaMalloc function is managed by
this.
The device passed to constructor is a device id as as string such as "0" and
"1".
\ingroup MemoryImplGrp
*/
class NBLA_CUDA_API CudaMemory : public Memory {
protected:
private:
CudaMemory(size_t bytes, const string &device, void *ptr);
int device_num_;

public:
CudaMemory(Size_t bytes, const string &device);
virtual bool allocate();
virtual ~CudaMemory();
CudaMemory(size_t bytes, const string &device);
~CudaMemory();
bool alloc_impl() override;
shared_ptr<Memory> divide_impl(size_t second_start) override;
void merge_next_impl(Memory *from) override;
void merge_prev_impl(Memory *from) override;
};
}
#endif
7 changes: 6 additions & 1 deletion src/nbla/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,12 @@ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${ARCH_FLAGS}")
# To prevent user confusion, library filename will change that cudnn in included or not.
set(NBLA_CUDA_LIBRARY_NAME nnabla_cuda)

file(GLOB CPP_SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ./*.cpp array/*.cpp cudnn/*.cpp utils/*.cpp)
file(GLOB CPP_SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
./*.cpp
memory/*.cpp
array/*.cpp
cudnn/*.cpp
utils/*.cpp)

if(MSVC)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler /W0")
Expand Down
52 changes: 13 additions & 39 deletions src/nbla/cuda/array/cuda_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,17 @@ using std::make_shared;

// CudaArray
CudaArray::CudaArray(const Size_t size, dtypes dtype, const Context &ctx)
: Array(size, dtype, ctx), device_(std::stoi(ctx.device_id)),
inuse_memory_(nullptr) {}
: Array(size, dtype, ctx,
SingletonManager::get<Cuda>()->naive_allocator()->alloc(
Array::size_as_bytes(size, dtype), ctx.device_id)),
device_(std::stoi(ctx.device_id)) {}

CudaArray::~CudaArray() {
if (this->object_) {
this->deallocate();
}
}
CudaArray::CudaArray(const Size_t size, dtypes dtype, const Context &ctx,
AllocatorMemory &&mem)
: Array::Array(size, dtype, ctx, std::move(mem)),
device_(std::stoi(ctx.device_id)) {}

void CudaArray::allocate() {
#ifdef NBLA_VERBOSE_MEMORY_USAGE
printf("CudaArray is created with size of %d\n",
(int)(this->size_ * sizeof(this->dtype_)));
#endif
int msize = this->size_ * sizeof_dtype(this->dtype_);
inuse_memory_ = make_shared<CudaMemory>(msize, this->ctx_.device_id);
inuse_memory_->allocate();
this->object_ = inuse_memory_->ptr();
}

void CudaArray::deallocate() {
inuse_memory_ = nullptr;
this->object_ = nullptr;
}
CudaArray::~CudaArray() {}

void CudaArray::zero() {
cuda_set_device(device_);
Expand Down Expand Up @@ -106,25 +93,12 @@ void synchronizer_cpu_array_cuda_array(Array *src, Array *dst) {
/////////////////////////////////
CudaCachedArray::CudaCachedArray(const Size_t size, dtypes dtype,
const Context &ctx)
: CudaArray(size, dtype, ctx) {}
: CudaArray(size, dtype, ctx,
SingletonManager::get<Cuda>()->caching_allocator()->alloc(
Array::size_as_bytes(size, dtype), ctx.device_id)) {}

CudaCachedArray::~CudaCachedArray() { this->deallocate(); }
CudaCachedArray::~CudaCachedArray() {}

void CudaCachedArray::allocate() {
deallocate();
int bytes = this->size_ * sizeof_dtype(this->dtype_);
auto mem = SingletonManager::get<Cuda>()->memcache().pop_or_create(
bytes, this->ctx_.device_id);
this->object_ = mem->ptr();
this->inuse_memory_ = mem;
}

void CudaCachedArray::deallocate() {
if (this->inuse_memory_) {
SingletonManager::get<Cuda>()->memcache().cache(this->inuse_memory_);
this->inuse_memory_ = nullptr;
}
}
Context CudaCachedArray::filter_context(const Context &ctx) {
return Context({}, "CudaCachedArray", ctx.device_id);
}
Expand Down
31 changes: 11 additions & 20 deletions src/nbla/cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,17 @@
#include <nbla/cuda/utils/random.hpp>
#include <nbla/singleton_manager-internal.hpp>

#include <nbla/cuda/memory/cuda_memory.hpp>

#include <nbla/memory/caching_allocator_with_buckets.hpp>
#include <nbla/memory/naive_allocator.hpp>

namespace nbla {

Cuda::Cuda() {}
Cuda::Cuda()
: naive_allocator_(make_shared<NaiveAllocator<CudaMemory>>()),
caching_allocator_(
make_shared<CachingAllocatorWithBuckets<CudaMemory>>()) {}

Cuda::~Cuda() {
for (auto handle : this->cublas_handles_) {
Expand Down Expand Up @@ -144,25 +152,8 @@ void Cuda::register_array_class(const string &name) {
array_classes_.push_back(name);
}

MemoryCache<CudaMemory> &Cuda::memcache() { return memcache_; }

void *Cuda::get_workspace(Size_t size_in_bytes, int device) {
if (size_in_bytes == 0) {
return nullptr;
}
std::lock_guard<decltype(mtx_workspace_)> lock(mtx_workspace_);
auto it = workspace_.find(device);
if (it == workspace_.end()) {
workspace_[device] =
make_shared<CudaMemory>(size_in_bytes, std::to_string(device));
} else if (it->second->size() < size_in_bytes) {
workspace_.erase(it);
workspace_[device] =
make_shared<CudaMemory>(size_in_bytes, std::to_string(device));
}
it = workspace_.find(device);
return it->second->ptr();
}
shared_ptr<Allocator> Cuda::caching_allocator() { return caching_allocator_; }
shared_ptr<Allocator> Cuda::naive_allocator() { return naive_allocator_; }

NBLA_INSTANTIATE_SINGLETON(NBLA_CUDA_API, Cuda);
}
Loading

0 comments on commit 719e508

Please sign in to comment.