-
Notifications
You must be signed in to change notification settings - Fork 29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TNN对mali的调度、cl&gl交互、subworkgroup的magic number #57
Comments
arm mali:针对buffer的特殊调度策略知乎上有一个问题关于为何TNN的性能在mali上好,回答者和某用户进行了深入的沟通:
下面,将针对TNN arm mali的调度策略进行分析,其回答中提到了
任何带有阻塞(blocking)的命令,都会隐式地对命令队列执行
以上内容有些绕,总结一下:只有阻塞地 如若有两个命令队列,且他们有执行上的依赖关系,即命令队列B依赖命令队列A的执行,想要让命令队列A的Event对象作为B的条件,那么就需要隐式或显式地调用clFlush或阻塞命令,以确保入队的A任务的发射(start)状态。 arm官方文档在flush上的说明Avoid application processor and GPU interactions in the middle of processingEnqueue all the kernels first, and call clFinish() at the end if possible. Avoid blocking calls in the submission threadAvoid clFinish() or clWaitForEvent() or any other blocking calls in the submission thread. Batching kernels submissionFrom version r17p0 onwards, the OpenCL driver batches kernels that are flushed together for submission to the hardware. Batching kernels can significantly reduce the runtime overheads and cache maintenance costs. For example, this reduction is useful when the application is accessing multiple sub-buffers created from a buffer imported using clImportMemoryARM in separate kernels. Execution optimizations
Wondering when I should use clFlush or clFinish.参考:https://community.khronos.org/t/wondering-when-i-should-use-clflush-or-clfinish/3157 tnn/device/opencl/acc/opencl_layer_acc.cchttps://github.com/Tencent/TNN/blob/4b9ffbecc22f5ea4ba6bc4fdacff85475a59d08d/source/tnn/device/opencl/acc/opencl_layer_acc.cc#L160
Status OpenCLLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
int unit_idx = 0;
for (auto execute_unit : execute_units_) {
ret = RunKernel(execute_unit.ocl_kernel, execute_unit.global_work_size, execute_unit.local_work_size,
ocl_context_->CommandQueue(), op_name_);
unit_idx++;
}
if (NeedFlush()) {
ocl_context_->CommandQueue()->flush();
}
return TNN_OK;
}
bool OpenCLLayerAcc::NeedFlush() {
// flush by magic number
if (0 == ocl_context_->AddAndGetFlushCount() % 10) {
return true;
}
return false;
} OpenCLContext// https://github.com/Tencent/TNN/blob/a315d2acfb327014721b308359a6d534470289ba/source/tnn/device/opencl/opencl_context.cc
// opencl kernel flush strategy, some devices(special for huawei device) whave serious latency.
unsigned int OpenCLContext::AddAndGetFlushCount() {
flush_count_++;
return flush_count_;
}
// https://github.com/Tencent/TNN/blob/a315d2acfb327014721b308359a6d534470289ba/source/tnn/device/opencl/opencl_context.h#L88
class OpenCLContext : public Context {
public:
OpenCLContext();
~OpenCLContext();
// @brief get tnn command queue
// @param command_queue device command queue for forward
Status GetCommandQueue(void **command_queue) override;
// @brief share tnn command queue to another context
Status ShareCommandQueue(Context* context) override;
/**
* @brief get CommandQueue
*/
cl::CommandQueue *CommandQueue();
cl::CommandQueue *TuneCommandQueue();
// load library
virtual Status LoadLibrary(std::vector<std::string> path) override;
/**
* @brief befor instace forword
* @param instance instace
*/
virtual Status OnInstanceForwardBegin() override;
/**
* @brief after instace forword
* @param instance instace
*/
virtual Status OnInstanceForwardEnd() override;
// @brief before instance Reshape
virtual Status OnInstanceReshapeBegin() override;
// @brief after instace Reshape
virtual Status OnInstanceReshapeEnd() override;
// @brief wait for jobs in the current context to complete
virtual Status Synchronize() override;
// @brief add flush_count_ and return val
unsigned int AddAndGetFlushCount();
std::map<std::string, std::vector<uint32_t>>& GetLocalSizeTuneMap();
Status StoreLocalSizeTuneMap();
public:
/**
* @brief initialize opencl env
*/
Status Init();
private:
std::shared_ptr<cl::CommandQueue> command_queue_ = nullptr;
std::shared_ptr<cl::CommandQueue> tune_command_queue_ = nullptr;
std::shared_ptr<cl::CommandQueue> GetCommandQueue();
OpenCLRuntime *opencl_runtime_ = nullptr;
unsigned int flush_count_ = 0;
cl_command_queue_properties properties_ = 0;
bool ReadStatusCheck(std::ifstream& is);
std::map<std::string, std::vector<uint32_t>> local_size_tune_map_;
uint32_t tune_map_size_;
static std::mutex s_mutex_;
}; |
magic number for workgrouphttps://github.com/Tencent/TNN/blob/aedc6c849e711a6386a8d2cd4ebb0bc94c7b9285/source/tnn/device/opencl/opencl_runtime.cc#L341
//magic number
static std::map<int, int> AdrenoSubGroup{
{640, 128}, {630, 128}, {616, 128}, {612, 64}, {610, 64}, {540, 32}, {530, 32},
{512, 32}, {510, 32}, {509, 32}, {506, 32}, {505, 32}, {405, 32}, {330, 16},
};
//opencl 2.0 can get SubGroupSize.
uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range) {
uint32_t sub_group_size = 0;
if (ADRENO == gpu_info_.type) {
#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_TARGET_OPENCL_VERSION >= 210 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
cl_int cl_ret;
sub_group_size = kernel.getSubGroupInfo<CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE>(*device_, range, &cl_ret);
if (cl_ret != CL_SUCCESS) {
CHECK_CL_SUCCESS(cl_ret)
sub_group_size = 0;
}
#else
if (AdrenoSubGroup.find(gpu_info_.model_num) != AdrenoSubGroup.end()) {
sub_group_size = AdrenoSubGroup[gpu_info_.model_num];
}
#endif
}
return sub_group_size;
} |
cl&&gl交互默认编译不开启,需要设置CMake // https://github.com/Tencent/TNN/blob/aedc6c849e711a6386a8d2cd4ebb0bc94c7b9285/source/tnn/device/opencl/opencl_runtime.cc#L341
#ifdef SHARING_MEM_WITH_OPENGL
#include <EGL/egl.h>
#endif
//Init will get platforms info, get devices info, create opencl context.
Status OpenCLRuntime::Init() {
// ....
#if defined(SHARING_MEM_WITH_OPENGL) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
// create context from glcontext
LOGI("Create special opencl context to share with OpenGL\n");
LOGI("eglGetCurrentContext(): 0x%x\n", eglGetCurrentContext());
cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(), 0};
context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, context_prop, nullptr, nullptr, &err));
if (err != CL_SUCCESS) {
LOGE(
"Create special opencl context falied, Create common opencl "
"context then.\n");
context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, nullptr, nullptr, nullptr, &err));
}
#else
LOGI("Create common opencl context\n");
context_ = std::shared_ptr<cl::Context>(new cl::Context(*device_, nullptr, nullptr, nullptr, &err));
#endif
|
tune// https://github.com/Tencent/TNN/blob/4b9ffbecc22f5ea4ba6bc4fdacff85475a59d08d/source/tnn/device/opencl/acc/opencl_layer_acc.cc#L160
Status OpenCLLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
#if defined(LOCAL_SIZE_FINE_TUNE) && TNN_PROFILE
auto execute_unit_org = execute_units_[0];
auto max_wgs = execute_unit_org.workgroupsize_max;
std::vector<std::vector<uint32_t>> local_size_list_3d = {
{16, 4, 1}, {8, 8, 1}, {4, 16, 1}, {2, 32, 1}, {1, 64, 1}, {2, 64, 1}, {4, 64, 1},
{8, 64, 1}, {16, 64, 1}, {8, 64, 2}, {4, 64, 4}, {2, 64, 8}, {2, 64, 4}, {},
};
std::vector<std::vector<uint32_t>> local_size_list_2d = {
{2, max_wgs / 2}, {4, max_wgs / 4}, {8, max_wgs / 8},
{16, max_wgs / 16}, {max_wgs / 2, 2}, {max_wgs / 4, 4},
{max_wgs / 8, 8}, {max_wgs / 16, 16}, {},
};
std::vector<uint32_t> local_size_default;
if (execute_unit_org.global_work_size.size() == 2) {
local_size_default = LocalWS2DDefault(execute_unit_org);
} else if (execute_unit_org.global_work_size.size() == 3) {
local_size_default = LocalWS3DDefault(execute_unit_org);
}
OpenCLExecuteUnit exec_unit_default = execute_unit_org;
exec_unit_default.local_work_size = local_size_default;
execute_units_.push_back(exec_unit_default);
if (execute_unit_org.global_work_size.size() == 2) {
for (auto local_size : local_size_list_2d) {
OpenCLExecuteUnit exec_unit_temp = execute_unit_org;
exec_unit_temp.local_work_size = local_size;
execute_units_.push_back(exec_unit_temp);
}
} else if (execute_unit_org.global_work_size.size() == 3) {
for (auto local_size : local_size_list_3d) {
OpenCLExecuteUnit exec_unit_temp = execute_unit_org;
exec_unit_temp.local_work_size = local_size;
execute_units_.push_back(exec_unit_temp);
}
}
#endif |
The text was updated successfully, but these errors were encountered: