Skip to content

Commit

Permalink
[Intel GPU]Add 13 Intel GPU kernels (PaddlePaddle#479)
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhiwei35 authored Apr 6, 2023
1 parent 2d3390d commit 094616b
Show file tree
Hide file tree
Showing 37 changed files with 7,165 additions and 43 deletions.
24 changes: 9 additions & 15 deletions backends/intel_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,16 @@ endif()
find_package(PythonInterp 3.9 REQUIRED)
find_package(PythonLibs 3.9 REQUIRED)

if(NOT DEFINED ENV{DPL_ROOT})
message(FATAL_ERROR "Please set the env var DPL_ROOT!" $ENV{DPL_ROOT})
endif()

set(DPCPP_COMPIER_PATH "${ONEAPI_PATH}/${ONEAPI_COMPILER_DIR}/linux/bin/dpcpp")
set(ONEAPI_SYCL_INCLUDE
"${ONEAPI_PATH}/${ONEAPI_COMPILER_DIR}/linux/include/sycl/")
set(ONEAPI_SYCL_LIBDIR "${ONEAPI_PATH}/${ONEAPI_COMPILER_DIR}/linux/lib/")
set(CMAKE_CXX_COMPILER "${DPCPP_COMPIER_PATH}")

# TODO(Zhiwei35): when using oneDNN, should add oneDNN include dir.
if(WITH_ONEDNN)
include_directories(${ONEAPI_SYCL_INCLUDE} ${ONEDNN_INC})
else()
include_directories(${ONEAPI_SYCL_INCLUDE} $ENV{DPL_ROOT}/include)
include_directories(${ONEAPI_SYCL_INCLUDE} $ENV{DNNLROOT}/include)
endif()

set(PLUGIN_NAME "paddle-custom-intel-gpu")
Expand All @@ -65,19 +60,18 @@ endif()

add_definitions(-std=c++14)

# custom kernels
file(
GLOB_RECURSE PLUGIN_SRCS
RELATIVE ${CMAKE_SOURCE_DIR}
kernels/*.cc)
message(STATUS "PLUGIN_SRCS : ${PLUGIN_SRCS}")

# custom runtime
set(PLUGIN_SRCS runtime/runtime.cc)
list(APPEND PLUGIN_SRCS runtime/runtime.cc)
add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
add_definitions(-DPADDLE_WITH_CUSTOM_KERNEL)

# custom kernels
if(WITH_KERNELS)
# TODO(Zhiwei35, to be more general when adding many kernels)
list(APPEND PLUGIN_SRCS kernels/mean_kernel.cc)
message(STATUS "CUSTOM_KERNEL_SRCS : ${CUSTOM_KERNEL_SRCS}")
message(STATUS "PLUGIN_SRCS : ${PLUGIN_SRCS}")
endif()

# build shared library
add_library(${PLUGIN_NAME} SHARED ${PLUGIN_SRCS})

Expand Down
8 changes: 4 additions & 4 deletions backends/intel_gpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu
mkdir build && cd build

cmake ..
make -j8
make -j $(nproc)

# using pip to install the output
pip install dist/paddle_custom_intel_gpu*.whl
Expand All @@ -42,10 +42,10 @@ pip install dist/paddle_custom_intel_gpu*.whl
## Verification

```bash
# list available hardware backends
python -c "import paddle; print(paddle.device.get_all_custom_device_type())"
# check the plugin status
python -c "import paddle; print('intel_gpu' in paddle.device.get_all_custom_device_type())"

# expected output
['intel_gpu']
True

```
255 changes: 255 additions & 0 deletions backends/intel_gpu/kernels/argsort_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "kernels/dnn_support.hpp"
#include "kernels/kernels.h"
#include "kernels/phi_funcs.h"
#include "paddle/phi/capi/all.h"

namespace custom_kernel {

namespace gpu {

template <typename T>
void Transpose(const phi::Context& ctx,
const phi::DenseTensor& x,
const std::vector<int64_t>& axis,
T* out_data,
const std::vector<int64_t>& out_dims,
int64_t out_numel) {
auto x_dims = x.dims();
auto x_data = x.data<T>();
show_kernel("TransposeKernel");
show_debug("x{dims}=" << x.dims() << " x{rank}=" << x_dims.size()
<< " out{dims}=" << out_dims);

if (out_numel == 0) {
return;
}
auto rank = x_dims.size();
if (rank == 1) {
memcpy(out_data, x_data, x.numel() * sizeof(T));
}
PD_CHECK(axis.size() == rank,
"axis.size (%d) must be equal the rank of input (%d).",
axis.size(),
rank);

std::vector<size_t> step(out_dims.size(), 1);
for (auto i = out_dims.size() - 1; i > 0; --i) {
step[i - 1] = step[i] * out_dims[i];
}

std::vector<size_t> index(rank, 0);
for (auto i = 0; i < x.numel(); ++i) {
std::vector<size_t> dst_index(rank, 0);
for (auto j = 0; j < rank; ++j) {
dst_index[j] = index[axis[j]];
}
out_data[phi::vec_product(dst_index, step)] = x_data[i];

index.back()++;
for (auto j = rank - 1; j > 0; --j) {
if (index[j] >= x_dims[j]) {
index[j - 1]++;
index[j] = 0;
} else {
break;
}
}
}
}

template <typename T>
void FullSort(int input_height,
int input_width,
int input_dim,
T* input,
T* t_out,
int64_t* t_indices,
bool descending) {
for (int i = 0; i < input_height; ++i) {
std::vector<std::pair<T, int>> col_vec;
col_vec.reserve(input_width);
if (input_dim == 1) {
for (int j = 0; j < input_width; ++j) {
col_vec.push_back(std::pair<T, int>(input[j], j));
}
} else {
for (int j = 0; j < input_width; ++j) {
col_vec.push_back(std::pair<T, int>(input[i * input_width + j], j));
}
}
std::sort(col_vec.begin(),
col_vec.end(),
[&](const std::pair<T, int>& l, const std::pair<T, int>& r) {
if (descending)
// TODO(Zhiwei35) comparison with NaN always evaluates to
// false in fast floating point modes and need to enhance
return (std::isnan(static_cast<double>(l.first)) &&
!std::isnan(static_cast<double>(r.first))) ||
(l.first > r.first);
else
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
for (int j = 0; j < input_width; ++j) {
t_out[i * input_width + j] = col_vec[j].first;
t_indices[i * input_width + j] = col_vec[j].second;
}
}
}

template <typename T>
void ArgsortKernel(const phi::Context& dev_ctx,
const phi::DenseTensor& input,
int axis,
bool descending,
phi::DenseTensor* output,
phi::DenseTensor* indices) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto out_size = output->numel();
auto ids_size = indices->numel();
auto out_mem_size = out_size * sizeof(T);
auto ids_mem_size = ids_size * sizeof(int64_t);
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
T* out_data = dev_ctx.template Alloc<T>(output);
int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);

show_kernel("argsort in_dims=" << in_dims << " axis=" << axis << " type="
<< dnn_support::type2String<T>::name()
<< " desc=" << descending);
// TODO(Zhiwei35): support argsort with dims >=3
PD_CHECK(in_dims.size() < 3, "PoC Lenet/Mnist use case only");
auto* q = static_cast<sycl::queue*>(const_cast<void*>(dev_ctx.stream()));
size_t n = 1;
size_t m = in_dims[0];

if (in_dims.size() == 2) {
n = in_dims[0];
m = in_dims[1];
}
phi::DenseTensor cpu_input;
cpu_input.Resize(std::vector<int64_t>(in_dims));
cpu_input.set_dtype(input.dtype());
auto cpu_input_data = dev_ctx.template HostAlloc<T>(&cpu_input);

auto input_data = input.data<T>();
q->memcpy(cpu_input_data, input_data, input.memory_size());
q->wait();
// cpu implement
phi::DenseTensor cpu_output;
cpu_output.Resize(std::vector<int64_t>(out_dims));
cpu_output.set_dtype(output->dtype());
auto cpu_output_dims = cpu_output.dims();
auto cpu_output_numel = cpu_output.numel();
auto cpu_output_data = dev_ctx.template HostAlloc<T>(&cpu_output);

phi::DenseTensor cpu_ids;
cpu_ids.Resize(std::vector<int64_t>(indices->dims()));
cpu_ids.set_dtype(indices->dtype());
auto cpu_ids_dims = cpu_ids.dims();
auto cpu_ids_numel = cpu_ids.numel();
auto cpu_ids_data = dev_ctx.template HostAlloc<int64_t>(&cpu_ids);
// no need transpose
if (axis == -1 || axis + 1 == in_dims.size()) {
const int input_height = n;
const int input_width = m;
FullSort<T>(input_height,
input_width,
in_dims.size(),
cpu_input_data,
cpu_output_data,
cpu_ids_data,
descending);
} else {
// do cpu transpose
std::vector<int64_t> trans;
for (int i = 0; i < axis; i++) {
trans.push_back(i);
}
trans.push_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans.push_back(i);
}
trans.push_back(axis);
std::vector<int64_t> trans_dims(in_dims.cbegin(), in_dims.cend());
for (size_t i = 0; i < trans.size(); i++) {
trans_dims[i] = in_dims[trans[i]];
}

phi::DenseTensor trans_inp;
trans_inp.Resize(trans_dims);
auto trans_input_dims = trans_inp.dims();
auto trans_input_numel = trans_inp.numel();
auto trans_input_data = dev_ctx.template HostAlloc<T>(&trans_inp);
// do cpu transpose input
Transpose<T>(dev_ctx,
cpu_input,
trans,
trans_input_data,
trans_input_dims,
trans_input_numel);

const int64_t input_height = trans_dims[0];
const int64_t input_width = trans_dims[trans_dims.size() - 1];

phi::DenseTensor cpu_tmp_output;
cpu_tmp_output.Resize(trans_dims);
cpu_tmp_output.set_dtype(output->dtype());
auto cpu_tmp_output_data = dev_ctx.template HostAlloc<T>(&cpu_tmp_output);

phi::DenseTensor cpu_tmp_ids;
cpu_tmp_ids.Resize(trans_dims);
cpu_tmp_ids.set_dtype(indices->dtype());
auto cpu_tmp_ids_data = dev_ctx.template HostAlloc<int64_t>(&cpu_tmp_ids);

FullSort<T>(input_height,
input_width,
trans_dims.size(),
trans_input_data,
cpu_tmp_output_data,
cpu_tmp_ids_data,
descending);

Transpose<int64_t>(
dev_ctx, cpu_tmp_ids, trans, cpu_ids_data, cpu_ids_dims, cpu_ids_numel);
// CPU transpose back
Transpose<T>(dev_ctx,
cpu_tmp_output,
trans,
cpu_output_data,
cpu_output_dims,
cpu_output_numel);
}
// copy cpu result to intel gpu
q->memcpy(out_data, cpu_output_data, out_mem_size);
q->memcpy(ids_data, cpu_ids_data, ids_mem_size);
q->wait();
} // ArgsortKernel

} // namespace gpu

} // namespace custom_kernel

PD_BUILD_PHI_KERNEL(argsort,
intel_gpu,
ALL_LAYOUT,
custom_kernel::gpu::ArgsortKernel,
float,
double,
int,
int64_t) {}
Loading

0 comments on commit 094616b

Please sign in to comment.