From 2cfde9de8da05ce8606d033b69cebd6b84e762e4 Mon Sep 17 00:00:00 2001 From: strint Date: Fri, 18 Aug 2023 04:48:35 +0000 Subject: [PATCH 01/65] change for sd torch graph compile --- oneflow/api/python/framework/device.cpp | 7 ++++--- python/oneflow/nn/graph/graph.py | 1 + python/oneflow/nn/modules/module.py | 6 +++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/oneflow/api/python/framework/device.cpp b/oneflow/api/python/framework/device.cpp index 445b953aac4..bb662f50793 100644 --- a/oneflow/api/python/framework/device.cpp +++ b/oneflow/api/python/framework/device.cpp @@ -31,9 +31,10 @@ ONEFLOW_API_PYBIND11_MODULE("", m) { .def(py::init([](const std::string& type_or_type_with_device_id) { return Device::ParseAndNew(type_or_type_with_device_id).GetOrThrow(); })) - .def(py::init([](const std::string& type, int64_t device_id) { - return Device::New(type, device_id).GetOrThrow(); - })) + .def(py::init([](const std::string& type, int64_t index) { + return Device::New(type, index).GetOrThrow(); + }), + py::arg("type"), py::arg("index")) .def(py::init([](const Symbol& other_device) { return other_device; })) .def_property_readonly("type", [](const Symbol& d) { return d->type(); }) .def_property_readonly("index", [](const Symbol& d) { return d->device_id(); }) diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py index 2b6aca3627c..3b4c97f7a2a 100644 --- a/python/oneflow/nn/graph/graph.py +++ b/python/oneflow/nn/graph/graph.py @@ -274,6 +274,7 @@ def __call__(self, *args, **kwargs): Donot override this function. """ # For cache cache graphs with dynamic input shape. + print(self.name, " called.") if self._run_with_cache == True: return self._dynamic_input_graph_cache(*args, **kwargs) diff --git a/python/oneflow/nn/modules/module.py b/python/oneflow/nn/modules/module.py index 3bdf4d63ca7..7196549e045 100644 --- a/python/oneflow/nn/modules/module.py +++ b/python/oneflow/nn/modules/module.py @@ -1842,9 +1842,9 @@ def __repr__(self): def _shallow_repr(self): extra_lines = [] - extra_repr = self.extra_repr() - if extra_repr: - extra_lines = extra_repr.split("\n") + #extra_repr = self.extra_repr() + # if extra_repr: + # extra_lines = extra_repr.split("\n") lines = extra_lines main_str = self._get_name() + "(" if lines: From f04cc42dbb73d3186004ee424bdc68e745db639d Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Fri, 18 Aug 2023 04:51:54 +0000 Subject: [PATCH 02/65] auto format by CI --- oneflow/api/python/framework/device.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/oneflow/api/python/framework/device.cpp b/oneflow/api/python/framework/device.cpp index bb662f50793..df7278a2dd1 100644 --- a/oneflow/api/python/framework/device.cpp +++ b/oneflow/api/python/framework/device.cpp @@ -32,9 +32,9 @@ ONEFLOW_API_PYBIND11_MODULE("", m) { return Device::ParseAndNew(type_or_type_with_device_id).GetOrThrow(); })) .def(py::init([](const std::string& type, int64_t index) { - return Device::New(type, index).GetOrThrow(); - }), - py::arg("type"), py::arg("index")) + return Device::New(type, index).GetOrThrow(); + }), + py::arg("type"), py::arg("index")) .def(py::init([](const Symbol& other_device) { return other_device; })) .def_property_readonly("type", [](const Symbol& d) { return d->type(); }) .def_property_readonly("index", [](const Symbol& d) { return d->device_id(); }) From c30278e74a04a19a25f55e5b540b91c6d8491a71 Mon Sep 17 00:00:00 2001 From: Xiaoyu Xu Date: Fri, 18 Aug 2023 15:42:53 +0800 Subject: [PATCH 03/65] Update python/oneflow/nn/graph/graph.py Co-authored-by: Houjiang Chen --- python/oneflow/nn/graph/graph.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py index 3b4c97f7a2a..2b6aca3627c 100644 --- a/python/oneflow/nn/graph/graph.py +++ b/python/oneflow/nn/graph/graph.py @@ -274,7 +274,6 @@ def __call__(self, *args, **kwargs): Donot override this function. """ # For cache cache graphs with dynamic input shape. - print(self.name, " called.") if self._run_with_cache == True: return self._dynamic_input_graph_cache(*args, **kwargs) From b0d72a3b82a28b56985a7ad100c27c29fb90c9bc Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Fri, 18 Aug 2023 07:44:42 +0000 Subject: [PATCH 04/65] auto format by CI --- python/oneflow/nn/modules/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/oneflow/nn/modules/module.py b/python/oneflow/nn/modules/module.py index 7196549e045..c64a6eb3321 100644 --- a/python/oneflow/nn/modules/module.py +++ b/python/oneflow/nn/modules/module.py @@ -1842,7 +1842,7 @@ def __repr__(self): def _shallow_repr(self): extra_lines = [] - #extra_repr = self.extra_repr() + # extra_repr = self.extra_repr() # if extra_repr: # extra_lines = extra_repr.split("\n") lines = extra_lines From ed27149a92b801ce9a3349fb1b1c532c74d6b282 Mon Sep 17 00:00:00 2001 From: strint Date: Fri, 18 Aug 2023 12:18:18 +0000 Subject: [PATCH 05/65] restore md --- python/oneflow/nn/modules/module.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/oneflow/nn/modules/module.py b/python/oneflow/nn/modules/module.py index c64a6eb3321..3bdf4d63ca7 100644 --- a/python/oneflow/nn/modules/module.py +++ b/python/oneflow/nn/modules/module.py @@ -1842,9 +1842,9 @@ def __repr__(self): def _shallow_repr(self): extra_lines = [] - # extra_repr = self.extra_repr() - # if extra_repr: - # extra_lines = extra_repr.split("\n") + extra_repr = self.extra_repr() + if extra_repr: + extra_lines = extra_repr.split("\n") lines = extra_lines main_str = self._get_name() + "(" if lines: From df4a31d10880174e9380c029bbe7e2faa9679f1d Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 21 Aug 2023 15:05:55 +0000 Subject: [PATCH 06/65] update cutlass and support int8 conv --- CMakeLists.txt | 4 +- cmake/third_party.cmake | 1 + cmake/third_party/cutlass.cmake | 21 ++++ oneflow/user/kernels/conv_cutlass_kernels.cu | 116 ++++++++++++++++++ ...32fprop_optimized_s8_128x128_128x4_nhwc.cu | 81 ++++++++++++ ...832fprop_optimized_s8_128x32_128x4_nhwc.cu | 81 ++++++++++++ ...6832fprop_optimized_s8_128x64_64x4_nhwc.cu | 81 ++++++++++++ ...6832fprop_optimized_s8_128x64_64x6_nhwc.cu | 81 ++++++++++++ ...832fprop_optimized_s8_32x128_128x4_nhwc.cu | 81 ++++++++++++ ...816fprop_optimized_s8_128x128_32x2_nhwc.cu | 87 +++++++++++++ ...816fprop_optimized_s8_128x128_64x2_nhwc.cu | 87 +++++++++++++ ...816fprop_optimized_s8_128x256_32x2_nhwc.cu | 87 +++++++++++++ ...816fprop_optimized_s8_128x256_64x2_nhwc.cu | 87 +++++++++++++ ...8816fprop_optimized_s8_128x32_32x2_nhwc.cu | 86 +++++++++++++ ...8816fprop_optimized_s8_128x32_64x2_nhwc.cu | 86 +++++++++++++ ...8816fprop_optimized_s8_128x64_32x2_nhwc.cu | 86 +++++++++++++ ...8816fprop_optimized_s8_128x64_64x2_nhwc.cu | 86 +++++++++++++ ...8816fprop_optimized_s8_64x128_32x2_nhwc.cu | 86 +++++++++++++ ...8816fprop_optimized_s8_64x128_64x2_nhwc.cu | 86 +++++++++++++ ...i8816fprop_optimized_s8_64x32_32x2_nhwc.cu | 86 +++++++++++++ ...i8816fprop_optimized_s8_64x32_64x2_nhwc.cu | 86 +++++++++++++ ...i8816fprop_optimized_s8_64x64_32x2_nhwc.cu | 86 +++++++++++++ ...i8816fprop_optimized_s8_64x64_64x2_nhwc.cu | 86 +++++++++++++ oneflow/user/kernels/fused_glu_kernel.cu | 27 ++-- oneflow/user/ops/conv_op.cpp | 5 + 25 files changed, 1777 insertions(+), 10 deletions(-) create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu create mode 100644 oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fe46d0a5ef..0768ee9fed2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,9 +212,9 @@ if(BUILD_PYTHON) endif(BUILD_PYTHON) set(CUTLASS_URL - https://github.com/Oneflow-Inc/cutlass/archive/e6f548d80bfdf1167d66adbbbcfc2ee3394f4777.zip) + https://github.com/Oneflow-Inc/cutlass/archive/6429516476b11a8a2b6aace2e1402c39f914e1b2.zip) use_mirror(VARIABLE CUTLASS_URL URL ${CUTLASS_URL}) -set(CUTLASS_MD5 425f8cf064ff47c81124e55490135f5c) +set(CUTLASS_MD5 8357dec4a45ccb762ff03e3cf99f0a55) include(cuda) add_subdirectory(external) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index c7ac2893e6e..832b91b2a44 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -157,6 +157,7 @@ if(BUILD_CUDA) if(WITH_CUTLASS) list(APPEND oneflow_third_party_dependencies cutlass) + list(APPEND oneflow_third_party_dependencies cutlass_copy_extra_library_to_destination) list(APPEND oneflow_third_party_dependencies cutlass_copy_examples_to_destination) list(APPEND oneflow_third_party_libs ${CUTLASS_LIBRARIES}) list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUTLASS_INCLUDE_DIR}) diff --git a/cmake/third_party/cutlass.cmake b/cmake/third_party/cutlass.cmake index e11bfe7e16d..20114d13006 100644 --- a/cmake/third_party/cutlass.cmake +++ b/cmake/third_party/cutlass.cmake @@ -60,6 +60,26 @@ if(WITH_CUTLASS) -DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING= -DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF) + add_custom_target(cutlass_copy_extra_library_to_destination DEPENDS cutlass) + set(CUTLASS_SOURCE_LIBRARY_DIR ${CUTLASS_SOURCE_DIR}/tools/library) + set(CUTLASS_INSTALL_EXTRA_LIBRARY_FILES + "src/conv2d_operation.h" + "src/conv3d_operation.h" + "src/gemm_operation_3x.hpp" + "src/gemm_operation.h" + "src/library_internal.h" + "src/rank_2k_operation.h" + "src/rank_k_operation.h" + "src/symm_operation.h" + "src/trmm_operation.h" + ) + foreach(filename ${CUTLASS_INSTALL_EXTRA_LIBRARY_FILES}) + add_custom_command( + TARGET cutlass_copy_extra_library_to_destination + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUTLASS_SOURCE_LIBRARY_DIR}/${filename} + ${CUTLASS_INSTALL_DIR}/include/cutlass/library/${filename}) + endforeach() + add_custom_target(cutlass_copy_examples_to_destination DEPENDS cutlass) set(CUTLASS_SOURCE_EXAMPLES_DIR ${CUTLASS_SOURCE_DIR}/examples) @@ -67,6 +87,7 @@ if(WITH_CUTLASS) "45_dual_gemm/test_run.h" "45_dual_gemm/kernel/dual_gemm.h" "45_dual_gemm/device/dual_gemm.h" + "45_dual_gemm/dual_gemm_common.h" "45_dual_gemm/dual_gemm_run.h" "45_dual_gemm/thread/left_silu_and_mul.h" "45_dual_gemm/threadblock/dual_mma_multistage.h" diff --git a/oneflow/user/kernels/conv_cutlass_kernels.cu b/oneflow/user/kernels/conv_cutlass_kernels.cu index 52c3acf9e38..5026673e8ee 100644 --- a/oneflow/user/kernels/conv_cutlass_kernels.cu +++ b/oneflow/user/kernels/conv_cutlass_kernels.cu @@ -180,6 +180,122 @@ REGISTER_USER_KERNEL("conv2d") }) .SetPriority(user_op::kKernelPriorityOptimized); +class Conv2dInt8CutlassKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + Conv2dInt8CutlassKernel() = default; + ~Conv2dInt8CutlassKernel() override = default; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + CHECK(add_to_output == nullptr); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const auto& padding_before = ctx->Attr>("padding_before"); + const auto& dilation_rate = ctx->Attr>("dilation_rate"); + const auto& strides = ctx->Attr>("strides"); + + const int n = in->shape_view().At(0); + const int h = in->shape_view().At(1); + const int w = in->shape_view().At(2); + const int c = in->shape_view().At(3); + + const int k = weight->shape_view().At(0); + const int r = weight->shape_view().At(1); + const int s = weight->shape_view().At(2); + CHECK_EQ(weight->shape_view().At(3), c); + + const int p = out->shape_view().At(1); + const int q = out->shape_view().At(2); + + auto* stream = ctx->stream()->As(); + + cutlass::library::ConvFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kF32); + + cutlass::conv::Conv2dProblemSize problem_size( + n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), + strides.at(1), dilation_rate.at(0), dilation_rate.at(1), + cutlass::conv::Mode::kCrossCorrelation); + cutlass::library::Conv2dConfiguration configuraion; + configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; + configuraion.problem_size = problem_size; + configuraion.stride_a = {c, w * c, h * w * c}; + configuraion.stride_b = {c, s * c, r * s * c}; + configuraion.stride_c = {0, 0, 0}; + + cutlass::library::ConvArguments arguments; + arguments.A = in->dptr(); + arguments.B = weight->dptr(); + arguments.reordered_B = nullptr; + if (bias == nullptr) { + arguments.C = nullptr; + } else { + arguments.C = bias->dptr(); + } + arguments.D = out->mut_dptr(); + + float alpha = 1; + float beta = 0; + arguments.alpha = α + arguments.beta = β + arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; + const cutlass::library::Operation* operation = nullptr; + operation = [&]() -> const cutlass::library::Operation* { + const std::string& tuning_cache = ctx->Attr("tuning_cache"); + if (tuning_cache.empty()) { return nullptr; } + auto tuning_cache_object = nlohmann::json::parse(tuning_cache); + if (!tuning_cache_object.is_object()) { return nullptr; } + auto it = tuning_cache_object.find("cutlass"); + if (it == tuning_cache_object.end()) { return nullptr; } + if (!it->is_string()) { return nullptr; } + const std::string name = *it; + return CutlassConvTuner::Get().GetConv2dOperation(name, stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); + }(); + if (!operation) { + operation = CutlassConvTuner::Get().FindConv2dOperation(stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); + } + + CHECK(operation != nullptr); + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + std::vector host_workspace(host_workspace_size, 0); + auto init_status = operation->initialize(&configuraion, host_workspace.data(), + tmp_buffer->mut_dptr(), stream->cuda_stream()); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = operation->run(&arguments, host_workspace.data(), tmp_buffer->mut_dptr(), + stream->cuda_stream()); + CHECK(run_status == cutlass::Status::kSuccess); + } +}; + +REGISTER_USER_KERNEL("conv2d") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobAttr("data_format") == "channels_last") + && (user_op::HobAttr("groups") == 1) + && (user_op::HobDataType("in", 0) == DataType::kInt8)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { + // use static workspace size + return 128 * 1024 * 1024; + }) + .SetPriority(user_op::kKernelPriorityOptimized); + } // namespace } // namespace oneflow diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu new file mode 100644 index 00000000000..9699ad00cce --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu @@ -0,0 +1,81 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc" +template +using cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc + : public cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc<4, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc<8, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc<16, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu new file mode 100644 index 00000000000..bc3fdfe8070 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu @@ -0,0 +1,81 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc" +template +using cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 32, 128>, cutlass::gemm::GemmShape<32, 32, 128>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc + : public cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc<4, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc<8, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc<16, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu new file mode 100644 index 00000000000..63e624e1054 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu @@ -0,0 +1,81 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc" +template +using cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc + : public cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc<4, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc<8, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc<16, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu new file mode 100644 index 00000000000..d125f667776 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu @@ -0,0 +1,81 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc" +template +using cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 6, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc + : public cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc<4, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc<8, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc<16, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu new file mode 100644 index 00000000000..35ac414af11 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu @@ -0,0 +1,81 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc" +template +using cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 128, 128>, cutlass::gemm::GemmShape<32, 32, 128>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc + : public cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc<4, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc<8, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc<16, 4>>( + "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu new file mode 100644 index 00000000000..00877fabdaa --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu @@ -0,0 +1,87 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu new file mode 100644 index 00000000000..cfd8bf58875 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu @@ -0,0 +1,87 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu new file mode 100644 index 00000000000..aaa4e2aa932 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu @@ -0,0 +1,87 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 32>, cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu new file mode 100644 index 00000000000..23686d350a7 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu @@ -0,0 +1,87 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance +// "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_base {}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu new file mode 100644 index 00000000000..6f24f81b865 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 32>, cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu new file mode 100644 index 00000000000..9bdeaaf6e3e --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu new file mode 100644 index 00000000000..b521864ecfa --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu new file mode 100644 index 00000000000..394793bb92c --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu new file mode 100644 index 00000000000..39c0543700e --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu new file mode 100644 index 00000000000..2664b912fc3 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu new file mode 100644 index 00000000000..48d2517d25d --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu new file mode 100644 index 00000000000..07427fe7b3d --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu new file mode 100644 index 00000000000..1ca85bc86e5 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu new file mode 100644 index 00000000000..26d86b37bb8 --- /dev/null +++ b/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu @@ -0,0 +1,86 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "cutlass/library/src/conv2d_operation.h" +#include "cutlass/library/src/library_internal.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc" +template +using cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, + cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< + 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; + +// Derived class +template +struct cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc + : public cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_base { +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc = + cutlass::conv::device::ImplicitGemmConvolution< + cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc>; + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc(Manifest& manifest) { + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<1, 1>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align1")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<2, 2>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align2")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<4, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align4")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<8, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align8")); + manifest.append(new cutlass::library::Conv2dOperation< + Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<16, 4>>( + "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align16")); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/fused_glu_kernel.cu b/oneflow/user/kernels/fused_glu_kernel.cu index 56a2e81a957..3d5d8858f56 100644 --- a/oneflow/user/kernels/fused_glu_kernel.cu +++ b/oneflow/user/kernels/fused_glu_kernel.cu @@ -173,10 +173,11 @@ void DualGemmGegluHalf(ep::CudaStream* stream, int32_t m, int32_t n, int32_t k, constexpr bool kStoreD1 = true; using DualGemm = cutlass::gemm::device::DualGemm< ElementOperandA, cutlass::layout::RowMajor, ElementOperandB, cutlass::layout::ColumnMajor, - ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, - Arch, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1, - EpilogueOutputOp2, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, kStages, - kStoreD0, kStoreD1, kSplitKSerial>; + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, Arch, ThreadblockShape, WarpShape, InstructionShape, + EpilogueOutputOp0, EpilogueOutputOp1, EpilogueOutputOp2, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, kStages, kStoreD0, kStoreD1, + kSplitKSerial>; int split_k_slices = DualGemm::kSplitKSerial ? 2 : 1; @@ -198,10 +199,20 @@ void DualGemmGegluHalf(ep::CudaStream* stream, int32_t m, int32_t n, int32_t k, reinterpret_cast(y), n); cutlass::gemm::GemmCoord problem_size(m, n, k); - typename DualGemm::Arguments arguments{ - problem_size, tensor_a0, tensor_b0, tensor_bias0, tensor_d0, - tensor_b1, tensor_bias1, tensor_d1, tensor_out, {alpha0, beta0}, - {alpha1, beta1}, {}, split_k_slices}; + typename DualGemm::Arguments arguments{cutlass::gemm::DualGemmMode::kGemm, + problem_size, + tensor_a0, + tensor_b0, + tensor_bias0, + tensor_d0, + tensor_b1, + tensor_bias1, + tensor_d1, + tensor_out, + {alpha0, beta0}, + {alpha1, beta1}, + {}, + split_k_slices}; DualGemm dual_gemm_op; dual_gemm_op.initialize(arguments, stream->cublas_workspace(), stream->cuda_stream()); diff --git a/oneflow/user/ops/conv_op.cpp b/oneflow/user/ops/conv_op.cpp index bfc4e42616a..08d6e2e6714 100644 --- a/oneflow/user/ops/conv_op.cpp +++ b/oneflow/user/ops/conv_op.cpp @@ -52,6 +52,11 @@ Maybe InferTensorDesc4Conv(user_op::InferContext* ctx) { } out->set_is_dynamic(in.is_dynamic()); out->set_shape(Shape(out_shape)); + if (data_format == "channels_last") { + out->set_memory_format(MemoryFormat::kChannelsLast); + } else { + out->set_memory_format(MemoryFormat::kContiguous); + } } { From cc82f669ed8777923bf4feb99019492e8a515c65 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 22 Aug 2023 03:11:26 +0000 Subject: [PATCH 07/65] tuning int8 conv kernels --- CMakeLists.txt | 4 +- oneflow/user/kernels/conv_cutlass_kernels.cu | 6 +- ...32fprop_optimized_s8_128x128_128x4_nhwc.cu | 4 +- ...832fprop_optimized_s8_128x32_128x4_nhwc.cu | 4 +- ...6832fprop_optimized_s8_128x64_64x4_nhwc.cu | 4 +- ...6832fprop_optimized_s8_128x64_64x6_nhwc.cu | 4 +- ...832fprop_optimized_s8_32x128_128x4_nhwc.cu | 4 +- ...816fprop_optimized_s8_128x128_32x2_nhwc.cu | 4 +- ...816fprop_optimized_s8_128x128_64x2_nhwc.cu | 4 +- ...816fprop_optimized_s8_128x256_32x2_nhwc.cu | 4 +- ...816fprop_optimized_s8_128x256_64x2_nhwc.cu | 4 +- ...8816fprop_optimized_s8_128x32_32x2_nhwc.cu | 4 +- ...8816fprop_optimized_s8_128x32_64x2_nhwc.cu | 4 +- ...8816fprop_optimized_s8_128x64_32x2_nhwc.cu | 4 +- ...8816fprop_optimized_s8_128x64_64x2_nhwc.cu | 4 +- ...8816fprop_optimized_s8_64x128_32x2_nhwc.cu | 4 +- ...8816fprop_optimized_s8_64x128_64x2_nhwc.cu | 4 +- ...i8816fprop_optimized_s8_64x32_32x2_nhwc.cu | 4 +- ...i8816fprop_optimized_s8_64x32_64x2_nhwc.cu | 4 +- ...i8816fprop_optimized_s8_64x64_32x2_nhwc.cu | 4 +- ...i8816fprop_optimized_s8_64x64_64x2_nhwc.cu | 4 +- .../kernels/cutlass/external_singleton.cpp | 34 +++++++++++ .../user/kernels/cutlass/external_singleton.h | 61 +++++++++++++++++++ oneflow/user/kernels/cutlass_conv_tuner.cpp | 21 +++++-- 24 files changed, 173 insertions(+), 29 deletions(-) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu (97%) rename oneflow/user/kernels/{cutlass_operations => cutlass}/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu (97%) create mode 100644 oneflow/user/kernels/cutlass/external_singleton.cpp create mode 100644 oneflow/user/kernels/cutlass/external_singleton.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0768ee9fed2..eebc44dfc12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,9 +212,9 @@ if(BUILD_PYTHON) endif(BUILD_PYTHON) set(CUTLASS_URL - https://github.com/Oneflow-Inc/cutlass/archive/6429516476b11a8a2b6aace2e1402c39f914e1b2.zip) + https://github.com/Oneflow-Inc/cutlass/archive/d47b8883b5e3661b41cc8a7a6f4c240c5524647f.zip) use_mirror(VARIABLE CUTLASS_URL URL ${CUTLASS_URL}) -set(CUTLASS_MD5 8357dec4a45ccb762ff03e3cf99f0a55) +set(CUTLASS_MD5 7b417720240a443276ce4bb9ef169db1) include(cuda) add_subdirectory(external) diff --git a/oneflow/user/kernels/conv_cutlass_kernels.cu b/oneflow/user/kernels/conv_cutlass_kernels.cu index 5026673e8ee..e30b25a2c09 100644 --- a/oneflow/user/kernels/conv_cutlass_kernels.cu +++ b/oneflow/user/kernels/conv_cutlass_kernels.cu @@ -223,7 +223,7 @@ class Conv2dInt8CutlassKernel final : public user_op::OpKernel, public user_op:: cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kF32); + cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); cutlass::conv::Conv2dProblemSize problem_size( n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), @@ -247,8 +247,8 @@ class Conv2dInt8CutlassKernel final : public user_op::OpKernel, public user_op:: } arguments.D = out->mut_dptr(); - float alpha = 1; - float beta = 0; + int32_t alpha = 1; + int32_t beta = 0; arguments.alpha = α arguments.beta = β arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu index 9699ad00cce..2158fd6aa6b 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc<4, 4>>( "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_align4")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu index bc3fdfe8070..134ed531251 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc<4, 4>>( "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_align4")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu index 63e624e1054..193a14eebf2 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc<4, 4>>( "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_align4")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu index d125f667776..a8eed9605d3 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc<4, 4>>( "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_align4")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu index 35ac414af11..e858ccabf7f 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc<4, 4>>( "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_align4")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu index 00877fabdaa..5330d7642cf 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu index cfd8bf58875..9ae47d1b84b 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu index aaa4e2aa932..04928ede0e7 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu index 23686d350a7..1b8ccb4fab8 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance @@ -61,7 +63,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu index 6f24f81b865..07ef45ffe39 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu index 9bdeaaf6e3e..d9969233748 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu index b521864ecfa..b69aeb78db8 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu index 394793bb92c..d545b826aae 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu index 39c0543700e..7d796cf86a6 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu index 2664b912fc3..b206262821c 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu index 48d2517d25d..82bfb19a303 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu index 07427fe7b3d..85c2aae0ba7 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu index 1ca85bc86e5..2622afa25db 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu similarity index 97% rename from oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu rename to oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu index 26d86b37bb8..cc9142b2c45 100644 --- a/oneflow/user/kernels/cutlass_operations/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu +++ b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu @@ -27,6 +27,8 @@ limitations under the License. #include "cutlass/library/src/conv2d_operation.h" #include "cutlass/library/src/library_internal.h" +#include "oneflow/user/kernels/cutlass/external_singleton.h" + /////////////////////////////////////////////////////////////////////////////////////////////////// // Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc" @@ -60,7 +62,7 @@ namespace cutlass { namespace library { // Initialize all instances -void initialize_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc(Manifest& manifest) { +ONEFLOW_CUTLASS_MANIFEST(manifest) { manifest.append(new cutlass::library::Conv2dOperation< Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<1, 1>>( "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align1")); diff --git a/oneflow/user/kernels/cutlass/external_singleton.cpp b/oneflow/user/kernels/cutlass/external_singleton.cpp new file mode 100644 index 00000000000..07df60938ab --- /dev/null +++ b/oneflow/user/kernels/cutlass/external_singleton.cpp @@ -0,0 +1,34 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#include "oneflow/user/kernels/cutlass/external_singleton.h" + +namespace cutlass { +namespace library { + +ExternalSingleton::ExternalSingleton() {} + +ExternalSingleton& ExternalSingleton::get() { + static ExternalSingleton instance; + return instance; +} + +} // namespace library +} // namespace cutlass + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass/external_singleton.h b/oneflow/user/kernels/cutlass/external_singleton.h new file mode 100644 index 00000000000..2d8d52a20b7 --- /dev/null +++ b/oneflow/user/kernels/cutlass/external_singleton.h @@ -0,0 +1,61 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#include + +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" +#include "cutlass/library/operation_table.h" + +#include "oneflow/core/common/util.h" + +namespace cutlass { +namespace library { + +/// Singleton instance stores a Manifest and Operation table +class ExternalSingleton { + public: + /// Manifest object + std::vector manifest; + + /// Operation table referencing the Manifest + OperationTable operation_table; + + public: + ExternalSingleton(); + + static ExternalSingleton& get(); +}; + +#define ONEFLOW_CUTLASS_MANIFEST(m) ONEFLOW_CUTLASS_MANIFEST_IMPL(m, __COUNTER__) + +#define ONEFLOW_CUTLASS_MANIFEST_IMPL(m, uuid) \ + static void OF_PP_CAT(_cutlass_manifest_, uuid)(Manifest & m); \ + static int OF_PP_CAT(_cutlass_manifest_dummy_, uuid) = []() { \ + auto& manifest = ExternalSingleton::get().manifest; \ + manifest.resize(manifest.size() + 1); \ + OF_PP_CAT(_cutlass_manifest_, uuid)(manifest.back()); \ + ExternalSingleton::get().operation_table.append(manifest.back()); \ + return 0; \ + }(); \ + void OF_PP_CAT(_cutlass_manifest_, uuid)(Manifest & m) + +} // namespace library +} // namespace cutlass + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_conv_tuner.cpp b/oneflow/user/kernels/cutlass_conv_tuner.cpp index 553a0b73f56..7299a3874f3 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner.cpp @@ -25,6 +25,8 @@ limitations under the License. #include #include +#include "oneflow/user/kernels/cutlass/external_singleton.h" + namespace oneflow { namespace { @@ -232,11 +234,20 @@ const cutlass::library::Operation* CutlassConvTuner::Impl::FindConv2dOperation( OF_CUDA_CHECK(cudaEventCreate(&end)); const cutlass::library::Operation* fastest_operation = nullptr; float fastest_time = 0; - const auto& operations_map_it = - cutlass::library::Singleton::get().operation_table.conv2d_operations.find(functional_key); - CHECK(operations_map_it - != cutlass::library::Singleton::get().operation_table.conv2d_operations.cend()); - const cutlass::library::ConvOperationVectorMap& operations_map = operations_map_it->second; + const auto& operations_map = [&]() { + const auto& external_operations_map_it = + cutlass::library::ExternalSingleton::get().operation_table.conv2d_operations.find( + functional_key); + if (external_operations_map_it + != cutlass::library::ExternalSingleton::get().operation_table.conv2d_operations.cend()) { + return external_operations_map_it->second; + } + const auto& operations_map_it = + cutlass::library::Singleton::get().operation_table.conv2d_operations.find(functional_key); + CHECK(operations_map_it + != cutlass::library::Singleton::get().operation_table.conv2d_operations.cend()); + return operations_map_it->second; + }(); for (const auto& pair : operations_map) { std::map> operations; From 1e3c555ef7feafa0d4232a8133cd12319568bcd2 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 22 Aug 2023 05:31:42 +0000 Subject: [PATCH 08/65] fix --- oneflow/user/kernels/conv_cutlass_kernels.cu | 2 +- oneflow/user/kernels/cutlass_conv_tuner.cpp | 11 +++-------- oneflow/user/ops/conv_op.cpp | 6 +++++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/oneflow/user/kernels/conv_cutlass_kernels.cu b/oneflow/user/kernels/conv_cutlass_kernels.cu index e30b25a2c09..f409ed4edb5 100644 --- a/oneflow/user/kernels/conv_cutlass_kernels.cu +++ b/oneflow/user/kernels/conv_cutlass_kernels.cu @@ -234,7 +234,7 @@ class Conv2dInt8CutlassKernel final : public user_op::OpKernel, public user_op:: configuraion.problem_size = problem_size; configuraion.stride_a = {c, w * c, h * w * c}; configuraion.stride_b = {c, s * c, r * s * c}; - configuraion.stride_c = {0, 0, 0}; + configuraion.stride_c = {k, q * k, p * q * k}; cutlass::library::ConvArguments arguments; arguments.A = in->dptr(); diff --git a/oneflow/user/kernels/cutlass_conv_tuner.cpp b/oneflow/user/kernels/cutlass_conv_tuner.cpp index 7299a3874f3..d8b3d7fa2c6 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner.cpp @@ -70,14 +70,9 @@ struct Conv2dOperationCacheKey { return IsStrideAligned(configuraion.stride_a, n) && IsStrideAligned(configuraion.stride_b, n) && IsStrideAligned(configuraion.stride_c, n); }; - if (IsAligned(8)) { - alignment = 8; - } else if (IsAligned(4)) { - alignment = 4; - } else if (IsAligned(2)) { - alignment = 2; - } else { - alignment = 1; + alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); + for (; alignment > 1; alignment = alignment >> 1) { + if (IsAligned(alignment)) { break; } } } }; diff --git a/oneflow/user/ops/conv_op.cpp b/oneflow/user/ops/conv_op.cpp index 08d6e2e6714..7ba8cbc3adb 100644 --- a/oneflow/user/ops/conv_op.cpp +++ b/oneflow/user/ops/conv_op.cpp @@ -251,7 +251,11 @@ Maybe CheckAttr_(const user_op::UserOpDefWrapper& def, } /* static */ Maybe Conv2DOp::InferDataType(user_op::InferContext* ctx) { - ctx->SetOutputDType("out", 0, ctx->InputDType("in", 0)); + if (ctx->InputDType("in", 0) == DataType::kInt8) { + ctx->SetOutputDType("out", 0, DataType::kInt32); + } else { + ctx->SetOutputDType("out", 0, ctx->InputDType("in", 0)); + } return Maybe::Ok(); } From 19770b10236382f44dc66d97a346fbc8e8423f8e Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 22 Aug 2023 09:29:56 +0000 Subject: [PATCH 09/65] rm incompatible int8 conv implementation --- ...32fprop_optimized_s8_128x128_128x4_nhwc.cu | 83 ------------------- 1 file changed, 83 deletions(-) delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu deleted file mode 100644 index 2158fd6aa6b..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc.cu +++ /dev/null @@ -1,83 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc" -template -using cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - cutlass::gemm::GemmShape<128, 128, 128>, cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<16, 8, 32>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc - : public cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc<4, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc<8, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc<16, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x128_128x4_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// From c8110e8dff97c30482f43e2a79f0bd9f0c3b0c9a Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 22 Aug 2023 13:16:25 +0000 Subject: [PATCH 10/65] half reduce_min and reduce_max --- oneflow/user/kernels/reduce_kernel.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/oneflow/user/kernels/reduce_kernel.cpp b/oneflow/user/kernels/reduce_kernel.cpp index 73910c6a258..944b75adccb 100644 --- a/oneflow/user/kernels/reduce_kernel.cpp +++ b/oneflow/user/kernels/reduce_kernel.cpp @@ -162,7 +162,8 @@ class ReduceKernel final : public user_op::OpKernel, public user_op::CudaGraphSu REGISTER_REDUCE_ARITHMETIC_KERNELS(device, int8_t) \ REGISTER_REDUCE_ARITHMETIC_KERNELS(device, uint8_t) \ REGISTER_REDUCE_ARITHMETIC_KERNELS(device, int32_t) \ - REGISTER_REDUCE_ARITHMETIC_KERNELS(device, int64_t) + REGISTER_REDUCE_ARITHMETIC_KERNELS(device, int64_t) \ + REGISTER_REDUCE_ARITHMETIC_KERNELS(device, float16) #define REGISTER_REDUCE_NANSUM_KERNELS_BY_DEVICE(device) \ REGISTER_REDUCE_NANSUM_KERNELS(device, float) \ From 30435f9c084f8598a8d52275ca63baaf5a0721cc Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 23 Aug 2023 04:47:18 +0000 Subject: [PATCH 11/65] fake dynamo --- python/oneflow/_dynamo/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 python/oneflow/_dynamo/__init__.py diff --git a/python/oneflow/_dynamo/__init__.py b/python/oneflow/_dynamo/__init__.py new file mode 100644 index 00000000000..1930fa95acd --- /dev/null +++ b/python/oneflow/_dynamo/__init__.py @@ -0,0 +1,18 @@ +import warnings + +# Reference: https://github.com/pytorch/pytorch/blob/v2.0.1/torch/_dynamo/__init__.py +__all__ = [ + "allow_in_graph", +] + +def allow_in_graph(fn): + """ + """ + if isinstance(fn, (list, tuple)): + return [allow_in_graph(x) for x in fn] + assert callable(fn), "allow_in_graph expects a callable" + warnings.warn( + "The oneflow._dynamo.allow_in_graph interface is just to align the torch._dynamo.allow_in_graph interface and has no practical significance." + ) + return fn + From 69bbc0ee2bd9b8f6b33341f3eb5619b539a69879 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Wed, 23 Aug 2023 04:51:51 +0000 Subject: [PATCH 12/65] auto format by CI --- python/oneflow/_dynamo/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/oneflow/_dynamo/__init__.py b/python/oneflow/_dynamo/__init__.py index 1930fa95acd..dd1e5f072d6 100644 --- a/python/oneflow/_dynamo/__init__.py +++ b/python/oneflow/_dynamo/__init__.py @@ -1,3 +1,18 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" import warnings # Reference: https://github.com/pytorch/pytorch/blob/v2.0.1/torch/_dynamo/__init__.py From 6158db3ac6146e728c9b331a87757ef847b3e015 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 23 Aug 2023 05:28:54 +0000 Subject: [PATCH 13/65] warmup int8 conv algo --- .../cutlass_conv_tuning_warmup_pass.cpp | 94 ++++++++++++------- oneflow/user/kernels/cutlass_conv_tuner.cpp | 26 +++-- 2 files changed, 71 insertions(+), 49 deletions(-) diff --git a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp index 7a5677ce4ad..9ff33a13f40 100644 --- a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp +++ b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp @@ -72,7 +72,9 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const if (conv2d_op.attr("groups") != 1) { return; } VLOG(3) << "Tuning " << op_conf.name(); const auto& in_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("in", 0))); - if (in_desc.data_type() != DataType::kFloat16) { return; } + if (in_desc.data_type() != DataType::kFloat16 && in_desc.data_type() != DataType::kInt8) { + return; + } const auto& weight_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("weight", 0))); const auto& out_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.output("out", 0))); @@ -94,21 +96,6 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const const int p = out_desc.shape().At(1); const int q = out_desc.shape().At(2); - cutlass::library::ConvFunctionalKey key( - cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF32, cutlass::library::NumericTypeID::kF32); - - const bool allow_half_accumulation = - ParseBooleanFromEnv("ONEFLOW_CONV_ALLOW_HALF_PRECISION_ACCUMULATION", false); - - if (allow_half_accumulation) { - key.element_accumulator = cutlass::library::NumericTypeID::kF16; - key.element_compute = cutlass::library::NumericTypeID::kF16; - } - const size_t x_size = GetCudaAlignedSize(in_desc.ByteSizeOfBlobBody()); const size_t w_size = GetCudaAlignedSize(weight_desc.ByteSizeOfBlobBody()); const size_t y_size = GetCudaAlignedSize(out_desc.ByteSizeOfBlobBody()); @@ -135,47 +122,86 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), strides.at(1), dilation_rate.at(0), dilation_rate.at(1), cutlass::conv::Mode::kCrossCorrelation); + cutlass::library::Conv2dConfiguration configuraion; configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; configuraion.problem_size = problem_size; configuraion.stride_a = {c, w * c, h * w * c}; configuraion.stride_b = {c, s * c, r * s * c}; - configuraion.stride_c = {0, 0, 0}; + cutlass::library::ConvArguments arguments; arguments.A = x_ptr; arguments.B = w_ptr; arguments.reordered_B = nullptr; arguments.C = bias_ptr; arguments.D = y_ptr; + union SP { float f{}; half h; + int32_t i; }; - SP alpha; SP beta; + const cutlass::library::Operation* operation = nullptr; + + if (in_desc.data_type() == DataType::kFloat16) { + cutlass::library::ConvFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF32, cutlass::library::NumericTypeID::kF32); + + const bool allow_half_accumulation = + ParseBooleanFromEnv("ONEFLOW_CONV_ALLOW_HALF_PRECISION_ACCUMULATION", false); + if (allow_half_accumulation) { + key.element_accumulator = cutlass::library::NumericTypeID::kF16; + key.element_compute = cutlass::library::NumericTypeID::kF16; + } - if (allow_half_accumulation) { - alpha.h = static_cast(1.0F); - if (bias_ptr == nullptr) { - beta.h = static_cast(0.0F); + configuraion.stride_c = {0, 0, 0}; + + if (allow_half_accumulation) { + alpha.h = static_cast(1.0F); + if (bias_ptr == nullptr) { + beta.h = static_cast(0.0F); + } else { + beta.h = static_cast(1.0F); + } } else { - beta.h = static_cast(1.0F); + alpha.f = 1.0F; + if (bias_ptr == nullptr) { + beta.f = 0.0F; + } else { + beta.f = 1.0F; + } } + arguments.alpha = α + arguments.beta = β + arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; + operation = CutlassConvTuner::Get().FindConv2dOperation( + stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); + } else if (in_desc.data_type() == DataType::kInt8) { + cutlass::library::ConvFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); + + configuraion.stride_c = {k, q * k, p * q * k}; + int32_t alpha = 1; + int32_t beta = 0; + arguments.alpha = α + arguments.beta = β + arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; + operation = CutlassConvTuner::Get().FindConv2dOperation( + stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); } else { - alpha.f = 1.0F; - if (bias_ptr == nullptr) { - beta.f = 0.0F; - } else { - beta.f = 1.0F; - } + UNIMPLEMENTED() << "Only support float16 and int8 conv2d"; } - arguments.alpha = α - arguments.beta = β - arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; - const cutlass::library::Operation* operation = CutlassConvTuner::Get().FindConv2dOperation( - stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); if (operation != nullptr) { VLOG(3) << "Fastest operation: " << operation->description().name; nlohmann::json tuning_cache; diff --git a/oneflow/user/kernels/cutlass_conv_tuner.cpp b/oneflow/user/kernels/cutlass_conv_tuner.cpp index d8b3d7fa2c6..dfc9c17af04 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner.cpp @@ -33,21 +33,17 @@ namespace { bool IsWeakerAlginOperation(const cutlass::library::Operation* lhs, const cutlass::library::Operation* rhs) { - const char* lhs_name = lhs->description().name; - const char* rhs_name = rhs->description().name; - const size_t len = std::strlen(lhs_name); - const size_t suffix_len = std::strlen("align8"); - if (std::strlen(rhs_name) != len) { return false; } - if (len < suffix_len) { return false; } - const size_t prefix_len = len - suffix_len; - if (std::strncmp(lhs_name, rhs_name, prefix_len) != 0) { return false; } - const auto& HasLegalSuffix = [&](const char* str) { - if (std::strncmp(str + prefix_len, "align", std::strlen("align")) != 0) { return false; } - const char align = str[len - 1]; - return align == '8' || align == '4' || align == '2' || align == '1'; - }; - if ((!HasLegalSuffix(lhs_name)) || (!HasLegalSuffix(rhs_name))) { return false; } - return lhs_name[len - 1] < rhs_name[len - 1]; + const std::string lhs_name = lhs->description().name; + const std::string rhs_name = rhs->description().name; + size_t lhs_pos = lhs_name.rfind("align"); + if (lhs_pos == std::string::npos) { return false; } + size_t rhs_pos = rhs_name.rfind("align"); + if (rhs_pos == std::string::npos) { return false; } + if (lhs_name.substr(0, lhs_pos) != rhs_name.substr(0, rhs_pos)) { return false; } + size_t align_len = std::strlen("align"); + int lhs_alignment = std::atoi(lhs_name.substr(lhs_pos + align_len).c_str()); + int rhs_alignment = std::atoi(rhs_name.substr(rhs_pos + align_len).c_str()); + return lhs_alignment < rhs_alignment; } struct Conv2dOperationCacheKey { From 998c501f12c830953d30cf5928a80a510de55f24 Mon Sep 17 00:00:00 2001 From: strint Date: Wed, 23 Aug 2023 16:39:51 +0000 Subject: [PATCH 14/65] general argstree --- python/oneflow/framework/args_tree.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/oneflow/framework/args_tree.py b/python/oneflow/framework/args_tree.py index afd38c9907b..50ccc17aa90 100644 --- a/python/oneflow/framework/args_tree.py +++ b/python/oneflow/framework/args_tree.py @@ -41,12 +41,13 @@ class NamedArg(object): named_input = NamedArg([NamedArg(1), NamedArg({key: NamedArg("value")})]) """ - def __init__(self, prefix="", name=None, global_index=0) -> None: + def __init__(self, prefix="", name=None, global_index=0, tensor_type=Tensor) -> None: self._name = name if name is not None else str(global_index) self._prefix = prefix self._global_index = global_index self._is_value_set = False self._value = None + self._tensor_type = tensor_type def prefix(self): return self._prefix @@ -86,13 +87,13 @@ def __repr__(self): repr_str += "LIST" elif _is_raw_type(self._value, dict) or _is_raw_type(self._value, OrderedDict): repr_str += "DICT" - elif isinstance(self._value, Tensor): + elif isinstance(self._value, self._tensor_type): repr_str += "TENSOR" elif self._value is None: repr_str += "NONE" else: repr_str += "OPAQUE" - if isinstance(self._value, Tensor): + if isinstance(self._value, self._tensor_type): repr_str += ", value: " + self._value._meta_repr() elif ( _is_raw_type(self._value, dict) @@ -114,6 +115,7 @@ def __init__( gen_name: bool = False, root_prefix: str = "", root_name: str = None, + tensor_type = Tensor ) -> None: self._io_args = io_args @@ -122,6 +124,7 @@ def __init__( self._root_name = root_name self._named_io_args = None self._next_global_index = 0 + self._tensor_type = tensor_type if self._gen_name: self._named_io_args = self._construct_named_io_args( @@ -178,7 +181,7 @@ def iter_named_nodes(self): yield (named_node.prefix() + "_" + named_node.name(), named_node) def _construct_named_io_args(self, value, prefix: str, name: str) -> NamedArg: - arg = NamedArg(prefix, name, self._next_global_index) + arg = NamedArg(prefix, name, self._next_global_index, self._tensor_type) self._next_global_index += 1 if _is_raw_type(value, list) or _is_raw_type(value, tuple): @@ -219,7 +222,7 @@ def map_tuple_leaf(self, map_function: Callable): stack = [] # Cases handled: tuple(tensor, ...), such as input args. - if len(self._io_args) > 0 and isinstance(self._io_args[0], Tensor): + if len(self._io_args) > 0 and isinstance(self._io_args[0], self._tensor_type): for i in self._io_args: mapped_value = map_function(i) stack.append(mapped_value) @@ -233,7 +236,7 @@ def map_tuple_leaf(self, map_function: Callable): elif ( len(self._io_args) > 0 and isinstance(self._io_args[0], (tuple, list)) - and all(isinstance(arg, Tensor) for arg in self._io_args[0]) + and all(isinstance(arg, self._tensor_type) for arg in self._io_args[0]) ): for i in self._io_args[0]: mapped_value = map_function(i) From 44a37106f3b30e5e89b9cb9216ded4123728b3f6 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Wed, 23 Aug 2023 16:42:44 +0000 Subject: [PATCH 15/65] auto format by CI --- python/oneflow/_dynamo/__init__.py | 2 +- python/oneflow/framework/args_tree.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/oneflow/_dynamo/__init__.py b/python/oneflow/_dynamo/__init__.py index dd1e5f072d6..abc1eea891a 100644 --- a/python/oneflow/_dynamo/__init__.py +++ b/python/oneflow/_dynamo/__init__.py @@ -20,6 +20,7 @@ "allow_in_graph", ] + def allow_in_graph(fn): """ """ @@ -30,4 +31,3 @@ def allow_in_graph(fn): "The oneflow._dynamo.allow_in_graph interface is just to align the torch._dynamo.allow_in_graph interface and has no practical significance." ) return fn - diff --git a/python/oneflow/framework/args_tree.py b/python/oneflow/framework/args_tree.py index 50ccc17aa90..a6e5cec23c9 100644 --- a/python/oneflow/framework/args_tree.py +++ b/python/oneflow/framework/args_tree.py @@ -41,7 +41,9 @@ class NamedArg(object): named_input = NamedArg([NamedArg(1), NamedArg({key: NamedArg("value")})]) """ - def __init__(self, prefix="", name=None, global_index=0, tensor_type=Tensor) -> None: + def __init__( + self, prefix="", name=None, global_index=0, tensor_type=Tensor + ) -> None: self._name = name if name is not None else str(global_index) self._prefix = prefix self._global_index = global_index @@ -115,7 +117,7 @@ def __init__( gen_name: bool = False, root_prefix: str = "", root_name: str = None, - tensor_type = Tensor + tensor_type=Tensor, ) -> None: self._io_args = io_args From 361940637d39520f6fa27bb822c5f8d47e541186 Mon Sep 17 00:00:00 2001 From: strint Date: Thu, 24 Aug 2023 04:42:30 +0000 Subject: [PATCH 16/65] refine repr of argstree --- python/oneflow/framework/args_tree.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/oneflow/framework/args_tree.py b/python/oneflow/framework/args_tree.py index 50ccc17aa90..d1645149831 100644 --- a/python/oneflow/framework/args_tree.py +++ b/python/oneflow/framework/args_tree.py @@ -93,15 +93,16 @@ def __repr__(self): repr_str += "NONE" else: repr_str += "OPAQUE" + if isinstance(self._value, self._tensor_type): - repr_str += ", value: " + self._value._meta_repr() + repr_str += ", value: tensor(" + str(self._value.shape) + ", " + str(self._value.dtype) + ")" elif ( _is_raw_type(self._value, dict) or _is_raw_type(self._value, OrderedDict) or _is_raw_type(self._value, list) or _is_raw_type(self._value, tuple) ): - pass + repr_str += ", value: " + repr(self._value) else: repr_str += ", value: " + repr(self._value) repr_str += ")" @@ -286,3 +287,9 @@ def _execute_mapping(self, value, map_function): mapped_value = map_function(value) return mapped_value + + def __repr__(self): + if self._named_io_args: + return self._named_io_args.__repr__() + else: + return str(self.__class__) From ccac35e2b7bb53c5f0e92793b91312f371438dc9 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Thu, 24 Aug 2023 04:44:44 +0000 Subject: [PATCH 17/65] auto format by CI --- python/oneflow/framework/args_tree.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/oneflow/framework/args_tree.py b/python/oneflow/framework/args_tree.py index ddd52038aef..50f4e6a7fcb 100644 --- a/python/oneflow/framework/args_tree.py +++ b/python/oneflow/framework/args_tree.py @@ -97,7 +97,13 @@ def __repr__(self): repr_str += "OPAQUE" if isinstance(self._value, self._tensor_type): - repr_str += ", value: tensor(" + str(self._value.shape) + ", " + str(self._value.dtype) + ")" + repr_str += ( + ", value: tensor(" + + str(self._value.shape) + + ", " + + str(self._value.dtype) + + ")" + ) elif ( _is_raw_type(self._value, dict) or _is_raw_type(self._value, OrderedDict) From 025d4c5027a153dbcaa851942367085009052bf3 Mon Sep 17 00:00:00 2001 From: strint Date: Thu, 24 Aug 2023 10:07:05 +0000 Subject: [PATCH 18/65] graph allow any input --- python/oneflow/nn/graph/graph.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py index 2b6aca3627c..b6b7bf2050f 100644 --- a/python/oneflow/nn/graph/graph.py +++ b/python/oneflow/nn/graph/graph.py @@ -1748,14 +1748,13 @@ def __build_io(self, io_type, build_func, *args, **kwargs): args_repr = [] tensor2op_name = {} - def build_tensor_or_none(tensor, name, repr_str): - assert tensor is None or (isinstance(tensor, Tensor)) + def build_tensor_or_any(tensor, name, repr_str): if isinstance(tensor, Tensor): build_arg = build_func(name, tensor) op_names.append(name) tensor2op_name[build_arg] = name else: - build_arg = None + build_arg = tensor args_repr.append(repr_str) self.__print(0, 1, repr_str) @@ -1771,18 +1770,13 @@ def leaf_arg_fn(arg): arg_repr = self.__io_item_check_and_gen_repr( arg.value(), Tensor, io_type, name ) - build_arg = build_tensor_or_none(arg.value(), name, arg_repr) + build_arg = build_tensor_or_any(arg.value(), name, arg_repr) return build_arg - elif arg.value() is None: - arg_repr = self.__io_item_check_and_gen_repr( - arg.value(), None, io_type, name - ) - build_arg = build_tensor_or_none(arg.value(), name, arg_repr) else: # Opaque - # Error arg_repr = self.__io_item_check_and_gen_repr( arg.value(), None, io_type, name ) + build_arg = build_tensor_or_any(arg.value(), name, arg_repr) out = args_tree.map_leaf(leaf_arg_fn) build_args = out[0] @@ -1792,7 +1786,7 @@ def leaf_arg_fn(arg): def __io_item_check_and_gen_repr(self, item, expect_type, io_type, name): assert io_type in ("input", "output") - if expect_type is None and item is None: + if expect_type is None: repr_str = ( "[WARNING](" + io_type.upper() @@ -1802,6 +1796,7 @@ def __io_item_check_and_gen_repr(self, item, expect_type, io_type, name): + str(type(item)) + ")" ) + self.__print(1, 0, repr_str) return repr_str elif expect_type is not None and isinstance(item, expect_type): if isinstance(item, Tensor): From 354cf21972f1fe343ba12bf96586a7a56d08b380 Mon Sep 17 00:00:00 2001 From: strint Date: Thu, 24 Aug 2023 11:50:11 +0000 Subject: [PATCH 19/65] save support any input --- python/oneflow/nn/graph/graph.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py index b6b7bf2050f..7093c228b72 100644 --- a/python/oneflow/nn/graph/graph.py +++ b/python/oneflow/nn/graph/graph.py @@ -1826,27 +1826,21 @@ def __io_item_check_and_gen_repr(self, item, expect_type, io_type, name): def __map_io(self, io_type, func, *args, **kwargs): assert io_type in ("input", "output") - def mapping_tensor_or_none(tensor): - assert tensor is None or (isinstance(tensor, Tensor)) + def mapping_tensor_or_any(tensor): if isinstance(tensor, Tensor): mapped_arg = func(tensor) else: - mapped_arg = None + mapped_arg = tensor return mapped_arg def leaf_arg_fn(arg): arg_value = arg.value() - if isinstance(arg_value, Tensor) or arg_value is None: - return mapping_tensor_or_none(arg_value) - else: - self.__io_item_check( - arg_value, None, io_type, arg.prefix() + "_" + arg.name(), - ) + return mapping_tensor_or_any(arg_value) # NOTE(lixiang): Reduce the overhead of traversal and parsing of io args. if self._is_simple_tuple_output or self._is_simple_tuple_input: args_tree = ArgsTree(args, False) - out = args_tree.map_tuple_leaf(mapping_tensor_or_none) + out = args_tree.map_tuple_leaf(mapping_tensor_or_any) return out, kwargs args_tree = ArgsTree( From 99b3d602935edcabc8b5e6cd7e3e060c8b41fe89 Mon Sep 17 00:00:00 2001 From: jackalcooper Date: Fri, 25 Aug 2023 17:27:59 +0800 Subject: [PATCH 20/65] fix --- oneflow/core/functional/impl/nn_functor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 995cc937bdb..3cf3b4aa9f9 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -5223,6 +5223,7 @@ class GroupedMatmulFunctor { Maybe operator()(const TensorTuple& xs, const TensorTuple& weights) const { const int64_t input_size = xs.size(); const int64_t weight_size = weights.size(); + CHECK_LT_OR_RETURN(input_size, kMaxInputCount); CHECK_GE_OR_RETURN(input_size, 1) << Error::RuntimeError() << "The number of xs should be greater equal than 1."; CHECK_EQ_OR_RETURN(weight_size, input_size) From 8347ed8972f777c55d4cc919645438252bf1cffa Mon Sep 17 00:00:00 2001 From: jackalcooper Date: Fri, 25 Aug 2023 18:05:59 +0800 Subject: [PATCH 21/65] fix gsize>64 --- oneflow/user/kernels/grouped_matmul_bias.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/oneflow/user/kernels/grouped_matmul_bias.cu b/oneflow/user/kernels/grouped_matmul_bias.cu index c23d9c925b8..2022fbec012 100644 --- a/oneflow/user/kernels/grouped_matmul_bias.cu +++ b/oneflow/user/kernels/grouped_matmul_bias.cu @@ -190,7 +190,13 @@ class GroupedMatmulBiasKernel final : public user_op::OpKernel, public user_op:: } void* workspace = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr(); for (const auto& group : groups) { - ApplyGroup(group.first, group.second, has_biases, workspace, ctx->stream()); + for (size_t i = 0; i < group.second.size(); i += kMaxProblemBatch) { + std::vector> ptrs( + {group.second.begin() + i, + group.second.begin() + i + + std::min(group.second.size() - i, kMaxProblemBatch)}); + ApplyGroup(group.first, ptrs, has_biases, workspace, ctx->stream()); + } } } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } From dde053c84718531a0202756f4c58c14f04b7010c Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Sat, 26 Aug 2023 09:01:04 +0000 Subject: [PATCH 22/65] compile with cutlass extension --- cmake/third_party.cmake | 7 +- cmake/third_party/cutlass-extension.cmake | 60 +++++++++++++ cmake/third_party/cutlass.cmake | 20 ----- ...832fprop_optimized_s8_128x32_128x4_nhwc.cu | 83 ----------------- ...6832fprop_optimized_s8_128x64_64x4_nhwc.cu | 83 ----------------- ...6832fprop_optimized_s8_128x64_64x6_nhwc.cu | 83 ----------------- ...832fprop_optimized_s8_32x128_128x4_nhwc.cu | 83 ----------------- ...816fprop_optimized_s8_128x128_32x2_nhwc.cu | 89 ------------------- ...816fprop_optimized_s8_128x128_64x2_nhwc.cu | 89 ------------------- ...816fprop_optimized_s8_128x256_32x2_nhwc.cu | 89 ------------------- ...816fprop_optimized_s8_128x256_64x2_nhwc.cu | 89 ------------------- ...8816fprop_optimized_s8_128x32_32x2_nhwc.cu | 88 ------------------ ...8816fprop_optimized_s8_128x32_64x2_nhwc.cu | 88 ------------------ ...8816fprop_optimized_s8_128x64_32x2_nhwc.cu | 88 ------------------ ...8816fprop_optimized_s8_128x64_64x2_nhwc.cu | 88 ------------------ ...8816fprop_optimized_s8_64x128_32x2_nhwc.cu | 88 ------------------ ...8816fprop_optimized_s8_64x128_64x2_nhwc.cu | 88 ------------------ ...i8816fprop_optimized_s8_64x32_32x2_nhwc.cu | 88 ------------------ ...i8816fprop_optimized_s8_64x32_64x2_nhwc.cu | 88 ------------------ ...i8816fprop_optimized_s8_64x64_32x2_nhwc.cu | 88 ------------------ ...i8816fprop_optimized_s8_64x64_64x2_nhwc.cu | 88 ------------------ .../kernels/cutlass/external_singleton.cpp | 34 ------- .../user/kernels/cutlass/external_singleton.h | 61 ------------- oneflow/user/kernels/cutlass_conv_tuner.cpp | 17 ++-- 24 files changed, 77 insertions(+), 1690 deletions(-) create mode 100644 cmake/third_party/cutlass-extension.cmake delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu delete mode 100644 oneflow/user/kernels/cutlass/external_singleton.cpp delete mode 100644 oneflow/user/kernels/cutlass/external_singleton.h diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 832b91b2a44..e05a6b46734 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -144,6 +144,7 @@ if(BUILD_CUDA) endif() include(nccl) include(cutlass) + include(cutlass-extension) include(trt_flash_attention) list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES}) @@ -157,11 +158,15 @@ if(BUILD_CUDA) if(WITH_CUTLASS) list(APPEND oneflow_third_party_dependencies cutlass) - list(APPEND oneflow_third_party_dependencies cutlass_copy_extra_library_to_destination) list(APPEND oneflow_third_party_dependencies cutlass_copy_examples_to_destination) list(APPEND oneflow_third_party_libs ${CUTLASS_LIBRARIES}) list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUTLASS_INCLUDE_DIR}) endif() + if(WITH_CUTLASS_EXTENSION) + list(APPEND oneflow_third_party_dependencies cutlass-extension) + list(APPEND oneflow_third_party_libs ${CUTLASS_EXTENSION_LIBRARIES}) + list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUTLASS_EXTENSION_INCLUDE_DIR}) + endif() list(APPEND oneflow_third_party_dependencies trt_flash_attention) list(APPEND oneflow_third_party_libs ${TRT_FLASH_ATTENTION_LIBRARIES}) list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${TRT_FLASH_ATTENTION_INCLUDE_DIR}) diff --git a/cmake/third_party/cutlass-extension.cmake b/cmake/third_party/cutlass-extension.cmake new file mode 100644 index 00000000000..fbe3b0ff749 --- /dev/null +++ b/cmake/third_party/cutlass-extension.cmake @@ -0,0 +1,60 @@ +include(ExternalProject) + +set(WITH_CUTLASS_EXTENSION OFF CACHE BOOL "") + +if(WITH_CUTLASS_EXTENSION) + + add_definitions(-DWITH_CUTLASS_EXTENSION) + + find_package(Threads) + + set(CUTLASS_EXTENSION_PROJECT cutlass-extension) + + set(CUTLASS_EXTENSION_INSTALL_DIR ${THIRD_PARTY_DIR}/cutlass-extension) + + set(CUTLASS_EXTENSION_INCLUDE_DIR ${CUTLASS_EXTENSION_INSTALL_DIR}/include CACHE PATH "" FORCE) + set(CUTLASS_EXTENSION_LIBRARY_DIR ${CUTLASS_EXTENSION_INSTALL_DIR}/lib CACHE PATH "" FORCE) + set(CUTLASS_EXTENSION_LIBRARIES ${CUTLASS_EXTENSION_LIBRARY_DIR}/libcutlass_extension.so) + set(CUTLASS_EXTENSION_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass-extension/src/cutlass-extension/) + set(CUTLASS_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass/src/cutlass) + + foreach(arch ${CUDA_REAL_ARCHS_LIST}) + if(arch GREATER_EQUAL 70) + list(APPEND CUTLASS_REAL_ARCHS ${arch}) + endif() + endforeach() + + if(THIRD_PARTY) + ExternalProject_Add( + ${CUTLASS_EXTENSION_PROJECT} + PREFIX cutlass-extension + GIT_REPOSITORY https://github.com/Oneflow-Inc/oneflow-cutlass-extension.git + GIT_TAG master + UPDATE_COMMAND "" + BUILD_BYPRODUCTS ${CUTLASS_EXTENSION_LIBRARIES} + CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_DEBUG:STRING=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_CXX_FLAGS_RELEASE:STRING=${CMAKE_CXX_FLAGS_RELEASE} + CMAKE_CACHE_ARGS + -DCMAKE_CUDA_COMPILER:STRING=${CUDAToolkit_NVCC_EXECUTABLE} + -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER} + -DCMAKE_INSTALL_PREFIX:PATH=${CUTLASS_EXTENSION_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${CUTLASS_EXTENSION_LIBRARY_DIR} + -DCMAKE_INSTALL_MESSAGE:STRING=${CMAKE_INSTALL_MESSAGE} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCUTLASS_ENABLE_EXAMPLES:BOOL=OFF + -DCUTLASS_ENABLE_PROFILER:BOOL=OFF + -DCUTLASS_ENABLE_LIBRARY:BOOL=ON + -DCUTLASS_NVCC_ARCHS:STRING=${CUTLASS_REAL_ARCHS} + -DCUTLASS_ENABLE_TESTS:BOOL=OFF + -DCUTLASS_UNITY_BUILD_ENABLED:BOOL=ON + -DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING= + -DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF + -DCUTLASS_DIR:STRING=${CUTLASS_SOURCE_DIR} + DEPENDS cutlass + ) + + endif(THIRD_PARTY) +endif(WITH_CUTLASS_EXTENSION) diff --git a/cmake/third_party/cutlass.cmake b/cmake/third_party/cutlass.cmake index 20114d13006..46f1a9132ec 100644 --- a/cmake/third_party/cutlass.cmake +++ b/cmake/third_party/cutlass.cmake @@ -60,26 +60,6 @@ if(WITH_CUTLASS) -DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING= -DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF) - add_custom_target(cutlass_copy_extra_library_to_destination DEPENDS cutlass) - set(CUTLASS_SOURCE_LIBRARY_DIR ${CUTLASS_SOURCE_DIR}/tools/library) - set(CUTLASS_INSTALL_EXTRA_LIBRARY_FILES - "src/conv2d_operation.h" - "src/conv3d_operation.h" - "src/gemm_operation_3x.hpp" - "src/gemm_operation.h" - "src/library_internal.h" - "src/rank_2k_operation.h" - "src/rank_k_operation.h" - "src/symm_operation.h" - "src/trmm_operation.h" - ) - foreach(filename ${CUTLASS_INSTALL_EXTRA_LIBRARY_FILES}) - add_custom_command( - TARGET cutlass_copy_extra_library_to_destination - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUTLASS_SOURCE_LIBRARY_DIR}/${filename} - ${CUTLASS_INSTALL_DIR}/include/cutlass/library/${filename}) - endforeach() - add_custom_target(cutlass_copy_examples_to_destination DEPENDS cutlass) set(CUTLASS_SOURCE_EXAMPLES_DIR ${CUTLASS_SOURCE_DIR}/examples) diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu deleted file mode 100644 index 134ed531251..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc.cu +++ /dev/null @@ -1,83 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc" -template -using cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - cutlass::gemm::GemmShape<128, 32, 128>, cutlass::gemm::GemmShape<32, 32, 128>, - cutlass::gemm::GemmShape<16, 8, 32>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc - : public cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc<4, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc<8, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc<16, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x32_128x4_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu deleted file mode 100644 index 193a14eebf2..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc.cu +++ /dev/null @@ -1,83 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc" -template -using cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<16, 8, 32>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc - : public cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc<4, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc<8, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc<16, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x4_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu deleted file mode 100644 index a8eed9605d3..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc.cu +++ /dev/null @@ -1,83 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc" -template -using cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<16, 8, 32>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 6, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc - : public cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc<4, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc<8, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc<16, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_128x64_64x6_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu deleted file mode 100644 index e858ccabf7f..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc.cu +++ /dev/null @@ -1,83 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc" -template -using cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - cutlass::gemm::GemmShape<32, 128, 128>, cutlass::gemm::GemmShape<32, 32, 128>, - cutlass::gemm::GemmShape<16, 8, 32>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 4, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc - : public cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc<4, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc<8, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc<16, 4>>( - "cutlass_tensorop_i16832fprop_optimized_s8_32x128_128x4_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu deleted file mode 100644 index 5330d7642cf..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc.cu +++ /dev/null @@ -1,89 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_32x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu deleted file mode 100644 index 9ae47d1b84b..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc.cu +++ /dev/null @@ -1,89 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x128_64x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu deleted file mode 100644 index 04928ede0e7..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc.cu +++ /dev/null @@ -1,89 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 32>, cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_32x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu deleted file mode 100644 index 1b8ccb4fab8..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc.cu +++ /dev/null @@ -1,89 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance -// "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_base {}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x256_64x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu deleted file mode 100644 index 07ef45ffe39..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 32>, cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_32x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu deleted file mode 100644 index d9969233748..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x32_64x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu deleted file mode 100644 index b69aeb78db8..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_32x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu deleted file mode 100644 index d545b826aae..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_128x64_64x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu deleted file mode 100644 index 7d796cf86a6..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_32x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu deleted file mode 100644 index b206262821c..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x128_64x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu deleted file mode 100644 index 82bfb19a303..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_32x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu deleted file mode 100644 index 85c2aae0ba7..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x32_64x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu deleted file mode 100644 index 2622afa25db..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_32x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu b/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu deleted file mode 100644 index cc9142b2c45..00000000000 --- a/oneflow/user/kernels/cutlass/conv2d/cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* - Generated by conv2d_operation.py - Do not edit. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cutlass/cutlass.h" -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" - -#include "cutlass/library/src/conv2d_operation.h" -#include "cutlass/library/src/library_internal.h" - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -// Conv2dFprop Optimized kernel instance "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc" -template -using cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - int8_t, cutlass::layout::TensorNHWC, int8_t, cutlass::layout::TensorNHWC, int32_t, - cutlass::layout::TensorNHWC, int32_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombination, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle< - 4>, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, - 2, cutlass::arch::OpMultiplyAddSaturate, cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, Alignment, Alignment>::Kernel; - -// Derived class -template -struct cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc - : public cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_base { -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -template -using Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc = - cutlass::conv::device::ImplicitGemmConvolution< - cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc>; - -namespace cutlass { -namespace library { - -// Initialize all instances -ONEFLOW_CUTLASS_MANIFEST(manifest) { - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<1, 1>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align1")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<2, 2>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align2")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<4, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align4")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<8, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align8")); - manifest.append(new cutlass::library::Conv2dOperation< - Operation_cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc<16, 4>>( - "cutlass_tensorop_i8816fprop_optimized_s8_64x64_64x2_nhwc_align16")); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace library -} // namespace cutlass - -/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/oneflow/user/kernels/cutlass/external_singleton.cpp b/oneflow/user/kernels/cutlass/external_singleton.cpp deleted file mode 100644 index 07df60938ab..00000000000 --- a/oneflow/user/kernels/cutlass/external_singleton.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifdef WITH_CUTLASS - -#include "oneflow/user/kernels/cutlass/external_singleton.h" - -namespace cutlass { -namespace library { - -ExternalSingleton::ExternalSingleton() {} - -ExternalSingleton& ExternalSingleton::get() { - static ExternalSingleton instance; - return instance; -} - -} // namespace library -} // namespace cutlass - -#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass/external_singleton.h b/oneflow/user/kernels/cutlass/external_singleton.h deleted file mode 100644 index 2d8d52a20b7..00000000000 --- a/oneflow/user/kernels/cutlass/external_singleton.h +++ /dev/null @@ -1,61 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifdef WITH_CUTLASS - -#include - -#include "cutlass/library/library.h" -#include "cutlass/library/manifest.h" -#include "cutlass/library/operation_table.h" - -#include "oneflow/core/common/util.h" - -namespace cutlass { -namespace library { - -/// Singleton instance stores a Manifest and Operation table -class ExternalSingleton { - public: - /// Manifest object - std::vector manifest; - - /// Operation table referencing the Manifest - OperationTable operation_table; - - public: - ExternalSingleton(); - - static ExternalSingleton& get(); -}; - -#define ONEFLOW_CUTLASS_MANIFEST(m) ONEFLOW_CUTLASS_MANIFEST_IMPL(m, __COUNTER__) - -#define ONEFLOW_CUTLASS_MANIFEST_IMPL(m, uuid) \ - static void OF_PP_CAT(_cutlass_manifest_, uuid)(Manifest & m); \ - static int OF_PP_CAT(_cutlass_manifest_dummy_, uuid) = []() { \ - auto& manifest = ExternalSingleton::get().manifest; \ - manifest.resize(manifest.size() + 1); \ - OF_PP_CAT(_cutlass_manifest_, uuid)(manifest.back()); \ - ExternalSingleton::get().operation_table.append(manifest.back()); \ - return 0; \ - }(); \ - void OF_PP_CAT(_cutlass_manifest_, uuid)(Manifest & m) - -} // namespace library -} // namespace cutlass - -#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_conv_tuner.cpp b/oneflow/user/kernels/cutlass_conv_tuner.cpp index dfc9c17af04..ae0dd574fba 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner.cpp @@ -25,7 +25,9 @@ limitations under the License. #include #include -#include "oneflow/user/kernels/cutlass/external_singleton.h" +#ifdef WITH_CUTLASS_EXTENSION +#include +#endif // WITH_CUTLASS_EXTENSION namespace oneflow { @@ -226,13 +228,16 @@ const cutlass::library::Operation* CutlassConvTuner::Impl::FindConv2dOperation( const cutlass::library::Operation* fastest_operation = nullptr; float fastest_time = 0; const auto& operations_map = [&]() { - const auto& external_operations_map_it = - cutlass::library::ExternalSingleton::get().operation_table.conv2d_operations.find( +#ifdef WITH_CUTLASS_EXTENSION + const auto& extension_operations_map_it = + cutlass::library::CutlassExtensionSingleton::get().operation_table.conv2d_operations.find( functional_key); - if (external_operations_map_it - != cutlass::library::ExternalSingleton::get().operation_table.conv2d_operations.cend()) { - return external_operations_map_it->second; + if (extension_operations_map_it + != cutlass::library::CutlassExtensionSingleton::get() + .operation_table.conv2d_operations.cend()) { + return extension_operations_map_it->second; } +#endif // WITH_CUTLASS_EXTENSION const auto& operations_map_it = cutlass::library::Singleton::get().operation_table.conv2d_operations.find(functional_key); CHECK(operations_map_it From cb84faf1fe1cd33b2aed2006b37c4f6fcde808e9 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Sat, 26 Aug 2023 10:02:28 +0000 Subject: [PATCH 23/65] fix --- oneflow/user/kernels/cutlass_conv_tuner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/user/kernels/cutlass_conv_tuner.cpp b/oneflow/user/kernels/cutlass_conv_tuner.cpp index ae0dd574fba..df746bba5f2 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner.cpp @@ -26,7 +26,7 @@ limitations under the License. #include #ifdef WITH_CUTLASS_EXTENSION -#include +#include #endif // WITH_CUTLASS_EXTENSION namespace oneflow { From 5558ed3dc7c868729cce2a30edcf0a6066ce88be Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Sun, 27 Aug 2023 07:49:38 +0000 Subject: [PATCH 24/65] add conv2d quant op --- oneflow/core/functional/functional_api.yaml | 7 + oneflow/core/functional/impl/nn_functor.cpp | 65 +++++ oneflow/ir/include/OneFlow/OneFlowUserOps.td | 37 +++ .../ir/lib/OneFlow/Transform/AutoNHWCOps.cpp | 34 +++ oneflow/user/ops/conv_op.cpp | 6 +- oneflow/user/ops/conv_quant_op.cpp | 239 ++++++++++++++++++ 6 files changed, 383 insertions(+), 5 deletions(-) create mode 100644 oneflow/user/ops/conv_quant_op.cpp diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 3323db0e93c..59f5d7789a9 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1051,6 +1051,13 @@ String channel_pos="channels_first") => Conv2d' bind_python: True +- name: "conv2d_quant" + signature: + 'Tensor (Tensor input, Tensor weight, Tensor input_zero_point, Tensor scale=None, Tensor bias=None, Int32List[2] stride=1, + Int32List[2] padding=0, Int32List[2] dilation=1, Int32 groups=1, + String channel_pos="channels_first", DataType output_dtype=None) => Conv2dQuant' + bind_python: True + - name: "conv3d" signature: 'Tensor (Tensor input, Tensor weight, Tensor bias=None, Int32List[3] stride=1, diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 995cc937bdb..43b04cf7d29 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -163,6 +163,70 @@ class Conv3dFunctor : public ConvBaseFunctor { } }; +class ConvQuantBaseFunctor { + public: + explicit ConvQuantBaseFunctor(const int& num_spatial_dims) + : num_spatial_dims_(num_spatial_dims) {} + virtual ~ConvQuantBaseFunctor() = default; + + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& weight, + const std::shared_ptr& input_zero_point, + const Optional& scale, const Optional& bias, + const std::vector& stride, const std::vector& padding, + const std::vector& dilation, const int32_t& groups, + const std::string& channel_pos, + const Optional>& output_dtype) const { + if (scale || bias) { + CHECK_OR_RETURN(scale && bias) << "scale and bias must both be given or not."; + } + std::vector kernel_size_vec(num_spatial_dims_); + int32_t kernel_idx_offset = 2; + if (channel_pos == "channels_last") { kernel_idx_offset = 1; } + + for (int i = 0; i < num_spatial_dims_; i++) { + kernel_size_vec.at(i) = ((weight->shape())->At(i + kernel_idx_offset)); + } + auto& conv_attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("filters", "kernel_size", "padding_before", "strides", + "dilation_rate", "groups", "data_format", "out_dtype"); + conv_attrs.SetAllAttrs(static_cast(weight->shape()->At(0)), kernel_size_vec, padding, + stride, dilation, groups, channel_pos, + output_dtype.value_or(DType::Float())->data_type()); + if (scale) { + return OpInterpUtil::Dispatch( + *conv_scale_bias_op_, {input, weight, input_zero_point, JUST(scale), JUST(bias)}, + conv_attrs); + } + return OpInterpUtil::Dispatch(*conv_op_, {input, weight, input_zero_point}, conv_attrs); + } + + protected: + std::shared_ptr conv_op_; + std::shared_ptr conv_scale_bias_op_; + int32_t num_spatial_dims_; +}; + +class Conv2dQuantFunctor : public ConvQuantBaseFunctor { + public: + Conv2dQuantFunctor() : ConvQuantBaseFunctor(/*num_spatial_dims_=*/2) { + conv_op_ = CHECK_JUST(one::OpBuilder("conv2d_quant") + .Input("in") + .Input("weight") + .Input("in_zero_point") + .Output("out") + .Build()); + conv_scale_bias_op_ = CHECK_JUST(one::OpBuilder("conv2d_quant") + .Input("in") + .Input("weight") + .Input("in_zero_point") + .Input("scale") + .Input("bias") + .Output("out") + .Build()); + } +}; + class DeConvBaseFunctor { public: explicit DeConvBaseFunctor(const int& num_spatial_dims) : num_spatial_dims_(num_spatial_dims) { @@ -5428,6 +5492,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("Deconv1d"); m.add_functor("Deconv2d"); m.add_functor("Deconv3d"); + m.add_functor("Conv2dQuant"); m.add_functor("EmbeddingReNorm"); m.add_functor("Embedding"); m.add_functor("MatMul"); diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 05f5ea56bc3..d6de6a8f341 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -8299,6 +8299,43 @@ def OneFlow_FusedLinearWithGroupwiseQuantizedWeightOp : OneFlow_BaseOp<"fused_li let has_data_type_infer_fn = 1; } +def OneFlow_Conv2DQuantOp : OneFlow_BaseOp<"conv2d_quant", [NoMemoryEffect, AttrSizedOperandSegments, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { + let summary = "OneFlow fused convolution quant operation"; + let description = [{ + "The convolution operator consumes an input tensor and a filter, and" + "computes the output." + }]; + let input = (ins + OneFlow_Tensor:$in, + OneFlow_Tensor:$weight, + OneFlow_Tensor:$in_zero_point, + Optional:$scale, + Optional:$bias, + Optional:$_add_to_output + ); + let output = (outs OneFlow_Tensor:$out); + let attrs = (ins + DefaultValuedAttr:$filters, + SI32ArrayAttr:$padding_before, + StrAttr:$data_format, + SI32ArrayAttr:$kernel_size, + SI32ArrayAttr:$strides, + SI32ArrayAttr:$dilation_rate, + DefaultValuedAttr:$groups, + OneFlow_DataType:$out_dtype, + DefaultValuedAttr:$tuning_cache + ); + let trait_attrs = (ins + DenseI32ArrayAttr:$operand_segment_sizes + ); + let has_check_fn = 1; + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; + let has_compute_complexity_fn = 1; +} + #endif // GET_ONEFLOW_QUANTIZATION_OP_DEFINITIONS diff --git a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp index 1c763027b37..28455f78618 100644 --- a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp +++ b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp @@ -51,6 +51,40 @@ llvm::SmallVector Conv2DOp::NchwToNhwc(llvm::SmallVector val return results; } +bool Conv2DQuantOp::IsNCHW() { return this->getDataFormat().str() == "channels_first"; } + +llvm::DenseSet Conv2DQuantOp::OperandsToTranspose() { + if (this->get_addToOutput()) { + return {this->getIn(), this->getWeight(), this->get_addToOutput()}; + } else { + return {this->getIn(), this->getWeight()}; + } +} + +llvm::DenseSet Conv2DQuantOp::ResultsToTranspose() { return {this->getOut()}; } + +llvm::SmallVector Conv2DQuantOp::NchwToNhwc(llvm::SmallVector value, + PatternRewriter& rewriter) { + auto conv_quant_op = *this; + SmallVector operands; + operands.push_back(value[0]); + operands.push_back(value[1]); + operands.push_back(conv_quant_op.getInZeroPoint()); + if (conv_quant_op.getScale()) operands.push_back(conv_quant_op.getScale()); + if (conv_quant_op.getBias()) operands.push_back(conv_quant_op.getBias()); + if (this->get_addToOutput()) { operands.push_back(value[2]); } + NamedAttrList attributes = conv_quant_op->getAttrs(); + attributes.set(conv_quant_op.getDataFormatAttrName(), rewriter.getStringAttr("channels_last")); + auto res = + rewriter + .create(conv_quant_op.getLoc(), getNHWCResultTypes(conv_quant_op), + operands, attributes) + ->getResults(); + llvm::SmallVector results; + results.push_back(res[0]); + return results; +} + bool BiasAddOp::IsNCHW() { return this->getAxisAttr().getValue().getSExtValue() == 1; } llvm::DenseSet BiasAddOp::OperandsToTranspose() { return {this->getA()}; } diff --git a/oneflow/user/ops/conv_op.cpp b/oneflow/user/ops/conv_op.cpp index 7ba8cbc3adb..08d6e2e6714 100644 --- a/oneflow/user/ops/conv_op.cpp +++ b/oneflow/user/ops/conv_op.cpp @@ -251,11 +251,7 @@ Maybe CheckAttr_(const user_op::UserOpDefWrapper& def, } /* static */ Maybe Conv2DOp::InferDataType(user_op::InferContext* ctx) { - if (ctx->InputDType("in", 0) == DataType::kInt8) { - ctx->SetOutputDType("out", 0, DataType::kInt32); - } else { - ctx->SetOutputDType("out", 0, ctx->InputDType("in", 0)); - } + ctx->SetOutputDType("out", 0, ctx->InputDType("in", 0)); return Maybe::Ok(); } diff --git a/oneflow/user/ops/conv_quant_op.cpp b/oneflow/user/ops/conv_quant_op.cpp new file mode 100644 index 00000000000..c6c59706f46 --- /dev/null +++ b/oneflow/user/ops/conv_quant_op.cpp @@ -0,0 +1,239 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/ops/nn_util.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +namespace { + +template +Maybe InferTensorDesc4Conv(user_op::InferContext* ctx) { + const user_op::TensorDesc& in = ctx->InputTensorDesc("in", 0); + CHECK_EQ_OR_RETURN(NDims + 2, in.shape().NumAxes()) + << "Conv" << NDims << "D op's input shape ndim should equal to " << NDims + 2 + << " ,but got: " << in.shape().NumAxes(); + + auto data_format = ctx->Attr("data_format"); + auto kernel_size = ctx->Attr>("kernel_size"); + CHECK_EQ_OR_RETURN(NDims, kernel_size.size()); + int32_t filters = ctx->Attr("filters"); + size_t idx_offset = IdxOffset(data_format); + { + const auto& padding_before = ctx->Attr>("padding_before"); + auto dilation_rate = ctx->Attr>("dilation_rate"); + auto strides = ctx->Attr>("strides"); + CHECK_EQ_OR_RETURN(NDims, dilation_rate.size()); + CHECK_EQ_OR_RETURN(NDims, strides.size()); + CHECK_EQ_OR_RETURN(NDims, padding_before.size()); + + user_op::TensorDesc* out = ctx->MutOutputTensorDesc("out", 0); + DimVector out_shape(NDims + 2); + out_shape.at(0) = in.shape().At(0); + const size_t c_dim = data_format == "channels_first" ? 1 : NDims + 1; + out_shape.at(c_dim) = filters; + for (int32_t i = 0; i < NDims; ++i) { + JUST(CalcConvOut(in.shape().At(idx_offset + i), kernel_size.at(i), dilation_rate.at(i), + strides.at(i), padding_before.at(i), &out_shape.at(idx_offset + i))); + } + out->set_is_dynamic(in.is_dynamic()); + out->set_shape(Shape(out_shape)); + if (data_format == "channels_last") { + out->set_memory_format(MemoryFormat::kChannelsLast); + } else { + out->set_memory_format(MemoryFormat::kContiguous); + } + } + + { + int32_t groups = ctx->Attr("groups"); + CHECK_GT_OR_RETURN(groups, 0); + CHECK_LE_OR_RETURN(groups, filters); + CHECK_EQ_OR_RETURN(filters % groups, 0); + + DimVector weight_shape(in.shape().dim_vec()); + weight_shape.at(0) = filters; + if (data_format == "channels_first") { + CHECK_LE_OR_RETURN(groups, weight_shape.at(1)); + CHECK_EQ_OR_RETURN(weight_shape.at(1) % groups, 0); + weight_shape.at(1) = weight_shape.at(1) / groups; + } else if (data_format == "channels_last") { + CHECK_LE_OR_RETURN(groups, weight_shape.at(NDims + 1)); + CHECK_EQ_OR_RETURN(weight_shape.at(NDims + 1) % groups, 0); + weight_shape.at(NDims + 1) = weight_shape.at(NDims + 1) / groups; + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + for (size_t i = 0; i < NDims; ++i) { weight_shape.at(idx_offset + i) = kernel_size.at(i); } + + const user_op::TensorDesc& weight = ctx->InputTensorDesc("weight", 0); + CHECK_EQ_OR_RETURN(weight.shape(), Shape(weight_shape)); + } + + bool has_scale = ctx->has_input("scale", 0); + if (has_scale) { + const user_op::TensorDesc& scale = ctx->InputTensorDesc("scale", 0); + CHECK_EQ_OR_RETURN(scale.shape(), Shape({filters})); + } + bool has_bias = ctx->has_input("bias", 0); + if (has_bias) { + const user_op::TensorDesc& bias = ctx->InputTensorDesc("bias", 0); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({filters})); + } + if (has_scale || has_bias) { CHECK_OR_RETURN(has_scale && has_bias); } + return Maybe::Ok(); +} + +Maybe GetSbpSignatures4Conv(user_op::SbpContext* ctx) { + bool has_bias = false; + for (const auto& pair : ctx->inputs()) { + if (pair.first == "bias") { + CHECK_EQ_OR_RETURN(0, pair.second); + has_bias = true; + break; + } + } + + if (has_bias) { + ctx->NewBuilder() + .Split(ctx->inputs(), 0) + .Split(user_op::OpArg("in", 0), 0) + .Broadcast(user_op::OpArg("weight", 0)) + .Broadcast(user_op::OpArg("in_zero_point", 0)) + .Broadcast(user_op::OpArg("scale", 0)) + .Broadcast(user_op::OpArg("bias", 0)) + .Split(user_op::OpArg("out", 0), 0) + .Build(); + } else { + ctx->NewBuilder() + .Split(ctx->inputs(), 0) + .Split(user_op::OpArg("in", 0), 0) + .Broadcast(user_op::OpArg("weight", 0)) + .Broadcast(user_op::OpArg("in_zero_point", 0)) + .Split(user_op::OpArg("out", 0), 0) + .Build(); + } + return Maybe::Ok(); +} + +/* +Example for conv2d: + +ComputationCost += ((k*k + k*k-1)*c + c-1 + bias?1:0) * out_channel * out_width * out_height * batch_size += (2*k*k*c - 1 + bias?1:0) * out_channel * out_width * out_height * batch_size +≈ 2*k*k*c * out_channel * out_width * out_height * batch_size +*/ +Maybe ConvComputationCost(user_op::ComputeComplexityFnContext* ctx) { + const std::vector kernel_size = ctx->Attr>("kernel_size"); + const std::string data_format = ctx->Attr("data_format"); + const user_op::TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("in", 0); + const size_t c_dim = data_format == "channels_first" ? 1 : in->shape().NumAxes() - 1; + const int32_t c = in->shape().At(c_dim); + const user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("out", 0); + double cost = + std::accumulate(kernel_size.begin(), kernel_size.end(), 1.0, std::multiplies()); + cost = cost * 2 * c; + cost *= std::accumulate(out->shape().dim_vec().begin(), out->shape().dim_vec().end(), 1.0, + std::multiplies()); + + const auto& parallel_hierarchy = ctx->parallel_desc().hierarchy(); + const auto& nd_sbp_out = ctx->NdSbp4ArgNameAndIndex("out", 0); + for (int32_t dim_sbp = 0; dim_sbp < nd_sbp_out.sbp_parallel_size(); dim_sbp++) { + if (nd_sbp_out.sbp_parallel(dim_sbp).has_split_parallel()) { + cost /= parallel_hierarchy->At(dim_sbp); + } + } + return cost; +} + +template +Maybe CheckAttr_(const user_op::UserOpDefWrapper& def, + const user_op::UserOpConfWrapper& conf) { + bool is_checked = true; + std::stringstream err; + err << "Illegal value for " << conf.op_type_name() << " op " << conf.op_name() << ": "; + + const auto& data_format = conf.attr("data_format"); + if (!(data_format == "channels_first" || data_format == "channels_last")) { + err << " data_format:" << data_format; + is_checked = false; + } + + if (NDims != 0) { + const auto& padding_before = conf.attr>("padding_before"); + if (padding_before.size() != NDims) { + err << " padding_before: number of element is " << padding_before.size(); + is_checked = false; + } + + const auto& kernel_size = conf.attr>("kernel_size"); + if (kernel_size.size() != NDims) { + err << " kernel_size: number of element is " << kernel_size.size(); + is_checked = false; + } + + const auto& strides = conf.attr>("strides"); + if (strides.size() != NDims) { + err << " strides: number of element is " << strides.size(); + is_checked = false; + } + + const auto& dilation_rate = conf.attr>("dilation_rate"); + if (dilation_rate.size() != NDims) { + err << " dilation_rate: number of element is " << dilation_rate.size(); + is_checked = false; + } + } + + if (is_checked) { + return Maybe::Ok(); + } else { + return oneflow::Error::CheckFailedError() << err.str(); + } +} + +} // namespace + +/* static */ Maybe Conv2DQuantOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { + return InferTensorDesc4Conv<2>(ctx); +} + +/*static*/ Maybe Conv2DQuantOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe Conv2DQuantOp::GetSbp(user_op::SbpContext* ctx) { + return GetSbpSignatures4Conv(ctx); +} + +/* static */ Maybe Conv2DQuantOp::GetComputeComplexity( + user_op::ComputeComplexityFnContext* ctx) { + return ConvComputationCost(ctx); +} + +/* static */ Maybe Conv2DQuantOp::CheckAttr(const user_op::UserOpDefWrapper& def, + const user_op::UserOpConfWrapper& conf) { + return CheckAttr_<2>(def, conf); +} + +/* static */ Maybe Conv2DQuantOp::InferDataType(user_op::InferContext* ctx) { + ctx->SetOutputDType("out", 0, ctx->Attr("out_dtype")); + return Maybe::Ok(); +} + +} // namespace oneflow From a947f44087be3289e2ecc1f3d741352e5c668a6c Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Sun, 27 Aug 2023 15:29:15 +0000 Subject: [PATCH 25/65] add conv2d quant kernel --- .../cutlass_conv_tuning_warmup_pass.cpp | 94 ++-- oneflow/user/kernels/conv_cutlass_kernels.cu | 128 +----- oneflow/user/kernels/conv_quant_kernels.cu | 178 ++++++++ .../cutlass_conv2d_operation_cache_key.h | 156 +++++++ oneflow/user/kernels/cutlass_conv_tuner.cpp | 393 ---------------- oneflow/user/kernels/cutlass_conv_tuner.h | 42 +- .../user/kernels/cutlass_conv_tuner_impl.cpp | 432 ++++++++++++++++++ .../user/kernels/cutlass_conv_tuner_impl.h | 54 +++ python/oneflow/nn/functional/__init__.py | 1 + 9 files changed, 880 insertions(+), 598 deletions(-) create mode 100644 oneflow/user/kernels/conv_quant_kernels.cu create mode 100644 oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h delete mode 100644 oneflow/user/kernels/cutlass_conv_tuner.cpp create mode 100644 oneflow/user/kernels/cutlass_conv_tuner_impl.cpp create mode 100644 oneflow/user/kernels/cutlass_conv_tuner_impl.h diff --git a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp index 9ff33a13f40..5edc79bb4c5 100644 --- a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp +++ b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp @@ -72,9 +72,7 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const if (conv2d_op.attr("groups") != 1) { return; } VLOG(3) << "Tuning " << op_conf.name(); const auto& in_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("in", 0))); - if (in_desc.data_type() != DataType::kFloat16 && in_desc.data_type() != DataType::kInt8) { - return; - } + if (in_desc.data_type() != DataType::kFloat16) { return; } const auto& weight_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("weight", 0))); const auto& out_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.output("out", 0))); @@ -96,6 +94,21 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const const int p = out_desc.shape().At(1); const int q = out_desc.shape().At(2); + cutlass::library::ConvFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF32, cutlass::library::NumericTypeID::kF32); + + const bool allow_half_accumulation = + ParseBooleanFromEnv("ONEFLOW_CONV_ALLOW_HALF_PRECISION_ACCUMULATION", false); + + if (allow_half_accumulation) { + key.element_accumulator = cutlass::library::NumericTypeID::kF16; + key.element_compute = cutlass::library::NumericTypeID::kF16; + } + const size_t x_size = GetCudaAlignedSize(in_desc.ByteSizeOfBlobBody()); const size_t w_size = GetCudaAlignedSize(weight_desc.ByteSizeOfBlobBody()); const size_t y_size = GetCudaAlignedSize(out_desc.ByteSizeOfBlobBody()); @@ -122,86 +135,47 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), strides.at(1), dilation_rate.at(0), dilation_rate.at(1), cutlass::conv::Mode::kCrossCorrelation); - cutlass::library::Conv2dConfiguration configuraion; configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; configuraion.problem_size = problem_size; configuraion.stride_a = {c, w * c, h * w * c}; configuraion.stride_b = {c, s * c, r * s * c}; - + configuraion.stride_c = {0, 0, 0}; cutlass::library::ConvArguments arguments; arguments.A = x_ptr; arguments.B = w_ptr; arguments.reordered_B = nullptr; arguments.C = bias_ptr; arguments.D = y_ptr; - union SP { float f{}; half h; - int32_t i; }; + SP alpha; SP beta; - const cutlass::library::Operation* operation = nullptr; - - if (in_desc.data_type() == DataType::kFloat16) { - cutlass::library::ConvFunctionalKey key( - cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF32, cutlass::library::NumericTypeID::kF32); - - const bool allow_half_accumulation = - ParseBooleanFromEnv("ONEFLOW_CONV_ALLOW_HALF_PRECISION_ACCUMULATION", false); - if (allow_half_accumulation) { - key.element_accumulator = cutlass::library::NumericTypeID::kF16; - key.element_compute = cutlass::library::NumericTypeID::kF16; - } - configuraion.stride_c = {0, 0, 0}; - - if (allow_half_accumulation) { - alpha.h = static_cast(1.0F); - if (bias_ptr == nullptr) { - beta.h = static_cast(0.0F); - } else { - beta.h = static_cast(1.0F); - } + if (allow_half_accumulation) { + alpha.h = static_cast(1.0F); + if (bias_ptr == nullptr) { + beta.h = static_cast(0.0F); } else { - alpha.f = 1.0F; - if (bias_ptr == nullptr) { - beta.f = 0.0F; - } else { - beta.f = 1.0F; - } + beta.h = static_cast(1.0F); } - arguments.alpha = α - arguments.beta = β - arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; - operation = CutlassConvTuner::Get().FindConv2dOperation( - stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); - } else if (in_desc.data_type() == DataType::kInt8) { - cutlass::library::ConvFunctionalKey key( - cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, - cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); - - configuraion.stride_c = {k, q * k, p * q * k}; - int32_t alpha = 1; - int32_t beta = 0; - arguments.alpha = α - arguments.beta = β - arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; - operation = CutlassConvTuner::Get().FindConv2dOperation( - stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); } else { - UNIMPLEMENTED() << "Only support float16 and int8 conv2d"; + alpha.f = 1.0F; + if (bias_ptr == nullptr) { + beta.f = 0.0F; + } else { + beta.f = 1.0F; + } } + arguments.alpha = α + arguments.beta = β + arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; + const cutlass::library::Operation* operation = CutlassConvTuner().FindConv2dOperation( + stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); if (operation != nullptr) { VLOG(3) << "Fastest operation: " << operation->description().name; nlohmann::json tuning_cache; diff --git a/oneflow/user/kernels/conv_cutlass_kernels.cu b/oneflow/user/kernels/conv_cutlass_kernels.cu index f409ed4edb5..5591f204ed9 100644 --- a/oneflow/user/kernels/conv_cutlass_kernels.cu +++ b/oneflow/user/kernels/conv_cutlass_kernels.cu @@ -142,14 +142,14 @@ class Conv2dCutlassKernel final : public user_op::OpKernel, public user_op::Cuda if (it == tuning_cache_object.end()) { return nullptr; } if (!it->is_string()) { return nullptr; } const std::string name = *it; - return CutlassConvTuner::Get().GetConv2dOperation(name, stream, key, configuraion, arguments, - tmp_buffer->mut_dptr(), - tmp_buffer->shape_view().elem_cnt()); + return CutlassConvTuner().GetConv2dOperation(name, stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); }(); if (!operation) { - operation = CutlassConvTuner::Get().FindConv2dOperation(stream, key, configuraion, arguments, - tmp_buffer->mut_dptr(), - tmp_buffer->shape_view().elem_cnt()); + operation = CutlassConvTuner().FindConv2dOperation(stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); } CHECK(operation != nullptr); @@ -180,122 +180,6 @@ REGISTER_USER_KERNEL("conv2d") }) .SetPriority(user_op::kKernelPriorityOptimized); -class Conv2dInt8CutlassKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - Conv2dInt8CutlassKernel() = default; - ~Conv2dInt8CutlassKernel() override = default; - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); - const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK(add_to_output == nullptr); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const auto& padding_before = ctx->Attr>("padding_before"); - const auto& dilation_rate = ctx->Attr>("dilation_rate"); - const auto& strides = ctx->Attr>("strides"); - - const int n = in->shape_view().At(0); - const int h = in->shape_view().At(1); - const int w = in->shape_view().At(2); - const int c = in->shape_view().At(3); - - const int k = weight->shape_view().At(0); - const int r = weight->shape_view().At(1); - const int s = weight->shape_view().At(2); - CHECK_EQ(weight->shape_view().At(3), c); - - const int p = out->shape_view().At(1); - const int q = out->shape_view().At(2); - - auto* stream = ctx->stream()->As(); - - cutlass::library::ConvFunctionalKey key( - cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, - cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); - - cutlass::conv::Conv2dProblemSize problem_size( - n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), - strides.at(1), dilation_rate.at(0), dilation_rate.at(1), - cutlass::conv::Mode::kCrossCorrelation); - cutlass::library::Conv2dConfiguration configuraion; - configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; - configuraion.problem_size = problem_size; - configuraion.stride_a = {c, w * c, h * w * c}; - configuraion.stride_b = {c, s * c, r * s * c}; - configuraion.stride_c = {k, q * k, p * q * k}; - - cutlass::library::ConvArguments arguments; - arguments.A = in->dptr(); - arguments.B = weight->dptr(); - arguments.reordered_B = nullptr; - if (bias == nullptr) { - arguments.C = nullptr; - } else { - arguments.C = bias->dptr(); - } - arguments.D = out->mut_dptr(); - - int32_t alpha = 1; - int32_t beta = 0; - arguments.alpha = α - arguments.beta = β - arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; - const cutlass::library::Operation* operation = nullptr; - operation = [&]() -> const cutlass::library::Operation* { - const std::string& tuning_cache = ctx->Attr("tuning_cache"); - if (tuning_cache.empty()) { return nullptr; } - auto tuning_cache_object = nlohmann::json::parse(tuning_cache); - if (!tuning_cache_object.is_object()) { return nullptr; } - auto it = tuning_cache_object.find("cutlass"); - if (it == tuning_cache_object.end()) { return nullptr; } - if (!it->is_string()) { return nullptr; } - const std::string name = *it; - return CutlassConvTuner::Get().GetConv2dOperation(name, stream, key, configuraion, arguments, - tmp_buffer->mut_dptr(), - tmp_buffer->shape_view().elem_cnt()); - }(); - if (!operation) { - operation = CutlassConvTuner::Get().FindConv2dOperation(stream, key, configuraion, arguments, - tmp_buffer->mut_dptr(), - tmp_buffer->shape_view().elem_cnt()); - } - - CHECK(operation != nullptr); - const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); - std::vector host_workspace(host_workspace_size, 0); - auto init_status = operation->initialize(&configuraion, host_workspace.data(), - tmp_buffer->mut_dptr(), stream->cuda_stream()); - CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = operation->run(&arguments, host_workspace.data(), tmp_buffer->mut_dptr(), - stream->cuda_stream()); - CHECK(run_status == cutlass::Status::kSuccess); - } -}; - -REGISTER_USER_KERNEL("conv2d") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobAttr("data_format") == "channels_last") - && (user_op::HobAttr("groups") == 1) - && (user_op::HobDataType("in", 0) == DataType::kInt8)) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { - // use static workspace size - return 128 * 1024 * 1024; - }) - .SetPriority(user_op::kKernelPriorityOptimized); - } // namespace } // namespace oneflow diff --git a/oneflow/user/kernels/conv_quant_kernels.cu b/oneflow/user/kernels/conv_quant_kernels.cu new file mode 100644 index 00000000000..8402aec059b --- /dev/null +++ b/oneflow/user/kernels/conv_quant_kernels.cu @@ -0,0 +1,178 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUTLASS_EXTENSION + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include "oneflow/core/job/lazy_mode.h" +#include "oneflow/user/kernels/cutlass_conv_tuner.h" + +#include +#include +#include + +namespace oneflow { + +namespace { + +template +void LaunchConvQuantOpImpl(user_op::KernelComputeContext* ctx, + const cutlass::library::ConvFunctionalKey& key, + const Configuration& configuraion, const Arguments& arguments) { + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + auto* stream = ctx->stream()->As(); + + const cutlass::library::Operation* operation = nullptr; + operation = [&]() -> const cutlass::library::Operation* { + const std::string& tuning_cache = ctx->Attr("tuning_cache"); + if (tuning_cache.empty()) { return nullptr; } + auto tuning_cache_object = nlohmann::json::parse(tuning_cache); + if (!tuning_cache_object.is_object()) { return nullptr; } + auto it = tuning_cache_object.find("cutlass"); + if (it == tuning_cache_object.end()) { return nullptr; } + if (!it->is_string()) { return nullptr; } + const std::string name = *it; + return CutlassConvTuner().GetConv2dOperation(name, stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); + }(); + if (!operation) { + operation = CutlassConvTuner().FindConv2dOperation(stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); + } + + CHECK(operation != nullptr); + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + std::vector host_workspace(host_workspace_size, 0); + auto init_status = operation->initialize(&configuraion, host_workspace.data(), + tmp_buffer->mut_dptr(), stream->cuda_stream()); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = operation->run(&arguments, host_workspace.data(), tmp_buffer->mut_dptr(), + stream->cuda_stream()); + CHECK(run_status == cutlass::Status::kSuccess); +} + +void LaunchConv2dQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, + const cutlass::library::ConvFunctionalKey& key, + const cutlass::conv::Conv2dProblemSize& problem_size, + const user_op::Tensor* in, const user_op::Tensor* weight, + const user_op::Tensor* in_zero_point, + const user_op::Tensor* scale, const user_op::Tensor* bias, + user_op::Tensor* out) { + cutlass::library::Conv2dScaleBiasFusionConfiguration configuraion; + configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; + configuraion.problem_size = problem_size; + configuraion.stride_a = {problem_size.C, problem_size.W * problem_size.C, + problem_size.H * problem_size.W * problem_size.C}; + configuraion.stride_b = {problem_size.C, problem_size.S * problem_size.C, + problem_size.R * problem_size.S * problem_size.C}; + + cutlass::library::ConvScaleBiasFusionArguments arguments; + arguments.A = in->dptr(); + arguments.B = weight->dptr(); + arguments.reordered_B = nullptr; + arguments.P = in_zero_point->dptr(); + arguments.Scale = scale->dptr(); + arguments.Bias = bias->dptr(); + arguments.D = out->mut_dptr(); + + LaunchConvQuantOpImpl(ctx, key, configuraion, arguments); +} + +class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + Conv2dQuantKernel() = default; + ~Conv2dQuantKernel() override = default; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_point", 0); + const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + CHECK(add_to_output == nullptr); + + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + + const auto& padding_before = ctx->Attr>("padding_before"); + const auto& dilation_rate = ctx->Attr>("dilation_rate"); + const auto& strides = ctx->Attr>("strides"); + + const int n = in->shape_view().At(0); + const int h = in->shape_view().At(1); + const int w = in->shape_view().At(2); + const int c = in->shape_view().At(3); + + const int k = weight->shape_view().At(0); + const int r = weight->shape_view().At(1); + const int s = weight->shape_view().At(2); + CHECK_EQ(weight->shape_view().At(3), c); + + const int p = out->shape_view().At(1); + const int q = out->shape_view().At(2); + + cutlass::library::ConvFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); + if (out->data_type() == DataType::kFloat) { + key.element_C = cutlass::library::NumericTypeID::kF32; + key.element_compute = cutlass::library::NumericTypeID::kF32; + } else if (out->data_type() == DataType::kFloat16) { + key.element_C = cutlass::library::NumericTypeID::kF16; + key.element_compute = cutlass::library::NumericTypeID::kF32; + } + cutlass::conv::Conv2dProblemSize problem_size( + n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), + strides.at(1), dilation_rate.at(0), dilation_rate.at(1), + cutlass::conv::Mode::kCrossCorrelation); + if (scale) { + LaunchConv2dQuantScaleBiasFusionOp(ctx, key, problem_size, in, weight, in_zero_point, scale, + bias, out); + } else { + UNIMPLEMENTED(); + } + } +}; + +REGISTER_USER_KERNEL("conv2d_quant") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobAttr("data_format") == "channels_last") + && (user_op::HobAttr("groups") == 1) + && (user_op::HobDataType("in", 0) == DataType::kInt8)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { + // use static workspace size + return 128 * 1024 * 1024; + }) + .SetPriority(user_op::kKernelPriorityOptimized); + +} // namespace + +} // namespace oneflow + +#endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h new file mode 100644 index 00000000000..91e55cc3e51 --- /dev/null +++ b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h @@ -0,0 +1,156 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUTLASS + +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_CONV2D_OPERATION_CACHE_KEY_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_CONV2D_OPERATION_CACHE_KEY_H_ + +#include "oneflow/core/framework/framework.h" + +#include +#include + +#ifdef WITH_CUTLASS_EXTENSION +#include +#endif // WITH_CUTLASS_EXTENSION + +namespace oneflow { + +struct Conv2dOperationCacheKey { + cutlass::library::ConvFunctionalKey functional_key; + cutlass::library::Conv2dConfiguration configuraion; + size_t alignment; + Conv2dOperationCacheKey(cutlass::library::ConvFunctionalKey functional_key, + cutlass::library::Conv2dConfiguration configuraion, + cutlass::library::ConvArguments arguments) + : functional_key(functional_key), configuraion(configuraion) { + const auto IsStrideAligned = [&](const std::vector& stride, size_t n) { + return std::all_of(stride.cbegin(), stride.cend(), + [&](const int64_t& s) { return s % n == 0; }); + }; + CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.C) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.D) % kCudaAlignSize, 0); + const auto IsAligned = [&](size_t n) { + return IsStrideAligned(configuraion.stride_a, n) && IsStrideAligned(configuraion.stride_b, n) + && IsStrideAligned(configuraion.stride_c, n); + }; + alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); + for (; alignment > 1; alignment = alignment >> 1) { + if (IsAligned(alignment)) { break; } + } + } + +#ifdef WITH_CUTLASS_EXTENSION + Conv2dOperationCacheKey(cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dScaleBiasFusionConfiguration& config, + const cutlass::library::ConvScaleBiasFusionArguments& arguments) + : functional_key(functional_key) { + configuraion.problem_size = config.problem_size; + configuraion.stride_a = config.stride_a; + configuraion.stride_b = config.stride_b; + configuraion.stride_c = {0, 0, 0}; + const auto IsStrideAligned = [&](const std::vector& stride, size_t n) { + return std::all_of(stride.cbegin(), stride.cend(), + [&](const int64_t& s) { return s % n == 0; }); + }; + CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.P) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.Scale) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.Bias) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.D) % kCudaAlignSize, 0); + const auto IsAligned = [&](size_t n) { + return IsStrideAligned(configuraion.stride_a, n) && IsStrideAligned(configuraion.stride_b, n) + && IsStrideAligned(configuraion.stride_c, n); + }; + alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); + for (; alignment > 1; alignment = alignment >> 1) { + if (IsAligned(alignment)) { break; } + } + } +#endif // WITH_CUTLASS_EXTENSION +}; + +struct Conv2dProblemSizeHasher { + size_t operator()(const cutlass::conv::Conv2dProblemSize& problem_size) const { + size_t hash = 0; + hash = HashCombine(hash, std::hash()(problem_size.N)); + hash = HashCombine(hash, std::hash()(problem_size.H)); + hash = HashCombine(hash, std::hash()(problem_size.W)); + hash = HashCombine(hash, std::hash()(problem_size.C)); + hash = HashCombine(hash, std::hash()(problem_size.P)); + hash = HashCombine(hash, std::hash()(problem_size.Q)); + hash = HashCombine(hash, std::hash()(problem_size.K)); + hash = HashCombine(hash, std::hash()(problem_size.R)); + hash = HashCombine(hash, std::hash()(problem_size.S)); + hash = HashCombine(hash, std::hash()(problem_size.pad_h)); + hash = HashCombine(hash, std::hash()(problem_size.pad_w)); + hash = HashCombine(hash, std::hash()(problem_size.stride_h)); + hash = HashCombine(hash, std::hash()(problem_size.stride_w)); + hash = HashCombine(hash, std::hash()(problem_size.dilation_h)); + hash = HashCombine(hash, std::hash()(problem_size.dilation_w)); + hash = HashCombine(hash, std::hash()(static_cast(problem_size.mode))); + hash = HashCombine(hash, std::hash()(problem_size.split_k_slices)); + hash = HashCombine(hash, std::hash()(problem_size.groups)); + return hash; + } +}; + +struct Conv2dConfigurationHasher { + size_t operator()(const cutlass::library::Conv2dConfiguration& configuraion) const { + size_t hash = std::hash()(static_cast(configuraion.split_k_mode)); + hash = HashCombine(hash, Conv2dProblemSizeHasher()(configuraion.problem_size)); + for (const int64_t v : configuraion.stride_a) { + hash = HashCombine(hash, std::hash()(v)); + } + for (const int64_t v : configuraion.stride_b) { + hash = HashCombine(hash, std::hash()(v)); + } + for (const int64_t v : configuraion.stride_c) { + hash = HashCombine(hash, std::hash()(v)); + } + return hash; + } +}; + +struct Conv2dOperationCacheKeyHasher { + size_t operator()(const Conv2dOperationCacheKey& key) const { + size_t hash = cutlass::library::ConvFunctionalKeyHasher()(key.functional_key); + hash = HashCombine(hash, Conv2dConfigurationHasher()(key.configuraion)); + hash = HashCombine(hash, std::hash()(key.alignment)); + return hash; + } +}; + +inline bool operator==(const cutlass::library::Conv2dConfiguration& lhs, + const cutlass::library::Conv2dConfiguration& rhs) { + return lhs.split_k_mode == rhs.split_k_mode && lhs.problem_size == rhs.problem_size + && lhs.stride_a == rhs.stride_a && lhs.stride_b == rhs.stride_b + && lhs.stride_c == rhs.stride_c; +} + +inline bool operator==(const Conv2dOperationCacheKey& lhs, const Conv2dOperationCacheKey& rhs) { + return lhs.functional_key == rhs.functional_key && lhs.configuraion == rhs.configuraion + && lhs.alignment == rhs.alignment; +} + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_CONV2D_OPERATION_CACHE_KEY_H_ + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_conv_tuner.cpp b/oneflow/user/kernels/cutlass_conv_tuner.cpp deleted file mode 100644 index df746bba5f2..00000000000 --- a/oneflow/user/kernels/cutlass_conv_tuner.cpp +++ /dev/null @@ -1,393 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifdef WITH_CUTLASS - -#include "oneflow/user/kernels/cutlass_conv_tuner.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/ep/cuda/cuda_stream.h" -#include "oneflow/core/job/lazy_mode.h" -#include -#include -#include - -#ifdef WITH_CUTLASS_EXTENSION -#include -#endif // WITH_CUTLASS_EXTENSION - -namespace oneflow { - -namespace { - -bool IsWeakerAlginOperation(const cutlass::library::Operation* lhs, - const cutlass::library::Operation* rhs) { - const std::string lhs_name = lhs->description().name; - const std::string rhs_name = rhs->description().name; - size_t lhs_pos = lhs_name.rfind("align"); - if (lhs_pos == std::string::npos) { return false; } - size_t rhs_pos = rhs_name.rfind("align"); - if (rhs_pos == std::string::npos) { return false; } - if (lhs_name.substr(0, lhs_pos) != rhs_name.substr(0, rhs_pos)) { return false; } - size_t align_len = std::strlen("align"); - int lhs_alignment = std::atoi(lhs_name.substr(lhs_pos + align_len).c_str()); - int rhs_alignment = std::atoi(rhs_name.substr(rhs_pos + align_len).c_str()); - return lhs_alignment < rhs_alignment; -} - -struct Conv2dOperationCacheKey { - cutlass::library::ConvFunctionalKey functional_key; - cutlass::library::Conv2dConfiguration configuraion; - size_t alignment; - Conv2dOperationCacheKey(cutlass::library::ConvFunctionalKey functional_key, - cutlass::library::Conv2dConfiguration configuraion, - cutlass::library::ConvArguments arguments) - : functional_key(functional_key), configuraion(configuraion) { - const auto IsStrideAligned = [&](const std::vector& stride, size_t n) { - return std::all_of(stride.cbegin(), stride.cend(), - [&](const int64_t& s) { return s % n == 0; }); - }; - CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); - CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); - CHECK_EQ(reinterpret_cast(arguments.C) % kCudaAlignSize, 0); - CHECK_EQ(reinterpret_cast(arguments.D) % kCudaAlignSize, 0); - const auto IsAligned = [&](size_t n) { - return IsStrideAligned(configuraion.stride_a, n) && IsStrideAligned(configuraion.stride_b, n) - && IsStrideAligned(configuraion.stride_c, n); - }; - alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); - for (; alignment > 1; alignment = alignment >> 1) { - if (IsAligned(alignment)) { break; } - } - } -}; - -struct Conv2dProblemSizeHasher { - size_t operator()(const cutlass::conv::Conv2dProblemSize& problem_size) const { - size_t hash = 0; - hash = HashCombine(hash, std::hash()(problem_size.N)); - hash = HashCombine(hash, std::hash()(problem_size.H)); - hash = HashCombine(hash, std::hash()(problem_size.W)); - hash = HashCombine(hash, std::hash()(problem_size.C)); - hash = HashCombine(hash, std::hash()(problem_size.P)); - hash = HashCombine(hash, std::hash()(problem_size.Q)); - hash = HashCombine(hash, std::hash()(problem_size.K)); - hash = HashCombine(hash, std::hash()(problem_size.R)); - hash = HashCombine(hash, std::hash()(problem_size.S)); - hash = HashCombine(hash, std::hash()(problem_size.pad_h)); - hash = HashCombine(hash, std::hash()(problem_size.pad_w)); - hash = HashCombine(hash, std::hash()(problem_size.stride_h)); - hash = HashCombine(hash, std::hash()(problem_size.stride_w)); - hash = HashCombine(hash, std::hash()(problem_size.dilation_h)); - hash = HashCombine(hash, std::hash()(problem_size.dilation_w)); - hash = HashCombine(hash, std::hash()(static_cast(problem_size.mode))); - hash = HashCombine(hash, std::hash()(problem_size.split_k_slices)); - hash = HashCombine(hash, std::hash()(problem_size.groups)); - return hash; - } -}; - -struct Conv2dConfigurationHasher { - size_t operator()(const cutlass::library::Conv2dConfiguration& configuraion) const { - size_t hash = std::hash()(static_cast(configuraion.split_k_mode)); - hash = HashCombine(hash, Conv2dProblemSizeHasher()(configuraion.problem_size)); - for (const int64_t v : configuraion.stride_a) { - hash = HashCombine(hash, std::hash()(v)); - } - for (const int64_t v : configuraion.stride_b) { - hash = HashCombine(hash, std::hash()(v)); - } - for (const int64_t v : configuraion.stride_c) { - hash = HashCombine(hash, std::hash()(v)); - } - return hash; - } -}; - -struct Conv2dOperationCacheKeyHasher { - size_t operator()(const Conv2dOperationCacheKey& key) const { - size_t hash = cutlass::library::ConvFunctionalKeyHasher()(key.functional_key); - hash = HashCombine(hash, Conv2dConfigurationHasher()(key.configuraion)); - hash = HashCombine(hash, std::hash()(key.alignment)); - return hash; - } -}; - -inline bool operator==(const cutlass::library::Conv2dConfiguration& lhs, - const cutlass::library::Conv2dConfiguration& rhs) { - return lhs.split_k_mode == rhs.split_k_mode && lhs.problem_size == rhs.problem_size - && lhs.stride_a == rhs.stride_a && lhs.stride_b == rhs.stride_b - && lhs.stride_c == rhs.stride_c; -} - -inline bool operator==(const Conv2dOperationCacheKey& lhs, const Conv2dOperationCacheKey& rhs) { - return lhs.functional_key == rhs.functional_key && lhs.configuraion == rhs.configuraion - && lhs.alignment == rhs.alignment; -} - -size_t GetTensorSize(cutlass::library::NumericTypeID element, cutlass::library::LayoutTypeID layout, - const cutlass::Tensor4DCoord& extent, const std::vector& stride) { - const size_t element_size = cutlass::library::sizeof_bits(element) / 8; - size_t capacity = 0; - if (layout == cutlass::library::LayoutTypeID::kTensorNHWC) { - CHECK_EQ(stride.size(), 3); - capacity = - cutlass::layout::TensorNHWC(stride.at(0), stride.at(1), stride.at(2)).capacity(extent); - } else { - UNIMPLEMENTED(); - } - return capacity * element_size; -} - -}; // namespace - -using CacheMap = std::unordered_map; -struct CutlassConvTuner::Impl { - std::mutex mutex; - std::unordered_map cache; - - const cutlass::library::Operation* FindConv2dOperation( - ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, size_t workspace_size); - - const cutlass::library::Operation* GetConv2dOperation( - const std::string& name, ep::CudaStream* stream, - cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, size_t workspace_size); -}; - -const cutlass::library::Operation* CutlassConvTuner::Impl::FindConv2dOperation( - ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, size_t workspace_size) { - int dev = 0; - OF_CUDA_CHECK(cudaGetDevice(&dev)); - Conv2dOperationCacheKey cache_key(functional_key, configuraion, arguments); - { - std::lock_guard lock(mutex); - const auto& device_cache = cache[dev]; - const auto& it = device_cache.find(cache_key); - if (it != device_cache.end()) { return it->second; } - } - - cutlass::library::ConvArguments benchmark_arguments = arguments; - void* benchmark_workspace = workspace; - cudaStream_t benchmark_stream = stream->cuda_stream(); -#ifdef WITH_CUDA_GRAPHS - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - if (stream->IsGraphCapturing()) { - OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - OF_CUDA_CHECK(cudaStreamCreate(&benchmark_stream)); - OF_CUDA_CHECK(cudaMalloc(&benchmark_workspace, workspace_size)); - const size_t a_size = - GetTensorSize(functional_key.element_A, functional_key.layout_A, - configuraion.problem_size.activation_extent(), configuraion.stride_a); - OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.A, a_size)); - const size_t b_size = - GetTensorSize(functional_key.element_B, functional_key.layout_B, - configuraion.problem_size.filter_extent(), configuraion.stride_b); - OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.B, b_size)); - if (benchmark_arguments.C != nullptr) { - const size_t c_size = - GetTensorSize(functional_key.element_C, functional_key.layout_C, - configuraion.problem_size.output_extent(), configuraion.stride_c); - OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.C, c_size)); - } - - const size_t d_size = GetTensorSize( - functional_key.element_C, functional_key.layout_C, - configuraion.problem_size.output_extent(), - {configuraion.problem_size.K, configuraion.problem_size.K * configuraion.problem_size.Q, - configuraion.problem_size.K * configuraion.problem_size.Q * configuraion.problem_size.P}); - OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.D, d_size)); - } -#endif // WITH_CUDA_GRAPHS - - constexpr int turing_warmup_iters = 2; - constexpr int turing_iters = 5; - cudaEvent_t start{}; - cudaEvent_t end{}; - OF_CUDA_CHECK(cudaEventCreate(&start)); - OF_CUDA_CHECK(cudaEventCreate(&end)); - const cutlass::library::Operation* fastest_operation = nullptr; - float fastest_time = 0; - const auto& operations_map = [&]() { -#ifdef WITH_CUTLASS_EXTENSION - const auto& extension_operations_map_it = - cutlass::library::CutlassExtensionSingleton::get().operation_table.conv2d_operations.find( - functional_key); - if (extension_operations_map_it - != cutlass::library::CutlassExtensionSingleton::get() - .operation_table.conv2d_operations.cend()) { - return extension_operations_map_it->second; - } -#endif // WITH_CUTLASS_EXTENSION - const auto& operations_map_it = - cutlass::library::Singleton::get().operation_table.conv2d_operations.find(functional_key); - CHECK(operations_map_it - != cutlass::library::Singleton::get().operation_table.conv2d_operations.cend()); - return operations_map_it->second; - }(); - - for (const auto& pair : operations_map) { - std::map> operations; - for (auto operation : pair.second) { - operations.emplace(operation->description().name, operation); - } - const cutlass::library::Operation* prev_operation = nullptr; - for (const auto& name_operation : operations) { - const cutlass::library::Operation* operation = name_operation.second; - if (prev_operation != nullptr && IsWeakerAlginOperation(operation, prev_operation)) { - continue; - } - if (operation->description().tile_description.minimum_compute_capability * 10 - > stream->cuda_arch() - || operation->description().tile_description.maximum_compute_capability * 10 - < stream->cuda_arch()) { - continue; - } - auto status = operation->can_implement(&configuraion, &benchmark_arguments); - if (status != cutlass::Status::kSuccess) { continue; } - const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); - const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); - if (device_workspace_size > workspace_size) { continue; } - std::vector host_workspace(host_workspace_size, 0); - if (operation->initialize(&configuraion, host_workspace.data(), benchmark_workspace, - benchmark_stream) - != cutlass::Status::kSuccess) { - continue; - } - - const auto Run = [&]() { - auto init_status = operation->initialize(&configuraion, host_workspace.data(), - benchmark_workspace, benchmark_stream); - CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = operation->run(&benchmark_arguments, host_workspace.data(), - benchmark_workspace, benchmark_stream); - CHECK(run_status == cutlass::Status::kSuccess); - }; - OF_CUDA_CHECK(cudaStreamSynchronize(benchmark_stream)); - for (int i = 0; i < turing_warmup_iters; ++i) { Run(); } - OF_CUDA_CHECK(cudaEventRecord(start, benchmark_stream)); - for (int i = 0; i < turing_iters; ++i) { Run(); } - OF_CUDA_CHECK(cudaEventRecord(end, benchmark_stream)); - OF_CUDA_CHECK(cudaEventSynchronize(end)); - float time = 0; - OF_CUDA_CHECK(cudaEventElapsedTime(&time, start, end)); - VLOG(3) << operation->description().name << " " << time; - prev_operation = operation; - if (fastest_operation == nullptr || time < fastest_time) { - fastest_operation = operation; - fastest_time = time; - } - } - } - OF_CUDA_CHECK(cudaEventDestroy(start)); - OF_CUDA_CHECK(cudaEventDestroy(end)); -#ifdef WITH_CUDA_GRAPHS - if (stream->IsGraphCapturing()) { - OF_CUDA_CHECK(cudaStreamSynchronize(benchmark_stream)); - OF_CUDA_CHECK(cudaStreamDestroy(benchmark_stream)); - OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.A))); - OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.B))); - OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.C))); - OF_CUDA_CHECK(cudaFree(benchmark_arguments.D)); - OF_CUDA_CHECK(cudaFree(benchmark_workspace)); - OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - } -#endif // WITH_CUDA_GRAPHS - if (fastest_operation != nullptr) { - VLOG(3) << "Fastest: " << fastest_operation->description().name << " " << fastest_time; - { - std::lock_guard lock(mutex); - cache[dev][cache_key] = fastest_operation; - } - } - return fastest_operation; -} - -const cutlass::library::Operation* CutlassConvTuner::Impl::GetConv2dOperation( - const std::string& name, ep::CudaStream* stream, - cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, size_t workspace_size) { - int dev = 0; - OF_CUDA_CHECK(cudaGetDevice(&dev)); - const auto& operations_map_it = - cutlass::library::Singleton::get().operation_table.conv2d_operations.find(functional_key); - if (operations_map_it - == cutlass::library::Singleton::get().operation_table.conv2d_operations.cend()) { - return nullptr; - } - const cutlass::library::ConvOperationVectorMap& operations_map = operations_map_it->second; - for (const auto& pair : operations_map) { - for (auto operation : pair.second) { - if (name != operation->description().name) { continue; } - if (operation->description().tile_description.minimum_compute_capability * 10 - > stream->cuda_arch() - || operation->description().tile_description.maximum_compute_capability * 10 - < stream->cuda_arch()) { - continue; - } - auto status = operation->can_implement(&configuraion, &arguments); - if (status != cutlass::Status::kSuccess) { continue; } - const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); - const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); - if (device_workspace_size > workspace_size) { continue; } - std::vector host_workspace(host_workspace_size, 0); - if (operation->initialize(&configuraion, host_workspace.data(), workspace, - stream->cuda_stream()) - != cutlass::Status::kSuccess) { - continue; - } - return operation; - } - } - return nullptr; -} - -CutlassConvTuner::CutlassConvTuner() { impl_.reset(new Impl()); } - -const CutlassConvTuner& CutlassConvTuner::Get() { - static CutlassConvTuner instance; - return instance; -} - -const cutlass::library::Operation* CutlassConvTuner::FindConv2dOperation( - ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, - size_t workspace_size) const { - return impl_->FindConv2dOperation(stream, functional_key, configuraion, arguments, workspace, - workspace_size); -} - -const cutlass::library::Operation* CutlassConvTuner::GetConv2dOperation( - const std::string& name, ep::CudaStream* stream, - cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, - size_t workspace_size) const { - return impl_->GetConv2dOperation(name, stream, functional_key, configuraion, arguments, workspace, - workspace_size); -} - -} // namespace oneflow - -#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_conv_tuner.h b/oneflow/user/kernels/cutlass_conv_tuner.h index 7ee665ad23e..f8d3c810d58 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner.h +++ b/oneflow/user/kernels/cutlass_conv_tuner.h @@ -13,49 +13,45 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef ONEFLOW_USER_KERNELS_CUTLASS_CONV_TUNER_H_ #define ONEFLOW_USER_KERNELS_CUTLASS_CONV_TUNER_H_ #ifdef WITH_CUTLASS #include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" #include "oneflow/core/ep/cuda/cuda_stream.h" -#include "oneflow/core/job/lazy_mode.h" -#include +#include "oneflow/user/kernels/cutlass_conv_tuner_impl.h" + #include -#include +#include namespace oneflow { class CutlassConvTuner { public: - OF_DISALLOW_COPY_AND_MOVE(CutlassConvTuner); - ~CutlassConvTuner() = default; + CutlassConvTuner() = default; + template const cutlass::library::Operation* FindConv2dOperation( - ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, - size_t workspace_size) const; - + ep::CudaStream* stream, const cutlass::library::ConvFunctionalKey& functional_key, + const Configuration& configuraion, const Arguments& arguments, void* workspace, + size_t workspace_size) { + return GetCutlassConvTunerImpl()->Find( + stream, functional_key, configuraion, arguments, workspace, workspace_size); + } + + template const cutlass::library::Operation* GetConv2dOperation( const std::string& name, ep::CudaStream* stream, - cutlass::library::ConvFunctionalKey functional_key, - const cutlass::library::Conv2dConfiguration& configuraion, - const cutlass::library::ConvArguments& arguments, void* workspace, - size_t workspace_size) const; - - static const CutlassConvTuner& Get(); - - private: - CutlassConvTuner(); - struct Impl; - std::unique_ptr impl_; + const cutlass::library::ConvFunctionalKey& functional_key, const Configuration& configuraion, + const Arguments& arguments, void* workspace, size_t workspace_size) { + return GetCutlassConvTunerImpl()->Get( + name, stream, functional_key, configuraion, arguments, workspace, workspace_size); + } }; } // namespace oneflow #endif // WITH_CUTLASS + #endif // ONEFLOW_USER_KERNELS_CUTLASS_CONV_TUNER_H_ diff --git a/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp new file mode 100644 index 00000000000..ced2060ecea --- /dev/null +++ b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp @@ -0,0 +1,432 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#include "oneflow/user/kernels/cutlass_conv_tuner_impl.h" + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include +#include + +#include "oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h" +#ifdef WITH_CUTLASS_EXTENSION +#include +#include +#endif // WITH_CUTLASS_EXTENSION + +namespace oneflow { + +namespace { + +bool IsWeakerAlginOperation(const cutlass::library::Operation* lhs, + const cutlass::library::Operation* rhs) { + const std::string lhs_name = lhs->description().name; + const std::string rhs_name = rhs->description().name; + size_t lhs_pos = lhs_name.rfind("align"); + if (lhs_pos == std::string::npos) { return false; } + size_t rhs_pos = rhs_name.rfind("align"); + if (rhs_pos == std::string::npos) { return false; } + if (lhs_name.substr(0, lhs_pos) != rhs_name.substr(0, rhs_pos)) { return false; } + size_t align_len = std::strlen("align"); + int lhs_alignment = std::atoi(lhs_name.substr(lhs_pos + align_len).c_str()); + int rhs_alignment = std::atoi(rhs_name.substr(rhs_pos + align_len).c_str()); + return lhs_alignment < rhs_alignment; +} + +size_t GetTensorSize(cutlass::library::NumericTypeID element, cutlass::library::LayoutTypeID layout, + const cutlass::Tensor4DCoord& extent, const std::vector& stride) { + const size_t element_size = cutlass::library::sizeof_bits(element) / 8; + size_t capacity = 0; + if (layout == cutlass::library::LayoutTypeID::kTensorNHWC) { + CHECK_EQ(stride.size(), 3); + capacity = + cutlass::layout::TensorNHWC(stride.at(0), stride.at(1), stride.at(2)).capacity(extent); + } else { + UNIMPLEMENTED(); + } + return capacity * element_size; +} + +template +const cutlass::library::Operation* FindFastestOperation( + const Singleton* singleton, const cutlass::library::ConvFunctionalKey& functional_key, + const Configuration& configuraion, const Arguments& arguments, void* workspace, + size_t workspace_size, cudaStream_t stream, int cuda_arch) { + constexpr int turing_warmup_iters = 2; + constexpr int turing_iters = 5; + cudaEvent_t start{}; + cudaEvent_t end{}; + OF_CUDA_CHECK(cudaEventCreate(&start)); + OF_CUDA_CHECK(cudaEventCreate(&end)); + const cutlass::library::Operation* fastest_operation = nullptr; + float fastest_time = 0; + const auto& operations_map = [&]() { + const auto& it = singleton->operation_table.conv2d_operations.find(functional_key); + CHECK(it != singleton->operation_table.conv2d_operations.cend()); + return it->second; + }(); + + for (const auto& pair : operations_map) { + std::map> operations; + for (auto operation : pair.second) { + operations.emplace(operation->description().name, operation); + } + const cutlass::library::Operation* prev_operation = nullptr; + for (const auto& name_operation : operations) { + const cutlass::library::Operation* operation = name_operation.second; + if (prev_operation != nullptr && IsWeakerAlginOperation(operation, prev_operation)) { + continue; + } + if (operation->description().tile_description.minimum_compute_capability * 10 > cuda_arch + || operation->description().tile_description.maximum_compute_capability * 10 + < cuda_arch) { + continue; + } + auto status = operation->can_implement(&configuraion, &arguments); + if (status != cutlass::Status::kSuccess) { continue; } + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); + if (device_workspace_size > workspace_size) { continue; } + std::vector host_workspace(host_workspace_size, 0); + if (operation->initialize(&configuraion, host_workspace.data(), workspace, stream) + != cutlass::Status::kSuccess) { + continue; + } + + const auto Run = [&]() { + auto init_status = + operation->initialize(&configuraion, host_workspace.data(), workspace, stream); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = operation->run(&arguments, host_workspace.data(), workspace, stream); + CHECK(run_status == cutlass::Status::kSuccess); + }; + OF_CUDA_CHECK(cudaStreamSynchronize(stream)); + for (int i = 0; i < turing_warmup_iters; ++i) { Run(); } + OF_CUDA_CHECK(cudaEventRecord(start, stream)); + for (int i = 0; i < turing_iters; ++i) { Run(); } + OF_CUDA_CHECK(cudaEventRecord(end, stream)); + OF_CUDA_CHECK(cudaEventSynchronize(end)); + float time = 0; + OF_CUDA_CHECK(cudaEventElapsedTime(&time, start, end)); + VLOG(3) << operation->description().name << " " << time; + prev_operation = operation; + if (fastest_operation == nullptr || time < fastest_time) { + fastest_operation = operation; + fastest_time = time; + } + } + } + OF_CUDA_CHECK(cudaEventDestroy(start)); + OF_CUDA_CHECK(cudaEventDestroy(end)); + VLOG(3) << "Fastest: " << fastest_operation->description().name << " " << fastest_time; + return fastest_operation; +} + +template +const cutlass::library::Operation* GetOperation( + const Singleton* singleton, const std::string& name, + const cutlass::library::ConvFunctionalKey& functional_key, const Configuration& configuraion, + const Arguments& arguments, void* workspace, size_t workspace_size, cudaStream_t stream, + int cuda_arch) { + const auto& it = singleton->operation_table.conv2d_operations.find(functional_key); + if (it == singleton->operation_table.conv2d_operations.cend()) { return nullptr; } + const cutlass::library::ConvOperationVectorMap& operations_map = it->second; + for (const auto& pair : operations_map) { + for (auto operation : pair.second) { + if (name != operation->description().name) { continue; } + if (operation->description().tile_description.minimum_compute_capability * 10 > cuda_arch + || operation->description().tile_description.maximum_compute_capability * 10 + < cuda_arch) { + continue; + } + auto status = operation->can_implement(&configuraion, &arguments); + if (status != cutlass::Status::kSuccess) { continue; } + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); + if (device_workspace_size > workspace_size) { continue; } + std::vector host_workspace(host_workspace_size, 0); + if (operation->initialize(&configuraion, host_workspace.data(), workspace, stream) + != cutlass::Status::kSuccess) { + continue; + } + return operation; + } + } + return nullptr; +} + +} // namespace + +template<> +class CutlassConvTunerImpl { + public: + using CacheMap = std::unordered_map; + + CutlassConvTunerImpl() { singleton = &cutlass::library::Singleton::get(); } + + const cutlass::library::Operation* Find(ep::CudaStream* stream, + cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dConfiguration& configuraion, + const cutlass::library::ConvArguments& arguments, + void* workspace, size_t workspace_size); + + const cutlass::library::Operation* Get(const std::string& name, ep::CudaStream* stream, + cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dConfiguration& configuraion, + const cutlass::library::ConvArguments& arguments, + void* workspace, size_t workspace_size); + + private: + std::mutex mutex; + std::unordered_map cache; + const cutlass::library::Singleton* singleton; +}; + +const cutlass::library::Operation* +CutlassConvTunerImpl::Find( + ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dConfiguration& configuraion, + const cutlass::library::ConvArguments& arguments, void* workspace, size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + Conv2dOperationCacheKey cache_key(functional_key, configuraion, arguments); + { + std::lock_guard lock(mutex); + const auto& device_cache = cache[dev]; + const auto& it = device_cache.find(cache_key); + if (it != device_cache.end()) { return it->second; } + } + cutlass::library::ConvArguments benchmark_arguments = arguments; + void* benchmark_workspace = workspace; + cudaStream_t benchmark_stream = stream->cuda_stream(); +#ifdef WITH_CUDA_GRAPHS + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + OF_CUDA_CHECK(cudaStreamCreate(&benchmark_stream)); + OF_CUDA_CHECK(cudaMalloc(&benchmark_workspace, workspace_size)); + const size_t a_size = + GetTensorSize(functional_key.element_A, functional_key.layout_A, + configuraion.problem_size.activation_extent(), configuraion.stride_a); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.A, a_size)); + const size_t b_size = + GetTensorSize(functional_key.element_B, functional_key.layout_B, + configuraion.problem_size.filter_extent(), configuraion.stride_b); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.B, b_size)); + if (benchmark_arguments.C != nullptr) { + const size_t c_size = + GetTensorSize(functional_key.element_C, functional_key.layout_C, + configuraion.problem_size.output_extent(), configuraion.stride_c); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.C, c_size)); + } + + const size_t d_size = GetTensorSize( + functional_key.element_C, functional_key.layout_C, + configuraion.problem_size.output_extent(), + {configuraion.problem_size.K, configuraion.problem_size.K * configuraion.problem_size.Q, + configuraion.problem_size.K * configuraion.problem_size.Q * configuraion.problem_size.P}); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.D, d_size)); + } +#endif // WITH_CUDA_GRAPHS + + const cutlass::library::Operation* fastest_operation = FindFastestOperation( + singleton, functional_key, configuraion, benchmark_arguments, benchmark_workspace, + workspace_size, benchmark_stream, stream->cuda_arch()); + +#ifdef WITH_CUDA_GRAPHS + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaStreamSynchronize(benchmark_stream)); + OF_CUDA_CHECK(cudaStreamDestroy(benchmark_stream)); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.A))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.B))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.C))); + OF_CUDA_CHECK(cudaFree(benchmark_arguments.D)); + OF_CUDA_CHECK(cudaFree(benchmark_workspace)); + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + } +#endif // WITH_CUDA_GRAPHS + if (fastest_operation != nullptr) { + std::lock_guard lock(mutex); + cache[dev][cache_key] = fastest_operation; + } + return fastest_operation; +} + +const cutlass::library::Operation* +CutlassConvTunerImpl::Get( + const std::string& name, ep::CudaStream* stream, + cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dConfiguration& configuraion, + const cutlass::library::ConvArguments& arguments, void* workspace, size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + return GetOperation(singleton, name, functional_key, configuraion, arguments, workspace, + workspace_size, stream->cuda_stream(), stream->cuda_arch()); +} + +#ifdef WITH_CUTLASS_EXTENSION +template<> +class CutlassConvTunerImpl { + public: + using CacheMap = std::unordered_map; + + CutlassConvTunerImpl() { + singleton = &cutlass::library::CutlassExtensionSingleton::get( + cutlass::library::SingletonKind::kConv2dScaleBiasFusionWithZeroPoint); + } + + const cutlass::library::Operation* Find( + ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dScaleBiasFusionConfiguration& configuraion, + const cutlass::library::ConvScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size); + + const cutlass::library::Operation* Get( + const std::string& name, ep::CudaStream* stream, + cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dScaleBiasFusionConfiguration& configuraion, + const cutlass::library::ConvScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size); + + private: + std::mutex mutex; + std::unordered_map cache; + const cutlass::library::CutlassExtensionSingleton* singleton; +}; + +const cutlass::library::Operation* +CutlassConvTunerImpl:: + Find(ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dScaleBiasFusionConfiguration& configuraion, + const cutlass::library::ConvScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + Conv2dOperationCacheKey cache_key(functional_key, configuraion, arguments); + { + std::lock_guard lock(mutex); + const auto& device_cache = cache[dev]; + const auto& it = device_cache.find(cache_key); + if (it != device_cache.end()) { return it->second; } + } + cutlass::library::ConvScaleBiasFusionArguments benchmark_arguments = arguments; + void* benchmark_workspace = workspace; + cudaStream_t benchmark_stream = stream->cuda_stream(); +#ifdef WITH_CUDA_GRAPHS + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + OF_CUDA_CHECK(cudaStreamCreate(&benchmark_stream)); + OF_CUDA_CHECK(cudaMalloc(&benchmark_workspace, workspace_size)); + const size_t a_size = + GetTensorSize(functional_key.element_A, functional_key.layout_A, + configuraion.problem_size.activation_extent(), configuraion.stride_a); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.A, a_size)); + const size_t b_size = + GetTensorSize(functional_key.element_B, functional_key.layout_B, + configuraion.problem_size.filter_extent(), configuraion.stride_b); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.B, b_size)); + + if (benchmark_arguments.P != nullptr) { + const size_t p_size = cutlass::library::sizeof_bits(functional_key.element_A) / 8; + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.P, p_size)); + } + if (benchmark_arguments.Scale != nullptr) { + const size_t scale_size = + configuraion.problem_size.K * cutlass::library::sizeof_bits(functional_key.element_C) / 8; + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.Scale, scale_size)); + } + if (benchmark_arguments.Bias != nullptr) { + const size_t bias_size = + configuraion.problem_size.K * cutlass::library::sizeof_bits(functional_key.element_C) / 8; + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.Bias, bias_size)); + } + + const size_t d_size = GetTensorSize( + functional_key.element_C, functional_key.layout_C, + configuraion.problem_size.output_extent(), + {configuraion.problem_size.K, configuraion.problem_size.K * configuraion.problem_size.Q, + configuraion.problem_size.K * configuraion.problem_size.Q * configuraion.problem_size.P}); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.D, d_size)); + } +#endif // WITH_CUDA_GRAPHS + + const cutlass::library::Operation* fastest_operation = FindFastestOperation( + singleton, functional_key, configuraion, benchmark_arguments, benchmark_workspace, + workspace_size, benchmark_stream, stream->cuda_arch()); + +#ifdef WITH_CUDA_GRAPHS + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaStreamSynchronize(benchmark_stream)); + OF_CUDA_CHECK(cudaStreamDestroy(benchmark_stream)); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.A))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.B))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.P))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Scale))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Bias))); + OF_CUDA_CHECK(cudaFree(benchmark_arguments.D)); + OF_CUDA_CHECK(cudaFree(benchmark_workspace)); + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + } +#endif // WITH_CUDA_GRAPHS + if (fastest_operation != nullptr) { + std::lock_guard lock(mutex); + cache[dev][cache_key] = fastest_operation; + } + return fastest_operation; +} + +const cutlass::library::Operation* +CutlassConvTunerImpl:: + Get(const std::string& name, ep::CudaStream* stream, + cutlass::library::ConvFunctionalKey functional_key, + const cutlass::library::Conv2dScaleBiasFusionConfiguration& configuraion, + const cutlass::library::ConvScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + return GetOperation(singleton, name, functional_key, configuraion, arguments, workspace, + workspace_size, stream->cuda_stream(), stream->cuda_arch()); +} +#endif // WITH_CUTLASS_EXTENSION + +template +CutlassConvTunerImpl* GetCutlassConvTunerImpl() { + static CutlassConvTunerImpl impl; + return &impl; +} + +template CutlassConvTunerImpl* +GetCutlassConvTunerImpl(); + +#ifdef WITH_CUTLASS_EXTENSION +template CutlassConvTunerImpl* +GetCutlassConvTunerImpl(); +#endif // WITH_CUTLASS_EXTENSION + +} // namespace oneflow + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_conv_tuner_impl.h b/oneflow/user/kernels/cutlass_conv_tuner_impl.h new file mode 100644 index 00000000000..902c21b7b41 --- /dev/null +++ b/oneflow/user/kernels/cutlass_conv_tuner_impl.h @@ -0,0 +1,54 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_CONV_TUNER_IMPL_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_CONV_TUNER_IMPL_H_ + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" + +#include +#include +#include + +namespace oneflow { + +template +class CutlassConvTunerImpl { + public: + const cutlass::library::Operation* Find(ep::CudaStream* stream, + cutlass::library::ConvFunctionalKey functional_key, + const Configuration& configuraion, + const Arguments& arguments, void* workspace, + size_t workspace_size); + + const cutlass::library::Operation* Get(const std::string& name, ep::CudaStream* stream, + cutlass::library::ConvFunctionalKey functional_key, + const Configuration& configuraion, + const Arguments& arguments, void* workspace, + size_t workspace_size); +}; + +template +CutlassConvTunerImpl* GetCutlassConvTunerImpl(); + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_CONV_TUNER_IMPL_H_ + +#endif // WITH_CUTLASS diff --git a/python/oneflow/nn/functional/__init__.py b/python/oneflow/nn/functional/__init__.py index 8019945c366..677536e035d 100644 --- a/python/oneflow/nn/functional/__init__.py +++ b/python/oneflow/nn/functional/__init__.py @@ -20,6 +20,7 @@ from oneflow._C import conv1d from oneflow._C import conv2d from oneflow._C import conv3d +from oneflow._C import conv2d_quant from oneflow._C import deconv1d as conv_transpose1d from oneflow._C import deconv2d as conv_transpose2d from oneflow._C import deconv3d as conv_transpose3d From 7a38a9fdf8b77aeb4cee5c1a87fa6268d0b0b8cf Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 28 Aug 2023 06:00:54 +0000 Subject: [PATCH 26/65] refactor quantization kernel and fix tuning warmup pass --- .../cutlass_conv_tuning_warmup_pass.cpp | 176 ++++++++++++------ oneflow/ir/include/OneFlow/OneFlowUserOps.td | 2 +- .../ir/lib/OneFlow/Transform/AutoNHWCOps.cpp | 88 +++++---- oneflow/user/kernels/quantization_kernel.cu | 161 ++++++++++++++-- oneflow/user/ops/quantization_op.cpp | 15 +- 5 files changed, 332 insertions(+), 110 deletions(-) diff --git a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp index 5edc79bb4c5..650620a0d26 100644 --- a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp +++ b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp @@ -24,6 +24,11 @@ limitations under the License. #include "oneflow/core/framework/user_op_conf.h" #include +#include +#ifdef WITH_CUTLASS_EXTENSION +#include +#endif // WITH_CUTLASS_EXTENSION + namespace oneflow { namespace { @@ -63,7 +68,10 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const op_graph.ForEachNode([&](const OpNode* node) { const OperatorConf& op_conf = node->op().op_conf(); if (!op_conf.has_user_conf()) { return; } - if (op_conf.user_conf().op_type_name() != "conv2d") { return; } + if (op_conf.user_conf().op_type_name() != "conv2d" + && op_conf.user_conf().op_type_name() != "conv2d_quant") { + return; + } if (node->parallel_desc().device_type() != DeviceType::kCUDA) { return; } if (node->parallel_desc().parallel_num() != 1) { return; } if (!node->parallel_desc().containing_current_rank()) { return; } @@ -72,7 +80,9 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const if (conv2d_op.attr("groups") != 1) { return; } VLOG(3) << "Tuning " << op_conf.name(); const auto& in_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("in", 0))); - if (in_desc.data_type() != DataType::kFloat16) { return; } + if (op_conf.user_conf().op_type_name() == "conv2d") { + if (in_desc.data_type() != DataType::kFloat16) { return; } + } const auto& weight_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("weight", 0))); const auto& out_desc = node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.output("out", 0))); @@ -94,21 +104,6 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const const int p = out_desc.shape().At(1); const int q = out_desc.shape().At(2); - cutlass::library::ConvFunctionalKey key( - cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, - cutlass::library::NumericTypeID::kF32, cutlass::library::NumericTypeID::kF32); - - const bool allow_half_accumulation = - ParseBooleanFromEnv("ONEFLOW_CONV_ALLOW_HALF_PRECISION_ACCUMULATION", false); - - if (allow_half_accumulation) { - key.element_accumulator = cutlass::library::NumericTypeID::kF16; - key.element_compute = cutlass::library::NumericTypeID::kF16; - } - const size_t x_size = GetCudaAlignedSize(in_desc.ByteSizeOfBlobBody()); const size_t w_size = GetCudaAlignedSize(weight_desc.ByteSizeOfBlobBody()); const size_t y_size = GetCudaAlignedSize(out_desc.ByteSizeOfBlobBody()); @@ -118,7 +113,20 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const GetCudaAlignedSize(node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("bias", 0))) .ByteSizeOfBlobBody()); } - const size_t total_buf_size = x_size + w_size + y_size + bias_size; + size_t zero_point_size = 0; + size_t scale_size = 0; + if (conv2d_op.has_input("in_zero_point", 0)) { + zero_point_size = GetCudaAlignedSize( + node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("in_zero_point", 0))) + .ByteSizeOfBlobBody()); + } + if (conv2d_op.has_input("scale", 0)) { + scale_size = GetCudaAlignedSize( + node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("scale", 0))) + .ByteSizeOfBlobBody()); + } + const size_t total_buf_size = + x_size + w_size + y_size + bias_size + zero_point_size + scale_size; if (total_buf_size > buffer_size) { size_t malloc_size = RoundUp(total_buf_size, kBufferMallocAlign); OF_CUDA_CHECK(cudaFree(buffer)); @@ -128,54 +136,114 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const void* x_ptr = buffer; void* w_ptr = buffer + x_size; void* y_ptr = buffer + x_size + w_size; + size_t offset = x_size + w_size + y_size; void* bias_ptr = nullptr; - if (bias_size != 0) { bias_ptr = buffer + x_size + w_size + y_size; } + if (bias_size != 0) { bias_ptr = buffer + offset; } + void* zero_point_ptr = nullptr; + if (zero_point_size) { zero_point_ptr = buffer + offset + bias_size; } + void* scale_ptr = nullptr; + if (scale_size) { scale_ptr = buffer + offset + bias_size + zero_point_size; } cutlass::conv::Conv2dProblemSize problem_size( n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), strides.at(1), dilation_rate.at(0), dilation_rate.at(1), cutlass::conv::Mode::kCrossCorrelation); - cutlass::library::Conv2dConfiguration configuraion; - configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; - configuraion.problem_size = problem_size; - configuraion.stride_a = {c, w * c, h * w * c}; - configuraion.stride_b = {c, s * c, r * s * c}; - configuraion.stride_c = {0, 0, 0}; - cutlass::library::ConvArguments arguments; - arguments.A = x_ptr; - arguments.B = w_ptr; - arguments.reordered_B = nullptr; - arguments.C = bias_ptr; - arguments.D = y_ptr; - union SP { - float f{}; - half h; - }; - - SP alpha; - SP beta; - - if (allow_half_accumulation) { - alpha.h = static_cast(1.0F); - if (bias_ptr == nullptr) { - beta.h = static_cast(0.0F); + + const cutlass::library::Operation* operation = nullptr; + + if (op_conf.user_conf().op_type_name() == "conv2d") { + cutlass::library::ConvFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF16, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kF32, cutlass::library::NumericTypeID::kF32); + + const bool allow_half_accumulation = + ParseBooleanFromEnv("ONEFLOW_CONV_ALLOW_HALF_PRECISION_ACCUMULATION", false); + + if (allow_half_accumulation) { + key.element_accumulator = cutlass::library::NumericTypeID::kF16; + key.element_compute = cutlass::library::NumericTypeID::kF16; + } + + cutlass::library::Conv2dConfiguration configuraion; + configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; + configuraion.problem_size = problem_size; + configuraion.stride_a = {c, w * c, h * w * c}; + configuraion.stride_b = {c, s * c, r * s * c}; + configuraion.stride_c = {0, 0, 0}; + cutlass::library::ConvArguments arguments; + arguments.A = x_ptr; + arguments.B = w_ptr; + arguments.reordered_B = nullptr; + arguments.C = bias_ptr; + arguments.D = y_ptr; + union SP { + float f{}; + half h; + }; + + SP alpha; + SP beta; + + if (allow_half_accumulation) { + alpha.h = static_cast(1.0F); + if (bias_ptr == nullptr) { + beta.h = static_cast(0.0F); + } else { + beta.h = static_cast(1.0F); + } } else { - beta.h = static_cast(1.0F); + alpha.f = 1.0F; + if (bias_ptr == nullptr) { + beta.f = 0.0F; + } else { + beta.f = 1.0F; + } } + arguments.alpha = α + arguments.beta = β + arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; + + operation = CutlassConvTuner().FindConv2dOperation( + stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); } else { - alpha.f = 1.0F; - if (bias_ptr == nullptr) { - beta.f = 0.0F; +#ifdef WITH_CUTLASS_EXTENSION + cutlass::library::ConvFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, + cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); + if (out_desc.data_type() == DataType::kFloat) { + key.element_C = cutlass::library::NumericTypeID::kF32; + key.element_compute = cutlass::library::NumericTypeID::kF32; + } else if (out_desc.data_type() == DataType::kFloat16) { + key.element_C = cutlass::library::NumericTypeID::kF16; + key.element_compute = cutlass::library::NumericTypeID::kF32; } else { - beta.f = 1.0F; + return; } - } - arguments.alpha = α - arguments.beta = β - arguments.pointer_mode = cutlass::library::ScalarPointerMode::kHost; + cutlass::library::Conv2dScaleBiasFusionConfiguration configuraion; + configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; + configuraion.problem_size = problem_size; + configuraion.stride_a = {c, w * c, h * w * c}; + configuraion.stride_b = {c, s * c, r * s * c}; - const cutlass::library::Operation* operation = CutlassConvTuner().FindConv2dOperation( - stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); + cutlass::library::ConvScaleBiasFusionArguments arguments; + arguments.A = x_ptr; + arguments.B = w_ptr; + arguments.reordered_B = nullptr; + arguments.P = zero_point_ptr; + arguments.Scale = scale_ptr; + arguments.Bias = bias_ptr; + arguments.D = y_ptr; + + operation = CutlassConvTuner().FindConv2dOperation( + stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); +#endif // WITH_CUTLASS_EXTENSION + } if (operation != nullptr) { VLOG(3) << "Fastest operation: " << operation->description().name; nlohmann::json tuning_cache; diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index d6de6a8f341..844b06eb303 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -8233,7 +8233,7 @@ def OneFlow_MovingAverageMinMaxObserverOp : OneFlow_BaseOp<"moving_average_min_m let has_input_arg_modify_fn = 1; } -def OneFlow_QuantizationOp : OneFlow_BaseOp<"quantization", [NoMemoryEffect, DeclareOpInterfaceMethods]> { +def OneFlow_QuantizationOp : OneFlow_BaseOp<"quantization", [NoMemoryEffect, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$in, OneFlow_Tensor:$scale, diff --git a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp index 28455f78618..21238a4fbec 100644 --- a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp +++ b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp @@ -51,40 +51,6 @@ llvm::SmallVector Conv2DOp::NchwToNhwc(llvm::SmallVector val return results; } -bool Conv2DQuantOp::IsNCHW() { return this->getDataFormat().str() == "channels_first"; } - -llvm::DenseSet Conv2DQuantOp::OperandsToTranspose() { - if (this->get_addToOutput()) { - return {this->getIn(), this->getWeight(), this->get_addToOutput()}; - } else { - return {this->getIn(), this->getWeight()}; - } -} - -llvm::DenseSet Conv2DQuantOp::ResultsToTranspose() { return {this->getOut()}; } - -llvm::SmallVector Conv2DQuantOp::NchwToNhwc(llvm::SmallVector value, - PatternRewriter& rewriter) { - auto conv_quant_op = *this; - SmallVector operands; - operands.push_back(value[0]); - operands.push_back(value[1]); - operands.push_back(conv_quant_op.getInZeroPoint()); - if (conv_quant_op.getScale()) operands.push_back(conv_quant_op.getScale()); - if (conv_quant_op.getBias()) operands.push_back(conv_quant_op.getBias()); - if (this->get_addToOutput()) { operands.push_back(value[2]); } - NamedAttrList attributes = conv_quant_op->getAttrs(); - attributes.set(conv_quant_op.getDataFormatAttrName(), rewriter.getStringAttr("channels_last")); - auto res = - rewriter - .create(conv_quant_op.getLoc(), getNHWCResultTypes(conv_quant_op), - operands, attributes) - ->getResults(); - llvm::SmallVector results; - results.push_back(res[0]); - return results; -} - bool BiasAddOp::IsNCHW() { return this->getAxisAttr().getValue().getSExtValue() == 1; } llvm::DenseSet BiasAddOp::OperandsToTranspose() { return {this->getA()}; } @@ -354,6 +320,60 @@ llvm::SmallVector GroupNormOp::NchwToNhwc(llvm::SmallVector return results; } +bool Conv2DQuantOp::IsNCHW() { return this->getDataFormat().str() == "channels_first"; } + +llvm::DenseSet Conv2DQuantOp::OperandsToTranspose() { + if (this->get_addToOutput()) { + return {this->getIn(), this->getWeight(), this->get_addToOutput()}; + } else { + return {this->getIn(), this->getWeight()}; + } +} + +llvm::DenseSet Conv2DQuantOp::ResultsToTranspose() { return {this->getOut()}; } + +llvm::SmallVector Conv2DQuantOp::NchwToNhwc(llvm::SmallVector value, + PatternRewriter& rewriter) { + auto conv_quant_op = *this; + SmallVector operands; + operands.push_back(value[0]); + operands.push_back(value[1]); + operands.push_back(conv_quant_op.getInZeroPoint()); + if (conv_quant_op.getScale()) operands.push_back(conv_quant_op.getScale()); + if (conv_quant_op.getBias()) operands.push_back(conv_quant_op.getBias()); + if (this->get_addToOutput()) { operands.push_back(value[2]); } + NamedAttrList attributes = conv_quant_op->getAttrs(); + attributes.set(conv_quant_op.getDataFormatAttrName(), rewriter.getStringAttr("channels_last")); + auto res = + rewriter + .create(conv_quant_op.getLoc(), getNHWCResultTypes(conv_quant_op), + operands, attributes) + ->getResults(); + llvm::SmallVector results; + results.push_back(res[0]); + return results; +} + +bool QuantizationOp::IsNCHW() { return false; } + +llvm::DenseSet QuantizationOp::OperandsToTranspose() { return {this->getIn()}; } + +llvm::DenseSet QuantizationOp::ResultsToTranspose() { return {this->getOut()}; } + +llvm::SmallVector QuantizationOp::NchwToNhwc(llvm::SmallVector value, + PatternRewriter& rewriter) { + auto quantization_op = *this; + SmallVector operands{value[0]}; + operands.push_back(quantization_op.getScale()); + operands.push_back(quantization_op.getZeroPoint()); + auto res = rewriter + .create(quantization_op.getLoc(), + getNHWCResultTypes(quantization_op), operands, + quantization_op->getAttrs()) + ->getResults(); + return {res[0]}; +} + } // namespace oneflow } // namespace mlir diff --git a/oneflow/user/kernels/quantization_kernel.cu b/oneflow/user/kernels/quantization_kernel.cu index 2b0cfa1826b..d940b2345a9 100644 --- a/oneflow/user/kernels/quantization_kernel.cu +++ b/oneflow/user/kernels/quantization_kernel.cu @@ -24,23 +24,24 @@ namespace { template __global__ void QuantizationSymmetric(const T* in_ptr, const T* scale_ptr, const int64_t scale_size, const int64_t elements, const int64_t panel_size, - const double quantization_bit, T* out_ptr) { + const int32_t quantization_bit, T* out_ptr) { int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; int64_t step = gridDim.x * blockDim.x; - T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; - T lower_bound = -upper_bound - 1; + float upper_bound = pow(2.0, quantization_bit - 1) - 1; + float lower_bound = -upper_bound - 1; while (gid < elements) { int64_t channel_index = gid / panel_size; int64_t scale_idx = min(scale_size - 1, channel_index); - T scale = scale_ptr[scale_idx]; + float scale = scale_ptr[scale_idx]; + float in = in_ptr[gid]; - T out = nearbyint(in_ptr[gid] / scale); + float out = nearbyint(in / scale); out = out > upper_bound ? upper_bound : out; out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out; + out_ptr[gid] = static_cast(out); gid += step; } @@ -49,25 +50,26 @@ __global__ void QuantizationSymmetric(const T* in_ptr, const T* scale_ptr, const template __global__ void QuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr, const int64_t scale_size, const int64_t elements, - const int64_t panel_size, const double quantization_bit, + const int64_t panel_size, const int32_t quantization_bit, T* out_ptr) { int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; int64_t step = gridDim.x * blockDim.x; - T upper_bound = static_cast(pow(2.0, quantization_bit)) - 1; - T lower_bound = 0; + float upper_bound = pow(2.0, quantization_bit) - 1; + float lower_bound = 0; while (gid < elements) { int64_t channel_index = gid / panel_size; int64_t scale_idx = min(scale_size - 1, channel_index); - T scale = scale_ptr[scale_idx]; - T zero_point = zero_point_ptr[scale_idx]; + float scale = scale_ptr[scale_idx]; + float zero_point = zero_point_ptr[scale_idx]; + float in = in_ptr[gid]; - T out = nearbyint(in_ptr[gid] / scale + zero_point); + float out = nearbyint(in / scale + zero_point); out = out > upper_bound ? upper_bound : out; out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out; + out_ptr[gid] = static_cast(out); gid += step; } @@ -80,20 +82,131 @@ __global__ void QuantizationCambricon(const T* in_ptr, const T* shift, const int int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; int64_t step = gridDim.x * blockDim.x; - T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; - T lower_bound = -upper_bound - 1; + float upper_bound = pow(2.0, quantization_bit - 1) - 1; + float lower_bound = -upper_bound - 1; - T scale = static_cast(pow(2.0, static_cast(shift[0]))); + float scale = pow(2.0, static_cast(shift[0])); while (gid < elements) { - T out = nearbyint(in_ptr[gid] / scale); + float in = in_ptr[gid]; + float out = nearbyint(in / scale); out = out > upper_bound ? upper_bound : out; out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out; + out_ptr[gid] = static_cast(out); gid += step; } } +template +__global__ void OFPerTensorQuantizationSymmetric(const int64_t elements, const T* in_ptr, + const T* scale_ptr, const OutT upper_bound, + const OutT lower_bound, OutT* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + float scale = *scale_ptr; + + while (gid < elements) { + float in = in_ptr[gid]; + float out = nearbyint(in / scale); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = static_cast(out); + + gid += step; + } +} + +template +__global__ void OFPerTensorQuantizationAffine(const int64_t elements, const T* in_ptr, + const T* scale_ptr, const OutT* zero_point_ptr, + const OutT upper_bound, const OutT lower_bound, + OutT* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + float scale = *scale_ptr; + float zero_point = *zero_point_ptr; + + while (gid < elements) { + float in = in_ptr[gid]; + float out = nearbyint(in / scale + zero_point); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = static_cast(out); + + gid += step; + } +} + +struct __align__(8) Half4 { + half x; + half y; + half z; + half w; +}; + +struct __align__(4) Byte4 { + int8_t x; + int8_t y; + int8_t z; + int8_t w; +}; + +template<> +__global__ void OFPerTensorQuantizationAffine( + const int64_t elements, const half* in_ptr, const half* scale_ptr, const int8_t* zero_point_ptr, + const int8_t upper_bound, const int8_t lower_bound, int8_t* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + float scale = *scale_ptr; + float zero_point = *zero_point_ptr; + + int64_t loops = elements >> 2; + for (; gid < loops; gid += step) { + Half4 in = reinterpret_cast(in_ptr)[gid]; + Byte4 out; + int x = __float2int_rn(static_cast(in.x) / scale + zero_point); + int y = __float2int_rn(static_cast(in.y) / scale + zero_point); + int z = __float2int_rn(static_cast(in.z) / scale + zero_point); + int w = __float2int_rn(static_cast(in.w) / scale + zero_point); + out.x = max(min(x, upper_bound), lower_bound); + out.y = max(min(y, upper_bound), lower_bound); + out.z = max(min(z, upper_bound), lower_bound); + out.w = max(min(w, upper_bound), lower_bound); + reinterpret_cast(out_ptr)[gid] = out; + } + int64_t offset = loops << 2; + if (offset < elements && gid == loops) { + for (; offset < elements; offset += 1) { + float in = in_ptr[offset]; + int out = __float2int_rn(in / scale + zero_point); + out_ptr[offset] = max(min(out, upper_bound), lower_bound); + } + } +} + +template +void ApplyOFPerTensorQuantization(user_op::KernelComputeContext* ctx, + const std::string& quantization_scheme, + const int32_t quantization_bit, const user_op::Tensor* in, + const user_op::Tensor* scale, const user_op::Tensor* zero_point, + user_op::Tensor* out) { + const int64_t elements = in->shape_view().elem_cnt(); + OutT upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; + OutT lower_bound = -upper_bound - 1; + if (quantization_scheme == "symmetric") { + RUN_CUDA_KERNEL((OFPerTensorQuantizationSymmetric), ctx->stream(), elements, elements, + in->dptr(), scale->dptr(), upper_bound, lower_bound, + out->mut_dptr()); + } else { + RUN_CUDA_KERNEL((OFPerTensorQuantizationAffine), ctx->stream(), elements, elements, + in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, + lower_bound, out->mut_dptr()); + } +} + } // namespace template @@ -122,7 +235,16 @@ class GpuQuantizationKernel final : public user_op::OpKernel { auto origin_round_mode = std::fegetround(); std::fesetround(FE_TONEAREST); - if (quantization_formula == "google") { + if (quantization_formula == "oneflow") { + CHECK_EQ(scale_size, 1) + << "only support per-tensor quantization for oneflow quantization formula"; + if (quantization_bit == 8) { + ApplyOFPerTensorQuantization(ctx, quantization_scheme, quantization_bit, in, + scale, zero_point, out); + } else { + UNIMPLEMENTED(); + } + } else if (quantization_formula == "google") { if (quantization_scheme == "symmetric") { RUN_CUDA_KERNEL((QuantizationSymmetric), ctx->stream(), elements, in->dptr(), scale->dptr(), scale_size, elements, panel_size, quantization_bit, @@ -154,5 +276,6 @@ class GpuQuantizationKernel final : public user_op::OpKernel { REGISTER_QUANTIZATION_KERNEL(float); REGISTER_QUANTIZATION_KERNEL(double); +REGISTER_QUANTIZATION_KERNEL(half); } // namespace oneflow diff --git a/oneflow/user/ops/quantization_op.cpp b/oneflow/user/ops/quantization_op.cpp index e0299fec1b3..49e882832e3 100644 --- a/oneflow/user/ops/quantization_op.cpp +++ b/oneflow/user/ops/quantization_op.cpp @@ -75,7 +75,17 @@ namespace oneflow { return InferLogicalTensorDesc(ctx); } /*static*/ Maybe QuantizationOp::InferDataType(user_op::InferContext* ctx) { - ctx->SetOutputDType("out", 0, ctx->InputDType("in", 0)); + const std::string& quantization_formula = ctx->Attr("quantization_formula"); + if (quantization_formula == "oneflow") { + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + if (quantization_bit == 8) { + ctx->SetOutputDType("out", 0, DataType::kInt8); + } else { + OF_UNIMPLEMENTED(); + } + } else { + ctx->SetOutputDType("out", 0, ctx->InputDType("in", 0)); + } return Maybe::Ok(); } /*static*/ Maybe QuantizationOp::ModifyInputArg( @@ -99,7 +109,8 @@ namespace oneflow { CHECK_OR_RETURN(quantization_scheme == "symmetric" || quantization_scheme == "affine"); std::string quantization_formula = op_conf.attr("quantization_formula"); - CHECK_OR_RETURN(quantization_formula == "google" || quantization_formula == "cambricon"); + CHECK_OR_RETURN(quantization_formula == "google" || quantization_formula == "cambricon" + || quantization_formula == "oneflow"); return Maybe::Ok(); } From e48f73812152c08f3444663bf5748046c155ebdd Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 28 Aug 2023 07:50:49 +0000 Subject: [PATCH 27/65] fix --- oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h | 1 + 1 file changed, 1 insertion(+) diff --git a/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h index 91e55cc3e51..68b0c5321b2 100644 --- a/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h +++ b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h @@ -61,6 +61,7 @@ struct Conv2dOperationCacheKey { const cutlass::library::ConvScaleBiasFusionArguments& arguments) : functional_key(functional_key) { configuraion.problem_size = config.problem_size; + configuraion.split_k_mode = config.split_k_mode; configuraion.stride_a = config.stride_a; configuraion.stride_b = config.stride_b; configuraion.stride_c = {0, 0, 0}; From 5500840bcddcc92d0bdafec3bb106bfd2a6742b1 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 30 Aug 2023 05:57:30 +0000 Subject: [PATCH 28/65] fix compilation error for cuda12.2 --- oneflow/core/ndarray/ndarray_reduce_impl.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/oneflow/core/ndarray/ndarray_reduce_impl.cu b/oneflow/core/ndarray/ndarray_reduce_impl.cu index e436f0c32f2..e7d466e7559 100644 --- a/oneflow/core/ndarray/ndarray_reduce_impl.cu +++ b/oneflow/core/ndarray/ndarray_reduce_impl.cu @@ -51,14 +51,12 @@ struct NanSum { } }; -template<> -OF_DEVICE_FUNC cuComplex cub::Sum::operator()(const cuComplex& a, const cuComplex& b) const { +__host__ __device__ __forceinline__ cuComplex operator+(const cuComplex& a, const cuComplex& b) { return cuComplex{a.x + b.x, a.y + b.y}; } -template<> -OF_DEVICE_FUNC cuDoubleComplex cub::Sum::operator()(const cuDoubleComplex& a, - const cuDoubleComplex& b) const { +__host__ __device__ __forceinline__ cuDoubleComplex operator+(const cuDoubleComplex& a, + const cuDoubleComplex& b) { return cuDoubleComplex{a.x + b.x, a.y + b.y}; } } // namespace cub From cc7c49c8f247096ec07f68d5f1a0c0757a026b2c Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 30 Aug 2023 07:46:44 +0000 Subject: [PATCH 29/65] eliminate scalar math op --- oneflow/ir/lib/OneFlow/OneFlowRewrites.cpp | 27 +++++++++++++++++++ .../lib/OneFlow/PDLL/ForwardOpPatterns.pdll | 7 +++++ .../ir/lib/OneFlow/PDLL/OneFlowPDLLUtils.pdll | 2 ++ 3 files changed, 36 insertions(+) diff --git a/oneflow/ir/lib/OneFlow/OneFlowRewrites.cpp b/oneflow/ir/lib/OneFlow/OneFlowRewrites.cpp index 82635ed2cb5..35066ddadb0 100644 --- a/oneflow/ir/lib/OneFlow/OneFlowRewrites.cpp +++ b/oneflow/ir/lib/OneFlow/OneFlowRewrites.cpp @@ -311,6 +311,32 @@ static Attribute GetReciprocal(PatternRewriter& rewriter, Attribute a) { return rewriter.getF64FloatAttr(1 / a.cast().getValueAsDouble()); } +static LogicalResult IsScalarMathOpAndCouldBeEliminate(PatternRewriter& rewriter, + Operation* scalar_math_op) { + if (auto op = dyn_cast(scalar_math_op)) { + if (op.getHasIntOperand()) { + return success(op.getIntOperand() == 0); + } else { + return success(op.getFloatOperand().convertToDouble() == 0); + } + } else if (auto op = dyn_cast(scalar_math_op)) { + if (op.getHasIntOperand()) { + return success(op.getIntOperand() == 1); + } else { + return success(op.getFloatOperand().convertToDouble() == 1); + } + } else if (auto op = dyn_cast(scalar_math_op)) { + if (op.getHasIntOperand()) { + return success(op.getIntOperand() == 1); + } else { + return success(op.getFloatOperand().convertToDouble() == 1); + } + } else { + return failure(); + } + return failure(); +} + } // namespace namespace rewrites { @@ -346,6 +372,7 @@ void populateConstraints(RewritePatternSet& patterns) { PDLL_REGISTER(IsScalarTensor); PDLL_REGISTER(IsScalarEqualSqrtDim); PDLL_REGISTER(IsScalarEqualSqrtDimReciprocal); + PDLL_REGISTER(IsScalarMathOpAndCouldBeEliminate); #undef PDLL_REGISTER } diff --git a/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll b/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll index ebd041ef005..252aa704ce1 100644 --- a/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll +++ b/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll @@ -204,3 +204,10 @@ Pattern { replace broadcast_mul with CopyUserOpAttrs(broadcast_mul, scalar_mul); }; } + +Pattern { + let root = op<>(input: Value); + IsScalarMathOpAndCouldBeEliminate(root); + + replace root with input; +} diff --git a/oneflow/ir/lib/OneFlow/PDLL/OneFlowPDLLUtils.pdll b/oneflow/ir/lib/OneFlow/PDLL/OneFlowPDLLUtils.pdll index f69c44fea16..76d85255d56 100644 --- a/oneflow/ir/lib/OneFlow/PDLL/OneFlowPDLLUtils.pdll +++ b/oneflow/ir/lib/OneFlow/PDLL/OneFlowPDLLUtils.pdll @@ -22,3 +22,5 @@ Constraint IsScalarTensor(value: Value); Constraint IsScalarEqualSqrtDim(query_reshape: Value, scalar_div_operand: Attr); Constraint IsScalarEqualSqrtDimReciprocal(query_reshape: Value, scalar_div_operand: Attr); Rewrite GetReciprocal(a: Attr) -> Attr; + +Constraint IsScalarMathOpAndCouldBeEliminate(op: Op); From f1a439700b95d90e7d3e22679d6842b596e62f87 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 30 Aug 2023 12:18:06 +0000 Subject: [PATCH 30/65] refactor --- oneflow/user/kernels/quantization_kernel.cu | 166 +++++++++++--------- 1 file changed, 95 insertions(+), 71 deletions(-) diff --git a/oneflow/user/kernels/quantization_kernel.cu b/oneflow/user/kernels/quantization_kernel.cu index d940b2345a9..463aff202fa 100644 --- a/oneflow/user/kernels/quantization_kernel.cu +++ b/oneflow/user/kernels/quantization_kernel.cu @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/cuda/elementwise.cuh" #include "oneflow/core/device/cuda_util.h" #include "oneflow/core/framework/framework.h" #include "oneflow/core/kernel/kernel_util.cuh" @@ -97,92 +98,109 @@ __global__ void QuantizationCambricon(const T* in_ptr, const T* shift, const int } } -template +template +__host__ __device__ int ModDiv(int64_t N) { + return N - (N / M * M); +} + +template<> +__host__ __device__ int ModDiv<2>(int64_t N) { + return N & 0x1; +} + +template<> +__host__ __device__ int ModDiv<4>(int64_t N) { + return N & 0x3; +} + +template<> +__host__ __device__ int ModDiv<8>(int64_t N) { + return N & 0x7; +} + +template<> +__host__ __device__ int ModDiv<16>(int64_t N) { + return N & 0xF; +} + +template __global__ void OFPerTensorQuantizationSymmetric(const int64_t elements, const T* in_ptr, const T* scale_ptr, const OutT upper_bound, const OutT lower_bound, OutT* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using StoreType = cuda::elementwise::PackType; + using StorePack = cuda::elementwise::Pack; + + int64_t tid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x * pack_size; float scale = *scale_ptr; - while (gid < elements) { - float in = in_ptr[gid]; - float out = nearbyint(in / scale); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = static_cast(out); + for (int64_t idx = tid * pack_size; idx < elements; idx += step) { + StorePack out; + LoadPack in; + in.storage = reinterpret_cast(in_ptr + idx)[0]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + out.elem[i] = max(min(__float2int_rn(static_cast(in.elem[i]) / scale), upper_bound), + lower_bound); + } + reinterpret_cast(out_ptr + idx)[0] = out.storage; + } - gid += step; + int rest = ModDiv(elements); + + if (rest > 0 && tid == (gridDim.x * blockDim.x - 1)) { + in_ptr += elements - rest; + out_ptr += elements - rest; +#pragma unroll + for (int i = 0; i < rest; ++i) { + out_ptr[i] = + max(min(__float2int_rn(static_cast(in_ptr[i]) / scale), upper_bound), lower_bound); + } } } -template +template __global__ void OFPerTensorQuantizationAffine(const int64_t elements, const T* in_ptr, const T* scale_ptr, const OutT* zero_point_ptr, const OutT upper_bound, const OutT lower_bound, OutT* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using StoreType = cuda::elementwise::PackType; + using StorePack = cuda::elementwise::Pack; + + int64_t tid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x * pack_size; float scale = *scale_ptr; float zero_point = *zero_point_ptr; - while (gid < elements) { - float in = in_ptr[gid]; - float out = nearbyint(in / scale + zero_point); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = static_cast(out); - - gid += step; + for (int64_t idx = tid * pack_size; idx < elements; idx += step) { + StorePack out; + LoadPack in; + in.storage = reinterpret_cast(in_ptr + idx)[0]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + out.elem[i] = + max(min(__float2int_rn(static_cast(in.elem[i]) / scale + zero_point), upper_bound), + lower_bound); + } + reinterpret_cast(out_ptr + idx)[0] = out.storage; } -} -struct __align__(8) Half4 { - half x; - half y; - half z; - half w; -}; - -struct __align__(4) Byte4 { - int8_t x; - int8_t y; - int8_t z; - int8_t w; -}; - -template<> -__global__ void OFPerTensorQuantizationAffine( - const int64_t elements, const half* in_ptr, const half* scale_ptr, const int8_t* zero_point_ptr, - const int8_t upper_bound, const int8_t lower_bound, int8_t* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; + int rest = ModDiv(elements); - float scale = *scale_ptr; - float zero_point = *zero_point_ptr; - - int64_t loops = elements >> 2; - for (; gid < loops; gid += step) { - Half4 in = reinterpret_cast(in_ptr)[gid]; - Byte4 out; - int x = __float2int_rn(static_cast(in.x) / scale + zero_point); - int y = __float2int_rn(static_cast(in.y) / scale + zero_point); - int z = __float2int_rn(static_cast(in.z) / scale + zero_point); - int w = __float2int_rn(static_cast(in.w) / scale + zero_point); - out.x = max(min(x, upper_bound), lower_bound); - out.y = max(min(y, upper_bound), lower_bound); - out.z = max(min(z, upper_bound), lower_bound); - out.w = max(min(w, upper_bound), lower_bound); - reinterpret_cast(out_ptr)[gid] = out; - } - int64_t offset = loops << 2; - if (offset < elements && gid == loops) { - for (; offset < elements; offset += 1) { - float in = in_ptr[offset]; - int out = __float2int_rn(in / scale + zero_point); - out_ptr[offset] = max(min(out, upper_bound), lower_bound); + if (rest > 0 && tid == (gridDim.x * blockDim.x - 1)) { + in_ptr += elements - rest; + out_ptr += elements - rest; +#pragma unroll + for (int i = 0; i < rest; ++i) { + out_ptr[i] = + max(min(__float2int_rn(static_cast(in_ptr[i]) / scale + zero_point), upper_bound), + lower_bound); } } } @@ -193,17 +211,23 @@ void ApplyOFPerTensorQuantization(user_op::KernelComputeContext* ctx, const int32_t quantization_bit, const user_op::Tensor* in, const user_op::Tensor* scale, const user_op::Tensor* zero_point, user_op::Tensor* out) { + constexpr int pack_size = cuda::elementwise::PackSize(); + const int64_t elements = in->shape_view().elem_cnt(); + int64_t pack_num = (elements + pack_size - 1) / pack_size; + int grid_size; + cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + OutT upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; OutT lower_bound = -upper_bound - 1; + auto stream = ctx->stream()->As()->cuda_stream(); if (quantization_scheme == "symmetric") { - RUN_CUDA_KERNEL((OFPerTensorQuantizationSymmetric), ctx->stream(), elements, elements, - in->dptr(), scale->dptr(), upper_bound, lower_bound, - out->mut_dptr()); + OFPerTensorQuantizationSymmetric<<>>( + elements, in->dptr(), scale->dptr(), upper_bound, lower_bound, out->mut_dptr()); } else { - RUN_CUDA_KERNEL((OFPerTensorQuantizationAffine), ctx->stream(), elements, elements, - in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, - lower_bound, out->mut_dptr()); + OFPerTensorQuantizationAffine<<>>( + elements, in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, + lower_bound, out->mut_dptr()); } } From a889fc59b7e3ea122ec6fa3ada6cc8a3cef468be Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Thu, 31 Aug 2023 07:32:47 +0000 Subject: [PATCH 31/65] fuse add to output completely --- .../job_rewriter/fuse_add_to_output_pass.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp index 43682983a43..f3e7ef2e525 100644 --- a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp +++ b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp @@ -30,17 +30,20 @@ class FuseAddToOutputPass final : public JobPass { bool IsEnabled(const JobPassCtx& ctx) const { return ctx.job_desc().job_conf().enable_fuse_add_to_output(); } - Maybe Apply(const OpGraph& op_graph, JobBuilder* job_builder) const; + Maybe Apply(const OpGraph& op_graph, JobBuilder* job_builder) const; Maybe Apply(Job* job, JobPassCtx* ctx) const override { if (!IsEnabled(*ctx)) { return Maybe::Ok(); } - const OpGraph op_graph(*job); - JobBuilder job_builder(job); - return Apply(op_graph, &job_builder); + while (true) { + const OpGraph op_graph(*job); + JobBuilder job_builder(job); + if (!JUST(Apply(op_graph, &job_builder))) { break; } + } + return Maybe::Ok(); } }; -Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_builder) const { +Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_builder) const { const HashMap supported_op_type_name2output_arg( {{"normalization", user_op::OpArg("y", 0)}, {"dropout", user_op::OpArg("out", 0)}, @@ -83,7 +86,6 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ ctrl_in_op_names.insert(ctrl_in_op_name); } }); - auto IsReachable = op_graph.MakePredicatorIsOpNameDataOrCtrlReachable(); std::vector delete_ops; HashSet be_fused_op_names; @@ -160,9 +162,12 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ delete_ops.emplace_back(op_conf); return Maybe::Ok(); })); + if (delete_ops.empty()) { + return false; + } JUST(job_builder->MutOpTransactionCommit()); job_builder->DelOps(delete_ops); - return Maybe::Ok(); + return true; } } // namespace From a80b11d6d444bad3bd072e30ecddd4ceba015a19 Mon Sep 17 00:00:00 2001 From: clackhan Date: Thu, 31 Aug 2023 09:54:57 +0000 Subject: [PATCH 32/65] add_quant_matmul --- cmake/third_party/cutlass-extension.cmake | 2 +- oneflow/core/functional/functional_api.yaml | 5 + oneflow/core/functional/impl/nn_functor.cpp | 27 +++ oneflow/ir/include/OneFlow/OneFlowUserOps.td | 22 ++ oneflow/ir/lib/OneFlow/OneFlowSupport.cpp | 7 + oneflow/user/kernels/matmul_quant_kernels.cu | 165 ++++++++++++++ oneflow/user/ops/matmul_quant_op.cpp | 221 +++++++++++++++++++ python/oneflow/nn/functional/__init__.py | 1 + 8 files changed, 449 insertions(+), 1 deletion(-) create mode 100644 oneflow/user/kernels/matmul_quant_kernels.cu create mode 100644 oneflow/user/ops/matmul_quant_op.cpp diff --git a/cmake/third_party/cutlass-extension.cmake b/cmake/third_party/cutlass-extension.cmake index fbe3b0ff749..f3059d7584f 100644 --- a/cmake/third_party/cutlass-extension.cmake +++ b/cmake/third_party/cutlass-extension.cmake @@ -29,7 +29,7 @@ if(WITH_CUTLASS_EXTENSION) ${CUTLASS_EXTENSION_PROJECT} PREFIX cutlass-extension GIT_REPOSITORY https://github.com/Oneflow-Inc/oneflow-cutlass-extension.git - GIT_TAG master + GIT_TAG add_gemm_scale_bias_fusion UPDATE_COMMAND "" BUILD_BYPRODUCTS ${CUTLASS_EXTENSION_LIBRARIES} CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 59f5d7789a9..13cd1c646cd 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1058,6 +1058,11 @@ String channel_pos="channels_first", DataType output_dtype=None) => Conv2dQuant' bind_python: True +- name: "matmul_quant" + signature: + 'Tensor (Tensor a, Tensor b, Tensor scale=None, Tensor bias=None, Bool transpose_b=True, DataType output_dtype=None) => MatmulQuant' + bind_python: True + - name: "conv3d" signature: 'Tensor (Tensor input, Tensor weight, Tensor bias=None, Int32List[3] stride=1, diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index cd0d1540287..cf1a92d5c4f 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -378,6 +378,32 @@ class MatMulNoBroadCastFunctor { } }; +class MatMulQuantFunctor { + public: + MatMulQuantFunctor() { + matmul_op_ = CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Output("out").Build()); + matmul_scale_bias__op_ = + CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Input("scale").Input("bias").Output("out").Build()); + } + Maybe operator()(const std::shared_ptr& a, const std::shared_ptr& b, + const Optional& scale, const Optional& bias, + const bool& transpose_b, const Optional>& output_dtype) const { + if (scale || bias) { + CHECK_OR_RETURN(scale && bias) << "scale and bias must both be given or not."; + } + auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_b", "out_dtype"); + attrs.SetAllAttrs(transpose_b, output_dtype.value_or(DType::Float())->data_type()); + if (scale) { + return OpInterpUtil::Dispatch( + *matmul_scale_bias__op_, {a, b, JUST(scale), JUST(bias)}, attrs); + } + return OpInterpUtil::Dispatch(*matmul_op_, {a, b}, attrs); + } + private: + std::shared_ptr matmul_op_; + std::shared_ptr matmul_scale_bias__op_; +}; + class MatMulFunctor { public: MatMulFunctor() { @@ -5497,6 +5523,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("EmbeddingReNorm"); m.add_functor("Embedding"); m.add_functor("MatMul"); + m.add_functor("MatmulQuant"); m.add_functor("MatMulNoBroadCast"); m.add_functor("BatchMatMul"); m.add_functor("MatrixVectorProduct"); diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 844b06eb303..d3af206949d 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -5375,6 +5375,28 @@ def OneFlow_ErfcGradOp : OneFlow_BaseOp<"erfc_grad", [NoMemoryEffect, DeclareOpI let has_data_type_infer_fn = 1; } +def OneFlow_MatmulQuantOp : OneFlow_BaseOp<"matmul_quant", [NoMemoryEffect, AttrSizedOperandSegments, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$a, + OneFlow_Tensor:$b, + Optional:$scale, + Optional:$bias, + Optional:$_add_to_output + ); + let output = (outs + OneFlow_Tensor:$out + ); + let attrs = (ins + DefaultValuedAttr:$transpose_b, + OneFlow_DataType:$out_dtype + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; + let has_compute_complexity_fn = 1; +} + def OneFlow_MatmulOp : OneFlow_BaseOp<"matmul", [NoMemoryEffect, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$a, diff --git a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp index 6150ea83259..ff40c509210 100644 --- a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp +++ b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp @@ -129,6 +129,7 @@ void __DenseElementsAttrToTensor(const mlir::DenseElementsAttr dense_attr, std::vector data; std::vector<::oneflow::float16> fp16_data; + std::vector int8_data; void* dptr = nullptr; const size_t tensor_size = tensor->shape()->elem_cnt() * ::oneflow::GetSizeOfDataType(tensor->dtype()->data_type()); @@ -144,6 +145,12 @@ void __DenseElementsAttrToTensor(const mlir::DenseElementsAttr dense_attr, for (const T elem : dense_attr.getValues()) { data.push_back(elem); } CHECK_EQ(data.size() * sizeof(T), tensor_size); dptr = data.data(); + } else if (tensor->dtype()->data_type() == ::oneflow::DataType::kInt8){ + for (const T elem : dense_attr.getValues()) { + int8_data.push_back(static_cast(elem)); + } + CHECK_EQ(int8_data.size() * sizeof(int8_t), tensor_size); + dptr = int8_data.data(); } else { UNIMPLEMENTED(); } diff --git a/oneflow/user/kernels/matmul_quant_kernels.cu b/oneflow/user/kernels/matmul_quant_kernels.cu new file mode 100644 index 00000000000..6e2cc3fee00 --- /dev/null +++ b/oneflow/user/kernels/matmul_quant_kernels.cu @@ -0,0 +1,165 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUTLASS_EXTENSION + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/framework/config_def.h" +#include "oneflow/core/kernel/cuda_graph_support.h" + +#include "cutlass/gemm/device/gemm.h" + +#include "cutlass/gemm/device/gemm_scale_bias_fusion.h" +#include "cutlass/epilogue/thread/linear_combination_scale_bias.h" + +namespace oneflow { + +namespace { + +using RowMajor = cutlass::layout::RowMajor; // 行主序存储方式 +using ColMajor = cutlass::layout::ColumnMajor; // 列主序存储方式 + +void cutlass_gemm_scale_bias_s8s8fp16(cudaStream_t stream, void *workspace, int m, int k, int n, + const int8_t* a_ptr, const int8_t* b_ptr, const cutlass::half_t* scale, const cutlass::half_t* bias, cutlass::half_t* output) { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< + ElementA, // A矩阵数据类型 + RowMajor, // A矩阵存储方式 + ElementB, // B矩阵数据类型 + ColMajor, // B矩阵存储方式 + ElementC, // C矩阵数据类型 + RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationScaleBias>; + + CutlassRowAColBRowCGemm gemm_operator; + CutlassRowAColBRowCGemm::Arguments args( + {m, n, k}, + {a_ptr, k}, + {b_ptr, k}, + {scale, 0}, + {bias, 0}, + {output, n} + ); + + cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = gemm_operator(stream); //运行Gemm + CHECK(run_status == cutlass::Status::kSuccess); + return; +} + +void cutlass_gemm_scale_bias_s8s8fp32(cudaStream_t stream, void *workspace, int m, int k, int n, + const int8_t* a_ptr, const int8_t* b_ptr, const float* scale, const float* bias, float* output) { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = float; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< + ElementA, // A矩阵数据类型 + RowMajor, // A矩阵存储方式 + ElementB, // B矩阵数据类型 + ColMajor, // B矩阵存储方式 + ElementC, // C矩阵数据类型 + RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationScaleBias>; + + CutlassRowAColBRowCGemm gemm_operator; + CutlassRowAColBRowCGemm::Arguments args( + {m, n, k}, + {a_ptr, k}, + {b_ptr, k}, + {scale, 0}, + {bias, 0}, + {output, n} + ); + + cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = gemm_operator(stream); //运行Gemm + CHECK(run_status == cutlass::Status::kSuccess); + return; +} + +} // namespace + +class MatmulQuantKernel final : public user_op::OpKernel { + public: + MatmulQuantKernel() = default; + ~MatmulQuantKernel() = default; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("a", 0); + const user_op::Tensor* b = ctx->Tensor4ArgNameAndIndex("b", 0); + const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + CHECK(add_to_output == nullptr); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + + CHECK(ctx->Attr("transpose_b")); + + int64_t dim_a = a->shape_view().NumAxes(); + const int m = a->shape_view().Count(0, dim_a - 1); + const int k = a->shape_view().At(dim_a - 1); + const int n = b->shape_view().At(0); + + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + if (out->data_type() == DataType::kFloat) { + cutlass_gemm_scale_bias_s8s8fp32( + ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, k, n, + a->dptr(), b->dptr(), scale->dptr(), bias->dptr(), + out->mut_dptr()); + } else if (out->data_type() == DataType::kFloat16) { + cutlass_gemm_scale_bias_s8s8fp16( + ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, k, n, + a->dptr(), b->dptr(), reinterpret_cast(scale->dptr()), + reinterpret_cast(bias->dptr()), reinterpret_cast(out->mut_dptr())); + } + } +}; + +REGISTER_USER_KERNEL("matmul_quant") + .SetCreateFn().SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobDataType("a", 0) == DataType::kInt8) + && (user_op::HobDataType("b", 0) == DataType::kInt8)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { + // use static workspace size + return 128 * 1024 * 1024; + }) + .SetPriority(user_op::kKernelPriorityOptimized); + +} // namespace oneflow + +#endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/ops/matmul_quant_op.cpp b/oneflow/user/ops/matmul_quant_op.cpp new file mode 100644 index 00000000000..0f45a8cd842 --- /dev/null +++ b/oneflow/user/ops/matmul_quant_op.cpp @@ -0,0 +1,221 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +static const int kAlignment = 16; + +namespace { + +Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { + bool transpose_b = ctx->Attr("transpose_b"); + const Shape& shape_b = ctx->Shape4ArgNameAndIndex("b", 0); + int64_t n = 0; + if (!transpose_b) { + n = shape_b.At(shape_b.NumAxes() - 1); + } else { + n = shape_b.At(shape_b.NumAxes() - 2); + } + + double logical_computation_cost = 2 * ctx->Shape4ArgNameAndIndex("a", 0).elem_cnt() * n; + const auto& nd_sbp_a = ctx->NdSbp4ArgNameAndIndex("a", 0); + const auto& nd_sbp_b = ctx->NdSbp4ArgNameAndIndex("b", 0); + const auto& parallel_hierarchy = ctx->parallel_desc().hierarchy(); + for (int32_t sbp_dim = 0; sbp_dim < nd_sbp_a.sbp_parallel_size(); sbp_dim++) { + if (nd_sbp_a.sbp_parallel(sbp_dim).has_split_parallel() + || nd_sbp_b.sbp_parallel(sbp_dim).has_split_parallel()) { + logical_computation_cost /= parallel_hierarchy->At(sbp_dim); + } + } + return logical_computation_cost; +} + +} + +// BroadcastMatmul + +/* static */ Maybe MatmulQuantOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { + bool transpose_b = ctx->Attr("transpose_b"); + CHECK_OR_RETURN(transpose_b); + + const user_op::TensorDesc& a = ctx->InputTensorDesc("a", 0); + const user_op::TensorDesc& b = ctx->InputTensorDesc("b", 0); + user_op::TensorDesc* out = ctx->MutOutputTensorDesc("out", 0); + + const int64_t num_a_dims = a.shape().NumAxes(); + const int64_t num_b_dims = b.shape().NumAxes(); + CHECK_OR_RETURN(num_a_dims == 2 || num_a_dims == 3); + CHECK_EQ_OR_RETURN(num_b_dims, 2); + int64_t m = 0; + int64_t n = 0; + int64_t k = 0; // tensor a (no trans): batch_dims*m*k, tensor b (no trans): batch_dims*k*n + m = a.shape().At(num_a_dims - 2); + k = a.shape().At(num_a_dims - 1); + if (!transpose_b) { + CHECK_EQ_OR_RETURN(k, b.shape().At(0)) + << "K dim should be equal to b.shape().At(0). "; + n = b.shape().At(1); + } else { + CHECK_EQ_OR_RETURN(k, b.shape().At(1)) + << "K dim should be equal to b.shape().At(1). "; + n = b.shape().At(0); + } + + CHECK_EQ_OR_RETURN(k % kAlignment, 0); + CHECK_EQ_OR_RETURN(n % kAlignment, 0); + + Shape output = ctx->InputShape("a", 0); + output.Set(num_a_dims - 2, m); + output.Set(num_a_dims - 1, n); + out->set_shape(Shape(output)); + + if (ctx->has_input("_add_to_output", 0)) { + const user_op::TensorDesc& add_to_output = ctx->InputTensorDesc("_add_to_output", 0); + CHECK_EQ_OR_RETURN(add_to_output.shape(), out->shape()); + } + return Maybe::Ok(); +} + +/*static*/ Maybe MatmulQuantOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe MatmulQuantOp::GetSbp(user_op::SbpContext* ctx) { + // (b, m, k) * (k, n) when transpose_b is false + // (b, m, k) * (n, k) when transpose_b is true + bool transpose_b = ctx->Attr("transpose_b"); + + const auto& a_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("a", 0).shape(); + const auto& b_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("b", 0).shape(); + + const int64_t a_num_axes = a_shape.NumAxes(); + const int64_t b_num_axes = b_shape.NumAxes(); + + int32_t m_a_axis = a_num_axes - 2; + int32_t k_a_axis = a_num_axes - 1; + int32_t k_b_axis = -1; + int32_t n_axis = -1; + + if (transpose_b) { + k_b_axis = b_num_axes - 1; + n_axis = b_num_axes - 2; + } else { + k_b_axis = b_num_axes - 2; + n_axis = b_num_axes - 1; + } + + bool has_bias = false; + for (const auto& pair : ctx->inputs()) { + if (pair.first == "bias") { + CHECK_EQ_OR_RETURN(0, pair.second); + has_bias = true; + break; + } + } + std::vector out_and_add_to_output_args; + out_and_add_to_output_args.emplace_back("out", 0); + + if (ctx->user_op_conf().has_input("_add_to_output", 0)) { + out_and_add_to_output_args.emplace_back("_add_to_output", 0); + } + + const int64_t max_num_axes = std::max(a_num_axes, b_num_axes); + + if (has_bias) { + // S(m axis) x B -> S(m axis) + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), m_a_axis) + .Broadcast(user_op::OpArg("b", 0)) + .Broadcast(user_op::OpArg("scale", 0)) + .Broadcast(user_op::OpArg("bias", 0)) + .Split(out_and_add_to_output_args, max_num_axes - 2) + .Build(); + // B x S(n_axis) -> S(n_axis) + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .Split(user_op::OpArg("b", 0), n_axis) + .Split(user_op::OpArg("scale", 0), 0) + .Split(user_op::OpArg("bias", 0), 0) + .Split(out_and_add_to_output_args, max_num_axes - 1) + .Build(); + // S(a_k_axis) x S(b_k_axis) -> P + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), k_a_axis) + .Split(user_op::OpArg("b", 0), k_b_axis) + .Broadcast(user_op::OpArg("scale", 0)) + .PartialSum(user_op::OpArg("bias", 0)) + .PartialSum(out_and_add_to_output_args) + .Build(); + // P x B -> P + ctx->NewBuilder() + .PartialSum(user_op::OpArg("a", 0)) + .Broadcast(user_op::OpArg("b", 0)) + .Broadcast(user_op::OpArg("scale", 0)) + .PartialSum(user_op::OpArg("bias", 0)) + .PartialSum(out_and_add_to_output_args) + .Build(); + } else { + // S(m axis) x B -> S(m axis) + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), m_a_axis) + .Broadcast(user_op::OpArg("b", 0)) + .Split(out_and_add_to_output_args, max_num_axes - 2) + .Build(); + + // B x S(n_axis) -> S(n_axis) + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .Split(user_op::OpArg("b", 0), n_axis) + .Split(out_and_add_to_output_args, max_num_axes - 1) + .Build(); + + // S(a_k_axis) x S(b_k_axis) -> P + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), k_a_axis) + .Split(user_op::OpArg("b", 0), k_b_axis) + .PartialSum(out_and_add_to_output_args) + .Build(); + + // P x B -> P + ctx->NewBuilder() + .PartialSum(user_op::OpArg("a", 0)) + .Broadcast(user_op::OpArg("b", 0)) + .PartialSum(out_and_add_to_output_args) + .Build(); + + // B x P -> P + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .PartialSum(user_op::OpArg("b", 0)) + .PartialSum(out_and_add_to_output_args) + .Build(); + } + return Maybe::Ok(); +} + +/* static */ Maybe MatmulQuantOp::InferDataType(user_op::InferContext* ctx) { + ctx->SetOutputDType("out", 0, ctx->Attr("out_dtype")); + return Maybe::Ok(); +} + +/*static*/ Maybe MatmulQuantOp::GetComputeComplexity( + user_op::ComputeComplexityFnContext* ctx) { + return GetComputationCost(ctx); +} + +} // namespace oneflow diff --git a/python/oneflow/nn/functional/__init__.py b/python/oneflow/nn/functional/__init__.py index 677536e035d..43851a59f72 100644 --- a/python/oneflow/nn/functional/__init__.py +++ b/python/oneflow/nn/functional/__init__.py @@ -21,6 +21,7 @@ from oneflow._C import conv2d from oneflow._C import conv3d from oneflow._C import conv2d_quant +from oneflow._C import matmul_quant from oneflow._C import deconv1d as conv_transpose1d from oneflow._C import deconv2d as conv_transpose2d from oneflow._C import deconv3d as conv_transpose3d From cd745cc5d2fb1f2ed91d3123924b7303e8f6ca32 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 1 Sep 2023 12:50:15 +0800 Subject: [PATCH 33/65] fix unused-but-set-variable error --- .../job_rewriter/cutlass_conv_tuning_warmup_pass.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp index 650620a0d26..79cd3d0fded 100644 --- a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp +++ b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp @@ -139,10 +139,6 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const size_t offset = x_size + w_size + y_size; void* bias_ptr = nullptr; if (bias_size != 0) { bias_ptr = buffer + offset; } - void* zero_point_ptr = nullptr; - if (zero_point_size) { zero_point_ptr = buffer + offset + bias_size; } - void* scale_ptr = nullptr; - if (scale_size) { scale_ptr = buffer + offset + bias_size + zero_point_size; } cutlass::conv::Conv2dProblemSize problem_size( n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), @@ -210,6 +206,11 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); } else { #ifdef WITH_CUTLASS_EXTENSION + void* zero_point_ptr = nullptr; + if (zero_point_size) { zero_point_ptr = buffer + offset + bias_size; } + void* scale_ptr = nullptr; + if (scale_size) { scale_ptr = buffer + offset + bias_size + zero_point_size; } + cutlass::library::ConvFunctionalKey key( cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, From 4e499e17c21cf1117d2a6a6d7e5e71d605521648 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 1 Sep 2023 11:03:22 +0000 Subject: [PATCH 34/65] conv2d_quant support add_to_output --- .../cutlass_conv_tuning_warmup_pass.cpp | 29 +++++++++++++++---- .../job_rewriter/fuse_add_to_output_pass.cpp | 5 ++-- oneflow/user/kernels/conv_quant_kernels.cu | 11 +++++-- .../user/kernels/cutlass_conv_tuner_impl.cpp | 15 +++++++--- oneflow/user/kernels/quantization_kernel.cu | 13 +++++---- 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp index 650620a0d26..aee754d811b 100644 --- a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp +++ b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp @@ -115,6 +115,7 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const } size_t zero_point_size = 0; size_t scale_size = 0; + size_t add_to_output_size = 0; if (conv2d_op.has_input("in_zero_point", 0)) { zero_point_size = GetCudaAlignedSize( node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("in_zero_point", 0))) @@ -125,8 +126,14 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("scale", 0))) .ByteSizeOfBlobBody()); } + if (conv2d_op.has_input("_add_to_output", 0)) { + add_to_output_size = GetCudaAlignedSize( + node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("_add_to_output", 0))) + .ByteSizeOfBlobBody()); + } + const size_t total_buf_size = - x_size + w_size + y_size + bias_size + zero_point_size + scale_size; + x_size + w_size + y_size + bias_size + zero_point_size + scale_size + add_to_output_size; if (total_buf_size > buffer_size) { size_t malloc_size = RoundUp(total_buf_size, kBufferMallocAlign); OF_CUDA_CHECK(cudaFree(buffer)); @@ -139,10 +146,6 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const size_t offset = x_size + w_size + y_size; void* bias_ptr = nullptr; if (bias_size != 0) { bias_ptr = buffer + offset; } - void* zero_point_ptr = nullptr; - if (zero_point_size) { zero_point_ptr = buffer + offset + bias_size; } - void* scale_ptr = nullptr; - if (scale_size) { scale_ptr = buffer + offset + bias_size + zero_point_size; } cutlass::conv::Conv2dProblemSize problem_size( n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), @@ -210,6 +213,20 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const stream->As(), key, configuraion, arguments, workspace, kMaxWorkspaceSize); } else { #ifdef WITH_CUTLASS_EXTENSION + offset += bias_size; + void* zero_point_ptr = nullptr; + if (zero_point_size) { + zero_point_ptr = buffer + offset; + offset += zero_point_size; + } + void* scale_ptr = nullptr; + if (scale_size) { + scale_ptr = buffer + offset; + offset += scale_size; + } + void* add_to_output_ptr = nullptr; + if (add_to_output_size) { add_to_output_ptr = buffer + offset; } + cutlass::library::ConvFunctionalKey key( cutlass::library::Provider::kCUTLASS, cutlass::library::ConvKind::kFprop, cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, @@ -230,6 +247,7 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const configuraion.problem_size = problem_size; configuraion.stride_a = {c, w * c, h * w * c}; configuraion.stride_b = {c, s * c, r * s * c}; + configuraion.stride_residual = {k, q * k, p * q * k}; cutlass::library::ConvScaleBiasFusionArguments arguments; arguments.A = x_ptr; @@ -238,6 +256,7 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const arguments.P = zero_point_ptr; arguments.Scale = scale_ptr; arguments.Bias = bias_ptr; + arguments.Residual = add_to_output_ptr; arguments.D = y_ptr; operation = CutlassConvTuner().FindConv2dOperation( diff --git a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp index f3e7ef2e525..2b715c83df1 100644 --- a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp +++ b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp @@ -48,6 +48,7 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ {{"normalization", user_op::OpArg("y", 0)}, {"dropout", user_op::OpArg("out", 0)}, {"matmul", user_op::OpArg("out", 0)}, + {"conv2d_quant", user_op::OpArg("out", 0)}, {"layer_norm_grad", user_op::OpArg("dx", 0)}, {"batch_matmul", user_op::OpArg("out", 0)}, {"fused_bias_add_mask_scale", user_op::OpArg("out", 0)}, @@ -162,9 +163,7 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ delete_ops.emplace_back(op_conf); return Maybe::Ok(); })); - if (delete_ops.empty()) { - return false; - } + if (delete_ops.empty()) { return false; } JUST(job_builder->MutOpTransactionCommit()); job_builder->DelOps(delete_ops); return true; diff --git a/oneflow/user/kernels/conv_quant_kernels.cu b/oneflow/user/kernels/conv_quant_kernels.cu index 8402aec059b..402f8396ad3 100644 --- a/oneflow/user/kernels/conv_quant_kernels.cu +++ b/oneflow/user/kernels/conv_quant_kernels.cu @@ -74,6 +74,7 @@ void LaunchConv2dQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, const user_op::Tensor* in, const user_op::Tensor* weight, const user_op::Tensor* in_zero_point, const user_op::Tensor* scale, const user_op::Tensor* bias, + const user_op::Tensor* add_to_output, user_op::Tensor* out) { cutlass::library::Conv2dScaleBiasFusionConfiguration configuraion; configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; @@ -82,6 +83,8 @@ void LaunchConv2dQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, problem_size.H * problem_size.W * problem_size.C}; configuraion.stride_b = {problem_size.C, problem_size.S * problem_size.C, problem_size.R * problem_size.S * problem_size.C}; + configuraion.stride_residual = {problem_size.K, problem_size.Q * problem_size.K, + problem_size.P * problem_size.Q * problem_size.K}; cutlass::library::ConvScaleBiasFusionArguments arguments; arguments.A = in->dptr(); @@ -90,6 +93,11 @@ void LaunchConv2dQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, arguments.P = in_zero_point->dptr(); arguments.Scale = scale->dptr(); arguments.Bias = bias->dptr(); + if (add_to_output) { + arguments.Residual = add_to_output->dptr(); + } else { + arguments.Residual = nullptr; + } arguments.D = out->mut_dptr(); LaunchConvQuantOpImpl(ctx, key, configuraion, arguments); @@ -112,7 +120,6 @@ class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGr const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK(add_to_output == nullptr); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); @@ -152,7 +159,7 @@ class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGr cutlass::conv::Mode::kCrossCorrelation); if (scale) { LaunchConv2dQuantScaleBiasFusionOp(ctx, key, problem_size, in, weight, in_zero_point, scale, - bias, out); + bias, add_to_output, out); } else { UNIMPLEMENTED(); } diff --git a/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp index ced2060ecea..39824480c78 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp @@ -291,6 +291,8 @@ class CutlassConvTunerImpl cache; const cutlass::library::CutlassExtensionSingleton* singleton; + const cutlass::library::CutlassExtensionSingleton* residual_singleton; }; const cutlass::library::Operation* @@ -360,19 +363,22 @@ CutlassConvTunerImplcuda_arch()); + const cutlass::library::Operation* fastest_operation = + FindFastestOperation((benchmark_arguments.Residual ? residual_singleton : singleton), + functional_key, configuraion, benchmark_arguments, benchmark_workspace, + workspace_size, benchmark_stream, stream->cuda_arch()); #ifdef WITH_CUDA_GRAPHS if (stream->IsGraphCapturing()) { @@ -383,6 +389,7 @@ CutlassConvTunerImpl(benchmark_arguments.P))); OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Scale))); OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Bias))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Residual))); OF_CUDA_CHECK(cudaFree(benchmark_arguments.D)); OF_CUDA_CHECK(cudaFree(benchmark_workspace)); OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); diff --git a/oneflow/user/kernels/quantization_kernel.cu b/oneflow/user/kernels/quantization_kernel.cu index 463aff202fa..1a06ab92681 100644 --- a/oneflow/user/kernels/quantization_kernel.cu +++ b/oneflow/user/kernels/quantization_kernel.cu @@ -222,12 +222,15 @@ void ApplyOFPerTensorQuantization(user_op::KernelComputeContext* ctx, OutT lower_bound = -upper_bound - 1; auto stream = ctx->stream()->As()->cuda_stream(); if (quantization_scheme == "symmetric") { - OFPerTensorQuantizationSymmetric<<>>( - elements, in->dptr(), scale->dptr(), upper_bound, lower_bound, out->mut_dptr()); + OFPerTensorQuantizationSymmetric + <<>>( + elements, in->dptr(), scale->dptr(), upper_bound, lower_bound, + out->mut_dptr()); } else { - OFPerTensorQuantizationAffine<<>>( - elements, in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, - lower_bound, out->mut_dptr()); + OFPerTensorQuantizationAffine + <<>>( + elements, in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, + lower_bound, out->mut_dptr()); } } From 63b449cf8a6c5351fc334a4fe71a99d8c2bc1e6f Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 1 Sep 2023 12:58:06 +0000 Subject: [PATCH 35/65] fix and reformat --- oneflow/core/functional/impl/nn_functor.cpp | 23 +++-- oneflow/ir/lib/OneFlow/OneFlowSupport.cpp | 2 +- .../user/kernels/cutlass_conv_tuner_impl.cpp | 5 +- oneflow/user/kernels/matmul_quant_kernels.cu | 92 +++++++++---------- oneflow/user/ops/matmul_quant_op.cpp | 10 +- 5 files changed, 66 insertions(+), 66 deletions(-) diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index cf1a92d5c4f..0feb2e7e5eb 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -381,24 +381,33 @@ class MatMulNoBroadCastFunctor { class MatMulQuantFunctor { public: MatMulQuantFunctor() { - matmul_op_ = CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Output("out").Build()); - matmul_scale_bias__op_ = - CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Input("scale").Input("bias").Output("out").Build()); + matmul_op_ = + CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Output("out").Build()); + matmul_scale_bias__op_ = CHECK_JUST(one::OpBuilder("matmul_quant") + .Input("a") + .Input("b") + .Input("scale") + .Input("bias") + .Output("out") + .Build()); } - Maybe operator()(const std::shared_ptr& a, const std::shared_ptr& b, + Maybe operator()(const std::shared_ptr& a, + const std::shared_ptr& b, const Optional& scale, const Optional& bias, - const bool& transpose_b, const Optional>& output_dtype) const { + const bool& transpose_b, + const Optional>& output_dtype) const { if (scale || bias) { CHECK_OR_RETURN(scale && bias) << "scale and bias must both be given or not."; } auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_b", "out_dtype"); attrs.SetAllAttrs(transpose_b, output_dtype.value_or(DType::Float())->data_type()); if (scale) { - return OpInterpUtil::Dispatch( - *matmul_scale_bias__op_, {a, b, JUST(scale), JUST(bias)}, attrs); + return OpInterpUtil::Dispatch(*matmul_scale_bias__op_, + {a, b, JUST(scale), JUST(bias)}, attrs); } return OpInterpUtil::Dispatch(*matmul_op_, {a, b}, attrs); } + private: std::shared_ptr matmul_op_; std::shared_ptr matmul_scale_bias__op_; diff --git a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp index ff40c509210..60236ff4e0e 100644 --- a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp +++ b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp @@ -145,7 +145,7 @@ void __DenseElementsAttrToTensor(const mlir::DenseElementsAttr dense_attr, for (const T elem : dense_attr.getValues()) { data.push_back(elem); } CHECK_EQ(data.size() * sizeof(T), tensor_size); dptr = data.data(); - } else if (tensor->dtype()->data_type() == ::oneflow::DataType::kInt8){ + } else if (tensor->dtype()->data_type() == ::oneflow::DataType::kInt8) { for (const T elem : dense_attr.getValues()) { int8_data.push_back(static_cast(elem)); } diff --git a/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp index 39824480c78..1d3a65b5cae 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp @@ -412,8 +412,9 @@ CutlassConvTunerImplcuda_stream(), stream->cuda_arch()); + return GetOperation((arguments.Residual ? residual_singleton : singleton), name, functional_key, + configuraion, arguments, workspace, workspace_size, stream->cuda_stream(), + stream->cuda_arch()); } #endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/kernels/matmul_quant_kernels.cu b/oneflow/user/kernels/matmul_quant_kernels.cu index 6e2cc3fee00..4af18df7d5d 100644 --- a/oneflow/user/kernels/matmul_quant_kernels.cu +++ b/oneflow/user/kernels/matmul_quant_kernels.cu @@ -29,12 +29,13 @@ namespace oneflow { namespace { -using RowMajor = cutlass::layout::RowMajor; // 行主序存储方式 -using ColMajor = cutlass::layout::ColumnMajor; // 列主序存储方式 +using RowMajor = cutlass::layout::RowMajor; // 行主序存储方式 +using ColMajor = cutlass::layout::ColumnMajor; // 列主序存储方式 -void cutlass_gemm_scale_bias_s8s8fp16(cudaStream_t stream, void *workspace, int m, int k, int n, - const int8_t* a_ptr, const int8_t* b_ptr, const cutlass::half_t* scale, const cutlass::half_t* bias, cutlass::half_t* output) { - +void cutlass_gemm_scale_bias_s8s8fp16(cudaStream_t stream, void* workspace, int m, int k, int n, + const int8_t* a_ptr, const int8_t* b_ptr, + const cutlass::half_t* scale, const cutlass::half_t* bias, + cutlass::half_t* output) { using ElementA = int8_t; using ElementB = int8_t; using ElementC = cutlass::half_t; @@ -43,36 +44,30 @@ void cutlass_gemm_scale_bias_s8s8fp16(cudaStream_t stream, void *workspace, int using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< ElementA, // A矩阵数据类型 - RowMajor, // A矩阵存储方式 + RowMajor, // A矩阵存储方式 ElementB, // B矩阵数据类型 - ColMajor, // B矩阵存储方式 - ElementC, // C矩阵数据类型 - RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, + ColMajor, // B矩阵存储方式 + ElementC, // C矩阵数据类型 + RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::LinearCombinationScaleBias>; + ElementCompute>>; CutlassRowAColBRowCGemm gemm_operator; - CutlassRowAColBRowCGemm::Arguments args( - {m, n, k}, - {a_ptr, k}, - {b_ptr, k}, - {scale, 0}, - {bias, 0}, - {output, n} - ); + CutlassRowAColBRowCGemm::Arguments args({m, n, k}, {a_ptr, k}, {b_ptr, k}, {scale, 0}, {bias, 0}, + {output, n}); cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = gemm_operator(stream); //运行Gemm + auto run_status = gemm_operator(stream); //运行Gemm CHECK(run_status == cutlass::Status::kSuccess); return; } -void cutlass_gemm_scale_bias_s8s8fp32(cudaStream_t stream, void *workspace, int m, int k, int n, - const int8_t* a_ptr, const int8_t* b_ptr, const float* scale, const float* bias, float* output) { - +void cutlass_gemm_scale_bias_s8s8fp32(cudaStream_t stream, void* workspace, int m, int k, int n, + const int8_t* a_ptr, const int8_t* b_ptr, const float* scale, + const float* bias, float* output) { using ElementA = int8_t; using ElementB = int8_t; using ElementC = float; @@ -81,29 +76,23 @@ void cutlass_gemm_scale_bias_s8s8fp32(cudaStream_t stream, void *workspace, int using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< ElementA, // A矩阵数据类型 - RowMajor, // A矩阵存储方式 + RowMajor, // A矩阵存储方式 ElementB, // B矩阵数据类型 - ColMajor, // B矩阵存储方式 - ElementC, // C矩阵数据类型 - RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, + ColMajor, // B矩阵存储方式 + ElementC, // C矩阵数据类型 + RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::LinearCombinationScaleBias>; + ElementCompute>>; CutlassRowAColBRowCGemm gemm_operator; - CutlassRowAColBRowCGemm::Arguments args( - {m, n, k}, - {a_ptr, k}, - {b_ptr, k}, - {scale, 0}, - {bias, 0}, - {output, n} - ); + CutlassRowAColBRowCGemm::Arguments args({m, n, k}, {a_ptr, k}, {b_ptr, k}, {scale, 0}, {bias, 0}, + {output, n}); cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = gemm_operator(stream); //运行Gemm + auto run_status = gemm_operator(stream); //运行Gemm CHECK(run_status == cutlass::Status::kSuccess); return; } @@ -137,21 +126,24 @@ class MatmulQuantKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); if (out->data_type() == DataType::kFloat) { - cutlass_gemm_scale_bias_s8s8fp32( - ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, k, n, - a->dptr(), b->dptr(), scale->dptr(), bias->dptr(), - out->mut_dptr()); + cutlass_gemm_scale_bias_s8s8fp32(ctx->stream()->As()->cuda_stream(), + tmp_buffer->mut_dptr(), m, k, n, a->dptr(), + b->dptr(), scale->dptr(), bias->dptr(), + out->mut_dptr()); } else if (out->data_type() == DataType::kFloat16) { - cutlass_gemm_scale_bias_s8s8fp16( - ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, k, n, - a->dptr(), b->dptr(), reinterpret_cast(scale->dptr()), - reinterpret_cast(bias->dptr()), reinterpret_cast(out->mut_dptr())); + cutlass_gemm_scale_bias_s8s8fp16(ctx->stream()->As()->cuda_stream(), + tmp_buffer->mut_dptr(), m, k, n, a->dptr(), + b->dptr(), + reinterpret_cast(scale->dptr()), + reinterpret_cast(bias->dptr()), + reinterpret_cast(out->mut_dptr())); } } }; REGISTER_USER_KERNEL("matmul_quant") - .SetCreateFn().SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) && (user_op::HobDataType("a", 0) == DataType::kInt8) && (user_op::HobDataType("b", 0) == DataType::kInt8)) .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { @@ -162,4 +154,4 @@ REGISTER_USER_KERNEL("matmul_quant") } // namespace oneflow -#endif // WITH_CUTLASS_EXTENSION +#endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/ops/matmul_quant_op.cpp b/oneflow/user/ops/matmul_quant_op.cpp index 0f45a8cd842..4f8d9b77ef3 100644 --- a/oneflow/user/ops/matmul_quant_op.cpp +++ b/oneflow/user/ops/matmul_quant_op.cpp @@ -45,7 +45,7 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { return logical_computation_cost; } -} +} // namespace // BroadcastMatmul @@ -67,12 +67,10 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { m = a.shape().At(num_a_dims - 2); k = a.shape().At(num_a_dims - 1); if (!transpose_b) { - CHECK_EQ_OR_RETURN(k, b.shape().At(0)) - << "K dim should be equal to b.shape().At(0). "; + CHECK_EQ_OR_RETURN(k, b.shape().At(0)) << "K dim should be equal to b.shape().At(0). "; n = b.shape().At(1); } else { - CHECK_EQ_OR_RETURN(k, b.shape().At(1)) - << "K dim should be equal to b.shape().At(1). "; + CHECK_EQ_OR_RETURN(k, b.shape().At(1)) << "K dim should be equal to b.shape().At(1). "; n = b.shape().At(0); } @@ -129,7 +127,7 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { } std::vector out_and_add_to_output_args; out_and_add_to_output_args.emplace_back("out", 0); - + if (ctx->user_op_conf().has_input("_add_to_output", 0)) { out_and_add_to_output_args.emplace_back("_add_to_output", 0); } From eef55aa13825f35cf31dc9cfc2f0326bf9ab9946 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 1 Sep 2023 15:05:50 +0000 Subject: [PATCH 36/65] update cutlass extension branch --- cmake/third_party/cutlass-extension.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/third_party/cutlass-extension.cmake b/cmake/third_party/cutlass-extension.cmake index f3059d7584f..fbe3b0ff749 100644 --- a/cmake/third_party/cutlass-extension.cmake +++ b/cmake/third_party/cutlass-extension.cmake @@ -29,7 +29,7 @@ if(WITH_CUTLASS_EXTENSION) ${CUTLASS_EXTENSION_PROJECT} PREFIX cutlass-extension GIT_REPOSITORY https://github.com/Oneflow-Inc/oneflow-cutlass-extension.git - GIT_TAG add_gemm_scale_bias_fusion + GIT_TAG master UPDATE_COMMAND "" BUILD_BYPRODUCTS ${CUTLASS_EXTENSION_LIBRARIES} CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} From d62f6e4d587376683473bba00a58e6636454ebe6 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Sat, 2 Sep 2023 09:16:58 +0000 Subject: [PATCH 37/65] matmul_quant supports add_to_output --- oneflow/core/functional/functional_api.yaml | 4 +- oneflow/core/functional/impl/nn_functor.cpp | 32 +++-- .../job_rewriter/fuse_add_to_output_pass.cpp | 1 + oneflow/ir/include/OneFlow/OneFlowUserOps.td | 4 +- oneflow/user/kernels/matmul_quant_kernels.cu | 133 ++++++++---------- oneflow/user/ops/conv_quant_op.cpp | 12 +- oneflow/user/ops/matmul_quant_op.cpp | 123 +++++++--------- 7 files changed, 142 insertions(+), 167 deletions(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 13cd1c646cd..66b642cb3bf 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1060,7 +1060,9 @@ - name: "matmul_quant" signature: - 'Tensor (Tensor a, Tensor b, Tensor scale=None, Tensor bias=None, Bool transpose_b=True, DataType output_dtype=None) => MatmulQuant' + 'Tensor (Tensor a, Tensor b, Tensor scale=None, Tensor bias=None, + Bool transpose_a=False, Bool transpose_b=False, + Double alpha=1.0, DataType output_dtype=None) => MatmulQuant' bind_python: True - name: "conv3d" diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 0feb2e7e5eb..0f571a18d2c 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -383,34 +383,40 @@ class MatMulQuantFunctor { MatMulQuantFunctor() { matmul_op_ = CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Output("out").Build()); - matmul_scale_bias__op_ = CHECK_JUST(one::OpBuilder("matmul_quant") - .Input("a") - .Input("b") - .Input("scale") - .Input("bias") - .Output("out") - .Build()); + matmul_scale_bias_op_ = CHECK_JUST(one::OpBuilder("matmul_quant") + .Input("a") + .Input("b") + .Input("scale") + .Input("bias") + .Output("out") + .Build()); } Maybe operator()(const std::shared_ptr& a, const std::shared_ptr& b, const Optional& scale, const Optional& bias, - const bool& transpose_b, + const bool& transpose_a, const bool& transpose_b, const double& alpha, const Optional>& output_dtype) const { + CHECK_OR_RETURN(!transpose_a) + << "the first input should not be transposed for quantized matmul."; + CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; + CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; if (scale || bias) { CHECK_OR_RETURN(scale && bias) << "scale and bias must both be given or not."; } - auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_b", "out_dtype"); - attrs.SetAllAttrs(transpose_b, output_dtype.value_or(DType::Float())->data_type()); + auto& attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); + attrs.SetAllAttrs(transpose_a, transpose_b, alpha, + output_dtype.value_or(DType::Float())->data_type()); if (scale) { - return OpInterpUtil::Dispatch(*matmul_scale_bias__op_, - {a, b, JUST(scale), JUST(bias)}, attrs); + return OpInterpUtil::Dispatch(*matmul_scale_bias_op_, {a, b, JUST(scale), JUST(bias)}, + attrs); } return OpInterpUtil::Dispatch(*matmul_op_, {a, b}, attrs); } private: std::shared_ptr matmul_op_; - std::shared_ptr matmul_scale_bias__op_; + std::shared_ptr matmul_scale_bias_op_; }; class MatMulFunctor { diff --git a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp index 2b715c83df1..98b5f029927 100644 --- a/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp +++ b/oneflow/core/job_rewriter/fuse_add_to_output_pass.cpp @@ -48,6 +48,7 @@ Maybe FuseAddToOutputPass::Apply(const OpGraph& op_graph, JobBuilder* job_ {{"normalization", user_op::OpArg("y", 0)}, {"dropout", user_op::OpArg("out", 0)}, {"matmul", user_op::OpArg("out", 0)}, + {"matmul_quant", user_op::OpArg("out", 0)}, {"conv2d_quant", user_op::OpArg("out", 0)}, {"layer_norm_grad", user_op::OpArg("dx", 0)}, {"batch_matmul", user_op::OpArg("out", 0)}, diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index d3af206949d..70102086352 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -5387,7 +5387,9 @@ def OneFlow_MatmulQuantOp : OneFlow_BaseOp<"matmul_quant", [NoMemoryEffect, Attr OneFlow_Tensor:$out ); let attrs = (ins - DefaultValuedAttr:$transpose_b, + DefaultValuedAttr:$transpose_a, + DefaultValuedAttr:$transpose_b, + DefaultValuedAttr:$alpha, OneFlow_DataType:$out_dtype ); let has_logical_tensor_desc_infer_fn = 1; diff --git a/oneflow/user/kernels/matmul_quant_kernels.cu b/oneflow/user/kernels/matmul_quant_kernels.cu index 4af18df7d5d..1c2f25344c8 100644 --- a/oneflow/user/kernels/matmul_quant_kernels.cu +++ b/oneflow/user/kernels/matmul_quant_kernels.cu @@ -25,76 +25,63 @@ limitations under the License. #include "cutlass/gemm/device/gemm_scale_bias_fusion.h" #include "cutlass/epilogue/thread/linear_combination_scale_bias.h" +#include "cutlass/gemm/device/gemm_scale_bias_residual_fusion.h" +#include "cutlass/epilogue/thread/linear_combination_scale_bias_residual.h" + namespace oneflow { namespace { -using RowMajor = cutlass::layout::RowMajor; // 行主序存储方式 -using ColMajor = cutlass::layout::ColumnMajor; // 列主序存储方式 - -void cutlass_gemm_scale_bias_s8s8fp16(cudaStream_t stream, void* workspace, int m, int k, int n, - const int8_t* a_ptr, const int8_t* b_ptr, - const cutlass::half_t* scale, const cutlass::half_t* bias, - cutlass::half_t* output) { - using ElementA = int8_t; - using ElementB = int8_t; - using ElementC = cutlass::half_t; - using ElementAccumulator = int32_t; - using ElementCompute = float; - - using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< - ElementA, // A矩阵数据类型 - RowMajor, // A矩阵存储方式 - ElementB, // B矩阵数据类型 - ColMajor, // B矩阵存储方式 - ElementC, // C矩阵数据类型 - RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombinationScaleBias>; - - CutlassRowAColBRowCGemm gemm_operator; - CutlassRowAColBRowCGemm::Arguments args({m, n, k}, {a_ptr, k}, {b_ptr, k}, {scale, 0}, {bias, 0}, - {output, n}); - - cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); - CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = gemm_operator(stream); //运行Gemm - CHECK(run_status == cutlass::Status::kSuccess); - return; -} +using RowMajor = cutlass::layout::RowMajor; +using ColMajor = cutlass::layout::ColumnMajor; -void cutlass_gemm_scale_bias_s8s8fp32(cudaStream_t stream, void* workspace, int m, int k, int n, - const int8_t* a_ptr, const int8_t* b_ptr, const float* scale, - const float* bias, float* output) { - using ElementA = int8_t; - using ElementB = int8_t; - using ElementC = float; +template +void cutlass_gemm_scale_bias_s8(cudaStream_t stream, void* workspace, int m, int n, int k, + const void* a, const void* b, const void* scale, const void* bias, + const void* residual, void* output) { + using ElementA = T; + using ElementB = T; + using ElementC = OutT; using ElementAccumulator = int32_t; using ElementCompute = float; - using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< - ElementA, // A矩阵数据类型 - RowMajor, // A矩阵存储方式 - ElementB, // B矩阵数据类型 - ColMajor, // B矩阵存储方式 - ElementC, // C矩阵数据类型 - RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombinationScaleBias>; - - CutlassRowAColBRowCGemm gemm_operator; - CutlassRowAColBRowCGemm::Arguments args({m, n, k}, {a_ptr, k}, {b_ptr, k}, {scale, 0}, {bias, 0}, - {output, n}); - - cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); - CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = gemm_operator(stream); //运行Gemm - CHECK(run_status == cutlass::Status::kSuccess); - return; + if (!residual) { + using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< + ElementA, RowMajor, ElementB, ColMajor, ElementC, RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationScaleBias>; + + CutlassRowAColBRowCGemm gemm_operator; + typename CutlassRowAColBRowCGemm::Arguments args( + {m, n, k}, {reinterpret_cast(a), k}, {reinterpret_cast(b), k}, + {reinterpret_cast(scale), 0}, {reinterpret_cast(bias), 0}, + {reinterpret_cast(output), n}); + + cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = gemm_operator(stream); + CHECK(run_status == cutlass::Status::kSuccess); + } else { + using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasResidualFusion< + ElementA, RowMajor, ElementB, ColMajor, ElementC, RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationScaleBiasResidual< + ElementC, 8, ElementAccumulator, ElementCompute, cutlass::plus>>; + + CutlassRowAColBRowCGemm gemm_operator; + typename CutlassRowAColBRowCGemm::Arguments args( + {m, n, k}, {reinterpret_cast(a), k}, {reinterpret_cast(b), k}, + {reinterpret_cast(scale), 0}, {reinterpret_cast(bias), 0}, + {reinterpret_cast(residual), n}, {reinterpret_cast(output), n}); + + cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = gemm_operator(stream); + CHECK(run_status == cutlass::Status::kSuccess); + } } } // namespace @@ -107,15 +94,17 @@ class MatmulQuantKernel final : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("a", 0); const user_op::Tensor* b = ctx->Tensor4ArgNameAndIndex("b", 0); const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK(add_to_output == nullptr); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + CHECK(!ctx->Attr("transpose_a")); CHECK(ctx->Attr("transpose_b")); int64_t dim_a = a->shape_view().NumAxes(); @@ -126,17 +115,15 @@ class MatmulQuantKernel final : public user_op::OpKernel { user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); if (out->data_type() == DataType::kFloat) { - cutlass_gemm_scale_bias_s8s8fp32(ctx->stream()->As()->cuda_stream(), - tmp_buffer->mut_dptr(), m, k, n, a->dptr(), - b->dptr(), scale->dptr(), bias->dptr(), - out->mut_dptr()); + cutlass_gemm_scale_bias_s8( + ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, n, k, + a->dptr(), b->dptr(), scale->dptr(), bias->dptr(), + (add_to_output ? add_to_output->dptr() : nullptr), out->mut_dptr()); } else if (out->data_type() == DataType::kFloat16) { - cutlass_gemm_scale_bias_s8s8fp16(ctx->stream()->As()->cuda_stream(), - tmp_buffer->mut_dptr(), m, k, n, a->dptr(), - b->dptr(), - reinterpret_cast(scale->dptr()), - reinterpret_cast(bias->dptr()), - reinterpret_cast(out->mut_dptr())); + cutlass_gemm_scale_bias_s8( + ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, n, k, + a->dptr(), b->dptr(), scale->dptr(), bias->dptr(), + (add_to_output ? add_to_output->dptr() : nullptr), out->mut_dptr()); } } }; diff --git a/oneflow/user/ops/conv_quant_op.cpp b/oneflow/user/ops/conv_quant_op.cpp index c6c59706f46..0fa8d0d637c 100644 --- a/oneflow/user/ops/conv_quant_op.cpp +++ b/oneflow/user/ops/conv_quant_op.cpp @@ -99,16 +99,8 @@ Maybe InferTensorDesc4Conv(user_op::InferContext* ctx) { } Maybe GetSbpSignatures4Conv(user_op::SbpContext* ctx) { - bool has_bias = false; - for (const auto& pair : ctx->inputs()) { - if (pair.first == "bias") { - CHECK_EQ_OR_RETURN(0, pair.second); - has_bias = true; - break; - } - } - - if (has_bias) { + if (ctx->user_op_conf().has_input("scale", 0)) { + CHECK_OR_RETURN(ctx->user_op_conf().has_input("bias", 0)); ctx->NewBuilder() .Split(ctx->inputs(), 0) .Split(user_op::OpArg("in", 0), 0) diff --git a/oneflow/user/ops/matmul_quant_op.cpp b/oneflow/user/ops/matmul_quant_op.cpp index 4f8d9b77ef3..2d9d3656bc5 100644 --- a/oneflow/user/ops/matmul_quant_op.cpp +++ b/oneflow/user/ops/matmul_quant_op.cpp @@ -18,8 +18,6 @@ limitations under the License. namespace oneflow { -static const int kAlignment = 16; - namespace { Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { @@ -50,40 +48,43 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { // BroadcastMatmul /* static */ Maybe MatmulQuantOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { + bool transpose_a = ctx->Attr("transpose_a"); bool transpose_b = ctx->Attr("transpose_b"); CHECK_OR_RETURN(transpose_b); const user_op::TensorDesc& a = ctx->InputTensorDesc("a", 0); const user_op::TensorDesc& b = ctx->InputTensorDesc("b", 0); + // CHECK_EQ_OR_RETURN(a.shape().NumAxes(), b.shape().NumAxes()); + CHECK_GE_OR_RETURN(a.shape().NumAxes(), 2); + CHECK_EQ_OR_RETURN(b.shape().NumAxes(), 2); + size_t a_num_axes = a.shape().NumAxes(); + size_t b_num_axes = b.shape().NumAxes(); + user_op::TensorDesc* out = ctx->MutOutputTensorDesc("out", 0); - const int64_t num_a_dims = a.shape().NumAxes(); - const int64_t num_b_dims = b.shape().NumAxes(); - CHECK_OR_RETURN(num_a_dims == 2 || num_a_dims == 3); - CHECK_EQ_OR_RETURN(num_b_dims, 2); - int64_t m = 0; - int64_t n = 0; - int64_t k = 0; // tensor a (no trans): batch_dims*m*k, tensor b (no trans): batch_dims*k*n - m = a.shape().At(num_a_dims - 2); - k = a.shape().At(num_a_dims - 1); + Shape output = ctx->InputShape("a", 0); + ctx->SetOutputIsDynamic("out", 0, ctx->InputIsDynamic("a", 0)); + + int64_t m, n, k; // tensor a (no trans): m*k, tensor b (no trans): k*n + if (!transpose_a) { + m = a.shape().At(a_num_axes - 2); + k = a.shape().At(a_num_axes - 1); + } else { + m = a.shape().At(a_num_axes - 1); + k = a.shape().At(a_num_axes - 2); + } if (!transpose_b) { - CHECK_EQ_OR_RETURN(k, b.shape().At(0)) << "K dim should be equal to b.shape().At(0). "; - n = b.shape().At(1); + CHECK_EQ_OR_RETURN(k, b.shape().At(b_num_axes - 2)); + n = b.shape().At(b_num_axes - 1); } else { - CHECK_EQ_OR_RETURN(k, b.shape().At(1)) << "K dim should be equal to b.shape().At(1). "; - n = b.shape().At(0); + CHECK_EQ_OR_RETURN(k, b.shape().At(b_num_axes - 1)); + n = b.shape().At(b_num_axes - 2); } - - CHECK_EQ_OR_RETURN(k % kAlignment, 0); - CHECK_EQ_OR_RETURN(n % kAlignment, 0); - - Shape output = ctx->InputShape("a", 0); - output.Set(num_a_dims - 2, m); - output.Set(num_a_dims - 1, n); - out->set_shape(Shape(output)); - + output.Set(a_num_axes - 2, m); + output.Set(a_num_axes - 1, n); + out->set_shape(output); if (ctx->has_input("_add_to_output", 0)) { - const user_op::TensorDesc& add_to_output = ctx->InputTensorDesc("_add_to_output", 0); + const auto& add_to_output = ctx->InputTensorDesc("_add_to_output", 0); CHECK_EQ_OR_RETURN(add_to_output.shape(), out->shape()); } return Maybe::Ok(); @@ -94,109 +95,93 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { } /* static */ Maybe MatmulQuantOp::GetSbp(user_op::SbpContext* ctx) { - // (b, m, k) * (k, n) when transpose_b is false - // (b, m, k) * (n, k) when transpose_b is true - bool transpose_b = ctx->Attr("transpose_b"); - + // (m, k_a) * (k_b, n) where k_a == k_b const auto& a_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("a", 0).shape(); const auto& b_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("b", 0).shape(); - const int64_t a_num_axes = a_shape.NumAxes(); const int64_t b_num_axes = b_shape.NumAxes(); - int32_t m_a_axis = a_num_axes - 2; - int32_t k_a_axis = a_num_axes - 1; + int32_t m_axis = -1; + int32_t k_a_axis = -1; int32_t k_b_axis = -1; int32_t n_axis = -1; - - if (transpose_b) { + if (ctx->Attr("transpose_a")) { + m_axis = a_num_axes - 1; + k_a_axis = a_num_axes - 2; + } else { + m_axis = a_num_axes - 2; + k_a_axis = a_num_axes - 1; + } + if (ctx->Attr("transpose_b")) { k_b_axis = b_num_axes - 1; n_axis = b_num_axes - 2; } else { k_b_axis = b_num_axes - 2; n_axis = b_num_axes - 1; } - - bool has_bias = false; - for (const auto& pair : ctx->inputs()) { - if (pair.first == "bias") { - CHECK_EQ_OR_RETURN(0, pair.second); - has_bias = true; - break; - } - } std::vector out_and_add_to_output_args; out_and_add_to_output_args.emplace_back("out", 0); - if (ctx->user_op_conf().has_input("_add_to_output", 0)) { out_and_add_to_output_args.emplace_back("_add_to_output", 0); } - - const int64_t max_num_axes = std::max(a_num_axes, b_num_axes); - - if (has_bias) { - // S(m axis) x B -> S(m axis) + if (ctx->user_op_conf().has_input("scale", 0)) { + CHECK_OR_RETURN(ctx->user_op_conf().has_input("bias", 0)); ctx->NewBuilder() - .Split(user_op::OpArg("a", 0), m_a_axis) + .Split(user_op::OpArg("a", 0), m_axis) .Broadcast(user_op::OpArg("b", 0)) .Broadcast(user_op::OpArg("scale", 0)) .Broadcast(user_op::OpArg("bias", 0)) - .Split(out_and_add_to_output_args, max_num_axes - 2) + .Split(out_and_add_to_output_args, 0) .Build(); - // B x S(n_axis) -> S(n_axis) ctx->NewBuilder() .Broadcast(user_op::OpArg("a", 0)) .Split(user_op::OpArg("b", 0), n_axis) .Split(user_op::OpArg("scale", 0), 0) .Split(user_op::OpArg("bias", 0), 0) - .Split(out_and_add_to_output_args, max_num_axes - 1) + .Split(out_and_add_to_output_args, 1) .Build(); - // S(a_k_axis) x S(b_k_axis) -> P ctx->NewBuilder() .Split(user_op::OpArg("a", 0), k_a_axis) .Split(user_op::OpArg("b", 0), k_b_axis) .Broadcast(user_op::OpArg("scale", 0)) - .PartialSum(user_op::OpArg("bias", 0)) + .Broadcast(user_op::OpArg("bias", 0)) .PartialSum(out_and_add_to_output_args) .Build(); - // P x B -> P ctx->NewBuilder() .PartialSum(user_op::OpArg("a", 0)) .Broadcast(user_op::OpArg("b", 0)) .Broadcast(user_op::OpArg("scale", 0)) - .PartialSum(user_op::OpArg("bias", 0)) + .Broadcast(user_op::OpArg("bias", 0)) + .PartialSum(out_and_add_to_output_args) + .Build(); + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .PartialSum(user_op::OpArg("b", 0)) + .Broadcast(user_op::OpArg("scale", 0)) + .Broadcast(user_op::OpArg("bias", 0)) .PartialSum(out_and_add_to_output_args) .Build(); } else { - // S(m axis) x B -> S(m axis) ctx->NewBuilder() - .Split(user_op::OpArg("a", 0), m_a_axis) + .Split(user_op::OpArg("a", 0), m_axis) .Broadcast(user_op::OpArg("b", 0)) - .Split(out_and_add_to_output_args, max_num_axes - 2) + .Split(out_and_add_to_output_args, 0) .Build(); - - // B x S(n_axis) -> S(n_axis) ctx->NewBuilder() .Broadcast(user_op::OpArg("a", 0)) .Split(user_op::OpArg("b", 0), n_axis) - .Split(out_and_add_to_output_args, max_num_axes - 1) + .Split(out_and_add_to_output_args, 1) .Build(); - - // S(a_k_axis) x S(b_k_axis) -> P ctx->NewBuilder() .Split(user_op::OpArg("a", 0), k_a_axis) .Split(user_op::OpArg("b", 0), k_b_axis) .PartialSum(out_and_add_to_output_args) .Build(); - - // P x B -> P ctx->NewBuilder() .PartialSum(user_op::OpArg("a", 0)) .Broadcast(user_op::OpArg("b", 0)) .PartialSum(out_and_add_to_output_args) .Build(); - - // B x P -> P ctx->NewBuilder() .Broadcast(user_op::OpArg("a", 0)) .PartialSum(user_op::OpArg("b", 0)) From 418598eb5a4b74e27875c8b749577df969628a73 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Sat, 2 Sep 2023 09:35:03 +0000 Subject: [PATCH 38/65] auto format by CI --- cmake/third_party/cutlass-extension.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/third_party/cutlass-extension.cmake b/cmake/third_party/cutlass-extension.cmake index fbe3b0ff749..68f062fe950 100644 --- a/cmake/third_party/cutlass-extension.cmake +++ b/cmake/third_party/cutlass-extension.cmake @@ -15,7 +15,8 @@ if(WITH_CUTLASS_EXTENSION) set(CUTLASS_EXTENSION_INCLUDE_DIR ${CUTLASS_EXTENSION_INSTALL_DIR}/include CACHE PATH "" FORCE) set(CUTLASS_EXTENSION_LIBRARY_DIR ${CUTLASS_EXTENSION_INSTALL_DIR}/lib CACHE PATH "" FORCE) set(CUTLASS_EXTENSION_LIBRARIES ${CUTLASS_EXTENSION_LIBRARY_DIR}/libcutlass_extension.so) - set(CUTLASS_EXTENSION_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass-extension/src/cutlass-extension/) + set(CUTLASS_EXTENSION_SOURCE_DIR + ${CMAKE_CURRENT_BINARY_DIR}/cutlass-extension/src/cutlass-extension/) set(CUTLASS_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass/src/cutlass) foreach(arch ${CUDA_REAL_ARCHS_LIST}) @@ -53,8 +54,7 @@ if(WITH_CUTLASS_EXTENSION) -DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING= -DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF -DCUTLASS_DIR:STRING=${CUTLASS_SOURCE_DIR} - DEPENDS cutlass - ) + DEPENDS cutlass) endif(THIRD_PARTY) endif(WITH_CUTLASS_EXTENSION) From 9031d5ce0cb6b57cc2f43505a3a0fbd1ca8ca046 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Sun, 3 Sep 2023 09:23:14 +0000 Subject: [PATCH 39/65] support to find the fastest kernel for matmul_quant --- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 3 +- .../cutlass_conv2d_operation_cache_key.h | 17 +- .../cutlass_gemm_operation_cache_key.h | 139 ++++++++ oneflow/user/kernels/cutlass_gemm_tuner.h | 57 +++ .../user/kernels/cutlass_gemm_tuner_impl.cpp | 324 ++++++++++++++++++ .../user/kernels/cutlass_gemm_tuner_impl.h | 54 +++ oneflow/user/kernels/matmul_quant_kernels.cu | 156 +++++---- 7 files changed, 683 insertions(+), 67 deletions(-) create mode 100644 oneflow/user/kernels/cutlass_gemm_operation_cache_key.h create mode 100644 oneflow/user/kernels/cutlass_gemm_tuner.h create mode 100644 oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp create mode 100644 oneflow/user/kernels/cutlass_gemm_tuner_impl.h diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 70102086352..c31b6d8f680 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -5390,7 +5390,8 @@ def OneFlow_MatmulQuantOp : OneFlow_BaseOp<"matmul_quant", [NoMemoryEffect, Attr DefaultValuedAttr:$transpose_a, DefaultValuedAttr:$transpose_b, DefaultValuedAttr:$alpha, - OneFlow_DataType:$out_dtype + OneFlow_DataType:$out_dtype, + DefaultValuedAttr:$tuning_cache ); let has_logical_tensor_desc_infer_fn = 1; let has_physical_tensor_desc_infer_fn = 1; diff --git a/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h index 68b0c5321b2..a013e949166 100644 --- a/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h +++ b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h @@ -33,10 +33,15 @@ struct Conv2dOperationCacheKey { cutlass::library::ConvFunctionalKey functional_key; cutlass::library::Conv2dConfiguration configuraion; size_t alignment; + bool fuse_scale_bias; + bool fuse_residual; Conv2dOperationCacheKey(cutlass::library::ConvFunctionalKey functional_key, cutlass::library::Conv2dConfiguration configuraion, cutlass::library::ConvArguments arguments) - : functional_key(functional_key), configuraion(configuraion) { + : functional_key(functional_key), + configuraion(configuraion), + fuse_scale_bias(false), + fuse_residual(false) { const auto IsStrideAligned = [&](const std::vector& stride, size_t n) { return std::all_of(stride.cbegin(), stride.cend(), [&](const int64_t& s) { return s % n == 0; }); @@ -59,7 +64,9 @@ struct Conv2dOperationCacheKey { Conv2dOperationCacheKey(cutlass::library::ConvFunctionalKey functional_key, const cutlass::library::Conv2dScaleBiasFusionConfiguration& config, const cutlass::library::ConvScaleBiasFusionArguments& arguments) - : functional_key(functional_key) { + : functional_key(functional_key), + fuse_scale_bias(true), + fuse_residual(arguments.Residual != nullptr) { configuraion.problem_size = config.problem_size; configuraion.split_k_mode = config.split_k_mode; configuraion.stride_a = config.stride_a; @@ -74,6 +81,7 @@ struct Conv2dOperationCacheKey { CHECK_EQ(reinterpret_cast(arguments.P) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Scale) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Bias) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.Residual) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.D) % kCudaAlignSize, 0); const auto IsAligned = [&](size_t n) { return IsStrideAligned(configuraion.stride_a, n) && IsStrideAligned(configuraion.stride_b, n) @@ -134,6 +142,8 @@ struct Conv2dOperationCacheKeyHasher { size_t hash = cutlass::library::ConvFunctionalKeyHasher()(key.functional_key); hash = HashCombine(hash, Conv2dConfigurationHasher()(key.configuraion)); hash = HashCombine(hash, std::hash()(key.alignment)); + hash = HashCombine(hash, std::hash()(key.fuse_scale_bias)); + hash = HashCombine(hash, std::hash()(key.fuse_residual)); return hash; } }; @@ -147,7 +157,8 @@ inline bool operator==(const cutlass::library::Conv2dConfiguration& lhs, inline bool operator==(const Conv2dOperationCacheKey& lhs, const Conv2dOperationCacheKey& rhs) { return lhs.functional_key == rhs.functional_key && lhs.configuraion == rhs.configuraion - && lhs.alignment == rhs.alignment; + && lhs.alignment == rhs.alignment && lhs.fuse_scale_bias == rhs.fuse_scale_bias + && lhs.fuse_residual == rhs.fuse_residual; } } // namespace oneflow diff --git a/oneflow/user/kernels/cutlass_gemm_operation_cache_key.h b/oneflow/user/kernels/cutlass_gemm_operation_cache_key.h new file mode 100644 index 00000000000..eb29aec0658 --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_operation_cache_key.h @@ -0,0 +1,139 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUTLASS + +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_GEMM_OPERATION_CACHE_KEY_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_GEMM_OPERATION_CACHE_KEY_H_ + +#include "oneflow/core/framework/framework.h" + +#include +#include + +#ifdef WITH_CUTLASS_EXTENSION +#include +#endif // WITH_CUTLASS_EXTENSION + +namespace oneflow { + +struct GemmOperationCacheKey { + cutlass::library::GemmFunctionalKey functional_key; + cutlass::library::GemmConfiguration configuraion; + size_t alignment; + bool fuse_scale_bias; + bool fuse_residual; + GemmOperationCacheKey(const cutlass::library::GemmFunctionalKey& functional_key, + const cutlass::library::GemmConfiguration& configuraion, + const cutlass::library::GemmArguments& arguments) + : functional_key(functional_key), + configuraion(configuraion), + fuse_scale_bias(false), + fuse_residual(false) { + CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.C) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.D) % kCudaAlignSize, 0); + const auto IsAligned = [&](size_t n) { + return configuraion.lda % n == 0 && configuraion.ldb % n == 0 && configuraion.ldc % n == 0 + && configuraion.ldd % n == 0; + }; + alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); + for (; alignment > 1; alignment = alignment >> 1) { + if (IsAligned(alignment)) { break; } + } + } + +#ifdef WITH_CUTLASS_EXTENSION + GemmOperationCacheKey(cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmScaleBiasFusionConfiguration& config, + const cutlass::library::GemmScaleBiasFusionArguments& arguments) + : functional_key(functional_key), + fuse_scale_bias(true), + fuse_residual(arguments.Residual != nullptr) { + configuraion.problem_size = config.problem_size; + configuraion.split_k_slices = config.split_k_slices; + configuraion.lda = config.lda; + configuraion.ldb = config.ldb; + configuraion.ldc = 0; + configuraion.ldd = config.ldd; + CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.Scale) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.Bias) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.Residual) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.D) % kCudaAlignSize, 0); + const auto IsAligned = [&](size_t n) { + return configuraion.lda % n == 0 && configuraion.ldb % n == 0 && config.ldr % n == 0 + && configuraion.ldd % n == 0; + }; + alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); + for (; alignment > 1; alignment = alignment >> 1) { + if (IsAligned(alignment)) { break; } + } + } +#endif // WITH_CUTLASS_EXTENSION +}; + +struct GemmProblemSizeHasher { + size_t operator()(const cutlass::gemm::GemmCoord& problem_size) const { + size_t hash = 0; + hash = HashCombine(hash, std::hash()(problem_size.m())); + hash = HashCombine(hash, std::hash()(problem_size.n())); + hash = HashCombine(hash, std::hash()(problem_size.k())); + return hash; + } +}; + +struct GemmConfigurationHasher { + size_t operator()(const cutlass::library::GemmConfiguration& configuraion) const { + size_t hash = std::hash()(configuraion.split_k_slices); + hash = HashCombine(hash, GemmProblemSizeHasher()(configuraion.problem_size)); + hash = HashCombine(hash, configuraion.lda); + hash = HashCombine(hash, configuraion.ldb); + hash = HashCombine(hash, configuraion.ldc); + hash = HashCombine(hash, configuraion.ldd); + return hash; + } +}; + +struct GemmOperationCacheKeyHasher { + size_t operator()(const GemmOperationCacheKey& key) const { + size_t hash = cutlass::library::GemmFunctionalKeyHasher()(key.functional_key); + hash = HashCombine(hash, GemmConfigurationHasher()(key.configuraion)); + hash = HashCombine(hash, std::hash()(key.alignment)); + hash = HashCombine(hash, std::hash()(key.fuse_scale_bias)); + hash = HashCombine(hash, std::hash()(key.fuse_residual)); + return hash; + } +}; + +inline bool operator==(const cutlass::library::GemmConfiguration& lhs, + const cutlass::library::GemmConfiguration& rhs) { + return lhs.split_k_slices == rhs.split_k_slices && lhs.problem_size == rhs.problem_size + && lhs.lda == rhs.lda && lhs.ldb == rhs.ldb && lhs.ldc == rhs.ldc && lhs.ldd == rhs.ldd; +} + +inline bool operator==(const GemmOperationCacheKey& lhs, const GemmOperationCacheKey& rhs) { + return lhs.functional_key == rhs.functional_key && lhs.configuraion == rhs.configuraion + && lhs.alignment == rhs.alignment && lhs.fuse_scale_bias == rhs.fuse_scale_bias + && lhs.fuse_residual == rhs.fuse_residual; +} + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_GEMM_OPERATION_CACHE_KEY_H_ + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_gemm_tuner.h b/oneflow/user/kernels/cutlass_gemm_tuner.h new file mode 100644 index 00000000000..1596df8d836 --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_tuner.h @@ -0,0 +1,57 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_GEMM_TUNER_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_GEMM_TUNER_H_ + +#ifdef WITH_CUTLASS + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include "oneflow/user/kernels/cutlass_gemm_tuner_impl.h" + +#include +#include + +namespace oneflow { + +class CutlassGemmTuner { + public: + CutlassGemmTuner() = default; + + template + const cutlass::library::Operation* FindOperation( + ep::CudaStream* stream, const cutlass::library::GemmFunctionalKey& functional_key, + const Configuration& configuraion, const Arguments& arguments, void* workspace, + size_t workspace_size) { + return GetCutlassGemmTunerImpl()->Find( + stream, functional_key, configuraion, arguments, workspace, workspace_size); + } + + template + const cutlass::library::Operation* GetOperation( + const std::string& name, ep::CudaStream* stream, + const cutlass::library::GemmFunctionalKey& functional_key, const Configuration& configuraion, + const Arguments& arguments, void* workspace, size_t workspace_size) { + return GetCutlassGemmTunerImpl()->Get( + name, stream, functional_key, configuraion, arguments, workspace, workspace_size); + } +}; + +} // namespace oneflow + +#endif // WITH_CUTLASS + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_GEMM_TUNER_H_ diff --git a/oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp b/oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp new file mode 100644 index 00000000000..abb69aed600 --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp @@ -0,0 +1,324 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#include "oneflow/user/kernels/cutlass_gemm_tuner_impl.h" + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include +#include + +#include "oneflow/user/kernels/cutlass_gemm_operation_cache_key.h" +#ifdef WITH_CUTLASS_EXTENSION +#include +#include +#endif // WITH_CUTLASS_EXTENSION + +namespace oneflow { + +namespace { + +bool IsWeakerAlginOperation(const cutlass::library::Operation* lhs, + const cutlass::library::Operation* rhs) { + const std::string lhs_name = lhs->description().name; + const std::string rhs_name = rhs->description().name; + size_t lhs_pos = lhs_name.rfind("align"); + if (lhs_pos == std::string::npos) { return false; } + size_t rhs_pos = rhs_name.rfind("align"); + if (rhs_pos == std::string::npos) { return false; } + if (lhs_name.substr(0, lhs_pos) != rhs_name.substr(0, rhs_pos)) { return false; } + size_t align_len = std::strlen("align"); + int lhs_alignment = std::atoi(lhs_name.substr(lhs_pos + align_len).c_str()); + int rhs_alignment = std::atoi(rhs_name.substr(rhs_pos + align_len).c_str()); + return lhs_alignment < rhs_alignment; +} + +size_t GetTensorSize(cutlass::library::NumericTypeID element, cutlass::library::LayoutTypeID layout, + const int row, const int col, const int ldc) { + const size_t element_size = cutlass::library::sizeof_bits(element) / 8; + size_t capacity = 0; + if (layout == cutlass::library::LayoutTypeID::kRowMajor) { + capacity = row * ldc; + } else if (layout == cutlass::library::LayoutTypeID::kColumnMajor) { + capacity = ldc * col; + } else { + UNIMPLEMENTED(); + } + return capacity * element_size; +} + +template +const cutlass::library::Operation* FindFastestOperation( + const Singleton* singleton, const cutlass::library::GemmFunctionalKey& functional_key, + const Configuration& configuraion, const Arguments& arguments, void* workspace, + size_t workspace_size, cudaStream_t stream, int cuda_arch) { + constexpr int turing_warmup_iters = 2; + constexpr int turing_iters = 5; + cudaEvent_t start{}; + cudaEvent_t end{}; + OF_CUDA_CHECK(cudaEventCreate(&start)); + OF_CUDA_CHECK(cudaEventCreate(&end)); + const cutlass::library::Operation* fastest_operation = nullptr; + float fastest_time = 0; + const auto& operations_map = [&]() { + const auto& it = singleton->operation_table.gemm_operations.find(functional_key); + CHECK(it != singleton->operation_table.gemm_operations.cend()); + return it->second; + }(); + + for (const auto& pair : operations_map) { + std::map> operations; + for (auto operation : pair.second) { + operations.emplace(operation->description().name, operation); + } + const cutlass::library::Operation* prev_operation = nullptr; + for (const auto& name_operation : operations) { + const cutlass::library::Operation* operation = name_operation.second; + if (prev_operation != nullptr && IsWeakerAlginOperation(operation, prev_operation)) { + continue; + } + if (operation->description().tile_description.minimum_compute_capability * 10 > cuda_arch + || operation->description().tile_description.maximum_compute_capability * 10 + < cuda_arch) { + continue; + } + auto status = operation->can_implement(&configuraion, &arguments); + if (status != cutlass::Status::kSuccess) { continue; } + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); + if (device_workspace_size > workspace_size) { continue; } + std::vector host_workspace(host_workspace_size, 0); + if (operation->initialize(&configuraion, host_workspace.data(), workspace, stream) + != cutlass::Status::kSuccess) { + continue; + } + + const auto Run = [&]() { + auto init_status = + operation->initialize(&configuraion, host_workspace.data(), workspace, stream); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = operation->run(&arguments, host_workspace.data(), workspace, stream); + CHECK(run_status == cutlass::Status::kSuccess); + }; + OF_CUDA_CHECK(cudaStreamSynchronize(stream)); + for (int i = 0; i < turing_warmup_iters; ++i) { Run(); } + OF_CUDA_CHECK(cudaEventRecord(start, stream)); + for (int i = 0; i < turing_iters; ++i) { Run(); } + OF_CUDA_CHECK(cudaEventRecord(end, stream)); + OF_CUDA_CHECK(cudaEventSynchronize(end)); + float time = 0; + OF_CUDA_CHECK(cudaEventElapsedTime(&time, start, end)); + VLOG(3) << operation->description().name << " " << time; + prev_operation = operation; + if (fastest_operation == nullptr || time < fastest_time) { + fastest_operation = operation; + fastest_time = time; + } + } + } + OF_CUDA_CHECK(cudaEventDestroy(start)); + OF_CUDA_CHECK(cudaEventDestroy(end)); + VLOG(3) << "Fastest: " << fastest_operation->description().name << " " << fastest_time; + return fastest_operation; +} + +template +const cutlass::library::Operation* GetOperation( + const Singleton* singleton, const std::string& name, + const cutlass::library::GemmFunctionalKey& functional_key, const Configuration& configuraion, + const Arguments& arguments, void* workspace, size_t workspace_size, cudaStream_t stream, + int cuda_arch) { + const auto& it = singleton->operation_table.gemm_operations.find(functional_key); + if (it == singleton->operation_table.gemm_operations.cend()) { return nullptr; } + const cutlass::library::GemmOperationVectorMap& operations_map = it->second; + for (const auto& pair : operations_map) { + for (auto operation : pair.second) { + if (name != operation->description().name) { continue; } + if (operation->description().tile_description.minimum_compute_capability * 10 > cuda_arch + || operation->description().tile_description.maximum_compute_capability * 10 + < cuda_arch) { + continue; + } + auto status = operation->can_implement(&configuraion, &arguments); + if (status != cutlass::Status::kSuccess) { continue; } + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); + if (device_workspace_size > workspace_size) { continue; } + std::vector host_workspace(host_workspace_size, 0); + if (operation->initialize(&configuraion, host_workspace.data(), workspace, stream) + != cutlass::Status::kSuccess) { + continue; + } + return operation; + } + } + return nullptr; +} + +} // namespace + +#ifdef WITH_CUTLASS_EXTENSION +template<> +class CutlassGemmTunerImpl { + public: + using CacheMap = std::unordered_map; + + CutlassGemmTunerImpl() { + singleton = &cutlass::library::CutlassExtensionSingleton::get( + cutlass::library::SingletonKind::kGemmScaleBiasFusion); + residual_singleton = &cutlass::library::CutlassExtensionSingleton::get( + cutlass::library::SingletonKind::kGemmScaleBiasResidualFusion); + } + + const cutlass::library::Operation* Find( + ep::CudaStream* stream, cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size); + + const cutlass::library::Operation* Get( + const std::string& name, ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size); + + private: + std::mutex mutex; + std::unordered_map cache; + const cutlass::library::CutlassExtensionSingleton* singleton; + const cutlass::library::CutlassExtensionSingleton* residual_singleton; +}; + +const cutlass::library::Operation* +CutlassGemmTunerImpl:: + Find(ep::CudaStream* stream, cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + GemmOperationCacheKey cache_key(functional_key, configuraion, arguments); + { + std::lock_guard lock(mutex); + const auto& device_cache = cache[dev]; + const auto& it = device_cache.find(cache_key); + if (it != device_cache.end()) { return it->second; } + } + cutlass::library::GemmScaleBiasFusionArguments benchmark_arguments = arguments; + void* benchmark_workspace = workspace; + cudaStream_t benchmark_stream = stream->cuda_stream(); +#ifdef WITH_CUDA_GRAPHS + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + OF_CUDA_CHECK(cudaStreamCreate(&benchmark_stream)); + OF_CUDA_CHECK(cudaMalloc(&benchmark_workspace, workspace_size)); + const size_t a_size = GetTensorSize(functional_key.element_A, functional_key.layout_A, + configuraion.problem_size.m(), + configuraion.problem_size.k(), configuraion.lda); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.A, a_size)); + const size_t b_size = GetTensorSize(functional_key.element_B, functional_key.layout_B, + configuraion.problem_size.k(), + configuraion.problem_size.m(), configuraion.ldb); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.B, b_size)); + + if (benchmark_arguments.Scale != nullptr) { + const size_t scale_size = configuraion.problem_size.n() + * cutlass::library::sizeof_bits(functional_key.element_D) / 8; + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.Scale, scale_size)); + } + if (benchmark_arguments.Bias != nullptr) { + const size_t bias_size = configuraion.problem_size.n() + * cutlass::library::sizeof_bits(functional_key.element_D) / 8; + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.Bias, bias_size)); + } + if (benchmark_arguments.Residual != nullptr) { + const size_t residual_size = GetTensorSize(functional_key.element_D, functional_key.layout_D, + configuraion.problem_size.m(), + configuraion.problem_size.n(), configuraion.ldr); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.Residual, residual_size)); + } + const size_t d_size = GetTensorSize(functional_key.element_D, functional_key.layout_D, + configuraion.problem_size.m(), + configuraion.problem_size.n(), configuraion.ldd); + OF_CUDA_CHECK(cudaMalloc(&benchmark_arguments.D, d_size)); + } +#endif // WITH_CUDA_GRAPHS + + const cutlass::library::Operation* fastest_operation = + FindFastestOperation((benchmark_arguments.Residual ? residual_singleton : singleton), + functional_key, configuraion, benchmark_arguments, benchmark_workspace, + workspace_size, benchmark_stream, stream->cuda_arch()); + +#ifdef WITH_CUDA_GRAPHS + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaStreamSynchronize(benchmark_stream)); + OF_CUDA_CHECK(cudaStreamDestroy(benchmark_stream)); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.A))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.B))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Scale))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Bias))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Residual))); + OF_CUDA_CHECK(cudaFree(benchmark_arguments.D)); + OF_CUDA_CHECK(cudaFree(benchmark_workspace)); + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + } +#endif // WITH_CUDA_GRAPHS + if (fastest_operation != nullptr) { + std::lock_guard lock(mutex); + cache[dev][cache_key] = fastest_operation; + } + return fastest_operation; +} + +const cutlass::library::Operation* +CutlassGemmTunerImpl:: + Get(const std::string& name, ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + return GetOperation((arguments.Residual ? residual_singleton : singleton), name, functional_key, + configuraion, arguments, workspace, workspace_size, stream->cuda_stream(), + stream->cuda_arch()); +} +#endif // WITH_CUTLASS_EXTENSION + +template +CutlassGemmTunerImpl* GetCutlassGemmTunerImpl() { + static CutlassGemmTunerImpl impl; + return &impl; +} + +#ifdef WITH_CUTLASS_EXTENSION +template CutlassGemmTunerImpl* +GetCutlassGemmTunerImpl(); +#endif // WITH_CUTLASS_EXTENSION + +} // namespace oneflow + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_gemm_tuner_impl.h b/oneflow/user/kernels/cutlass_gemm_tuner_impl.h new file mode 100644 index 00000000000..f5bf0213d8b --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_tuner_impl.h @@ -0,0 +1,54 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_GEMM_TUNER_IMPL_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_GEMM_TUNER_IMPL_H_ + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" + +#include +#include +#include + +namespace oneflow { + +template +class CutlassGemmTunerImpl { + public: + const cutlass::library::Operation* Find(ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const Configuration& configuraion, + const Arguments& arguments, void* workspace, + size_t workspace_size); + + const cutlass::library::Operation* Get(const std::string& name, ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const Configuration& configuraion, + const Arguments& arguments, void* workspace, + size_t workspace_size); +}; + +template +CutlassGemmTunerImpl* GetCutlassGemmTunerImpl(); + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_GEMM_TUNER_IMPL_H_ + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/matmul_quant_kernels.cu b/oneflow/user/kernels/matmul_quant_kernels.cu index 1c2f25344c8..eeb8f23ad43 100644 --- a/oneflow/user/kernels/matmul_quant_kernels.cu +++ b/oneflow/user/kernels/matmul_quant_kernels.cu @@ -20,68 +20,77 @@ limitations under the License. #include "oneflow/core/framework/config_def.h" #include "oneflow/core/kernel/cuda_graph_support.h" -#include "cutlass/gemm/device/gemm.h" +#include "oneflow/user/kernels/cutlass_gemm_tuner.h" -#include "cutlass/gemm/device/gemm_scale_bias_fusion.h" -#include "cutlass/epilogue/thread/linear_combination_scale_bias.h" - -#include "cutlass/gemm/device/gemm_scale_bias_residual_fusion.h" -#include "cutlass/epilogue/thread/linear_combination_scale_bias_residual.h" +#include +#include +#include +#include namespace oneflow { namespace { -using RowMajor = cutlass::layout::RowMajor; -using ColMajor = cutlass::layout::ColumnMajor; - -template -void cutlass_gemm_scale_bias_s8(cudaStream_t stream, void* workspace, int m, int n, int k, - const void* a, const void* b, const void* scale, const void* bias, - const void* residual, void* output) { - using ElementA = T; - using ElementB = T; - using ElementC = OutT; - using ElementAccumulator = int32_t; - using ElementCompute = float; - - if (!residual) { - using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasFusion< - ElementA, RowMajor, ElementB, ColMajor, ElementC, RowMajor, ElementAccumulator, - cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombinationScaleBias>; - - CutlassRowAColBRowCGemm gemm_operator; - typename CutlassRowAColBRowCGemm::Arguments args( - {m, n, k}, {reinterpret_cast(a), k}, {reinterpret_cast(b), k}, - {reinterpret_cast(scale), 0}, {reinterpret_cast(bias), 0}, - {reinterpret_cast(output), n}); - - cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); - CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = gemm_operator(stream); - CHECK(run_status == cutlass::Status::kSuccess); +void LaunchMatmulQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, + const cutlass::library::GemmFunctionalKey& key, + const cutlass::gemm::GemmCoord& problem_size, + const user_op::Tensor* a, const user_op::Tensor* b, + const user_op::Tensor* scale, const user_op::Tensor* bias, + const user_op::Tensor* add_to_output, + user_op::Tensor* out) { + cutlass::library::GemmScaleBiasFusionConfiguration configuraion; + configuraion.problem_size = problem_size; + configuraion.lda = problem_size.k(); + configuraion.ldb = problem_size.k(); + configuraion.ld_scale = 0; + configuraion.ld_bias = 0; + configuraion.ldr = problem_size.n(); + configuraion.ldd = problem_size.n(); + configuraion.split_k_slices = 1; + // if (problem_size.m() <= 2 && problem_size.k() >= 4096) { configuraion.split_k_slices = 16; } + + cutlass::library::GemmScaleBiasFusionArguments arguments; + arguments.A = a->dptr(); + arguments.B = b->dptr(); + arguments.Scale = scale->dptr(); + arguments.Bias = bias->dptr(); + if (add_to_output) { + arguments.Residual = add_to_output->dptr(); } else { - using CutlassRowAColBRowCGemm = typename cutlass::gemm::device::GemmScaleBiasResidualFusion< - ElementA, RowMajor, ElementB, ColMajor, ElementC, RowMajor, ElementAccumulator, - cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::LinearCombinationScaleBiasResidual< - ElementC, 8, ElementAccumulator, ElementCompute, cutlass::plus>>; - - CutlassRowAColBRowCGemm gemm_operator; - typename CutlassRowAColBRowCGemm::Arguments args( - {m, n, k}, {reinterpret_cast(a), k}, {reinterpret_cast(b), k}, - {reinterpret_cast(scale), 0}, {reinterpret_cast(bias), 0}, - {reinterpret_cast(residual), n}, {reinterpret_cast(output), n}); - - cutlass::Status init_status = gemm_operator.initialize(args, workspace, stream); - CHECK(init_status == cutlass::Status::kSuccess); - auto run_status = gemm_operator(stream); - CHECK(run_status == cutlass::Status::kSuccess); + arguments.Residual = nullptr; + } + arguments.D = out->mut_dptr(); + + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + auto* stream = ctx->stream()->As(); + const cutlass::library::Operation* operation = nullptr; + operation = [&]() -> const cutlass::library::Operation* { + const std::string& tuning_cache = ctx->Attr("tuning_cache"); + if (tuning_cache.empty()) { return nullptr; } + auto tuning_cache_object = nlohmann::json::parse(tuning_cache); + if (!tuning_cache_object.is_object()) { return nullptr; } + auto it = tuning_cache_object.find("cutlass"); + if (it == tuning_cache_object.end()) { return nullptr; } + if (!it->is_string()) { return nullptr; } + const std::string name = *it; + return CutlassGemmTuner().GetOperation(name, stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); + }(); + if (!operation) { + operation = CutlassGemmTuner().FindOperation(stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); } + CHECK(operation != nullptr); + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + std::vector host_workspace(host_workspace_size, 0); + auto init_status = operation->initialize(&configuraion, host_workspace.data(), + tmp_buffer->mut_dptr(), stream->cuda_stream()); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = operation->run(&arguments, host_workspace.data(), tmp_buffer->mut_dptr(), + stream->cuda_stream()); + CHECK(run_status == cutlass::Status::kSuccess); } } // namespace @@ -112,18 +121,39 @@ class MatmulQuantKernel final : public user_op::OpKernel { const int k = a->shape_view().At(dim_a - 1); const int n = b->shape_view().At(0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + cutlass::library::GemmFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::GemmKind::kGemm, + cutlass::library::NumericTypeID::kS32, // element_compute + cutlass::library::NumericTypeID::kS32, // element_scalar + cutlass::library::NumericTypeID::kS8, // element_A + cutlass::library::LayoutTypeID::kRowMajor, // layout_A + cutlass::library::ComplexTransform::kNone, // transform_A + cutlass::library::NumericTypeID::kS8, // element_B + cutlass::library::LayoutTypeID::kColumnMajor, // layout_B + cutlass::library::ComplexTransform::kNone, // transform_B + cutlass::library::NumericTypeID::kS32, // element_C + cutlass::library::LayoutTypeID::kRowMajor, // layout_C + cutlass::library::NumericTypeID::kS32, // element_D + cutlass::library::LayoutTypeID::kRowMajor // layout_D + ); if (out->data_type() == DataType::kFloat) { - cutlass_gemm_scale_bias_s8( - ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, n, k, - a->dptr(), b->dptr(), scale->dptr(), bias->dptr(), - (add_to_output ? add_to_output->dptr() : nullptr), out->mut_dptr()); + key.element_scalar = cutlass::library::NumericTypeID::kF32; + key.element_C = cutlass::library::NumericTypeID::kF32; + key.element_D = cutlass::library::NumericTypeID::kF32; } else if (out->data_type() == DataType::kFloat16) { - cutlass_gemm_scale_bias_s8( - ctx->stream()->As()->cuda_stream(), tmp_buffer->mut_dptr(), m, n, k, - a->dptr(), b->dptr(), scale->dptr(), bias->dptr(), - (add_to_output ? add_to_output->dptr() : nullptr), out->mut_dptr()); + key.element_scalar = cutlass::library::NumericTypeID::kF32; + key.element_C = cutlass::library::NumericTypeID::kF16; + key.element_D = cutlass::library::NumericTypeID::kF16; + } + + cutlass::gemm::GemmCoord problem_size(m, n, k); + + if (scale) { + LaunchMatmulQuantScaleBiasFusionOp(ctx, key, problem_size, a, b, scale, bias, add_to_output, + out); + } else { + UNIMPLEMENTED(); } } }; From a857441eaab944edbabe47dcf02083cfc8348636 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 4 Sep 2023 03:14:34 +0000 Subject: [PATCH 40/65] fuse gelu quant --- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 30 ++ .../PDLL/FuseOpsWithBackwardImplPattern.pdll | 20 + .../user/kernels/fused_glu_quant_kernel.cu | 346 ++++++++++++++++++ oneflow/user/ops/fused_glu_quant_op.cpp | 198 ++++++++++ 4 files changed, 594 insertions(+) create mode 100644 oneflow/user/kernels/fused_glu_quant_kernel.cu create mode 100644 oneflow/user/ops/fused_glu_quant_op.cpp diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index c31b6d8f680..768a9d3ccf1 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -5668,6 +5668,36 @@ def OneFlow_FusedGluOp : OneFlow_BaseOp<"fused_glu", [NoMemoryEffect, AttrSizedO let has_data_type_infer_fn = 1; } +def OneFlow_FusedGluQuantOp : OneFlow_BaseOp<"fused_glu_quant", [NoMemoryEffect, AttrSizedOperandSegments, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$x, + OneFlow_Tensor:$w, + OneFlow_Tensor:$scale, + OneFlow_Tensor:$bias, + Optional:$v, + Optional:$v_scale, + Optional:$v_bias + ); + let attrs = (ins + DefaultValuedAttr:$activation, + DefaultValuedAttr:$is_split, + OneFlow_DataType:$out_dtype, + DefaultValuedAttr:$tuning_cache + ); + let trait_attrs = (ins + DenseI32ArrayAttr:$operand_segment_sizes + ); + let output = (outs + OneFlow_Tensor:$y, + OneFlow_Tensor:$matmul_wx, + Optional:$matmul_vx + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + def OneFlow_FusedGluWithoutLinearGradOp : OneFlow_BaseOp<"fused_glu_without_linear_grad", [NoMemoryEffect, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$dy, diff --git a/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll b/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll index daf2d3073bd..028ecd28f6b 100644 --- a/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll +++ b/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll @@ -38,3 +38,23 @@ Pattern { replace matmul_wx_add with fused_gelu_out.1; }; } + +Pattern { + let device_name: Attr; + let device_tag: Attr; + let out_dtype: Attr; + + let matmul_wx_add = op(x: Value, w: Value, scale: Value, bias: Value){device_name = device_name, device_tag = device_tag, alpha = attr<"1.000000e+00 : f64">, out_dtype = out_dtype} -> (matmul_wx_out: Type); + + let hidden_states = op(matmul_wx_add.0){device_name = device_name, device_tag = device_tag}; + let gate = op(matmul_wx_add.0){device_name = device_name, device_tag = device_tag}; + let gate_activate = op(gate.0){device_name = device_name, device_tag = device_tag}; + let gelu_out = op(hidden_states.0,gate_activate.0){device_name = device_name, device_tag = device_tag}-> (out: Type); + + rewrite gelu_out with{ + let fused_gelu_out = op(x, w, scale, bias){activation = attr<"\"gelu\"">, operand_segment_sizes = attr<"array">, device_name = device_name, device_tag = device_tag, out_dtype = out_dtype}-> (out, matmul_wx_out); + CopyUserOpAttrs(gelu_out, fused_gelu_out); + replace gelu_out with fused_gelu_out.0; + replace matmul_wx_add with fused_gelu_out.1; + }; +} diff --git a/oneflow/user/kernels/fused_glu_quant_kernel.cu b/oneflow/user/kernels/fused_glu_quant_kernel.cu new file mode 100644 index 00000000000..0134790cdfe --- /dev/null +++ b/oneflow/user/kernels/fused_glu_quant_kernel.cu @@ -0,0 +1,346 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUTLASS_EXTENSION + +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/core/ep/include/primitive/unary_op.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/common/device_type.h" +#include "oneflow/core/ep/common/primitive/unary_functor.h" +#include "oneflow/core/ep/cuda/primitive/unary_functor.cuh" +#include "oneflow/core/kernel/util/cuda_half_util.h" +#include "oneflow/core/kernel/cuda_graph_support.h" + +#include "oneflow/user/kernels/cutlass_gemm_tuner.h" + +#include +#include +#include +#include + +#if CUDA_VERSION >= 11020 + +namespace oneflow { + +namespace { + +void LaunchMatmulQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, + const cutlass::library::GemmFunctionalKey& key, + const cutlass::gemm::GemmCoord& problem_size, + const user_op::Tensor* a, const user_op::Tensor* b, + const user_op::Tensor* scale, const user_op::Tensor* bias, + const user_op::Tensor* add_to_output, + user_op::Tensor* out) { + cutlass::library::GemmScaleBiasFusionConfiguration configuraion; + configuraion.problem_size = problem_size; + configuraion.lda = problem_size.k(); + configuraion.ldb = problem_size.k(); + configuraion.ld_scale = 0; + configuraion.ld_bias = 0; + configuraion.ldr = problem_size.n(); + configuraion.ldd = problem_size.n(); + configuraion.split_k_slices = 1; + // if (problem_size.m() <= 2 && problem_size.k() >= 4096) { configuraion.split_k_slices = 16; } + + cutlass::library::GemmScaleBiasFusionArguments arguments; + arguments.A = a->dptr(); + arguments.B = b->dptr(); + arguments.Scale = scale->dptr(); + arguments.Bias = bias->dptr(); + if (add_to_output) { + arguments.Residual = add_to_output->dptr(); + } else { + arguments.Residual = nullptr; + } + arguments.D = out->mut_dptr(); + + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + auto* stream = ctx->stream()->As(); + const cutlass::library::Operation* operation = nullptr; + operation = [&]() -> const cutlass::library::Operation* { + const std::string& tuning_cache = ctx->Attr("tuning_cache"); + if (tuning_cache.empty()) { return nullptr; } + auto tuning_cache_object = nlohmann::json::parse(tuning_cache); + if (!tuning_cache_object.is_object()) { return nullptr; } + auto it = tuning_cache_object.find("cutlass"); + if (it == tuning_cache_object.end()) { return nullptr; } + if (!it->is_string()) { return nullptr; } + const std::string name = *it; + return CutlassGemmTuner().GetOperation(name, stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); + }(); + if (!operation) { + operation = CutlassGemmTuner().FindOperation(stream, key, configuraion, arguments, + tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt()); + } + CHECK(operation != nullptr); + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + std::vector host_workspace(host_workspace_size, 0); + auto init_status = operation->initialize(&configuraion, host_workspace.data(), + tmp_buffer->mut_dptr(), stream->cuda_stream()); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = operation->run(&arguments, host_workspace.data(), tmp_buffer->mut_dptr(), + stream->cuda_stream()); + CHECK(run_status == cutlass::Status::kSuccess); +} + +template +__global__ void FusedGluForwardGpu( + const IndexType m, const IndexType packed_n, const IndexType packed_num, + const IndexType packed_stride, + ep::primitive::UnaryFunctor act, T* matmul_wx, T* matmul_vx, + T* y) { + // obtain global thread index + IndexType global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // define type of Pack + using LoadPack = cuda::elementwise::Packed; + + // workload of current thread + for (IndexType packed_index = global_thread_id, step = gridDim.x * blockDim.x; + packed_index < packed_num; packed_index += step) { + // obtain the row and col index in output tensor "y" + const IndexType y_packed_row = packed_index / packed_n; + const IndexType y_packed_col = packed_index - y_packed_row * packed_n; + + // cast type to load type + const LoadPack* matmul_wx_load = + reinterpret_cast(matmul_wx) + (y_packed_row * packed_stride + y_packed_col); + const LoadPack* matmul_vx_load = + reinterpret_cast(matmul_vx) + (y_packed_row * packed_stride + y_packed_col); + + // init vectors + LoadPack matmul_wx_vec = *matmul_wx_load; + LoadPack matmul_vx_vec = *matmul_vx_load; + LoadPack y_vec; + +#pragma unroll + for (int i = 0; i < pack_size; i++) { + // obtain the hidden_state and gate + T hidden_state = matmul_wx_vec.elem[i]; + T gate = matmul_vx_vec.elem[i]; + + // calculate activation + T act_gate = act(gate); + + // calculate element-wise product + y_vec.elem[i] = hidden_state * act_gate; + } + *(reinterpret_cast(y + packed_index * pack_size)) = y_vec; + } +} + +template +void LaunchFusedGluForwardGpu(ep::Stream* stream, const IndexType m, const IndexType packed_n, + const IndexType pack_num, const IndexType packed_stride, T* matmul_wx, + T* matmul_vx, T* y) { + constexpr int32_t block_size = 128; + unsigned int grid_size = (pack_num + block_size - 1) / block_size; + ep::primitive::UnaryFunctor act(0, 0); + FusedGluForwardGpu + <<As()->cuda_stream()>>>( + m, packed_n, pack_num, packed_stride, act, matmul_wx, matmul_vx, y); +} + +template +void DispatchIndexType(ep::Stream* stream, const int64_t m, const int64_t packed_n, + const int64_t pack_num, const int64_t packed_stride, T* matmul_wx, + T* matmul_vx, T* y) { + // dispatch index type + if (pack_num < (1 << 30)) { + LaunchFusedGluForwardGpu( + stream, m, packed_n, pack_num, packed_stride, matmul_wx, matmul_vx, y); + } else { + LaunchFusedGluForwardGpu( + stream, m, packed_n, pack_num, packed_stride, matmul_wx, matmul_vx, y); + } +} + +template::type = 0> +void DispatchPackSize(ep::Stream* stream, const int64_t m, const int64_t n, const int64_t stride, + T* matmul_wx, T* matmul_vx, T* y) { + DispatchIndexType(stream, m, n, m * n, stride, matmul_wx, matmul_vx, y); +} + +template::type = 0> +void DispatchPackSize(ep::Stream* stream, const int64_t m, const int64_t n, const int64_t stride, + T* matmul_wx, T* matmul_vx, T* y) { + const int64_t pack_size = alignment / sizeof(T); + const int64_t packed_n = n / pack_size; + const int64_t pack_num = m * packed_n; + const int64_t packed_stride = stride / pack_size; + DispatchIndexType(stream, m, packed_n, pack_num, + packed_stride, matmul_wx, matmul_vx, y); +} + +template +void DispatchAlignment(ep::Stream* stream, const int64_t m, const int64_t n, const int64_t stride, + T* matmul_wx, T* matmul_vx, T* y) { + const auto IsAligned = [&](const size_t alignment) { + const uintptr_t matmul_wx_ptr = reinterpret_cast(matmul_wx); + const uintptr_t matmul_vx_ptr = reinterpret_cast(matmul_vx); + const uintptr_t y_ptr = reinterpret_cast(y); + + return (/* memory address alignment */ + matmul_wx_ptr % alignment == 0 && matmul_vx_ptr % alignment == 0 + && y_ptr % alignment == 0 + /* #element per row alignment */ + && n % (alignment / sizeof(T)) == 0); + }; + + if (IsAligned(16)) { + DispatchPackSize(stream, m, n, stride, matmul_wx, matmul_vx, y); + } else if (IsAligned(8)) { + DispatchPackSize(stream, m, n, stride, matmul_wx, matmul_vx, y); + } else if (IsAligned(4)) { + DispatchPackSize(stream, m, n, stride, matmul_wx, matmul_vx, y); + } else if (IsAligned(2)) { + DispatchPackSize(stream, m, n, stride, matmul_wx, matmul_vx, y); + } else { + DispatchPackSize(stream, m, n, stride, matmul_wx, matmul_vx, y); + } +} + +template +void DispatchActivationType(ep::Stream* stream, const int64_t m, const int64_t n, + const int64_t stride, T* matmul_wx, T* matmul_vx, T* y, + const std::string& activation) { + if (activation == "none") { + DispatchAlignment(stream, m, n, stride, matmul_wx, + matmul_vx, y); + } else if (activation == "sigmoid") { + DispatchAlignment(stream, m, n, stride, matmul_wx, + matmul_vx, y); + } else if (activation == "relu") { + DispatchAlignment(stream, m, n, stride, matmul_wx, matmul_vx, + y); + } else if (activation == "gelu") { + DispatchAlignment(stream, m, n, stride, matmul_wx, matmul_vx, + y); + } else if (activation == "fast_gelu") { + DispatchAlignment(stream, m, n, stride, matmul_wx, + matmul_vx, y); + } else if (activation == "silu") { + DispatchAlignment(stream, m, n, stride, matmul_wx, matmul_vx, + y); + } else { + UNIMPLEMENTED(); + } +} + +template +class GpuFusedGluQuantKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + GpuFusedGluQuantKernel() = default; + ~GpuFusedGluQuantKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + const user_op::Tensor* input_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* input_w = ctx->Tensor4ArgNameAndIndex("w", 0); + const user_op::Tensor* input_scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* input_bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + + user_op::Tensor* out_y = ctx->Tensor4ArgNameAndIndex("y", 0); + user_op::Tensor* out_matmul_wx = ctx->Tensor4ArgNameAndIndex("matmul_wx", 0); + user_op::Tensor* out_matmul_vx = nullptr; + + CHECK(!ctx->has_input("v", 0)) << "fused_glu_quant does not support split mode"; + bool is_split_mode = false; + + const ShapeView& x_shape = input_x->shape_view(); + const ShapeView& w_shape = input_w->shape_view(); + const ShapeView& y_shape = out_y->shape_view(); + + const DataType data_type = out_y->data_type(); + size_t x_num_axes = x_shape.NumAxes(); + // infer m, n, k + const int64_t m = x_shape.Count(0, x_num_axes - 1); + const int64_t n = y_shape.At(x_num_axes - 1); + const int64_t k = x_shape.At(x_num_axes - 1); + + cutlass::library::GemmFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::GemmKind::kGemm, + cutlass::library::NumericTypeID::kS32, // element_compute + cutlass::library::NumericTypeID::kS32, // element_scalar + cutlass::library::NumericTypeID::kS8, // element_A + cutlass::library::LayoutTypeID::kRowMajor, // layout_A + cutlass::library::ComplexTransform::kNone, // transform_A + cutlass::library::NumericTypeID::kS8, // element_B + cutlass::library::LayoutTypeID::kColumnMajor, // layout_B + cutlass::library::ComplexTransform::kNone, // transform_B + cutlass::library::NumericTypeID::kS32, // element_C + cutlass::library::LayoutTypeID::kRowMajor, // layout_C + cutlass::library::NumericTypeID::kS32, // element_D + cutlass::library::LayoutTypeID::kRowMajor // layout_D + ); + if (data_type == DataType::kFloat) { + key.element_scalar = cutlass::library::NumericTypeID::kF32; + key.element_C = cutlass::library::NumericTypeID::kF32; + key.element_D = cutlass::library::NumericTypeID::kF32; + } else if (data_type == DataType::kFloat16) { + key.element_scalar = cutlass::library::NumericTypeID::kF32; + key.element_C = cutlass::library::NumericTypeID::kF16; + key.element_D = cutlass::library::NumericTypeID::kF16; + } + cutlass::gemm::GemmCoord problem_size(m, 2 * n, k); + + LaunchMatmulQuantScaleBiasFusionOp(ctx, key, problem_size, input_x, input_w, input_scale, + input_bias, nullptr, out_matmul_wx); + + // dispatch according to activation type + DispatchActivationType( + ctx->stream(), + /*m, n=*/m, n, + /*stride=*/is_split_mode ? n : 2 * n, + /*matmul_wx=*/out_matmul_wx->mut_dptr(), + /*matmul_vx=*/ + is_split_mode ? out_matmul_vx->mut_dptr() : out_matmul_wx->mut_dptr() + n, + /*y=*/out_y->mut_dptr(), + /*activation=*/ctx->Attr("activation")); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +} // namespace + +#define REGISTER_GPU_FUSED_GLU_QUANT_KERNEL(T, OutT) \ + REGISTER_USER_KERNEL("fused_glu_quant") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) + +REGISTER_GPU_FUSED_GLU_QUANT_KERNEL(int8_t, float); +REGISTER_GPU_FUSED_GLU_QUANT_KERNEL(int8_t, half); + +} // namespace oneflow + +#endif // CUDA_VERSION >= 11020 + +#endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/ops/fused_glu_quant_op.cpp b/oneflow/user/ops/fused_glu_quant_op.cpp new file mode 100644 index 00000000000..569e4d66de4 --- /dev/null +++ b/oneflow/user/ops/fused_glu_quant_op.cpp @@ -0,0 +1,198 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +/* static */ auto FusedGluQuantOp::GetSbp(user_op::SbpContext* ctx) -> Maybe { + // check whether the user provide weight tensor v + bool is_split_mode = false; + if (ctx->user_op_conf().has_input("v", 0)) { is_split_mode = true; } + if (is_split_mode) { + CHECK_OR_RETURN(ctx->user_op_conf().has_input("v_scale", 0)) + << "expected v_scale for split mode"; + CHECK_OR_RETURN(ctx->user_op_conf().has_input("v_bias", 0)) << "expected v_bias for split mode"; + } + + // data parallelism + for (int64_t i = 0; i < ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape().NumAxes() - 1; + ++i) { + if (is_split_mode) { + ctx->NewBuilder() + .Split(user_op::OpArg("x", 0), i) + .Broadcast(user_op::OpArg("w", 0)) + .Broadcast(user_op::OpArg("scale", 0)) + .Broadcast(user_op::OpArg("bias", 0)) + .Broadcast(user_op::OpArg("v", 0)) + .Broadcast(user_op::OpArg("v_scale", 0)) + .Broadcast(user_op::OpArg("v_bias", 0)) + .Split(ctx->outputs(), i) + .Build(); + } else { + ctx->NewBuilder() + .Split(user_op::OpArg("x", 0), i) + .Broadcast(user_op::OpArg("w", 0)) + .Broadcast(user_op::OpArg("scale", 0)) + .Broadcast(user_op::OpArg("bias", 0)) + .Split(ctx->outputs(), i) + .Build(); + } + } + + // model parallelism + if (is_split_mode) { + ctx->NewBuilder() + .Broadcast(user_op::OpArg("x", 0)) + .Split(user_op::OpArg("w", 0), 0) + .Split(user_op::OpArg("scale", 0), 0) + .Split(user_op::OpArg("bias", 0), 0) + .Split(user_op::OpArg("v", 0), 0) + .Split(user_op::OpArg("v_scale", 0), 0) + .Split(user_op::OpArg("v_bias", 0), 0) + .Split(ctx->outputs(), + ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape().NumAxes() - 1) + .Build(); + } + + return Maybe::Ok(); +} + +/* static */ auto FusedGluQuantOp::InferLogicalTensorDesc(user_op::InferContext* ctx) + -> Maybe { + // obtain input shape + const Shape& x_shape = ctx->InputShape("x", 0); + const Shape& w_shape = ctx->InputShape("w", 0); + + // check whether the user provide weight tensor v + bool is_split_mode = false; + if (ctx->has_input("v", 0)) { is_split_mode = true; } + if (is_split_mode) { + CHECK_OR_RETURN(ctx->has_input("v_scale", 0)) << "expected v_scale for split mode"; + CHECK_OR_RETURN(ctx->has_input("v_bias", 0)) << "expected v_bias for split mode"; + } + + // check dimensions of x, w and b + CHECK_GT_OR_RETURN(x_shape.NumAxes(), 1) + << "number of axes of \'x\' should have be greater than 1, yet get " << x_shape.NumAxes(); + CHECK_EQ_OR_RETURN(w_shape.NumAxes(), 2) + << "number of axes of \'w\' should have be equal to 2, yet get " << w_shape.NumAxes(); + + // check input shapes of w and b + size_t x_num_axes = x_shape.NumAxes(); + CHECK_EQ_OR_RETURN(w_shape.At(1), x_shape.At(x_num_axes - 1)) + << "dimension 1 of \'w\'(" << w_shape.At(1) + << ") is not consistant with the last dimension of \'x\'(" << x_shape.At(x_num_axes - 1) + << ")"; + + const Shape& scale_shape = ctx->InputShape("scale", 0); + CHECK_EQ_OR_RETURN(scale_shape.Count(0), w_shape.At(0)) + << "the element count of \'scale\'(" << scale_shape.Count(0) + << ") is not consistant with dimension 0 of \'w\'(" << w_shape.At(0) << ")"; + + const Shape& bias_shape = ctx->InputShape("bias", 0); + CHECK_EQ_OR_RETURN(bias_shape.Count(0), w_shape.At(0)) + << "the element count of \'bias\'(" << bias_shape.Count(0) + << ") is not consistant with dimension 0 of \'w\'(" << w_shape.At(0) << ")"; + + if (!is_split_mode) { + CHECK_EQ_OR_RETURN(w_shape.At(1) % 2, 0) << "dimension 1 of \'w\' is not divisible by 2"; + } + + // check both dimensions and input shapes of v and v_scale, v_bias (optional) + if (is_split_mode) { + const Shape& v_shape = ctx->InputShape("v", 0); + + CHECK_EQ_OR_RETURN(v_shape.NumAxes(), 2) + << "number of axes of \'v\' should have be equal to 2, yet get " << v_shape.NumAxes(); + CHECK_OR_RETURN(v_shape == w_shape) << "the shape of \'v\' is not consistant with \'w\'"; + + const Shape& v_scale_shape = ctx->InputShape("v_scale", 0); + CHECK_EQ_OR_RETURN(v_scale_shape.Count(0), v_shape.At(0)) + << "the element count of \'v_scale\'(" << v_scale_shape.Count(0) + << ") is not consistant with dimension 0 of \'v\'(" << v_shape.At(0) << ")"; + + const Shape& v_bias_shape = ctx->InputShape("v_bias", 0); + CHECK_EQ_OR_RETURN(v_bias_shape.Count(0), v_shape.At(0)) + << "the element count of \'v_bias\'(" << v_bias_shape.Count(0) + << ") is not consistant with dimension 0 of \'v\'(" << v_shape.At(0) << ")"; + } + + // set shape of the output tensor y + Shape y_shape = x_shape; // borrow from input shape + size_t y_num_axes = x_num_axes; + if (is_split_mode) { + y_shape.Set(y_num_axes - 1, w_shape.At(0)); + } else { + y_shape.Set(y_num_axes - 1, w_shape.At(0) / 2); + } + user_op::TensorDesc* y_tensor = ctx->MutOutputTensorDesc("y", 0); + y_tensor->set_shape(y_shape); + + // set shape of the output tensors of both matmul_wx and matmul_vx + Shape matmul_wx_shape = x_shape; // borrow from input shape + matmul_wx_shape.Set(x_num_axes - 1, w_shape.At(0)); + user_op::TensorDesc* matmul_wx_tensor = ctx->MutOutputTensorDesc("matmul_wx", 0); + matmul_wx_tensor->set_shape(matmul_wx_shape); + if (is_split_mode) { + user_op::TensorDesc* matmul_vx_tensor = ctx->MutOutputTensorDesc("matmul_vx", 0); + matmul_vx_tensor->set_shape(y_shape); + } + + return Maybe::Ok(); +} + +/* static */ auto FusedGluQuantOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) + -> Maybe { + return InferLogicalTensorDesc(ctx); +} + +/* static */ auto FusedGluQuantOp::InferDataType(user_op::InferContext* ctx) -> Maybe { + DataType out_dtype = ctx->Attr("out_dtype"); + // obtain input data types + DataType x_dtype = ctx->InputDType("x", 0); + + // check whether the user provide weight tensor v + bool is_split_mode = false; + if (ctx->has_input("v", 0)) { is_split_mode = true; } + + // check types of x, w and b + CHECK_EQ_OR_RETURN(ctx->InputDType("w", 0), x_dtype) + << "data type of \'w\' is not consitant with \'x\'"; + CHECK_EQ_OR_RETURN(ctx->InputDType("scale", 0), out_dtype) + << "data type of \'scale\' is not consitant with out dtype " << out_dtype; + CHECK_EQ_OR_RETURN(ctx->InputDType("bias", 0), out_dtype) + << "data type of \'bias\' is not consitant with out dtype " << out_dtype; + + // check types of v and c (optional) + if (is_split_mode) { + CHECK_EQ_OR_RETURN(ctx->InputDType("v", 0), x_dtype) + << "data type of \'v\' is not consitant with \'x\'"; + CHECK_EQ_OR_RETURN(ctx->InputDType("v_scale", 0), out_dtype) + << "data type of \'v_scale\' is not consitant with out dtype " << out_dtype; + CHECK_EQ_OR_RETURN(ctx->InputDType("v_bias", 0), out_dtype) + << "data type of \'v_bias\' is not consitant with out dtype " << out_dtype; + } + + // set output data type + ctx->SetOutputDType("y", 0, out_dtype); + ctx->SetOutputDType("matmul_wx", 0, out_dtype); + if (is_split_mode) { ctx->SetOutputDType("matmul_vx", 0, out_dtype); } + + return Maybe::Ok(); +} + +} // namespace oneflow From 9ac9a1c7e2c742fadc9bfd7da72eb24053d7de5b Mon Sep 17 00:00:00 2001 From: clackhan Date: Mon, 4 Sep 2023 08:21:11 +0000 Subject: [PATCH 41/65] add_prune_redundant_quantization_op_pass --- oneflow/core/job/job_build_and_infer_ctx.cpp | 1 + .../prune_redundant_quantization_op_pass.cpp | 108 ++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index 3c77539cbcf..d17352f0898 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -1027,6 +1027,7 @@ Maybe LazyJobBuildAndInferCtx::Complete() { JUST(DoPass("DoParallelCastBeforeWideningTypeCast")); JUST(DoPass("FuseCastScalePass")); JUST(DoPass("PruneParallelCastOpsPass")); + JUST(DoPass("PruneRedundantQuantizationOpsPass")); JUST(DoPass("FuseUpdateOpsPass")); JUST(DoPass("FuseModelUpdateCastOpsPass")); JUST(DoPass("MultiTensorModelUpdatePass")); diff --git a/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp b/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp new file mode 100644 index 00000000000..ca786d84dd4 --- /dev/null +++ b/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp @@ -0,0 +1,108 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/job_rewriter/job_pass.h" + +namespace oneflow { + +namespace { + +bool IsQunatizationOp(const OperatorConf& op_conf) { + return op_conf.has_user_conf() + && (op_conf.user_conf().op_type_name() == "quantization"); +} + +bool NeedDoPass(const Job& job) { + return std::any_of(job.net().op().cbegin(), job.net().op().cend(), IsQunatizationOp); +} + +class PruneReduntantQuantizationOpsPass final : public JobPass { + public: + PruneReduntantQuantizationOpsPass() = default; + ~PruneReduntantQuantizationOpsPass() override = default; + + bool IsEnabled(const JobPassCtx& ctx) const { return true; } + Maybe Apply(const OpGraph& op_graph, JobBuilder* job_builder) const; + + Maybe Apply(Job* job, JobPassCtx* ctx) const override { + if (!IsEnabled(*ctx)) { return Maybe::Ok(); } + if (!NeedDoPass(*job)) { return Maybe::Ok(); } + const OpGraph op_graph(*job); + JobBuilder job_builder(job); + return Apply(op_graph, &job_builder); + } +}; + +Maybe PruneReduntantQuantizationOpsPass::Apply(const OpGraph& op_graph, + JobBuilder* job_builder) const { + HashMap op_name2op_conf; + HashSet ctrl_in_op_names; + op_graph.ForEachNode([&](const OpNode* op_node) { + for (const std::string& ctrl_in_op_name : op_node->op().op_conf().ctrl_in_op_name()) { + ctrl_in_op_names.insert(ctrl_in_op_name); + } + }); + std::vector del_op_names; + op_graph.ForEachNode([&](const OpNode* op_node) { + if (op_node->out_edges().size() == 1) { return; } + bool has_found_quant_op = false; + LogicalBlobId first_quantization_lbi; + for (const auto* out_edge : op_node->out_edges()) { + OpNode* consumer = out_edge->dst_node(); + const OperatorConf& op_conf = consumer->op().op_conf(); + + if (ctrl_in_op_names.find(op_conf.name()) != ctrl_in_op_names.end()) { return; } + if (!op_conf.has_user_conf()) { continue; } + if (op_conf.user_conf().op_type_name() != "quantization") { continue; } + std::vector first_quantization_ctrl_in_op_names; + user_op::UserOpConfWrapper conf_wrapper(op_conf); + if (has_found_quant_op) { + const LogicalBlobId& quantization_lbi = GenLogicalBlobId(conf_wrapper.output("out", 0)); + for (const OpEdge* consumer_out_edge : consumer->out_edges()) { + const OpNode* consumer = consumer_out_edge->dst_node(); + const std::string& consumer_op_name = consumer->op().op_name(); + if (op_name2op_conf.find(consumer_op_name) == op_name2op_conf.end()) { + op_name2op_conf[consumer_op_name] = consumer->op().op_conf(); + } + OperatorConf& consumer_op_conf = op_name2op_conf.at(consumer_op_name); + for (const std::string& ibn : consumer->op().input_bns()) { + if (consumer->op().BnInOp2Lbi(ibn) == quantization_lbi) { + const auto& new_val = GenLogicalBlobName(first_quantization_lbi); + const auto& old_val = ReplaceInputLbnInOpCustomizedConf(&consumer_op_conf, ibn, new_val); + CHECK_EQ(GenLogicalBlobName(quantization_lbi), old_val); + for (const auto& ctrl_in_op_name : op_conf.ctrl_in_op_name()) { + consumer_op_conf.add_ctrl_in_op_name(ctrl_in_op_name); + } + } + } + del_op_names.emplace_back(op_conf.name()); + } + } else { + first_quantization_lbi = GenLogicalBlobId(conf_wrapper.output("out", 0)); + for (const auto& ctrl_in_op_name : op_conf.ctrl_in_op_name()) { + first_quantization_ctrl_in_op_names.emplace_back(ctrl_in_op_name); + } + has_found_quant_op = true; + } + } + }); + for (const auto& pair : op_name2op_conf) { job_builder->MutOpsOnlyOnce({pair.second}); } + job_builder->DelOps(del_op_names); + return Maybe::Ok(); +} + +} // namespace + +REGISTER_JOB_PASS("PruneRedundantQuantizationOpsPass", PruneReduntantQuantizationOpsPass); + +} // namespace oneflow From 2d45cc64c210acb4394fdc3e27041bc5bcb9b784 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Mon, 4 Sep 2023 08:24:22 +0000 Subject: [PATCH 42/65] auto format by CI --- .../prune_redundant_quantization_op_pass.cpp | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp b/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp index ca786d84dd4..191c0693dd9 100644 --- a/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp +++ b/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp @@ -1,5 +1,20 @@ /* Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -18,8 +33,7 @@ namespace oneflow { namespace { bool IsQunatizationOp(const OperatorConf& op_conf) { - return op_conf.has_user_conf() - && (op_conf.user_conf().op_type_name() == "quantization"); + return op_conf.has_user_conf() && (op_conf.user_conf().op_type_name() == "quantization"); } bool NeedDoPass(const Job& job) { @@ -44,7 +58,7 @@ class PruneReduntantQuantizationOpsPass final : public JobPass { }; Maybe PruneReduntantQuantizationOpsPass::Apply(const OpGraph& op_graph, - JobBuilder* job_builder) const { + JobBuilder* job_builder) const { HashMap op_name2op_conf; HashSet ctrl_in_op_names; op_graph.ForEachNode([&](const OpNode* op_node) { @@ -78,7 +92,8 @@ Maybe PruneReduntantQuantizationOpsPass::Apply(const OpGraph& op_graph, for (const std::string& ibn : consumer->op().input_bns()) { if (consumer->op().BnInOp2Lbi(ibn) == quantization_lbi) { const auto& new_val = GenLogicalBlobName(first_quantization_lbi); - const auto& old_val = ReplaceInputLbnInOpCustomizedConf(&consumer_op_conf, ibn, new_val); + const auto& old_val = + ReplaceInputLbnInOpCustomizedConf(&consumer_op_conf, ibn, new_val); CHECK_EQ(GenLogicalBlobName(quantization_lbi), old_val); for (const auto& ctrl_in_op_name : op_conf.ctrl_in_op_name()) { consumer_op_conf.add_ctrl_in_op_name(ctrl_in_op_name); From 9af9b8dba8fde34e2314688f4f7bda430e4bf344 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 4 Sep 2023 14:33:50 +0000 Subject: [PATCH 43/65] optimize activation dynamic quantization --- oneflow/core/functional/functional_api.yaml | 6 + oneflow/core/functional/impl/quantization.cpp | 46 ++++ oneflow/ir/include/OneFlow/OneFlowUserOps.td | 26 ++ ...used_activation_min_max_observer_kernel.cu | 258 ++++++++++++++++++ .../user/ops/fused_min_max_observer_op.cpp | 87 ++++++ 5 files changed, 423 insertions(+) create mode 100644 oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu create mode 100644 oneflow/user/ops/fused_min_max_observer_op.cpp diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 66b642cb3bf..7662384b143 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1106,6 +1106,12 @@ signature: 'Tensor (Tensor x, Tensor w, Tensor w_scale, *, Tensor w_zero=None, Tensor b=None, Int32 num_bits=8, Bool symmetric=True, Int64 group_dim=-1, Int64 group_size=-1) => FusedLinearWithGroupwiseQuantizedWeight' bind_python: True +- name: "fused_activation_min_max_observer" + signature: + "TensorTuple (Tensor in, Tensor weight_scale, Tensor weight_acc, Tensor bias=None, String quantization_formula, Int32 quantization_bit, + String quantization_scheme, Bool per_layer_quantization=True) => FusedActivationMinMaxObserver" + bind_python: True + - name: "conv_data_grad" signature: 'Tensor (Tensor dy, Tensor weight, Tensor x, Int32 num_spatial_dims, diff --git a/oneflow/core/functional/impl/quantization.cpp b/oneflow/core/functional/impl/quantization.cpp index bb28693282b..c6e6b65ce58 100644 --- a/oneflow/core/functional/impl/quantization.cpp +++ b/oneflow/core/functional/impl/quantization.cpp @@ -90,6 +90,51 @@ class MovingAverageMinMaxObserverFunctor { std::shared_ptr op_; }; +class FusedActivationMinMaxObserverFunctor { + public: + FusedActivationMinMaxObserverFunctor() { + op_ = CHECK_JUST(one::OpBuilder("fused_activation_min_max_observer") + .Input("in") + .Input("weight_scale") + .Input("weight_acc") + .Output("in_scale") + .Output("in_zero_point") + .Output("out_scale") + .Output("out_bias") + .Build()); + op_with_bias_ = CHECK_JUST(one::OpBuilder("fused_activation_min_max_observer") + .Input("in") + .Input("weight_scale") + .Input("weight_acc") + .Input("bias") + .Output("in_scale") + .Output("in_zero_point") + .Output("out_scale") + .Output("out_bias") + .Build()); + } + Maybe operator()( + const std::shared_ptr& in, const std::shared_ptr& weight_scale, + const std::shared_ptr& weight_acc, const Optional& bias, + const std::string& quantization_formula, const int32_t& quantization_bit, + const std::string& quantization_scheme, const bool& per_layer_quantization) const { + auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("quantization_formula", "quantization_bit", + "quantization_scheme", "per_layer_quantization"); + attrs.SetAllAttrs(quantization_formula, quantization_bit, quantization_scheme, + per_layer_quantization); + if (bias) { + return OpInterpUtil::Dispatch(*op_with_bias_, + {in, weight_scale, weight_acc, JUST(bias)}, attrs); + } else { + return OpInterpUtil::Dispatch(*op_, {in, weight_scale, weight_acc}, attrs); + } + } + + private: + std::shared_ptr op_; + std::shared_ptr op_with_bias_; +}; + class FakeQuantizationFunctor { public: FakeQuantizationFunctor() { @@ -390,6 +435,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("GroupwiseDequantize"); m.add_functor( "FusedLinearWithGroupwiseQuantizedWeight"); + m.add_functor("FusedActivationMinMaxObserver"); }; } // namespace functional diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 768a9d3ccf1..5012eb185bc 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -8354,6 +8354,32 @@ def OneFlow_FusedLinearWithGroupwiseQuantizedWeightOp : OneFlow_BaseOp<"fused_li let has_data_type_infer_fn = 1; } +def OneFlow_FusedActivationMinMaxObserverOp : OneFlow_BaseOp<"fused_activation_min_max_observer", [NoMemoryEffect, NoGrad, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$in, + OneFlow_Tensor:$weight_scale, + OneFlow_Tensor:$weight_acc, + Optional:$bias + ); + let output = (outs + OneFlow_Tensor:$in_scale, + OneFlow_Tensor:$in_zero_point, + OneFlow_Tensor:$out_scale, + OneFlow_Tensor:$out_bias + ); + let attrs = (ins + DefaultValuedAttr:$quantization_formula, + DefaultValuedAttr:$quantization_bit, + DefaultValuedAttr:$quantization_scheme, + DefaultValuedAttr:$per_layer_quantization + ); + let has_check_fn = 1; + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + def OneFlow_Conv2DQuantOp : OneFlow_BaseOp<"conv2d_quant", [NoMemoryEffect, AttrSizedOperandSegments, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let summary = "OneFlow fused convolution quant operation"; let description = [{ diff --git a/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu b/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu new file mode 100644 index 00000000000..c6215ff23ef --- /dev/null +++ b/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu @@ -0,0 +1,258 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.cuh" +#include "oneflow/core/ndarray/binary_func.h" +#include "oneflow/core/kernel/util/numeric_limits.cuh" + +namespace oneflow { + +namespace { + +template +__host__ __device__ int ModDiv(int64_t N) { + return N - (N / M * M); +} + +template<> +__host__ __device__ int ModDiv<2>(int64_t N) { + return N & 0x1; +} + +template<> +__host__ __device__ int ModDiv<4>(int64_t N) { + return N & 0x3; +} + +template<> +__host__ __device__ int ModDiv<8>(int64_t N) { + return N & 0x7; +} + +template<> +__host__ __device__ int ModDiv<16>(int64_t N) { + return N & 0xF; +} + +template +struct MinMaxVal { + T min; + T max; +}; + +template +__global__ void ReduceMinMaxPerLayer(const int64_t elements, const T* in_ptr, T* min_max_ptr) { + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + + extern __shared__ uint8_t buffer[]; + + T min_value = detail::numeric_limits::max(); + T max_value = detail::numeric_limits::lowest(); + + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x * pack_size; + + for (int64_t idx = gid * pack_size; idx < elements; idx += step) { + LoadPack in; + in.storage = reinterpret_cast(in_ptr + idx)[0]; + for (int i = 0; i < pack_size; ++i) { + min_value = BinaryFuncMin::Invoke(min_value, in.elem[i]); + max_value = BinaryFuncMax::Invoke(max_value, in.elem[i]); + } + } + int rest = ModDiv(elements); + if (rest > 0 && gid == (gridDim.x * blockDim.x - 1)) { + in_ptr += elements - rest; + for (int i = 0; i < rest; ++i) { + T val = in_ptr[i]; + min_value = BinaryFuncMin::Invoke(min_value, val); + max_value = BinaryFuncMax::Invoke(max_value, val); + } + } + + int64_t tid = threadIdx.x; + + MinMaxVal* shared_min_max = reinterpret_cast*>(buffer); + shared_min_max[tid].min = min_value; + shared_min_max[tid].max = max_value; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + shared_min_max[tid].min = + BinaryFuncMin::Invoke(shared_min_max[tid].min, shared_min_max[tid + s].min); + shared_min_max[tid].max = + BinaryFuncMax::Invoke(shared_min_max[tid].max, shared_min_max[tid + s].max); + } + __syncthreads(); + } + + if (tid == 0) { + MinMaxVal* min_max = reinterpret_cast*>(min_max_ptr); + min_max[blockIdx.x].min = shared_min_max[0].min; + min_max[blockIdx.x].max = shared_min_max[0].max; + } +} + +template +__global__ void ComputeOFScaleAndZeroPoint(const T* min_max_ptr, const int min_max_size, + const int quantization_bit, const float* weight_scale, + const float* weight_acc, const T* bias, T* in_scale, + Q* in_zero_point, T* out_scale, T* out_bias, + const int out_elements) { + extern __shared__ uint8_t buffer[]; + MinMaxVal* shared_min_max = reinterpret_cast*>(buffer); + int64_t tid = threadIdx.x; + + { + T min_value = detail::numeric_limits::max(); + T max_value = detail::numeric_limits::lowest(); + + const MinMaxVal* min_max = reinterpret_cast*>(min_max_ptr); + + for (int64_t idx = threadIdx.x; idx < min_max_size; idx += blockDim.x) { + min_value = BinaryFuncMin::Invoke(min_value, min_max[idx].min); + max_value = BinaryFuncMax::Invoke(max_value, min_max[idx].max); + } + shared_min_max[tid].min = min_value; + shared_min_max[tid].max = max_value; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + shared_min_max[tid].min = + BinaryFuncMin::Invoke(shared_min_max[tid].min, shared_min_max[tid + s].min); + shared_min_max[tid].max = + BinaryFuncMax::Invoke(shared_min_max[tid].max, shared_min_max[tid + s].max); + } + __syncthreads(); + } + } + + float min_value = static_cast(shared_min_max[0].min); + float max_value = static_cast(shared_min_max[0].max); + float input_scale = (max_value - min_value) / ((1 << quantization_bit) - 1); + int32_t input_zero_point = + -(__float2int_rn(min_value / input_scale) + (1 << (quantization_bit - 1))); + float scale_zero_point = -input_scale * input_zero_point; + + if (tid == 0) { + in_scale[0] = static_cast(input_scale); + in_zero_point[0] = static_cast(input_zero_point); + } + if (bias) { + for (int64_t idx = threadIdx.x; idx < out_elements; idx += blockDim.x) { + out_scale[idx] = static_cast(weight_scale[idx] * input_scale); + out_bias[idx] = static_cast(weight_acc[idx] * scale_zero_point) + bias[idx]; + } + } else { + for (int64_t idx = threadIdx.x; idx < out_elements; idx += blockDim.x) { + out_scale[idx] = static_cast(weight_scale[idx] * input_scale); + out_bias[idx] = static_cast(weight_acc[idx] * scale_zero_point); + } + } +} + +} // namespace + +template +class GpuFusedActivationMinMaxObserverKernel final : public user_op::OpKernel { + public: + GpuFusedActivationMinMaxObserverKernel() = default; + ~GpuFusedActivationMinMaxObserverKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* weight_scale = ctx->Tensor4ArgNameAndIndex("weight_scale", 0); + const user_op::Tensor* weight_acc = ctx->Tensor4ArgNameAndIndex("weight_acc", 0); + const user_op::Tensor* bias = nullptr; + if (ctx->has_input("bias", 0)) { bias = ctx->Tensor4ArgNameAndIndex("bias", 0); } + + user_op::Tensor* in_scale = ctx->Tensor4ArgNameAndIndex("in_scale", 0); + user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_point", 0); + user_op::Tensor* out_scale = ctx->Tensor4ArgNameAndIndex("out_scale", 0); + user_op::Tensor* out_bias = ctx->Tensor4ArgNameAndIndex("out_bias", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + + CHECK(quantization_scheme == "affine"); + CHECK(quantization_bit == 8); + + const int64_t elements = in->shape_view().elem_cnt(); + + constexpr int pack_size = cuda::elementwise::PackSize(); + int grid_size = 0; + int64_t pack_num = (elements + pack_size - 1) / pack_size; + cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + // grid_size = grid_size > 1024 ? 1024 : grid_size; + + size_t element_bytes = GetSizeOfDataType(GetDataType::value); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), grid_size * element_bytes * 2); + + T* min_max = reinterpret_cast(tmp_buffer->mut_dptr()); + auto stream = ctx->stream()->As()->cuda_stream(); + if (per_layer_quantization) { + ReduceMinMaxPerLayer + <<>>(elements, in->dptr(), + min_max); + } else { + UNIMPLEMENTED() + << "fused_activation_min_max_observer does not support per-channel quantization"; + } + + if (quantization_formula == "oneflow") { + if (quantization_bit == 8) { + ComputeOFScaleAndZeroPoint + <<<1, cuda::elementwise::kBlockSize, cuda::elementwise::kBlockSize * element_bytes * 2, + stream>>>(min_max, grid_size, quantization_bit, weight_scale->dptr(), + weight_acc->dptr(), bias ? bias->dptr() : nullptr, + in_scale->mut_dptr(), in_zero_point->mut_dptr(), + out_scale->mut_dptr(), out_bias->mut_dptr(), + out_scale->shape_view().elem_cnt()); + } else { + UNIMPLEMENTED(); + } + } else { + UNIMPLEMENTED() + << "fused_activation_min_max_observer only support oneflow quantization formula"; + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_activation_min_max_observer") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) + +REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(float); +REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(double); +REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(half); + +} // namespace oneflow diff --git a/oneflow/user/ops/fused_min_max_observer_op.cpp b/oneflow/user/ops/fused_min_max_observer_op.cpp new file mode 100644 index 00000000000..6ec8d4c856b --- /dev/null +++ b/oneflow/user/ops/fused_min_max_observer_op.cpp @@ -0,0 +1,87 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +/* static */ Maybe FusedActivationMinMaxObserverOp::InferLogicalTensorDesc( + user_op::InferContext* ctx) { + CHECK_OR_RETURN(ctx->Attr("per_layer_quantization")) + << "activation min_max_observer only support per-layer quantization"; + const Shape& weight_scale_shape = ctx->InputShape("weight_scale", 0); + + ctx->SetOutputShape("in_scale", 0, Shape({1})); + ctx->SetOutputShape("in_zero_point", 0, Shape({1})); + ctx->SetOutputShape("out_scale", 0, {weight_scale_shape.Count(0)}); + ctx->SetOutputShape("out_bias", 0, {weight_scale_shape.Count(0)}); + return Maybe::Ok(); +} + +/*static*/ Maybe FusedActivationMinMaxObserverOp::InferPhysicalTensorDesc( + user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe FusedActivationMinMaxObserverOp::GetSbp(user_op::SbpContext* ctx) { + // NOTE(Liang Depeng): input needs to be broadcast in order to accurately calculate the + // global scale and zero_point + return Maybe::Ok(); +} + +/* static */ Maybe FusedActivationMinMaxObserverOp::CheckAttr( + const user_op::UserOpDefWrapper& def, const user_op::UserOpConfWrapper& op_conf) { + int32_t quantization_bit = op_conf.attr("quantization_bit"); + CHECK_GT_OR_RETURN(quantization_bit, 1); + CHECK_LE_OR_RETURN(quantization_bit, 8); + + std::string quantization_scheme = op_conf.attr("quantization_scheme"); + CHECK_OR_RETURN(quantization_scheme == "symmetric" || quantization_scheme == "affine"); + + std::string quantization_formula = op_conf.attr("quantization_formula"); + CHECK_OR_RETURN(quantization_formula == "google" || quantization_formula == "cambricon" + || quantization_formula == "oneflow"); + return Maybe::Ok(); +} + +/* static */ Maybe FusedActivationMinMaxObserverOp::InferDataType( + user_op::InferContext* ctx) { + CHECK_EQ_OR_RETURN(ctx->InputDType("weight_scale", 0), DataType::kFloat) + << "weight_scale dtype should be float"; + CHECK_EQ_OR_RETURN(ctx->InputDType("weight_acc", 0), DataType::kFloat) + << "weight_acc dtype should be float"; + + DataType data_type = ctx->InputDType("in", 0); + if (ctx->has_input("bias", 0)) { CHECK_EQ_OR_RETURN(data_type, ctx->InputDType("bias", 0)); } + + int32_t quantization_bit = ctx->Attr("quantization_bit"); + const std::string& quantization_formula = ctx->Attr("quantization_formula"); + if (quantization_formula == "oneflow") { + if (quantization_bit == 8) { + ctx->SetOutputDType("in_zero_point", 0, DataType::kInt8); + } else { + OF_UNIMPLEMENTED(); + } + } else { + ctx->SetOutputDType("in_zero_point", 0, data_type); + } + ctx->SetOutputDType("in_scale", 0, data_type); + ctx->SetOutputDType("out_scale", 0, data_type); + ctx->SetOutputDType("out_bias", 0, data_type); + return Maybe::Ok(); +} + +} // namespace oneflow From 4a5a5c6e063591a3ba3d3357f865189e17095e62 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 5 Sep 2023 05:23:52 +0000 Subject: [PATCH 44/65] optimize --- .../prune_redundant_quantization_op_pass.cpp | 12 -- ...used_activation_min_max_observer_kernel.cu | 157 ++++++++++++------ 2 files changed, 109 insertions(+), 60 deletions(-) diff --git a/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp b/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp index 191c0693dd9..6f7eaf85feb 100644 --- a/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp +++ b/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp @@ -13,18 +13,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ #include "oneflow/core/framework/framework.h" #include "oneflow/core/job_rewriter/job_pass.h" diff --git a/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu b/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu index c6215ff23ef..1441cd6e8b3 100644 --- a/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu +++ b/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu @@ -49,21 +49,17 @@ __host__ __device__ int ModDiv<16>(int64_t N) { return N & 0xF; } -template -struct MinMaxVal { - T min; - T max; -}; - template __global__ void ReduceMinMaxPerLayer(const int64_t elements, const T* in_ptr, T* min_max_ptr) { using LoadType = cuda::elementwise::PackType; using LoadPack = cuda::elementwise::Pack; + using MinMaxPack = cuda::elementwise::Pack; extern __shared__ uint8_t buffer[]; - T min_value = detail::numeric_limits::max(); - T max_value = detail::numeric_limits::lowest(); + MinMaxPack min_max; + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; int64_t step = gridDim.x * blockDim.x * pack_size; @@ -71,42 +67,44 @@ __global__ void ReduceMinMaxPerLayer(const int64_t elements, const T* in_ptr, T* for (int64_t idx = gid * pack_size; idx < elements; idx += step) { LoadPack in; in.storage = reinterpret_cast(in_ptr + idx)[0]; +#pragma unroll for (int i = 0; i < pack_size; ++i) { - min_value = BinaryFuncMin::Invoke(min_value, in.elem[i]); - max_value = BinaryFuncMax::Invoke(max_value, in.elem[i]); + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[i]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[i]); } } int rest = ModDiv(elements); if (rest > 0 && gid == (gridDim.x * blockDim.x - 1)) { in_ptr += elements - rest; + LoadPack in; + in.storage = reinterpret_cast(in_ptr)[0]; +#pragma unroll for (int i = 0; i < rest; ++i) { - T val = in_ptr[i]; - min_value = BinaryFuncMin::Invoke(min_value, val); - max_value = BinaryFuncMax::Invoke(max_value, val); + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[i]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[i]); } } int64_t tid = threadIdx.x; - MinMaxVal* shared_min_max = reinterpret_cast*>(buffer); - shared_min_max[tid].min = min_value; - shared_min_max[tid].max = max_value; + MinMaxPack* shared_min_max = reinterpret_cast(buffer); + shared_min_max[tid].storage = min_max.storage; __syncthreads(); for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { - shared_min_max[tid].min = - BinaryFuncMin::Invoke(shared_min_max[tid].min, shared_min_max[tid + s].min); - shared_min_max[tid].max = - BinaryFuncMax::Invoke(shared_min_max[tid].max, shared_min_max[tid + s].max); + MinMaxPack min_max0, min_max1; + min_max0.storage = shared_min_max[tid].storage; + min_max1.storage = shared_min_max[tid + s].storage; + min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); + min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); + shared_min_max[tid].storage = min_max0.storage; } __syncthreads(); } if (tid == 0) { - MinMaxVal* min_max = reinterpret_cast*>(min_max_ptr); - min_max[blockIdx.x].min = shared_min_max[0].min; - min_max[blockIdx.x].max = shared_min_max[0].max; + reinterpret_cast(min_max_ptr)[blockIdx.x].storage = shared_min_max[0].storage; } } @@ -116,55 +114,118 @@ __global__ void ComputeOFScaleAndZeroPoint(const T* min_max_ptr, const int min_m const float* weight_acc, const T* bias, T* in_scale, Q* in_zero_point, T* out_scale, T* out_bias, const int out_elements) { + using MinMaxPack = cuda::elementwise::Pack; + extern __shared__ uint8_t buffer[]; - MinMaxVal* shared_min_max = reinterpret_cast*>(buffer); + MinMaxPack* shared_min_max = reinterpret_cast(buffer); int64_t tid = threadIdx.x; - { - T min_value = detail::numeric_limits::max(); - T max_value = detail::numeric_limits::lowest(); - - const MinMaxVal* min_max = reinterpret_cast*>(min_max_ptr); - + MinMaxPack min_max; + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); +#pragma unroll for (int64_t idx = threadIdx.x; idx < min_max_size; idx += blockDim.x) { - min_value = BinaryFuncMin::Invoke(min_value, min_max[idx].min); - max_value = BinaryFuncMax::Invoke(max_value, min_max[idx].max); + MinMaxPack in = reinterpret_cast(min_max_ptr)[idx]; + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[1]); } - shared_min_max[tid].min = min_value; - shared_min_max[tid].max = max_value; + shared_min_max[tid].storage = min_max.storage; __syncthreads(); for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { - shared_min_max[tid].min = - BinaryFuncMin::Invoke(shared_min_max[tid].min, shared_min_max[tid + s].min); - shared_min_max[tid].max = - BinaryFuncMax::Invoke(shared_min_max[tid].max, shared_min_max[tid + s].max); + MinMaxPack min_max0, min_max1; + min_max0.storage = shared_min_max[tid].storage; + min_max1.storage = shared_min_max[tid + s].storage; + min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); + min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); + shared_min_max[tid].storage = min_max0.storage; } __syncthreads(); } } - float min_value = static_cast(shared_min_max[0].min); - float max_value = static_cast(shared_min_max[0].max); + MinMaxPack min_max = shared_min_max[0]; + float min_value = static_cast(min_max.elem[0]); + float max_value = static_cast(min_max.elem[1]); float input_scale = (max_value - min_value) / ((1 << quantization_bit) - 1); int32_t input_zero_point = -(__float2int_rn(min_value / input_scale) + (1 << (quantization_bit - 1))); float scale_zero_point = -input_scale * input_zero_point; - if (tid == 0) { + int64_t thread_num = gridDim.x * blockDim.x; + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + if (gid == 0) { in_scale[0] = static_cast(input_scale); in_zero_point[0] = static_cast(input_zero_point); } + + using LoadWPack = cuda::elementwise::Pack; + using LoadBPack = cuda::elementwise::Pack; + using StorePack = cuda::elementwise::Pack; + if (bias) { - for (int64_t idx = threadIdx.x; idx < out_elements; idx += blockDim.x) { - out_scale[idx] = static_cast(weight_scale[idx] * input_scale); - out_bias[idx] = static_cast(weight_acc[idx] * scale_zero_point) + bias[idx]; + for (int64_t idx = gid << 2; idx < out_elements; idx += thread_num << 2) { + LoadWPack w_scale = reinterpret_cast(weight_scale + idx)[0]; + LoadWPack w_acc = reinterpret_cast(weight_acc + idx)[0]; + LoadBPack b = reinterpret_cast(bias + idx)[0]; + StorePack store_scale, store_bias; + + store_scale.elem[0] = static_cast(w_scale.elem[0] * input_scale); + store_scale.elem[1] = static_cast(w_scale.elem[1] * input_scale); + store_scale.elem[2] = static_cast(w_scale.elem[2] * input_scale); + store_scale.elem[3] = static_cast(w_scale.elem[3] * input_scale); + + store_bias.elem[0] = static_cast(__fmaf_rn(w_acc.elem[0], scale_zero_point, b.elem[0])); + store_bias.elem[1] = static_cast(__fmaf_rn(w_acc.elem[1], scale_zero_point, b.elem[1])); + store_bias.elem[2] = static_cast(__fmaf_rn(w_acc.elem[2], scale_zero_point, b.elem[2])); + store_bias.elem[3] = static_cast(__fmaf_rn(w_acc.elem[3], scale_zero_point, b.elem[3])); + + reinterpret_cast(out_scale + idx)[0] = store_scale; + reinterpret_cast(out_bias + idx)[0] = store_bias; + } + int rest = ModDiv<4>(out_elements); + if (rest > 0 && gid == (thread_num - 1)) { + int offset = out_elements - rest; + LoadWPack w_scale = reinterpret_cast(weight_scale + offset)[0]; + LoadWPack w_acc = reinterpret_cast(weight_acc + offset)[0]; + LoadBPack b = reinterpret_cast(bias + offset)[0]; +#pragma unroll + for (int i = 0; i < rest; ++i) { + out_scale[offset + i] = static_cast(w_scale.elem[i] * input_scale); + out_bias[offset + i] = + static_cast(__fmaf_rn(w_acc.elem[i], scale_zero_point, b.elem[i])); + } } } else { - for (int64_t idx = threadIdx.x; idx < out_elements; idx += blockDim.x) { - out_scale[idx] = static_cast(weight_scale[idx] * input_scale); - out_bias[idx] = static_cast(weight_acc[idx] * scale_zero_point); + for (int64_t idx = gid << 2; idx < out_elements; idx += thread_num << 2) { + LoadWPack w_scale = reinterpret_cast(weight_scale + idx)[0]; + LoadWPack w_acc = reinterpret_cast(weight_acc + idx)[0]; + StorePack store_scale, store_bias; + + store_scale.elem[0] = static_cast(w_scale.elem[0] * input_scale); + store_scale.elem[1] = static_cast(w_scale.elem[1] * input_scale); + store_scale.elem[2] = static_cast(w_scale.elem[2] * input_scale); + store_scale.elem[3] = static_cast(w_scale.elem[3] * input_scale); + + store_bias.elem[0] = static_cast(w_acc.elem[0] * scale_zero_point); + store_bias.elem[1] = static_cast(w_acc.elem[1] * scale_zero_point); + store_bias.elem[2] = static_cast(w_acc.elem[2] * scale_zero_point); + store_bias.elem[3] = static_cast(w_acc.elem[3] * scale_zero_point); + + reinterpret_cast(out_scale + idx)[0] = store_scale; + reinterpret_cast(out_bias + idx)[0] = store_bias; + } + int rest = ModDiv<4>(out_elements); + if (rest > 0 && gid == (thread_num - 1)) { + int offset = out_elements - rest; + LoadWPack w_scale = reinterpret_cast(weight_scale + offset)[0]; + LoadWPack w_acc = reinterpret_cast(weight_acc + offset)[0]; +#pragma unroll + for (int i = 0; i < rest; ++i) { + out_scale[offset + i] = static_cast(w_scale.elem[i] * input_scale); + out_bias[offset + i] = static_cast(w_acc.elem[i] * scale_zero_point); + } } } } @@ -206,7 +267,7 @@ class GpuFusedActivationMinMaxObserverKernel final : public user_op::OpKernel { int grid_size = 0; int64_t pack_num = (elements + pack_size - 1) / pack_size; cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - // grid_size = grid_size > 1024 ? 1024 : grid_size; + grid_size = grid_size > 2048 ? 2048 : grid_size; size_t element_bytes = GetSizeOfDataType(GetDataType::value); CHECK_GE(tmp_buffer->shape_view().elem_cnt(), grid_size * element_bytes * 2); From 42c1ee952173fb625e7767e9bcf7be1af7c1b7a8 Mon Sep 17 00:00:00 2001 From: clackhan Date: Tue, 5 Sep 2023 08:28:23 +0000 Subject: [PATCH 45/65] impl mlir pass --- oneflow/core/job/job_build_and_infer_ctx.cpp | 1 - .../prune_redundant_quantization_op_pass.cpp | 108 ------------------ oneflow/ir/lib/OneFlow/Passes.cpp | 26 +++++ 3 files changed, 26 insertions(+), 109 deletions(-) delete mode 100644 oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index d17352f0898..3c77539cbcf 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -1027,7 +1027,6 @@ Maybe LazyJobBuildAndInferCtx::Complete() { JUST(DoPass("DoParallelCastBeforeWideningTypeCast")); JUST(DoPass("FuseCastScalePass")); JUST(DoPass("PruneParallelCastOpsPass")); - JUST(DoPass("PruneRedundantQuantizationOpsPass")); JUST(DoPass("FuseUpdateOpsPass")); JUST(DoPass("FuseModelUpdateCastOpsPass")); JUST(DoPass("MultiTensorModelUpdatePass")); diff --git a/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp b/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp deleted file mode 100644 index ca786d84dd4..00000000000 --- a/oneflow/core/job_rewriter/prune_redundant_quantization_op_pass.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/job_rewriter/job_pass.h" - -namespace oneflow { - -namespace { - -bool IsQunatizationOp(const OperatorConf& op_conf) { - return op_conf.has_user_conf() - && (op_conf.user_conf().op_type_name() == "quantization"); -} - -bool NeedDoPass(const Job& job) { - return std::any_of(job.net().op().cbegin(), job.net().op().cend(), IsQunatizationOp); -} - -class PruneReduntantQuantizationOpsPass final : public JobPass { - public: - PruneReduntantQuantizationOpsPass() = default; - ~PruneReduntantQuantizationOpsPass() override = default; - - bool IsEnabled(const JobPassCtx& ctx) const { return true; } - Maybe Apply(const OpGraph& op_graph, JobBuilder* job_builder) const; - - Maybe Apply(Job* job, JobPassCtx* ctx) const override { - if (!IsEnabled(*ctx)) { return Maybe::Ok(); } - if (!NeedDoPass(*job)) { return Maybe::Ok(); } - const OpGraph op_graph(*job); - JobBuilder job_builder(job); - return Apply(op_graph, &job_builder); - } -}; - -Maybe PruneReduntantQuantizationOpsPass::Apply(const OpGraph& op_graph, - JobBuilder* job_builder) const { - HashMap op_name2op_conf; - HashSet ctrl_in_op_names; - op_graph.ForEachNode([&](const OpNode* op_node) { - for (const std::string& ctrl_in_op_name : op_node->op().op_conf().ctrl_in_op_name()) { - ctrl_in_op_names.insert(ctrl_in_op_name); - } - }); - std::vector del_op_names; - op_graph.ForEachNode([&](const OpNode* op_node) { - if (op_node->out_edges().size() == 1) { return; } - bool has_found_quant_op = false; - LogicalBlobId first_quantization_lbi; - for (const auto* out_edge : op_node->out_edges()) { - OpNode* consumer = out_edge->dst_node(); - const OperatorConf& op_conf = consumer->op().op_conf(); - - if (ctrl_in_op_names.find(op_conf.name()) != ctrl_in_op_names.end()) { return; } - if (!op_conf.has_user_conf()) { continue; } - if (op_conf.user_conf().op_type_name() != "quantization") { continue; } - std::vector first_quantization_ctrl_in_op_names; - user_op::UserOpConfWrapper conf_wrapper(op_conf); - if (has_found_quant_op) { - const LogicalBlobId& quantization_lbi = GenLogicalBlobId(conf_wrapper.output("out", 0)); - for (const OpEdge* consumer_out_edge : consumer->out_edges()) { - const OpNode* consumer = consumer_out_edge->dst_node(); - const std::string& consumer_op_name = consumer->op().op_name(); - if (op_name2op_conf.find(consumer_op_name) == op_name2op_conf.end()) { - op_name2op_conf[consumer_op_name] = consumer->op().op_conf(); - } - OperatorConf& consumer_op_conf = op_name2op_conf.at(consumer_op_name); - for (const std::string& ibn : consumer->op().input_bns()) { - if (consumer->op().BnInOp2Lbi(ibn) == quantization_lbi) { - const auto& new_val = GenLogicalBlobName(first_quantization_lbi); - const auto& old_val = ReplaceInputLbnInOpCustomizedConf(&consumer_op_conf, ibn, new_val); - CHECK_EQ(GenLogicalBlobName(quantization_lbi), old_val); - for (const auto& ctrl_in_op_name : op_conf.ctrl_in_op_name()) { - consumer_op_conf.add_ctrl_in_op_name(ctrl_in_op_name); - } - } - } - del_op_names.emplace_back(op_conf.name()); - } - } else { - first_quantization_lbi = GenLogicalBlobId(conf_wrapper.output("out", 0)); - for (const auto& ctrl_in_op_name : op_conf.ctrl_in_op_name()) { - first_quantization_ctrl_in_op_names.emplace_back(ctrl_in_op_name); - } - has_found_quant_op = true; - } - } - }); - for (const auto& pair : op_name2op_conf) { job_builder->MutOpsOnlyOnce({pair.second}); } - job_builder->DelOps(del_op_names); - return Maybe::Ok(); -} - -} // namespace - -REGISTER_JOB_PASS("PruneRedundantQuantizationOpsPass", PruneReduntantQuantizationOpsPass); - -} // namespace oneflow diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index 844a3331a9e..6aabf92f2ef 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -504,6 +504,31 @@ LogicalResult FusedConsecutiveAddPattern::matchAndRewrite(Add2Op op, return TryFusedConsecutiveAdd(op, {op.getIn0(), op.getIn1()}, rewriter); } +struct PruneReduntantQuantizationOpsPattern : public OpInterfaceRewritePattern { + explicit PruneReduntantQuantizationOpsPattern(mlir::MLIRContext* context) + : OpInterfaceRewritePattern(context, /*benefit=*/1) {} + + public: + LogicalResult matchAndRewrite(UserOpCompatible op, PatternRewriter& rewriter) const override { + SmallVector quantOps; + for (auto u: op->getUsers()) { + if (auto q = llvm::dyn_cast(u)) { + quantOps.push_back(q); + } + } + if (quantOps.size() <= 1) { + return failure(); + } + auto q0 = *quantOps.begin(); + for (oneflow::QuantizationOp q: quantOps) { + if (q != q0) { + q->replaceAllUsesWith(q0->getResults()); + } + } + return success(); + } +}; + struct AutoNhwcPattern : public OpInterfaceRewritePattern { explicit AutoNhwcPattern(mlir::MLIRContext* context) : OpInterfaceRewritePattern(context, /*benefit=*/1) {} @@ -1131,6 +1156,7 @@ void populateFuserForExistingOp(::mlir::RewritePatternSet& patterns) { populateNormalizationOpPatterns(patterns); patterns.add>(patterns.getContext()); patterns.add>(patterns.getContext()); + patterns.add(patterns.getContext()); } void populateAutoNhwcPatterns(::mlir::RewritePatternSet& patterns) { From 7a5ba06f154a2963782f9cded54b71e8afda5df4 Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Tue, 5 Sep 2023 08:33:47 +0000 Subject: [PATCH 46/65] auto format by CI --- oneflow/ir/lib/OneFlow/Passes.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index 6aabf92f2ef..095e42aac3c 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -511,19 +511,13 @@ struct PruneReduntantQuantizationOpsPattern : public OpInterfaceRewritePattern quantOps; - for (auto u: op->getUsers()) { - if (auto q = llvm::dyn_cast(u)) { - quantOps.push_back(q); - } - } - if (quantOps.size() <= 1) { - return failure(); + for (auto u : op->getUsers()) { + if (auto q = llvm::dyn_cast(u)) { quantOps.push_back(q); } } + if (quantOps.size() <= 1) { return failure(); } auto q0 = *quantOps.begin(); - for (oneflow::QuantizationOp q: quantOps) { - if (q != q0) { - q->replaceAllUsesWith(q0->getResults()); - } + for (oneflow::QuantizationOp q : quantOps) { + if (q != q0) { q->replaceAllUsesWith(q0->getResults()); } } return success(); } From d294ef2b3fa4f9c92b53b8b1c448f4fa1d9da25e Mon Sep 17 00:00:00 2001 From: clackhan Date: Tue, 5 Sep 2023 08:42:26 +0000 Subject: [PATCH 47/65] refine --- oneflow/ir/lib/OneFlow/Passes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index 6aabf92f2ef..0133c8ef235 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -510,7 +510,7 @@ struct PruneReduntantQuantizationOpsPattern : public OpInterfaceRewritePattern quantOps; + SmallVector quantOps; for (auto u: op->getUsers()) { if (auto q = llvm::dyn_cast(u)) { quantOps.push_back(q); From d414febc5cf6b6086ba4c761e3ebe24c09599724 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 5 Sep 2023 15:14:40 +0000 Subject: [PATCH 48/65] update --- oneflow/core/functional/functional_api.yaml | 5 +- oneflow/core/functional/impl/quantization.cpp | 45 +-- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 14 +- oneflow/ir/lib/OneFlow/Passes.cpp | 6 +- .../ir/lib/OneFlow/Transform/AutoNHWCOps.cpp | 22 ++ .../kernels/dynamic_quantization_kernel.cu | 363 ++++++++++++++++++ ...used_activation_min_max_observer_kernel.cu | 319 --------------- ...ver_op.cpp => dynamic_quantization_op.cpp} | 41 +- 8 files changed, 423 insertions(+), 392 deletions(-) create mode 100644 oneflow/user/kernels/dynamic_quantization_kernel.cu delete mode 100644 oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu rename oneflow/user/ops/{fused_min_max_observer_op.cpp => dynamic_quantization_op.cpp} (56%) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 7662384b143..ba02d1fba26 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1106,10 +1106,9 @@ signature: 'Tensor (Tensor x, Tensor w, Tensor w_scale, *, Tensor w_zero=None, Tensor b=None, Int32 num_bits=8, Bool symmetric=True, Int64 group_dim=-1, Int64 group_size=-1) => FusedLinearWithGroupwiseQuantizedWeight' bind_python: True -- name: "fused_activation_min_max_observer" +- name: "dynamic_quantization" signature: - "TensorTuple (Tensor in, Tensor weight_scale, Tensor weight_acc, Tensor bias=None, String quantization_formula, Int32 quantization_bit, - String quantization_scheme, Bool per_layer_quantization=True) => FusedActivationMinMaxObserver" + "TensorTuple (Tensor in, String quantization_formula, Int32 quantization_bit, String quantization_scheme, Bool per_layer_quantization=True) => DynamicQuantization" bind_python: True - name: "conv_data_grad" diff --git a/oneflow/core/functional/impl/quantization.cpp b/oneflow/core/functional/impl/quantization.cpp index c6e6b65ce58..dfc8a9e409b 100644 --- a/oneflow/core/functional/impl/quantization.cpp +++ b/oneflow/core/functional/impl/quantization.cpp @@ -90,49 +90,30 @@ class MovingAverageMinMaxObserverFunctor { std::shared_ptr op_; }; -class FusedActivationMinMaxObserverFunctor { +class DynamicQuantizationFunctor { public: - FusedActivationMinMaxObserverFunctor() { - op_ = CHECK_JUST(one::OpBuilder("fused_activation_min_max_observer") + DynamicQuantizationFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dynamic_quantization") .Input("in") - .Input("weight_scale") - .Input("weight_acc") - .Output("in_scale") - .Output("in_zero_point") - .Output("out_scale") - .Output("out_bias") + .Output("out") + .Output("scale") + .Output("zero_point") .Build()); - op_with_bias_ = CHECK_JUST(one::OpBuilder("fused_activation_min_max_observer") - .Input("in") - .Input("weight_scale") - .Input("weight_acc") - .Input("bias") - .Output("in_scale") - .Output("in_zero_point") - .Output("out_scale") - .Output("out_bias") - .Build()); } - Maybe operator()( - const std::shared_ptr& in, const std::shared_ptr& weight_scale, - const std::shared_ptr& weight_acc, const Optional& bias, - const std::string& quantization_formula, const int32_t& quantization_bit, - const std::string& quantization_scheme, const bool& per_layer_quantization) const { + Maybe operator()(const std::shared_ptr& in, + const std::string& quantization_formula, + const int32_t& quantization_bit, + const std::string& quantization_scheme, + const bool& per_layer_quantization) const { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("quantization_formula", "quantization_bit", "quantization_scheme", "per_layer_quantization"); attrs.SetAllAttrs(quantization_formula, quantization_bit, quantization_scheme, per_layer_quantization); - if (bias) { - return OpInterpUtil::Dispatch(*op_with_bias_, - {in, weight_scale, weight_acc, JUST(bias)}, attrs); - } else { - return OpInterpUtil::Dispatch(*op_, {in, weight_scale, weight_acc}, attrs); - } + return OpInterpUtil::Dispatch(*op_, {in}, attrs); } private: std::shared_ptr op_; - std::shared_ptr op_with_bias_; }; class FakeQuantizationFunctor { @@ -435,7 +416,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("GroupwiseDequantize"); m.add_functor( "FusedLinearWithGroupwiseQuantizedWeight"); - m.add_functor("FusedActivationMinMaxObserver"); + m.add_functor("DynamicQuantization"); }; } // namespace functional diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index 5012eb185bc..ca0a3bfc71d 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -8354,18 +8354,14 @@ def OneFlow_FusedLinearWithGroupwiseQuantizedWeightOp : OneFlow_BaseOp<"fused_li let has_data_type_infer_fn = 1; } -def OneFlow_FusedActivationMinMaxObserverOp : OneFlow_BaseOp<"fused_activation_min_max_observer", [NoMemoryEffect, NoGrad, DeclareOpInterfaceMethods]> { +def OneFlow_DynamicQuantizationOp : OneFlow_BaseOp<"dynamic_quantization", [NoMemoryEffect, NoGrad, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let input = (ins - OneFlow_Tensor:$in, - OneFlow_Tensor:$weight_scale, - OneFlow_Tensor:$weight_acc, - Optional:$bias + OneFlow_Tensor:$in ); let output = (outs - OneFlow_Tensor:$in_scale, - OneFlow_Tensor:$in_zero_point, - OneFlow_Tensor:$out_scale, - OneFlow_Tensor:$out_bias + OneFlow_Tensor:$out, + OneFlow_Tensor:$scale, + OneFlow_Tensor:$zero_point ); let attrs = (ins DefaultValuedAttr:$quantization_formula, diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index 844a3331a9e..ce1c4d7e5c5 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -561,12 +561,16 @@ struct AutoNhwcPattern : public OpInterfaceRewritePattern { getResultTransposeOp(op, created_results[num_transposed_result], transpose_attributes, num_transposed_result, rewriter)) { result.replaceAllUsesWith(result_transpose_op); - num_transposed_result += 1; } else { + op->emitError("Fail to transpose op result"); return failure(); } + } else { + result.replaceAllUsesWith(created_results[num_transposed_result]); } + num_transposed_result += 1; } + op->erase(); } return success(); } diff --git a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp index 21238a4fbec..4f1867f342a 100644 --- a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp +++ b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp @@ -374,6 +374,28 @@ llvm::SmallVector QuantizationOp::NchwToNhwc(llvm::SmallVector DynamicQuantizationOp::OperandsToTranspose() { return {this->getIn()}; } + +llvm::DenseSet DynamicQuantizationOp::ResultsToTranspose() { return {this->getOut()}; } + +llvm::SmallVector DynamicQuantizationOp::NchwToNhwc(llvm::SmallVector value, + PatternRewriter& rewriter) { + auto dynamic_quant_op = *this; + SmallVector operands{value[0]}; + auto res = rewriter + .create(dynamic_quant_op.getLoc(), + getNHWCResultTypes(dynamic_quant_op), + operands, dynamic_quant_op->getAttrs()) + ->getResults(); + llvm::SmallVector results; + results.push_back(res[0]); + results.push_back(res[1]); + results.push_back(res[2]); + return results; +} + } // namespace oneflow } // namespace mlir diff --git a/oneflow/user/kernels/dynamic_quantization_kernel.cu b/oneflow/user/kernels/dynamic_quantization_kernel.cu new file mode 100644 index 00000000000..e12dc2f048b --- /dev/null +++ b/oneflow/user/kernels/dynamic_quantization_kernel.cu @@ -0,0 +1,363 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.cuh" +#include "oneflow/core/ndarray/binary_func.h" +#include "oneflow/core/kernel/util/numeric_limits.cuh" + +namespace oneflow { + +namespace { + +template +__host__ __device__ int ModDiv(int64_t N) { + return N - (N / M * M); +} + +template<> +__host__ __device__ int ModDiv<2>(int64_t N) { + return N & 0x1; +} + +template<> +__host__ __device__ int ModDiv<4>(int64_t N) { + return N & 0x3; +} + +template<> +__host__ __device__ int ModDiv<8>(int64_t N) { + return N & 0x7; +} + +template<> +__host__ __device__ int ModDiv<16>(int64_t N) { + return N & 0xF; +} + +template +__global__ void ReduceMinMaxPerTensor(const int64_t elements, const T* in_ptr, T* min_max_ptr) { + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using MinMaxPack = cuda::elementwise::Pack; + + extern __shared__ uint8_t buffer[]; + + MinMaxPack min_max; + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); + + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x * pack_size; + + for (int64_t idx = gid * pack_size; idx < elements; idx += step) { + LoadPack in; + in.storage = reinterpret_cast(in_ptr + idx)[0]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[i]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[i]); + } + } + int rest = ModDiv(elements); + if (rest > 0 && gid == (gridDim.x * blockDim.x - 1)) { + in_ptr += elements - rest; + LoadPack in; + in.storage = reinterpret_cast(in_ptr)[0]; +#pragma unroll + for (int i = 0; i < rest; ++i) { + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[i]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[i]); + } + } + + int64_t tid = threadIdx.x; + + MinMaxPack* shared_min_max = reinterpret_cast(buffer); + shared_min_max[tid].storage = min_max.storage; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + MinMaxPack min_max0, min_max1; + min_max0.storage = shared_min_max[tid].storage; + min_max1.storage = shared_min_max[tid + s].storage; + min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); + min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); + shared_min_max[tid].storage = min_max0.storage; + } + __syncthreads(); + } + + if (tid == 0) { + reinterpret_cast(min_max_ptr)[blockIdx.x].storage = shared_min_max[0].storage; + } +} + +template +__global__ void ComputeScaleAndZeroPointBlock(const int min_max_size, const T* min_max_ptr, + const Q upper_bound, const Q lower_bound, + float* scale_ptr, Q* zero_point_ptr) { + using MinMaxPack = cuda::elementwise::Pack; + + extern __shared__ uint8_t buffer[]; + MinMaxPack* shared_min_max = reinterpret_cast(buffer); + int64_t tid = threadIdx.x; + { + MinMaxPack min_max; + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); +#pragma unroll + for (int64_t idx = tid; idx < min_max_size; idx += blockDim.x) { + MinMaxPack in = reinterpret_cast(min_max_ptr)[idx]; + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[1]); + } + shared_min_max[tid].storage = min_max.storage; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + MinMaxPack min_max0, min_max1; + min_max0.storage = shared_min_max[tid].storage; + min_max1.storage = shared_min_max[tid + s].storage; + min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); + min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); + shared_min_max[tid].storage = min_max0.storage; + } + __syncthreads(); + } + } + + if (threadIdx.x == 0) { + MinMaxPack min_max = shared_min_max[0]; + float min_value = static_cast(min_max.elem[0]); + float max_value = static_cast(min_max.elem[1]); + float scale = (max_value - min_value) / (upper_bound - lower_bound); + int32_t zero_point = lower_bound - __float2int_rn(min_value / scale); + scale_ptr[0] = scale; + zero_point_ptr[0] = static_cast(zero_point); + } +} + +template<> +__global__ void ComputeScaleAndZeroPointBlock( + const int min_max_size, const half* min_max_ptr, const int8_t upper_bound, + const int8_t lower_bound, float* scale_ptr, int8_t* zero_point_ptr) { + using T = half; + using Q = int8_t; + using MinMaxPack4 = cuda::elementwise::Pack; + using MinMaxPack = cuda::elementwise::Pack; + + extern __shared__ uint8_t buffer[]; + MinMaxPack* shared_min_max = reinterpret_cast(buffer); + int64_t tid = threadIdx.x; + + MinMaxPack min_max; + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); + +#pragma unroll + for (int idx = tid; idx < (min_max_size >> 2); idx += blockDim.x) { + MinMaxPack4 in = reinterpret_cast(min_max_ptr + (idx << 3))[0]; + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[1]); + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[2]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[3]); + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[4]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[5]); + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[6]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[7]); + } + + int rest = ModDiv<4>(min_max_size); + + if (rest > 0 && tid == blockDim.x - 1) { + int offset = (min_max_size - rest) << 1; + MinMaxPack4 in = reinterpret_cast(min_max_ptr + offset)[0]; +#pragma unroll + for (int i = 0; i < rest; ++i) { + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[i << 1]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[(i << 1) + 1]); + } + } + + shared_min_max[tid].storage = min_max.storage; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + MinMaxPack min_max0, min_max1; + min_max0.storage = shared_min_max[tid].storage; + min_max1.storage = shared_min_max[tid + s].storage; + min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); + min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); + shared_min_max[tid].storage = min_max0.storage; + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + MinMaxPack min_max = shared_min_max[0]; + float min_value = static_cast(min_max.elem[0]); + float max_value = static_cast(min_max.elem[1]); + float scale = (max_value - min_value) / (upper_bound - lower_bound); + int32_t zero_point = lower_bound - __float2int_rn(min_value / scale); + scale_ptr[0] = scale; + zero_point_ptr[0] = static_cast(zero_point); + } +} + +template +__global__ void ApplyQuantization(const int64_t elements, const T* in_ptr, const float* scale_ptr, + const Q* zero_point_ptr, const Q upper_bound, const Q lower_bound, + Q* out_ptr) { + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using StoreType = cuda::elementwise::PackType; + using StorePack = cuda::elementwise::Pack; + + int64_t tid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x * pack_size; + + float scale = *scale_ptr; + float zero_point = *zero_point_ptr; + + for (int64_t idx = tid * pack_size; idx < elements; idx += step) { + StorePack out; + LoadPack in; + in.storage = reinterpret_cast(in_ptr + idx)[0]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + out.elem[i] = + max(min(__float2int_rn(static_cast(in.elem[i]) / scale + zero_point), upper_bound), + lower_bound); + } + reinterpret_cast(out_ptr + idx)[0] = out.storage; + } + + int rest = ModDiv(elements); + + if (rest > 0 && tid == (gridDim.x * blockDim.x - 1)) { + in_ptr += elements - rest; + out_ptr += elements - rest; + LoadPack in; + in.storage = reinterpret_cast(in_ptr)[0]; +#pragma unroll + for (int i = 0; i < rest; ++i) { + out_ptr[i] = + max(min(__float2int_rn(static_cast(in.elem[i]) / scale + zero_point), upper_bound), + lower_bound); + } + } +} + +template +void ApplyDynamicQuantization(cudaStream_t stream, const int min_max_size, const T* min_max_ptr, + const int64_t elements, const T* in_ptr, const int quantization_bit, + Q* out_ptr, float* scale_ptr, Q* zero_point_ptr) { + Q upper_bound = (1 << (quantization_bit - 1)) - 1; + Q lower_bound = -upper_bound - 1; + size_t element_bytes = GetSizeOfDataType(GetDataType::value); + + ComputeScaleAndZeroPointBlock + <<<1, cuda::elementwise::kBlockSize, cuda::elementwise::kBlockSize * element_bytes * 2, + stream>>>(min_max_size, min_max_ptr, upper_bound, lower_bound, scale_ptr, zero_point_ptr); + + constexpr int pack_size = cuda::elementwise::PackSize(); + int64_t pack_num = (elements + pack_size - 1) / pack_size; + int grid_size = 0; + cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + ApplyQuantization<<>>( + elements, in_ptr, scale_ptr, zero_point_ptr, upper_bound, lower_bound, out_ptr); +} + +} // namespace + +template +class GpuDynamicQuantizationKernel final : public user_op::OpKernel { + public: + GpuDynamicQuantizationKernel() = default; + ~GpuDynamicQuantizationKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + + CHECK(quantization_scheme == "affine"); + + const int64_t elements = in->shape_view().elem_cnt(); + + constexpr int pack_size = cuda::elementwise::PackSize(); + int64_t pack_num = (elements + pack_size - 1) / pack_size; + int grid_size = 0; + cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + grid_size = grid_size > 2048 ? 2048 : grid_size; + + size_t element_bytes = GetSizeOfDataType(GetDataType::value); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), grid_size * element_bytes * 2); + + T* min_max = reinterpret_cast(tmp_buffer->mut_dptr()); + auto stream = ctx->stream()->As()->cuda_stream(); + if (per_layer_quantization) { + ReduceMinMaxPerTensor + <<>>(elements, in->dptr(), + min_max); + } else { + UNIMPLEMENTED() << "dynamic_quantization does not support per-channel quantization"; + } + + if (quantization_formula == "oneflow") { + if (quantization_bit == 8) { + ApplyDynamicQuantization( + stream, grid_size, min_max, elements, in->dptr(), quantization_bit, + out->mut_dptr(), scale->mut_dptr(), zero_point->mut_dptr()); + } else { + UNIMPLEMENTED(); + } + } else { + UNIMPLEMENTED() << "dynamic_quantization only support oneflow quantization formula"; + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DYNAMIC_QUANTIZATION_KERNEL(dtype) \ + REGISTER_USER_KERNEL("dynamic_quantization") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) + +REGISTER_DYNAMIC_QUANTIZATION_KERNEL(float); +REGISTER_DYNAMIC_QUANTIZATION_KERNEL(double); +REGISTER_DYNAMIC_QUANTIZATION_KERNEL(half); + +} // namespace oneflow diff --git a/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu b/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu deleted file mode 100644 index 1441cd6e8b3..00000000000 --- a/oneflow/user/kernels/fused_activation_min_max_observer_kernel.cu +++ /dev/null @@ -1,319 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/cuda/elementwise.cuh" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/kernel_util.cuh" -#include "oneflow/core/ndarray/binary_func.h" -#include "oneflow/core/kernel/util/numeric_limits.cuh" - -namespace oneflow { - -namespace { - -template -__host__ __device__ int ModDiv(int64_t N) { - return N - (N / M * M); -} - -template<> -__host__ __device__ int ModDiv<2>(int64_t N) { - return N & 0x1; -} - -template<> -__host__ __device__ int ModDiv<4>(int64_t N) { - return N & 0x3; -} - -template<> -__host__ __device__ int ModDiv<8>(int64_t N) { - return N & 0x7; -} - -template<> -__host__ __device__ int ModDiv<16>(int64_t N) { - return N & 0xF; -} - -template -__global__ void ReduceMinMaxPerLayer(const int64_t elements, const T* in_ptr, T* min_max_ptr) { - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - using MinMaxPack = cuda::elementwise::Pack; - - extern __shared__ uint8_t buffer[]; - - MinMaxPack min_max; - min_max.elem[0] = detail::numeric_limits::max(); - min_max.elem[1] = detail::numeric_limits::lowest(); - - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x * pack_size; - - for (int64_t idx = gid * pack_size; idx < elements; idx += step) { - LoadPack in; - in.storage = reinterpret_cast(in_ptr + idx)[0]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[i]); - min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[i]); - } - } - int rest = ModDiv(elements); - if (rest > 0 && gid == (gridDim.x * blockDim.x - 1)) { - in_ptr += elements - rest; - LoadPack in; - in.storage = reinterpret_cast(in_ptr)[0]; -#pragma unroll - for (int i = 0; i < rest; ++i) { - min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[i]); - min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[i]); - } - } - - int64_t tid = threadIdx.x; - - MinMaxPack* shared_min_max = reinterpret_cast(buffer); - shared_min_max[tid].storage = min_max.storage; - __syncthreads(); - - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - MinMaxPack min_max0, min_max1; - min_max0.storage = shared_min_max[tid].storage; - min_max1.storage = shared_min_max[tid + s].storage; - min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); - min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); - shared_min_max[tid].storage = min_max0.storage; - } - __syncthreads(); - } - - if (tid == 0) { - reinterpret_cast(min_max_ptr)[blockIdx.x].storage = shared_min_max[0].storage; - } -} - -template -__global__ void ComputeOFScaleAndZeroPoint(const T* min_max_ptr, const int min_max_size, - const int quantization_bit, const float* weight_scale, - const float* weight_acc, const T* bias, T* in_scale, - Q* in_zero_point, T* out_scale, T* out_bias, - const int out_elements) { - using MinMaxPack = cuda::elementwise::Pack; - - extern __shared__ uint8_t buffer[]; - MinMaxPack* shared_min_max = reinterpret_cast(buffer); - int64_t tid = threadIdx.x; - { - MinMaxPack min_max; - min_max.elem[0] = detail::numeric_limits::max(); - min_max.elem[1] = detail::numeric_limits::lowest(); -#pragma unroll - for (int64_t idx = threadIdx.x; idx < min_max_size; idx += blockDim.x) { - MinMaxPack in = reinterpret_cast(min_max_ptr)[idx]; - min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[0]); - min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[1]); - } - shared_min_max[tid].storage = min_max.storage; - __syncthreads(); - - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - MinMaxPack min_max0, min_max1; - min_max0.storage = shared_min_max[tid].storage; - min_max1.storage = shared_min_max[tid + s].storage; - min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); - min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); - shared_min_max[tid].storage = min_max0.storage; - } - __syncthreads(); - } - } - - MinMaxPack min_max = shared_min_max[0]; - float min_value = static_cast(min_max.elem[0]); - float max_value = static_cast(min_max.elem[1]); - float input_scale = (max_value - min_value) / ((1 << quantization_bit) - 1); - int32_t input_zero_point = - -(__float2int_rn(min_value / input_scale) + (1 << (quantization_bit - 1))); - float scale_zero_point = -input_scale * input_zero_point; - - int64_t thread_num = gridDim.x * blockDim.x; - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - if (gid == 0) { - in_scale[0] = static_cast(input_scale); - in_zero_point[0] = static_cast(input_zero_point); - } - - using LoadWPack = cuda::elementwise::Pack; - using LoadBPack = cuda::elementwise::Pack; - using StorePack = cuda::elementwise::Pack; - - if (bias) { - for (int64_t idx = gid << 2; idx < out_elements; idx += thread_num << 2) { - LoadWPack w_scale = reinterpret_cast(weight_scale + idx)[0]; - LoadWPack w_acc = reinterpret_cast(weight_acc + idx)[0]; - LoadBPack b = reinterpret_cast(bias + idx)[0]; - StorePack store_scale, store_bias; - - store_scale.elem[0] = static_cast(w_scale.elem[0] * input_scale); - store_scale.elem[1] = static_cast(w_scale.elem[1] * input_scale); - store_scale.elem[2] = static_cast(w_scale.elem[2] * input_scale); - store_scale.elem[3] = static_cast(w_scale.elem[3] * input_scale); - - store_bias.elem[0] = static_cast(__fmaf_rn(w_acc.elem[0], scale_zero_point, b.elem[0])); - store_bias.elem[1] = static_cast(__fmaf_rn(w_acc.elem[1], scale_zero_point, b.elem[1])); - store_bias.elem[2] = static_cast(__fmaf_rn(w_acc.elem[2], scale_zero_point, b.elem[2])); - store_bias.elem[3] = static_cast(__fmaf_rn(w_acc.elem[3], scale_zero_point, b.elem[3])); - - reinterpret_cast(out_scale + idx)[0] = store_scale; - reinterpret_cast(out_bias + idx)[0] = store_bias; - } - int rest = ModDiv<4>(out_elements); - if (rest > 0 && gid == (thread_num - 1)) { - int offset = out_elements - rest; - LoadWPack w_scale = reinterpret_cast(weight_scale + offset)[0]; - LoadWPack w_acc = reinterpret_cast(weight_acc + offset)[0]; - LoadBPack b = reinterpret_cast(bias + offset)[0]; -#pragma unroll - for (int i = 0; i < rest; ++i) { - out_scale[offset + i] = static_cast(w_scale.elem[i] * input_scale); - out_bias[offset + i] = - static_cast(__fmaf_rn(w_acc.elem[i], scale_zero_point, b.elem[i])); - } - } - } else { - for (int64_t idx = gid << 2; idx < out_elements; idx += thread_num << 2) { - LoadWPack w_scale = reinterpret_cast(weight_scale + idx)[0]; - LoadWPack w_acc = reinterpret_cast(weight_acc + idx)[0]; - StorePack store_scale, store_bias; - - store_scale.elem[0] = static_cast(w_scale.elem[0] * input_scale); - store_scale.elem[1] = static_cast(w_scale.elem[1] * input_scale); - store_scale.elem[2] = static_cast(w_scale.elem[2] * input_scale); - store_scale.elem[3] = static_cast(w_scale.elem[3] * input_scale); - - store_bias.elem[0] = static_cast(w_acc.elem[0] * scale_zero_point); - store_bias.elem[1] = static_cast(w_acc.elem[1] * scale_zero_point); - store_bias.elem[2] = static_cast(w_acc.elem[2] * scale_zero_point); - store_bias.elem[3] = static_cast(w_acc.elem[3] * scale_zero_point); - - reinterpret_cast(out_scale + idx)[0] = store_scale; - reinterpret_cast(out_bias + idx)[0] = store_bias; - } - int rest = ModDiv<4>(out_elements); - if (rest > 0 && gid == (thread_num - 1)) { - int offset = out_elements - rest; - LoadWPack w_scale = reinterpret_cast(weight_scale + offset)[0]; - LoadWPack w_acc = reinterpret_cast(weight_acc + offset)[0]; -#pragma unroll - for (int i = 0; i < rest; ++i) { - out_scale[offset + i] = static_cast(w_scale.elem[i] * input_scale); - out_bias[offset + i] = static_cast(w_acc.elem[i] * scale_zero_point); - } - } - } -} - -} // namespace - -template -class GpuFusedActivationMinMaxObserverKernel final : public user_op::OpKernel { - public: - GpuFusedActivationMinMaxObserverKernel() = default; - ~GpuFusedActivationMinMaxObserverKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - const user_op::Tensor* weight_scale = ctx->Tensor4ArgNameAndIndex("weight_scale", 0); - const user_op::Tensor* weight_acc = ctx->Tensor4ArgNameAndIndex("weight_acc", 0); - const user_op::Tensor* bias = nullptr; - if (ctx->has_input("bias", 0)) { bias = ctx->Tensor4ArgNameAndIndex("bias", 0); } - - user_op::Tensor* in_scale = ctx->Tensor4ArgNameAndIndex("in_scale", 0); - user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_point", 0); - user_op::Tensor* out_scale = ctx->Tensor4ArgNameAndIndex("out_scale", 0); - user_op::Tensor* out_bias = ctx->Tensor4ArgNameAndIndex("out_bias", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const std::string quantization_scheme = ctx->Attr("quantization_scheme"); - const int32_t quantization_bit = ctx->Attr("quantization_bit"); - const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); - const std::string quantization_formula = ctx->Attr("quantization_formula"); - - CHECK(quantization_scheme == "affine"); - CHECK(quantization_bit == 8); - - const int64_t elements = in->shape_view().elem_cnt(); - - constexpr int pack_size = cuda::elementwise::PackSize(); - int grid_size = 0; - int64_t pack_num = (elements + pack_size - 1) / pack_size; - cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - grid_size = grid_size > 2048 ? 2048 : grid_size; - - size_t element_bytes = GetSizeOfDataType(GetDataType::value); - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), grid_size * element_bytes * 2); - - T* min_max = reinterpret_cast(tmp_buffer->mut_dptr()); - auto stream = ctx->stream()->As()->cuda_stream(); - if (per_layer_quantization) { - ReduceMinMaxPerLayer - <<>>(elements, in->dptr(), - min_max); - } else { - UNIMPLEMENTED() - << "fused_activation_min_max_observer does not support per-channel quantization"; - } - - if (quantization_formula == "oneflow") { - if (quantization_bit == 8) { - ComputeOFScaleAndZeroPoint - <<<1, cuda::elementwise::kBlockSize, cuda::elementwise::kBlockSize * element_bytes * 2, - stream>>>(min_max, grid_size, quantization_bit, weight_scale->dptr(), - weight_acc->dptr(), bias ? bias->dptr() : nullptr, - in_scale->mut_dptr(), in_zero_point->mut_dptr(), - out_scale->mut_dptr(), out_bias->mut_dptr(), - out_scale->shape_view().elem_cnt()); - } else { - UNIMPLEMENTED(); - } - } else { - UNIMPLEMENTED() - << "fused_activation_min_max_observer only support oneflow quantization formula"; - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_activation_min_max_observer") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) - -REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(float); -REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(double); -REGISTER_FUSED_ACTIVATION_MIN_MAX_OBSERVER_KERNEL(half); - -} // namespace oneflow diff --git a/oneflow/user/ops/fused_min_max_observer_op.cpp b/oneflow/user/ops/dynamic_quantization_op.cpp similarity index 56% rename from oneflow/user/ops/fused_min_max_observer_op.cpp rename to oneflow/user/ops/dynamic_quantization_op.cpp index 6ec8d4c856b..70a9c9f7f2a 100644 --- a/oneflow/user/ops/fused_min_max_observer_op.cpp +++ b/oneflow/user/ops/dynamic_quantization_op.cpp @@ -18,31 +18,26 @@ limitations under the License. namespace oneflow { -/* static */ Maybe FusedActivationMinMaxObserverOp::InferLogicalTensorDesc( - user_op::InferContext* ctx) { +/* static */ Maybe DynamicQuantizationOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { CHECK_OR_RETURN(ctx->Attr("per_layer_quantization")) - << "activation min_max_observer only support per-layer quantization"; - const Shape& weight_scale_shape = ctx->InputShape("weight_scale", 0); - - ctx->SetOutputShape("in_scale", 0, Shape({1})); - ctx->SetOutputShape("in_zero_point", 0, Shape({1})); - ctx->SetOutputShape("out_scale", 0, {weight_scale_shape.Count(0)}); - ctx->SetOutputShape("out_bias", 0, {weight_scale_shape.Count(0)}); + << "dynamic quantization only supports per-layer quantization"; + ctx->SetOutputShape("out", 0, ctx->InputShape("in", 0)); + ctx->SetOutputShape("scale", 0, Shape({1})); + ctx->SetOutputShape("zero_point", 0, Shape({1})); return Maybe::Ok(); } -/*static*/ Maybe FusedActivationMinMaxObserverOp::InferPhysicalTensorDesc( - user_op::InferContext* ctx) { +/*static*/ Maybe DynamicQuantizationOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { return InferLogicalTensorDesc(ctx); } -/* static */ Maybe FusedActivationMinMaxObserverOp::GetSbp(user_op::SbpContext* ctx) { +/* static */ Maybe DynamicQuantizationOp::GetSbp(user_op::SbpContext* ctx) { // NOTE(Liang Depeng): input needs to be broadcast in order to accurately calculate the // global scale and zero_point return Maybe::Ok(); } -/* static */ Maybe FusedActivationMinMaxObserverOp::CheckAttr( +/* static */ Maybe DynamicQuantizationOp::CheckAttr( const user_op::UserOpDefWrapper& def, const user_op::UserOpConfWrapper& op_conf) { int32_t quantization_bit = op_conf.attr("quantization_bit"); CHECK_GT_OR_RETURN(quantization_bit, 1); @@ -57,30 +52,20 @@ namespace oneflow { return Maybe::Ok(); } -/* static */ Maybe FusedActivationMinMaxObserverOp::InferDataType( - user_op::InferContext* ctx) { - CHECK_EQ_OR_RETURN(ctx->InputDType("weight_scale", 0), DataType::kFloat) - << "weight_scale dtype should be float"; - CHECK_EQ_OR_RETURN(ctx->InputDType("weight_acc", 0), DataType::kFloat) - << "weight_acc dtype should be float"; - - DataType data_type = ctx->InputDType("in", 0); - if (ctx->has_input("bias", 0)) { CHECK_EQ_OR_RETURN(data_type, ctx->InputDType("bias", 0)); } - +/* static */ Maybe DynamicQuantizationOp::InferDataType(user_op::InferContext* ctx) { int32_t quantization_bit = ctx->Attr("quantization_bit"); const std::string& quantization_formula = ctx->Attr("quantization_formula"); if (quantization_formula == "oneflow") { if (quantization_bit == 8) { - ctx->SetOutputDType("in_zero_point", 0, DataType::kInt8); + ctx->SetOutputDType("out", 0, DataType::kInt8); + ctx->SetOutputDType("zero_point", 0, DataType::kInt8); } else { OF_UNIMPLEMENTED(); } } else { - ctx->SetOutputDType("in_zero_point", 0, data_type); + OF_UNIMPLEMENTED(); } - ctx->SetOutputDType("in_scale", 0, data_type); - ctx->SetOutputDType("out_scale", 0, data_type); - ctx->SetOutputDType("out_bias", 0, data_type); + ctx->SetOutputDType("scale", 0, DataType::kFloat); return Maybe::Ok(); } From 13c06699c0f2eec1bc3a37fdbf3ffdc3bdf6109c Mon Sep 17 00:00:00 2001 From: jackalcooper Date: Wed, 6 Sep 2023 10:29:34 +0800 Subject: [PATCH 49/65] fmt --- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index ca0a3bfc71d..c4b23d1a462 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -12222,7 +12222,7 @@ class OneFlow_JITLikeOp : OneFlow_BaseOpgetAttrOfType("callee"); } - + void setCalleeFromCallable(CallInterfaceCallable callee) { (*this)->setAttr("callee", callee.get()); } From 14c4bba3120cb312eb90602da6eb41c08c8a1217 Mon Sep 17 00:00:00 2001 From: clackhan Date: Wed, 6 Sep 2023 08:23:00 +0000 Subject: [PATCH 50/65] prune_reduntant_quant_from_input_op --- oneflow/ir/lib/OneFlow/Passes.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index e8305903e4b..6fbe5244800 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -523,6 +523,26 @@ struct PruneReduntantQuantizationOpsPattern : public OpInterfaceRewritePattern { + explicit PruneReduntantQuantizationFromInputOpPattern(mlir::MLIRContext* context) + : OpRewritePattern(context, /*benefit=*/1) {} + + public: + LogicalResult matchAndRewrite(InputOp op, PatternRewriter& rewriter) const override { + SmallVector quantOps; + for (auto u : op->getUsers()) { + if (auto q = llvm::dyn_cast(u)) { quantOps.push_back(q); } + } + if (quantOps.size() <= 1) { return failure(); } + auto q0 = *quantOps.begin(); + for (oneflow::QuantizationOp q : quantOps) { + if (q != q0) { + q->replaceAllUsesWith(q0->getResults()); } + } + return success(); + } +}; + struct AutoNhwcPattern : public OpInterfaceRewritePattern { explicit AutoNhwcPattern(mlir::MLIRContext* context) : OpInterfaceRewritePattern(context, /*benefit=*/1) {} @@ -1151,6 +1171,7 @@ void populateFuserForExistingOp(::mlir::RewritePatternSet& patterns) { patterns.add>(patterns.getContext()); patterns.add>(patterns.getContext()); patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } void populateAutoNhwcPatterns(::mlir::RewritePatternSet& patterns) { From d30b33afdf6948c3e468948cf017572e0be3eb1a Mon Sep 17 00:00:00 2001 From: clackhan Date: Wed, 6 Sep 2023 08:50:53 +0000 Subject: [PATCH 51/65] refine --- oneflow/ir/lib/OneFlow/Passes.cpp | 102 +++++++++++------------------- 1 file changed, 38 insertions(+), 64 deletions(-) diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index 0b9239211d9..9e4e1b906a5 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -504,44 +504,49 @@ LogicalResult FusedConsecutiveAddPattern::matchAndRewrite(Add2Op op, return TryFusedConsecutiveAdd(op, {op.getIn0(), op.getIn1()}, rewriter); } -struct PruneReduntantQuantizationOpsPattern : public OpInterfaceRewritePattern { - explicit PruneReduntantQuantizationOpsPattern(mlir::MLIRContext* context) - : OpInterfaceRewritePattern(context, /*benefit=*/1) {} - - public: - LogicalResult matchAndRewrite(UserOpCompatible op, PatternRewriter& rewriter) const override { - DenseMap> quantOps; - DenseMap> dynamic_quantOps; - for (auto result : op->getResults()) { - for (auto u : result.getUsers()) { - if (auto q = llvm::dyn_cast(u)) { quantOps[result].push_back(q); } - if (auto q = llvm::dyn_cast(u)) { - dynamic_quantOps[result].push_back(q); - } +template +LogicalResult PruneReduntantQuantization(OpType op, PatternRewriter& rewriter) { + DenseMap> quantOps; + DenseMap> dynamic_quantOps; + for (auto result : op->getResults()) { + for (auto u : result.getUsers()) { + if (auto q = llvm::dyn_cast(u)) { quantOps[result].push_back(q); } + if (auto q = llvm::dyn_cast(u)) { + dynamic_quantOps[result].push_back(q); } } - bool pruned = false; - for (const auto& it : quantOps) { - auto q0 = it.second[0]; - for (auto q : it.second) { - if (q != q0) { - q->replaceAllUsesWith(q0->getResults()); - q->erase(); - pruned = true; - } + } + bool pruned = false; + for (const auto& it : quantOps) { + auto q0 = it.second[0]; + for (auto q : it.second) { + if (q != q0) { + q->replaceAllUsesWith(q0->getResults()); + q->erase(); + pruned = true; } } - for (const auto& it : dynamic_quantOps) { - auto q0 = it.second[0]; - for (auto q : it.second) { - if (q != q0) { - q->replaceAllUsesWith(q0->getResults()); - q->erase(); - pruned = true; - } + } + for (const auto& it : dynamic_quantOps) { + auto q0 = it.second[0]; + for (auto q : it.second) { + if (q != q0) { + q->replaceAllUsesWith(q0->getResults()); + q->erase(); + pruned = true; } } - return success(pruned); + } + return success(pruned); +} + +struct PruneReduntantQuantizationOpsPattern : public OpInterfaceRewritePattern { + explicit PruneReduntantQuantizationOpsPattern(mlir::MLIRContext* context) + : OpInterfaceRewritePattern(context, /*benefit=*/1) {} + + public: + LogicalResult matchAndRewrite(UserOpCompatible op, PatternRewriter& rewriter) const override { + return PruneReduntantQuantization(op, rewriter); } }; @@ -551,38 +556,7 @@ struct PruneReduntantQuantizationFromInputOpPattern : public OpRewritePattern> quantOps; - DenseMap> dynamic_quantOps; - for (auto result : op->getResults()) { - for (auto u : result.getUsers()) { - if (auto q = llvm::dyn_cast(u)) { quantOps[result].push_back(q); } - if (auto q = llvm::dyn_cast(u)) { - dynamic_quantOps[result].push_back(q); - } - } - } - bool pruned = false; - for (const auto& it : quantOps) { - auto q0 = it.second[0]; - for (auto q : it.second) { - if (q != q0) { - q->replaceAllUsesWith(q0->getResults()); - q->erase(); - pruned = true; - } - } - } - for (const auto& it : dynamic_quantOps) { - auto q0 = it.second[0]; - for (auto q : it.second) { - if (q != q0) { - q->replaceAllUsesWith(q0->getResults()); - q->erase(); - pruned = true; - } - } - } - return success(pruned); + return PruneReduntantQuantization(op, rewriter); } }; From 78b51cda11bfde278189011810fa017e2b39ab4a Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 6 Sep 2023 09:40:33 +0000 Subject: [PATCH 52/65] fuse dynamic quant conv --- oneflow/core/functional/functional_api.yaml | 12 ++- oneflow/core/functional/impl/nn_functor.cpp | 73 +++++++++++++++++++ .../cutlass_conv_tuning_warmup_pass.cpp | 43 ++++++++++- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 3 + .../ir/lib/OneFlow/Transform/AutoNHWCOps.cpp | 3 + oneflow/user/kernels/conv_quant_kernels.cu | 67 +++++++++-------- .../cutlass_conv2d_operation_cache_key.h | 38 ++++++---- .../user/kernels/cutlass_conv_tuner_impl.cpp | 31 ++++---- oneflow/user/ops/conv_quant_op.cpp | 60 +++++++++------ 9 files changed, 241 insertions(+), 89 deletions(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index ba02d1fba26..5abf4134dbb 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1053,9 +1053,15 @@ - name: "conv2d_quant" signature: - 'Tensor (Tensor input, Tensor weight, Tensor input_zero_point, Tensor scale=None, Tensor bias=None, Int32List[2] stride=1, - Int32List[2] padding=0, Int32List[2] dilation=1, Int32 groups=1, - String channel_pos="channels_first", DataType output_dtype=None) => Conv2dQuant' + [ + 'Tensor (Tensor input, Tensor weight, Tensor input_zero_point, Tensor scale=None, Tensor bias=None, + Int32List[2] stride=1, Int32List[2] padding=0, Int32List[2] dilation=1, Int32 groups=1, + String channel_pos="channels_first", DataType output_dtype=None) => Conv2dQuant', + 'Tensor (Tensor input, Tensor weight, Tensor input_zero_point, Tensor input_scale, Tensor weight_scale, + Tensor weight_acc, Tensor bias=None, + Int32List[2] stride=1, Int32List[2] padding=0, Int32List[2] dilation=1, Int32 groups=1, + String channel_pos="channels_first", DataType output_dtype=None) => Conv2dQuant', + ] bind_python: True - name: "matmul_quant" diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 5cd2613d665..d03eaf654ed 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -227,6 +227,78 @@ class Conv2dQuantFunctor : public ConvQuantBaseFunctor { } }; +class ConvQuantWithInputScaleBaseFunctor { + public: + explicit ConvQuantWithInputScaleBaseFunctor(const int& num_spatial_dims) + : num_spatial_dims_(num_spatial_dims) {} + virtual ~ConvQuantWithInputScaleBaseFunctor() = default; + + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& weight, + const std::shared_ptr& input_zero_point, + const std::shared_ptr& input_scale, + const std::shared_ptr& weight_scale, + const std::shared_ptr& weight_acc, + const Optional& bias, const std::vector& stride, + const std::vector& padding, + const std::vector& dilation, const int32_t& groups, + const std::string& channel_pos, + const Optional>& output_dtype) const { + std::vector kernel_size_vec(num_spatial_dims_); + int32_t kernel_idx_offset = 2; + if (channel_pos == "channels_last") { kernel_idx_offset = 1; } + + for (int i = 0; i < num_spatial_dims_; i++) { + kernel_size_vec.at(i) = ((weight->shape())->At(i + kernel_idx_offset)); + } + auto& conv_attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("filters", "kernel_size", "padding_before", "strides", + "dilation_rate", "groups", "data_format", "out_dtype"); + conv_attrs.SetAllAttrs(static_cast(weight->shape()->At(0)), kernel_size_vec, padding, + stride, dilation, groups, channel_pos, + output_dtype.value_or(DType::Float())->data_type()); + if (bias) { + return OpInterpUtil::Dispatch( + *conv_bias_op_, + {input, weight, input_zero_point, input_scale, weight_scale, weight_acc, JUST(bias)}, + conv_attrs); + } + return OpInterpUtil::Dispatch( + *conv_op_, {input, weight, input_zero_point, input_scale, weight_scale, weight_acc}, + conv_attrs); + } + + protected: + std::shared_ptr conv_op_; + std::shared_ptr conv_bias_op_; + int32_t num_spatial_dims_; +}; + +class Conv2dQuantWithInputScaleFunctor : public ConvQuantWithInputScaleBaseFunctor { + public: + Conv2dQuantWithInputScaleFunctor() : ConvQuantWithInputScaleBaseFunctor(/*num_spatial_dims_=*/2) { + conv_op_ = CHECK_JUST(one::OpBuilder("conv2d_quant") + .Input("in") + .Input("weight") + .Input("in_zero_point") + .Input("in_scale") + .Input("weight_scale") + .Input("weight_acc") + .Output("out") + .Build()); + conv_bias_op_ = CHECK_JUST(one::OpBuilder("conv2d_quant") + .Input("in") + .Input("weight") + .Input("in_zero_point") + .Input("in_scale") + .Input("weight_scale") + .Input("weight_acc") + .Input("bias") + .Output("out") + .Build()); + } +}; + class DeConvBaseFunctor { public: explicit DeConvBaseFunctor(const int& num_spatial_dims) : num_spatial_dims_(num_spatial_dims) { @@ -5536,6 +5608,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("Deconv2d"); m.add_functor("Deconv3d"); m.add_functor("Conv2dQuant"); + m.add_functor("Conv2dQuant"); m.add_functor("EmbeddingReNorm"); m.add_functor("Embedding"); m.add_functor("MatMul"); diff --git a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp index aee754d811b..2c8d1253a2b 100644 --- a/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp +++ b/oneflow/core/job_rewriter/cutlass_conv_tuning_warmup_pass.cpp @@ -114,6 +114,9 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const .ByteSizeOfBlobBody()); } size_t zero_point_size = 0; + size_t in_scale_size = 0; + size_t filter_scale_size = 0; + size_t filter_acc_size = 0; size_t scale_size = 0; size_t add_to_output_size = 0; if (conv2d_op.has_input("in_zero_point", 0)) { @@ -121,6 +124,21 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("in_zero_point", 0))) .ByteSizeOfBlobBody()); } + if (conv2d_op.has_input("in_scale", 0)) { + in_scale_size = GetCudaAlignedSize( + node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("in_scale", 0))) + .ByteSizeOfBlobBody()); + } + if (conv2d_op.has_input("filter_scale", 0)) { + filter_scale_size = GetCudaAlignedSize( + node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("filter_scale", 0))) + .ByteSizeOfBlobBody()); + } + if (conv2d_op.has_input("filter_acc", 0)) { + filter_acc_size = GetCudaAlignedSize( + node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("filter_acc", 0))) + .ByteSizeOfBlobBody()); + } if (conv2d_op.has_input("scale", 0)) { scale_size = GetCudaAlignedSize( node->LogicalBlobDesc4Lbi(GenLogicalBlobId(conv2d_op.input("scale", 0))) @@ -132,8 +150,9 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const .ByteSizeOfBlobBody()); } - const size_t total_buf_size = - x_size + w_size + y_size + bias_size + zero_point_size + scale_size + add_to_output_size; + const size_t total_buf_size = x_size + w_size + y_size + bias_size + zero_point_size + + in_scale_size + filter_scale_size + filter_acc_size + scale_size + + add_to_output_size; if (total_buf_size > buffer_size) { size_t malloc_size = RoundUp(total_buf_size, kBufferMallocAlign); OF_CUDA_CHECK(cudaFree(buffer)); @@ -179,7 +198,6 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const cutlass::library::ConvArguments arguments; arguments.A = x_ptr; arguments.B = w_ptr; - arguments.reordered_B = nullptr; arguments.C = bias_ptr; arguments.D = y_ptr; union SP { @@ -219,6 +237,21 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const zero_point_ptr = buffer + offset; offset += zero_point_size; } + void* in_scale_ptr = nullptr; + if (in_scale_size) { + in_scale_ptr = buffer + offset; + offset += in_scale_size; + } + void* filter_scale_ptr = nullptr; + if (filter_scale_size) { + filter_scale_ptr = buffer + offset; + offset += filter_scale_size; + } + void* filter_acc_ptr = nullptr; + if (filter_acc_size) { + filter_acc_ptr = buffer + offset; + offset += filter_acc_size; + } void* scale_ptr = nullptr; if (scale_size) { scale_ptr = buffer + offset; @@ -252,8 +285,10 @@ Maybe CutlassConvTuningWarmupPass::Apply(Job* job, JobPassCtx* ctx) const cutlass::library::ConvScaleBiasFusionArguments arguments; arguments.A = x_ptr; arguments.B = w_ptr; - arguments.reordered_B = nullptr; arguments.P = zero_point_ptr; + arguments.InScale = in_scale_ptr; + arguments.FilterScale = filter_scale_ptr; + arguments.FilterAcc = filter_acc_ptr; arguments.Scale = scale_ptr; arguments.Bias = bias_ptr; arguments.Residual = add_to_output_ptr; diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index ca0a3bfc71d..8655356f65e 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -8386,6 +8386,9 @@ def OneFlow_Conv2DQuantOp : OneFlow_BaseOp<"conv2d_quant", [NoMemoryEffect, Attr OneFlow_Tensor:$in, OneFlow_Tensor:$weight, OneFlow_Tensor:$in_zero_point, + Optional:$in_scale, + Optional:$weight_scale, + Optional:$weight_acc, Optional:$scale, Optional:$bias, Optional:$_add_to_output diff --git a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp index 4f1867f342a..c0a6497b62c 100644 --- a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp +++ b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp @@ -339,6 +339,9 @@ llvm::SmallVector Conv2DQuantOp::NchwToNhwc(llvm::SmallVectorget_addToOutput()) { operands.push_back(value[2]); } diff --git a/oneflow/user/kernels/conv_quant_kernels.cu b/oneflow/user/kernels/conv_quant_kernels.cu index 402f8396ad3..ef0ad093cc0 100644 --- a/oneflow/user/kernels/conv_quant_kernels.cu +++ b/oneflow/user/kernels/conv_quant_kernels.cu @@ -68,14 +68,14 @@ void LaunchConvQuantOpImpl(user_op::KernelComputeContext* ctx, CHECK(run_status == cutlass::Status::kSuccess); } -void LaunchConv2dQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, - const cutlass::library::ConvFunctionalKey& key, - const cutlass::conv::Conv2dProblemSize& problem_size, - const user_op::Tensor* in, const user_op::Tensor* weight, - const user_op::Tensor* in_zero_point, - const user_op::Tensor* scale, const user_op::Tensor* bias, - const user_op::Tensor* add_to_output, - user_op::Tensor* out) { +void LaunchConv2dQuantOp(user_op::KernelComputeContext* ctx, + const cutlass::library::ConvFunctionalKey& key, + const cutlass::conv::Conv2dProblemSize& problem_size, + const user_op::Tensor* in, const user_op::Tensor* weight, + const user_op::Tensor* in_zero_point, const user_op::Tensor* in_scale, + const user_op::Tensor* weight_scale, const user_op::Tensor* weight_acc, + const user_op::Tensor* scale, const user_op::Tensor* bias, + const user_op::Tensor* add_to_output, user_op::Tensor* out) { cutlass::library::Conv2dScaleBiasFusionConfiguration configuraion; configuraion.split_k_mode = cutlass::conv::SplitKMode::kSerial; configuraion.problem_size = problem_size; @@ -83,23 +83,30 @@ void LaunchConv2dQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, problem_size.H * problem_size.W * problem_size.C}; configuraion.stride_b = {problem_size.C, problem_size.S * problem_size.C, problem_size.R * problem_size.S * problem_size.C}; - configuraion.stride_residual = {problem_size.K, problem_size.Q * problem_size.K, - problem_size.P * problem_size.Q * problem_size.K}; - + if (add_to_output) { + configuraion.stride_residual = {problem_size.K, problem_size.Q * problem_size.K, + problem_size.P * problem_size.Q * problem_size.K}; + } cutlass::library::ConvScaleBiasFusionArguments arguments; arguments.A = in->dptr(); arguments.B = weight->dptr(); - arguments.reordered_B = nullptr; arguments.P = in_zero_point->dptr(); - arguments.Scale = scale->dptr(); - arguments.Bias = bias->dptr(); - if (add_to_output) { - arguments.Residual = add_to_output->dptr(); - } else { - arguments.Residual = nullptr; - } arguments.D = out->mut_dptr(); + arguments.InScale = nullptr; + arguments.FilterScale = nullptr; + arguments.FilterAcc = nullptr; + arguments.Scale = nullptr; + arguments.Bias = nullptr; + arguments.Residual = nullptr; + + if (in_scale) { arguments.InScale = in_scale->dptr(); } + if (weight_scale) { arguments.FilterScale = weight_scale->dptr(); } + if (weight_acc) { arguments.FilterAcc = weight_acc->dptr(); } + if (scale) { arguments.Scale = scale->dptr(); } + if (bias) { arguments.Bias = bias->dptr(); } + if (add_to_output) { arguments.Residual = add_to_output->dptr(); } + LaunchConvQuantOpImpl(ctx, key, configuraion, arguments); } @@ -116,11 +123,6 @@ class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGr const user_op::OpKernelCache* cache) const override { const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_point", 0); - const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); - const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); const auto& padding_before = ctx->Attr>("padding_before"); @@ -157,12 +159,17 @@ class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGr n, h, w, c, k, r, s, p, q, padding_before.at(0), padding_before.at(1), strides.at(0), strides.at(1), dilation_rate.at(0), dilation_rate.at(1), cutlass::conv::Mode::kCrossCorrelation); - if (scale) { - LaunchConv2dQuantScaleBiasFusionOp(ctx, key, problem_size, in, weight, in_zero_point, scale, - bias, add_to_output, out); - } else { - UNIMPLEMENTED(); - } + + const user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_point", 0); + const user_op::Tensor* in_scale = ctx->Tensor4ArgNameAndIndex("in_scale", 0); + const user_op::Tensor* weight_scale = ctx->Tensor4ArgNameAndIndex("weight_scale", 0); + const user_op::Tensor* weight_acc = ctx->Tensor4ArgNameAndIndex("weight_acc", 0); + const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + + LaunchConv2dQuantOp(ctx, key, problem_size, in, weight, in_zero_point, in_scale, weight_scale, + weight_acc, scale, bias, add_to_output, out); } }; diff --git a/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h index a013e949166..c20aef145c6 100644 --- a/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h +++ b/oneflow/user/kernels/cutlass_conv2d_operation_cache_key.h @@ -24,6 +24,7 @@ limitations under the License. #include #ifdef WITH_CUTLASS_EXTENSION +#include #include #endif // WITH_CUTLASS_EXTENSION @@ -33,15 +34,12 @@ struct Conv2dOperationCacheKey { cutlass::library::ConvFunctionalKey functional_key; cutlass::library::Conv2dConfiguration configuraion; size_t alignment; - bool fuse_scale_bias; - bool fuse_residual; + size_t kind; + Conv2dOperationCacheKey(cutlass::library::ConvFunctionalKey functional_key, cutlass::library::Conv2dConfiguration configuraion, cutlass::library::ConvArguments arguments) - : functional_key(functional_key), - configuraion(configuraion), - fuse_scale_bias(false), - fuse_residual(false) { + : functional_key(functional_key), configuraion(configuraion), kind(-1) { const auto IsStrideAligned = [&](const std::vector& stride, size_t n) { return std::all_of(stride.cbegin(), stride.cend(), [&](const int64_t& s) { return s % n == 0; }); @@ -64,9 +62,22 @@ struct Conv2dOperationCacheKey { Conv2dOperationCacheKey(cutlass::library::ConvFunctionalKey functional_key, const cutlass::library::Conv2dScaleBiasFusionConfiguration& config, const cutlass::library::ConvScaleBiasFusionArguments& arguments) - : functional_key(functional_key), - fuse_scale_bias(true), - fuse_residual(arguments.Residual != nullptr) { + : functional_key(functional_key) { + if (arguments.Scale) { + kind = arguments.Residual + ? cutlass::library::SingletonKind::kConv2dScaleBiasResidualFusionWithZeroPoint + : cutlass::library::SingletonKind::kConv2dScaleBiasFusionWithZeroPoint; + } else if (arguments.InScale) { + if (arguments.Bias) { + kind = arguments.Residual + ? cutlass::library::SingletonKind::kConv2dFilterScaleBiasResidualFusion + : cutlass::library::SingletonKind::kConv2dFilterScaleBiasFusion; + } else { + UNIMPLEMENTED(); + } + } else { + UNIMPLEMENTED(); + } configuraion.problem_size = config.problem_size; configuraion.split_k_mode = config.split_k_mode; configuraion.stride_a = config.stride_a; @@ -78,7 +89,8 @@ struct Conv2dOperationCacheKey { }; CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); - CHECK_EQ(reinterpret_cast(arguments.P) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.FilterScale) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.FilterAcc) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Scale) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Bias) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Residual) % kCudaAlignSize, 0); @@ -142,8 +154,7 @@ struct Conv2dOperationCacheKeyHasher { size_t hash = cutlass::library::ConvFunctionalKeyHasher()(key.functional_key); hash = HashCombine(hash, Conv2dConfigurationHasher()(key.configuraion)); hash = HashCombine(hash, std::hash()(key.alignment)); - hash = HashCombine(hash, std::hash()(key.fuse_scale_bias)); - hash = HashCombine(hash, std::hash()(key.fuse_residual)); + hash = HashCombine(hash, std::hash()(key.kind)); return hash; } }; @@ -157,8 +168,7 @@ inline bool operator==(const cutlass::library::Conv2dConfiguration& lhs, inline bool operator==(const Conv2dOperationCacheKey& lhs, const Conv2dOperationCacheKey& rhs) { return lhs.functional_key == rhs.functional_key && lhs.configuraion == rhs.configuraion - && lhs.alignment == rhs.alignment && lhs.fuse_scale_bias == rhs.fuse_scale_bias - && lhs.fuse_residual == rhs.fuse_residual; + && lhs.alignment == rhs.alignment && lhs.kind == rhs.kind; } } // namespace oneflow diff --git a/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp index 1d3a65b5cae..d2fa02bb59d 100644 --- a/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp +++ b/oneflow/user/kernels/cutlass_conv_tuner_impl.cpp @@ -288,12 +288,7 @@ class CutlassConvTunerImpl; - CutlassConvTunerImpl() { - singleton = &cutlass::library::CutlassExtensionSingleton::get( - cutlass::library::SingletonKind::kConv2dScaleBiasFusionWithZeroPoint); - residual_singleton = &cutlass::library::CutlassExtensionSingleton::get( - cutlass::library::SingletonKind::kConv2dScaleBiasResidualFusionWithZeroPoint); - } + CutlassConvTunerImpl() {} const cutlass::library::Operation* Find( ep::CudaStream* stream, cutlass::library::ConvFunctionalKey functional_key, @@ -311,8 +306,6 @@ class CutlassConvTunerImpl cache; - const cutlass::library::CutlassExtensionSingleton* singleton; - const cutlass::library::CutlassExtensionSingleton* residual_singleton; }; const cutlass::library::Operation* @@ -375,10 +368,13 @@ CutlassConvTunerImplcuda_arch()); + const cutlass::library::CutlassExtensionSingleton* singleton = + &cutlass::library::CutlassExtensionSingleton::get( + static_cast(cache_key.kind)); + + const cutlass::library::Operation* fastest_operation = FindFastestOperation( + singleton, functional_key, configuraion, benchmark_arguments, benchmark_workspace, + workspace_size, benchmark_stream, stream->cuda_arch()); #ifdef WITH_CUDA_GRAPHS if (stream->IsGraphCapturing()) { @@ -412,9 +408,14 @@ CutlassConvTunerImplcuda_stream(), - stream->cuda_arch()); + + Conv2dOperationCacheKey cache_key(functional_key, configuraion, arguments); + const cutlass::library::CutlassExtensionSingleton* singleton = + &cutlass::library::CutlassExtensionSingleton::get( + static_cast(cache_key.kind)); + + return GetOperation(singleton, name, functional_key, configuraion, arguments, workspace, + workspace_size, stream->cuda_stream(), stream->cuda_arch()); } #endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/ops/conv_quant_op.cpp b/oneflow/user/ops/conv_quant_op.cpp index 0fa8d0d637c..6beb0fd55bf 100644 --- a/oneflow/user/ops/conv_quant_op.cpp +++ b/oneflow/user/ops/conv_quant_op.cpp @@ -84,41 +84,55 @@ Maybe InferTensorDesc4Conv(user_op::InferContext* ctx) { CHECK_EQ_OR_RETURN(weight.shape(), Shape(weight_shape)); } - bool has_scale = ctx->has_input("scale", 0); - if (has_scale) { + const user_op::TensorDesc& in_zero_point = ctx->InputTensorDesc("in_zero_point", 0); + CHECK_EQ_OR_RETURN(in_zero_point.shape().Count(0), 1); + + if (ctx->has_input("scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("bias", 0)); const user_op::TensorDesc& scale = ctx->InputTensorDesc("scale", 0); CHECK_EQ_OR_RETURN(scale.shape(), Shape({filters})); - } - bool has_bias = ctx->has_input("bias", 0); - if (has_bias) { const user_op::TensorDesc& bias = ctx->InputTensorDesc("bias", 0); CHECK_EQ_OR_RETURN(bias.shape(), Shape({filters})); } - if (has_scale || has_bias) { CHECK_OR_RETURN(has_scale && has_bias); } + if (ctx->has_input("in_scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("weight_scale", 0)); + CHECK_OR_RETURN(ctx->has_input("weight_acc", 0)); + const user_op::TensorDesc& in_scale = ctx->InputTensorDesc("in_scale", 0); + CHECK_EQ_OR_RETURN(in_scale.shape().Count(0), 1); + const user_op::TensorDesc& weight_scale = ctx->InputTensorDesc("weight_scale", 0); + CHECK_EQ_OR_RETURN(weight_scale.shape(), Shape({filters})); + const user_op::TensorDesc& weight_acc = ctx->InputTensorDesc("weight_acc", 0); + CHECK_EQ_OR_RETURN(weight_acc.shape(), Shape({filters})); + if (ctx->has_input("bias", 0)) { + const user_op::TensorDesc& bias = ctx->InputTensorDesc("bias", 0); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({filters})); + } + } return Maybe::Ok(); } Maybe GetSbpSignatures4Conv(user_op::SbpContext* ctx) { + std::vector split_args; + std::vector broadcast_args; + split_args.emplace_back("in", 0); + split_args.emplace_back("out", 0); + if (ctx->user_op_conf().has_input("_add_to_output", 0)) { + split_args.emplace_back("_add_to_output", 0); + } + broadcast_args.emplace_back("weight", 0); + broadcast_args.emplace_back("in_zero_point", 0); + if (ctx->user_op_conf().has_input("bias", 0)) { broadcast_args.emplace_back("bias", 0); } if (ctx->user_op_conf().has_input("scale", 0)) { CHECK_OR_RETURN(ctx->user_op_conf().has_input("bias", 0)); - ctx->NewBuilder() - .Split(ctx->inputs(), 0) - .Split(user_op::OpArg("in", 0), 0) - .Broadcast(user_op::OpArg("weight", 0)) - .Broadcast(user_op::OpArg("in_zero_point", 0)) - .Broadcast(user_op::OpArg("scale", 0)) - .Broadcast(user_op::OpArg("bias", 0)) - .Split(user_op::OpArg("out", 0), 0) - .Build(); - } else { - ctx->NewBuilder() - .Split(ctx->inputs(), 0) - .Split(user_op::OpArg("in", 0), 0) - .Broadcast(user_op::OpArg("weight", 0)) - .Broadcast(user_op::OpArg("in_zero_point", 0)) - .Split(user_op::OpArg("out", 0), 0) - .Build(); + broadcast_args.emplace_back("scale", 0); + } else if (ctx->user_op_conf().has_input("in_scale", 0)) { + CHECK_OR_RETURN(ctx->user_op_conf().has_input("weight_scale", 0)); + CHECK_OR_RETURN(ctx->user_op_conf().has_input("weight_acc", 0)); + broadcast_args.emplace_back("in_scale", 0); + broadcast_args.emplace_back("weight_scale", 0); + broadcast_args.emplace_back("weight_acc", 0); } + ctx->NewBuilder().Split(split_args, 0).Broadcast(broadcast_args).Build(); return Maybe::Ok(); } From a102cc02e871d18256cfb466029cca25252eaada Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 6 Sep 2023 16:01:09 +0000 Subject: [PATCH 53/65] matmul quant with fiter scale --- oneflow/core/functional/functional_api.yaml | 12 +- oneflow/core/functional/impl/nn_functor.cpp | 146 ++++++++++++------ oneflow/ir/include/OneFlow/OneFlowUserOps.td | 14 +- .../PDLL/FuseOpsWithBackwardImplPattern.pdll | 24 ++- .../cutlass_gemm_operation_cache_key.h | 37 +++-- .../user/kernels/cutlass_gemm_tuner_impl.cpp | 53 +++++-- .../user/kernels/fused_glu_quant_kernel.cu | 53 ++++--- oneflow/user/kernels/matmul_quant_kernels.cu | 58 ++++--- oneflow/user/ops/fused_glu_quant_op.cpp | 117 ++++++++------ oneflow/user/ops/matmul_quant_op.cpp | 136 ++++++++-------- 10 files changed, 421 insertions(+), 229 deletions(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 5abf4134dbb..adba9ce38e4 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1066,9 +1066,15 @@ - name: "matmul_quant" signature: - 'Tensor (Tensor a, Tensor b, Tensor scale=None, Tensor bias=None, - Bool transpose_a=False, Bool transpose_b=False, - Double alpha=1.0, DataType output_dtype=None) => MatmulQuant' + [ + 'Tensor (Tensor a, Tensor b, Tensor scale=None, Tensor bias=None, + Bool transpose_a=False, Bool transpose_b=False, + Double alpha=1.0, DataType output_dtype=None) => MatmulQuant', + 'Tensor (Tensor a, Tensor b, Tensor input_zero_point, Tensor input_scale, Tensor weight_scale, + Tensor weight_acc, Tensor bias=None, + Bool transpose_a=False, Bool transpose_b=False, + Double alpha=1.0, DataType output_dtype=None) => MatmulQuant' + ] bind_python: True - name: "conv3d" diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index d03eaf654ed..47231aba953 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -274,9 +274,10 @@ class ConvQuantWithInputScaleBaseFunctor { int32_t num_spatial_dims_; }; -class Conv2dQuantWithInputScaleFunctor : public ConvQuantWithInputScaleBaseFunctor { +class Conv2dQuantWithFilterScaleFunctor : public ConvQuantWithInputScaleBaseFunctor { public: - Conv2dQuantWithInputScaleFunctor() : ConvQuantWithInputScaleBaseFunctor(/*num_spatial_dims_=*/2) { + Conv2dQuantWithFilterScaleFunctor() + : ConvQuantWithInputScaleBaseFunctor(/*num_spatial_dims_=*/2) { conv_op_ = CHECK_JUST(one::OpBuilder("conv2d_quant") .Input("in") .Input("weight") @@ -450,47 +451,6 @@ class MatMulNoBroadCastFunctor { } }; -class MatMulQuantFunctor { - public: - MatMulQuantFunctor() { - matmul_op_ = - CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Output("out").Build()); - matmul_scale_bias_op_ = CHECK_JUST(one::OpBuilder("matmul_quant") - .Input("a") - .Input("b") - .Input("scale") - .Input("bias") - .Output("out") - .Build()); - } - Maybe operator()(const std::shared_ptr& a, - const std::shared_ptr& b, - const Optional& scale, const Optional& bias, - const bool& transpose_a, const bool& transpose_b, const double& alpha, - const Optional>& output_dtype) const { - CHECK_OR_RETURN(!transpose_a) - << "the first input should not be transposed for quantized matmul."; - CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; - CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; - if (scale || bias) { - CHECK_OR_RETURN(scale && bias) << "scale and bias must both be given or not."; - } - auto& attrs = - THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); - attrs.SetAllAttrs(transpose_a, transpose_b, alpha, - output_dtype.value_or(DType::Float())->data_type()); - if (scale) { - return OpInterpUtil::Dispatch(*matmul_scale_bias_op_, {a, b, JUST(scale), JUST(bias)}, - attrs); - } - return OpInterpUtil::Dispatch(*matmul_op_, {a, b}, attrs); - } - - private: - std::shared_ptr matmul_op_; - std::shared_ptr matmul_scale_bias_op_; -}; - class MatMulFunctor { public: MatMulFunctor() { @@ -616,6 +576,101 @@ class BatchMatMulFunctor { std::shared_ptr batch_matmul_op_; }; +class MatMulQuantFunctor { + public: + MatMulQuantFunctor() { + matmul_op_ = + CHECK_JUST(one::OpBuilder("matmul_quant").Input("a").Input("b").Output("out").Build()); + matmul_scale_bias_op_ = CHECK_JUST(one::OpBuilder("matmul_quant") + .Input("a") + .Input("b") + .Input("scale") + .Input("bias") + .Output("out") + .Build()); + } + Maybe operator()(const std::shared_ptr& a, + const std::shared_ptr& b, + const Optional& scale, const Optional& bias, + const bool& transpose_a, const bool& transpose_b, const double& alpha, + const Optional>& output_dtype) const { + CHECK_OR_RETURN(!transpose_a) + << "the first input should not be transposed for quantized matmul."; + CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; + CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; + if (scale || bias) { + CHECK_OR_RETURN(scale && bias) << "scale and bias must both be given or not."; + } + auto& attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); + attrs.SetAllAttrs(transpose_a, transpose_b, alpha, + output_dtype.value_or(DType::Float())->data_type()); + if (scale) { + return OpInterpUtil::Dispatch(*matmul_scale_bias_op_, {a, b, JUST(scale), JUST(bias)}, + attrs); + } + return OpInterpUtil::Dispatch(*matmul_op_, {a, b}, attrs); + } + + private: + std::shared_ptr matmul_op_; + std::shared_ptr matmul_scale_bias_op_; +}; + +class MatMulQuantWithFilterScaleFunctor { + public: + MatMulQuantWithFilterScaleFunctor() { + matmul_scale_op_ = CHECK_JUST(one::OpBuilder("matmul_quant") + .Input("a") + .Input("b") + .Input("in_zero_point") + .Input("in_scale") + .Input("weight_scale") + .Input("weight_acc") + .Output("out") + .Build()); + matmul_scale_bias_op_ = CHECK_JUST(one::OpBuilder("matmul_quant") + .Input("a") + .Input("b") + .Input("in_zero_point") + .Input("in_scale") + .Input("weight_scale") + .Input("weight_acc") + .Input("bias") + .Output("out") + .Build()); + } + Maybe operator()(const std::shared_ptr& a, + const std::shared_ptr& b, + const std::shared_ptr& input_zero_point, + const std::shared_ptr& input_scale, + const std::shared_ptr& weight_scale, + const std::shared_ptr& weight_acc, + const Optional& bias, const bool& transpose_a, + const bool& transpose_b, const double& alpha, + const Optional>& output_dtype) const { + CHECK_OR_RETURN(!transpose_a) + << "the first input should not be transposed for quantized matmul."; + CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; + CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; + auto& attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); + attrs.SetAllAttrs(transpose_a, transpose_b, alpha, + output_dtype.value_or(DType::Float())->data_type()); + if (bias) { + return OpInterpUtil::Dispatch( + *matmul_scale_bias_op_, + {a, b, input_zero_point, input_scale, weight_scale, weight_acc, JUST(bias)}, attrs); + } + return OpInterpUtil::Dispatch( + *matmul_scale_op_, {a, b, input_zero_point, input_scale, weight_scale, weight_acc}, attrs); + } + + private: + std::shared_ptr matmul_scale_op_; + std::shared_ptr matmul_scale_bias_op_; +}; + class VectorMatrixProductFunctor { public: VectorMatrixProductFunctor() { @@ -5607,12 +5662,11 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("Deconv1d"); m.add_functor("Deconv2d"); m.add_functor("Deconv3d"); - m.add_functor("Conv2dQuant"); - m.add_functor("Conv2dQuant"); + m.add_functor("Conv2dQuant"); m.add_functor("EmbeddingReNorm"); m.add_functor("Embedding"); m.add_functor("MatMul"); - m.add_functor("MatmulQuant"); + m.add_functor("MatmulQuant"); m.add_functor("MatMulNoBroadCast"); m.add_functor("BatchMatMul"); m.add_functor("MatrixVectorProduct"); diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index e96bec03d3e..af4d22d8c75 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -5379,6 +5379,10 @@ def OneFlow_MatmulQuantOp : OneFlow_BaseOp<"matmul_quant", [NoMemoryEffect, Attr let input = (ins OneFlow_Tensor:$a, OneFlow_Tensor:$b, + Optional:$in_zero_point, + Optional:$in_scale, + Optional:$weight_scale, + Optional:$weight_acc, Optional:$scale, Optional:$bias, Optional:$_add_to_output @@ -5672,9 +5676,15 @@ def OneFlow_FusedGluQuantOp : OneFlow_BaseOp<"fused_glu_quant", [NoMemoryEffect, let input = (ins OneFlow_Tensor:$x, OneFlow_Tensor:$w, - OneFlow_Tensor:$scale, - OneFlow_Tensor:$bias, + Optional:$in_zero_point, + Optional:$in_scale, + Optional:$weight_scale, + Optional:$weight_acc, + Optional:$scale, + Optional:$bias, Optional:$v, + Optional:$v_weight_scale, + Optional:$v_weight_acc, Optional:$v_scale, Optional:$v_bias ); diff --git a/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll b/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll index 028ecd28f6b..2d527f049c0 100644 --- a/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll +++ b/oneflow/ir/lib/OneFlow/PDLL/FuseOpsWithBackwardImplPattern.pdll @@ -52,7 +52,29 @@ Pattern { let gelu_out = op(hidden_states.0,gate_activate.0){device_name = device_name, device_tag = device_tag}-> (out: Type); rewrite gelu_out with{ - let fused_gelu_out = op(x, w, scale, bias){activation = attr<"\"gelu\"">, operand_segment_sizes = attr<"array">, device_name = device_name, device_tag = device_tag, out_dtype = out_dtype}-> (out, matmul_wx_out); + let fused_gelu_out = op(x, w, scale, bias){activation = attr<"\"gelu\"">, operand_segment_sizes = attr<"array">, device_name = device_name, device_tag = device_tag, out_dtype = out_dtype}-> (out, matmul_wx_out); + CopyUserOpAttrs(gelu_out, fused_gelu_out); + replace gelu_out with fused_gelu_out.0; + replace matmul_wx_add with fused_gelu_out.1; + }; +} + +Pattern { + let device_name: Attr; + let device_tag: Attr; + let out_dtype: Attr; + + let matmul_wx_add = op( + x: Value, w: Value, in_zero_point: Value, in_scale: Value, weight_scale: Value, weight_acc: Value, bias: Value) + {device_name = device_name, device_tag = device_tag, alpha = attr<"1.000000e+00 : f64">, out_dtype = out_dtype} -> (matmul_wx_out: Type); + + let hidden_states = op(matmul_wx_add.0){device_name = device_name, device_tag = device_tag}; + let gate = op(matmul_wx_add.0){device_name = device_name, device_tag = device_tag}; + let gate_activate = op(gate.0){device_name = device_name, device_tag = device_tag}; + let gelu_out = op(hidden_states.0,gate_activate.0){device_name = device_name, device_tag = device_tag}-> (out: Type); + + rewrite gelu_out with{ + let fused_gelu_out = op(x, w, in_zero_point, in_scale, weight_scale, weight_acc, bias){activation = attr<"\"gelu\"">, operand_segment_sizes = attr<"array">, device_name = device_name, device_tag = device_tag, out_dtype = out_dtype}-> (out, matmul_wx_out); CopyUserOpAttrs(gelu_out, fused_gelu_out); replace gelu_out with fused_gelu_out.0; replace matmul_wx_add with fused_gelu_out.1; diff --git a/oneflow/user/kernels/cutlass_gemm_operation_cache_key.h b/oneflow/user/kernels/cutlass_gemm_operation_cache_key.h index eb29aec0658..4b025d16b73 100644 --- a/oneflow/user/kernels/cutlass_gemm_operation_cache_key.h +++ b/oneflow/user/kernels/cutlass_gemm_operation_cache_key.h @@ -24,6 +24,7 @@ limitations under the License. #include #ifdef WITH_CUTLASS_EXTENSION +#include #include #endif // WITH_CUTLASS_EXTENSION @@ -33,15 +34,12 @@ struct GemmOperationCacheKey { cutlass::library::GemmFunctionalKey functional_key; cutlass::library::GemmConfiguration configuraion; size_t alignment; - bool fuse_scale_bias; - bool fuse_residual; + size_t kind; + GemmOperationCacheKey(const cutlass::library::GemmFunctionalKey& functional_key, const cutlass::library::GemmConfiguration& configuraion, const cutlass::library::GemmArguments& arguments) - : functional_key(functional_key), - configuraion(configuraion), - fuse_scale_bias(false), - fuse_residual(false) { + : functional_key(functional_key), configuraion(configuraion), kind(-1) { CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.C) % kCudaAlignSize, 0); @@ -60,9 +58,22 @@ struct GemmOperationCacheKey { GemmOperationCacheKey(cutlass::library::GemmFunctionalKey functional_key, const cutlass::library::GemmScaleBiasFusionConfiguration& config, const cutlass::library::GemmScaleBiasFusionArguments& arguments) - : functional_key(functional_key), - fuse_scale_bias(true), - fuse_residual(arguments.Residual != nullptr) { + : functional_key(functional_key) { + if (arguments.Scale) { + kind = arguments.Residual ? cutlass::library::SingletonKind::kGemmScaleBiasResidualFusion + : cutlass::library::SingletonKind::kGemmScaleBiasFusion; + } else if (arguments.FilterScale) { + if (arguments.Bias) { + kind = arguments.Residual + ? cutlass::library::SingletonKind::kGemmFilterScaleBiasResidualFusion + : cutlass::library::SingletonKind::kGemmFilterScaleBiasFusion; + } else { + kind = arguments.Residual ? cutlass::library::SingletonKind::kGemmFilterScaleResidualFusion + : cutlass::library::SingletonKind::kGemmFilterScaleFusion; + } + } else { + UNIMPLEMENTED(); + } configuraion.problem_size = config.problem_size; configuraion.split_k_slices = config.split_k_slices; configuraion.lda = config.lda; @@ -71,6 +82,8 @@ struct GemmOperationCacheKey { configuraion.ldd = config.ldd; CHECK_EQ(reinterpret_cast(arguments.A) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.B) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.FilterScale) % kCudaAlignSize, 0); + CHECK_EQ(reinterpret_cast(arguments.FilterAcc) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Scale) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Bias) % kCudaAlignSize, 0); CHECK_EQ(reinterpret_cast(arguments.Residual) % kCudaAlignSize, 0); @@ -114,8 +127,7 @@ struct GemmOperationCacheKeyHasher { size_t hash = cutlass::library::GemmFunctionalKeyHasher()(key.functional_key); hash = HashCombine(hash, GemmConfigurationHasher()(key.configuraion)); hash = HashCombine(hash, std::hash()(key.alignment)); - hash = HashCombine(hash, std::hash()(key.fuse_scale_bias)); - hash = HashCombine(hash, std::hash()(key.fuse_residual)); + hash = HashCombine(hash, std::hash()(key.kind)); return hash; } }; @@ -128,8 +140,7 @@ inline bool operator==(const cutlass::library::GemmConfiguration& lhs, inline bool operator==(const GemmOperationCacheKey& lhs, const GemmOperationCacheKey& rhs) { return lhs.functional_key == rhs.functional_key && lhs.configuraion == rhs.configuraion - && lhs.alignment == rhs.alignment && lhs.fuse_scale_bias == rhs.fuse_scale_bias - && lhs.fuse_residual == rhs.fuse_residual; + && lhs.alignment == rhs.alignment && lhs.kind == rhs.kind; } } // namespace oneflow diff --git a/oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp b/oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp index abb69aed600..e51163cf7fe 100644 --- a/oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp +++ b/oneflow/user/kernels/cutlass_gemm_tuner_impl.cpp @@ -180,12 +180,7 @@ class CutlassGemmTunerImpl; - CutlassGemmTunerImpl() { - singleton = &cutlass::library::CutlassExtensionSingleton::get( - cutlass::library::SingletonKind::kGemmScaleBiasFusion); - residual_singleton = &cutlass::library::CutlassExtensionSingleton::get( - cutlass::library::SingletonKind::kGemmScaleBiasResidualFusion); - } + CutlassGemmTunerImpl() {} const cutlass::library::Operation* Find( ep::CudaStream* stream, cutlass::library::GemmFunctionalKey functional_key, @@ -203,8 +198,6 @@ class CutlassGemmTunerImpl cache; - const cutlass::library::CutlassExtensionSingleton* singleton; - const cutlass::library::CutlassExtensionSingleton* residual_singleton; }; const cutlass::library::Operation* @@ -241,6 +234,24 @@ CutlassGemmTunerImplcuda_arch()); + const cutlass::library::CutlassExtensionSingleton* singleton = + &cutlass::library::CutlassExtensionSingleton::get( + static_cast(cache_key.kind)); + + const cutlass::library::Operation* fastest_operation = FindFastestOperation( + singleton, functional_key, configuraion, benchmark_arguments, benchmark_workspace, + workspace_size, benchmark_stream, stream->cuda_arch()); #ifdef WITH_CUDA_GRAPHS if (stream->IsGraphCapturing()) { @@ -275,6 +289,10 @@ CutlassGemmTunerImpl(benchmark_arguments.A))); OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.B))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.P))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.InScale))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.FilterScale))); + OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.FilterAcc))); OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Scale))); OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Bias))); OF_CUDA_CHECK(cudaFree(const_cast(benchmark_arguments.Residual))); @@ -300,9 +318,14 @@ CutlassGemmTunerImplcuda_stream(), - stream->cuda_arch()); + + GemmOperationCacheKey cache_key(functional_key, configuraion, arguments); + const cutlass::library::CutlassExtensionSingleton* singleton = + &cutlass::library::CutlassExtensionSingleton::get( + static_cast(cache_key.kind)); + + return GetOperation(singleton, name, functional_key, configuraion, arguments, workspace, + workspace_size, stream->cuda_stream(), stream->cuda_arch()); } #endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/kernels/fused_glu_quant_kernel.cu b/oneflow/user/kernels/fused_glu_quant_kernel.cu index 0134790cdfe..802e8373130 100644 --- a/oneflow/user/kernels/fused_glu_quant_kernel.cu +++ b/oneflow/user/kernels/fused_glu_quant_kernel.cu @@ -41,17 +41,20 @@ namespace oneflow { namespace { -void LaunchMatmulQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, - const cutlass::library::GemmFunctionalKey& key, - const cutlass::gemm::GemmCoord& problem_size, - const user_op::Tensor* a, const user_op::Tensor* b, - const user_op::Tensor* scale, const user_op::Tensor* bias, - const user_op::Tensor* add_to_output, - user_op::Tensor* out) { +void LaunchMatmulQuantOp(user_op::KernelComputeContext* ctx, + const cutlass::library::GemmFunctionalKey& key, + const cutlass::gemm::GemmCoord& problem_size, const user_op::Tensor* a, + const user_op::Tensor* b, const user_op::Tensor* in_zero_point, + const user_op::Tensor* in_scale, const user_op::Tensor* weight_scale, + const user_op::Tensor* weight_acc, const user_op::Tensor* scale, + const user_op::Tensor* bias, const user_op::Tensor* add_to_output, + user_op::Tensor* out) { cutlass::library::GemmScaleBiasFusionConfiguration configuraion; configuraion.problem_size = problem_size; configuraion.lda = problem_size.k(); configuraion.ldb = problem_size.k(); + configuraion.ld_filter_scale = 0; + configuraion.ld_filter_acc = 0; configuraion.ld_scale = 0; configuraion.ld_bias = 0; configuraion.ldr = problem_size.n(); @@ -62,14 +65,22 @@ void LaunchMatmulQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, cutlass::library::GemmScaleBiasFusionArguments arguments; arguments.A = a->dptr(); arguments.B = b->dptr(); - arguments.Scale = scale->dptr(); - arguments.Bias = bias->dptr(); - if (add_to_output) { - arguments.Residual = add_to_output->dptr(); - } else { - arguments.Residual = nullptr; - } arguments.D = out->mut_dptr(); + arguments.P = nullptr; + arguments.InScale = nullptr; + arguments.FilterScale = nullptr; + arguments.FilterAcc = nullptr; + arguments.Scale = nullptr; + arguments.Bias = nullptr; + arguments.Residual = nullptr; + + if (in_zero_point) { arguments.P = in_zero_point->dptr(); } + if (in_scale) { arguments.InScale = in_scale->dptr(); } + if (weight_scale) { arguments.FilterScale = weight_scale->dptr(); } + if (weight_acc) { arguments.FilterAcc = weight_acc->dptr(); } + if (scale) { arguments.Scale = scale->dptr(); } + if (bias) { arguments.Bias = bias->dptr(); } + if (add_to_output) { arguments.Residual = add_to_output->dptr(); } user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); auto* stream = ctx->stream()->As(); @@ -261,9 +272,6 @@ class GpuFusedGluQuantKernel final : public user_op::OpKernel, public user_op::C const user_op::OpKernelCache* cache) const override { const user_op::Tensor* input_x = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* input_w = ctx->Tensor4ArgNameAndIndex("w", 0); - const user_op::Tensor* input_scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - const user_op::Tensor* input_bias = ctx->Tensor4ArgNameAndIndex("bias", 0); - user_op::Tensor* out_y = ctx->Tensor4ArgNameAndIndex("y", 0); user_op::Tensor* out_matmul_wx = ctx->Tensor4ArgNameAndIndex("matmul_wx", 0); user_op::Tensor* out_matmul_vx = nullptr; @@ -308,8 +316,15 @@ class GpuFusedGluQuantKernel final : public user_op::OpKernel, public user_op::C } cutlass::gemm::GemmCoord problem_size(m, 2 * n, k); - LaunchMatmulQuantScaleBiasFusionOp(ctx, key, problem_size, input_x, input_w, input_scale, - input_bias, nullptr, out_matmul_wx); + const user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_point", 0); + const user_op::Tensor* in_scale = ctx->Tensor4ArgNameAndIndex("in_scale", 0); + const user_op::Tensor* weight_scale = ctx->Tensor4ArgNameAndIndex("weight_scale", 0); + const user_op::Tensor* weight_acc = ctx->Tensor4ArgNameAndIndex("weight_acc", 0); + const user_op::Tensor* input_scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* input_bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + + LaunchMatmulQuantOp(ctx, key, problem_size, input_x, input_w, in_zero_point, in_scale, + weight_scale, weight_acc, input_scale, input_bias, nullptr, out_matmul_wx); // dispatch according to activation type DispatchActivationType( diff --git a/oneflow/user/kernels/matmul_quant_kernels.cu b/oneflow/user/kernels/matmul_quant_kernels.cu index eeb8f23ad43..d4ac0591aa1 100644 --- a/oneflow/user/kernels/matmul_quant_kernels.cu +++ b/oneflow/user/kernels/matmul_quant_kernels.cu @@ -31,17 +31,20 @@ namespace oneflow { namespace { -void LaunchMatmulQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, - const cutlass::library::GemmFunctionalKey& key, - const cutlass::gemm::GemmCoord& problem_size, - const user_op::Tensor* a, const user_op::Tensor* b, - const user_op::Tensor* scale, const user_op::Tensor* bias, - const user_op::Tensor* add_to_output, - user_op::Tensor* out) { +void LaunchMatmulQuantOp(user_op::KernelComputeContext* ctx, + const cutlass::library::GemmFunctionalKey& key, + const cutlass::gemm::GemmCoord& problem_size, const user_op::Tensor* a, + const user_op::Tensor* b, const user_op::Tensor* in_zero_point, + const user_op::Tensor* in_scale, const user_op::Tensor* weight_scale, + const user_op::Tensor* weight_acc, const user_op::Tensor* scale, + const user_op::Tensor* bias, const user_op::Tensor* add_to_output, + user_op::Tensor* out) { cutlass::library::GemmScaleBiasFusionConfiguration configuraion; configuraion.problem_size = problem_size; configuraion.lda = problem_size.k(); configuraion.ldb = problem_size.k(); + configuraion.ld_filter_scale = 0; + configuraion.ld_filter_acc = 0; configuraion.ld_scale = 0; configuraion.ld_bias = 0; configuraion.ldr = problem_size.n(); @@ -52,14 +55,22 @@ void LaunchMatmulQuantScaleBiasFusionOp(user_op::KernelComputeContext* ctx, cutlass::library::GemmScaleBiasFusionArguments arguments; arguments.A = a->dptr(); arguments.B = b->dptr(); - arguments.Scale = scale->dptr(); - arguments.Bias = bias->dptr(); - if (add_to_output) { - arguments.Residual = add_to_output->dptr(); - } else { - arguments.Residual = nullptr; - } arguments.D = out->mut_dptr(); + arguments.P = nullptr; + arguments.InScale = nullptr; + arguments.FilterScale = nullptr; + arguments.FilterAcc = nullptr; + arguments.Scale = nullptr; + arguments.Bias = nullptr; + arguments.Residual = nullptr; + + if (in_zero_point) { arguments.P = in_zero_point->dptr(); } + if (in_scale) { arguments.InScale = in_scale->dptr(); } + if (weight_scale) { arguments.FilterScale = weight_scale->dptr(); } + if (weight_acc) { arguments.FilterAcc = weight_acc->dptr(); } + if (scale) { arguments.Scale = scale->dptr(); } + if (bias) { arguments.Bias = bias->dptr(); } + if (add_to_output) { arguments.Residual = add_to_output->dptr(); } user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); auto* stream = ctx->stream()->As(); @@ -108,9 +119,6 @@ class MatmulQuantKernel final : public user_op::OpKernel { void Compute(user_op::KernelComputeContext* ctx) const override { const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("a", 0); const user_op::Tensor* b = ctx->Tensor4ArgNameAndIndex("b", 0); - const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); - const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); CHECK(!ctx->Attr("transpose_a")); @@ -149,12 +157,16 @@ class MatmulQuantKernel final : public user_op::OpKernel { cutlass::gemm::GemmCoord problem_size(m, n, k); - if (scale) { - LaunchMatmulQuantScaleBiasFusionOp(ctx, key, problem_size, a, b, scale, bias, add_to_output, - out); - } else { - UNIMPLEMENTED(); - } + const user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_point", 0); + const user_op::Tensor* in_scale = ctx->Tensor4ArgNameAndIndex("in_scale", 0); + const user_op::Tensor* weight_scale = ctx->Tensor4ArgNameAndIndex("weight_scale", 0); + const user_op::Tensor* weight_acc = ctx->Tensor4ArgNameAndIndex("weight_acc", 0); + const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + + LaunchMatmulQuantOp(ctx, key, problem_size, a, b, in_zero_point, in_scale, weight_scale, + weight_acc, scale, bias, add_to_output, out); } }; diff --git a/oneflow/user/ops/fused_glu_quant_op.cpp b/oneflow/user/ops/fused_glu_quant_op.cpp index 569e4d66de4..acf153b65b8 100644 --- a/oneflow/user/ops/fused_glu_quant_op.cpp +++ b/oneflow/user/ops/fused_glu_quant_op.cpp @@ -28,6 +28,31 @@ namespace oneflow { CHECK_OR_RETURN(ctx->user_op_conf().has_input("v_bias", 0)) << "expected v_bias for split mode"; } + std::vector scalar_args; + if (ctx->user_op_conf().has_input("in_zero_point", 0)) { + scalar_args.emplace_back("in_zero_point", 0); + } + if (ctx->user_op_conf().has_input("in_scale", 0)) { scalar_args.emplace_back("in_scale", 0); } + + std::vector vector_args; + if (ctx->user_op_conf().has_input("weight_scale", 0)) { + vector_args.emplace_back("weight_scale", 0); + } + if (ctx->user_op_conf().has_input("weight_acc", 0)) { vector_args.emplace_back("weight_acc", 0); } + if (ctx->user_op_conf().has_input("scale", 0)) { vector_args.emplace_back("scale", 0); } + if (ctx->user_op_conf().has_input("bias", 0)) { vector_args.emplace_back("bias", 0); } + + if (is_split_mode) { + if (ctx->user_op_conf().has_input("v_weight_scale", 0)) { + vector_args.emplace_back("v_weight_scale", 0); + } + if (ctx->user_op_conf().has_input("v_weight_acc", 0)) { + vector_args.emplace_back("v_weight_acc", 0); + } + if (ctx->user_op_conf().has_input("v_scale", 0)) { vector_args.emplace_back("v_scale", 0); } + if (ctx->user_op_conf().has_input("v_bias", 0)) { vector_args.emplace_back("v_bias", 0); } + } + // data parallelism for (int64_t i = 0; i < ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape().NumAxes() - 1; ++i) { @@ -35,19 +60,17 @@ namespace oneflow { ctx->NewBuilder() .Split(user_op::OpArg("x", 0), i) .Broadcast(user_op::OpArg("w", 0)) - .Broadcast(user_op::OpArg("scale", 0)) - .Broadcast(user_op::OpArg("bias", 0)) .Broadcast(user_op::OpArg("v", 0)) - .Broadcast(user_op::OpArg("v_scale", 0)) - .Broadcast(user_op::OpArg("v_bias", 0)) + .Broadcast(scalar_args) + .Broadcast(vector_args) .Split(ctx->outputs(), i) .Build(); } else { ctx->NewBuilder() .Split(user_op::OpArg("x", 0), i) .Broadcast(user_op::OpArg("w", 0)) - .Broadcast(user_op::OpArg("scale", 0)) - .Broadcast(user_op::OpArg("bias", 0)) + .Broadcast(scalar_args) + .Broadcast(vector_args) .Split(ctx->outputs(), i) .Build(); } @@ -58,11 +81,9 @@ namespace oneflow { ctx->NewBuilder() .Broadcast(user_op::OpArg("x", 0)) .Split(user_op::OpArg("w", 0), 0) - .Split(user_op::OpArg("scale", 0), 0) - .Split(user_op::OpArg("bias", 0), 0) .Split(user_op::OpArg("v", 0), 0) - .Split(user_op::OpArg("v_scale", 0), 0) - .Split(user_op::OpArg("v_bias", 0), 0) + .Broadcast(scalar_args) + .Split(vector_args, 0) .Split(ctx->outputs(), ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape().NumAxes() - 1) .Build(); @@ -80,10 +101,6 @@ namespace oneflow { // check whether the user provide weight tensor v bool is_split_mode = false; if (ctx->has_input("v", 0)) { is_split_mode = true; } - if (is_split_mode) { - CHECK_OR_RETURN(ctx->has_input("v_scale", 0)) << "expected v_scale for split mode"; - CHECK_OR_RETURN(ctx->has_input("v_bias", 0)) << "expected v_bias for split mode"; - } // check dimensions of x, w and b CHECK_GT_OR_RETURN(x_shape.NumAxes(), 1) @@ -98,15 +115,30 @@ namespace oneflow { << ") is not consistant with the last dimension of \'x\'(" << x_shape.At(x_num_axes - 1) << ")"; - const Shape& scale_shape = ctx->InputShape("scale", 0); - CHECK_EQ_OR_RETURN(scale_shape.Count(0), w_shape.At(0)) - << "the element count of \'scale\'(" << scale_shape.Count(0) - << ") is not consistant with dimension 0 of \'w\'(" << w_shape.At(0) << ")"; - - const Shape& bias_shape = ctx->InputShape("bias", 0); - CHECK_EQ_OR_RETURN(bias_shape.Count(0), w_shape.At(0)) - << "the element count of \'bias\'(" << bias_shape.Count(0) - << ") is not consistant with dimension 0 of \'w\'(" << w_shape.At(0) << ")"; + if (ctx->has_input("scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("bias", 0)); + const user_op::TensorDesc& scale = ctx->InputTensorDesc("scale", 0); + CHECK_EQ_OR_RETURN(scale.shape(), Shape({w_shape.At(0)})); + const user_op::TensorDesc& bias = ctx->InputTensorDesc("bias", 0); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({w_shape.At(0)})); + } + if (ctx->has_input("in_scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("in_zero_point", 0)); + CHECK_OR_RETURN(ctx->has_input("weight_scale", 0)); + CHECK_OR_RETURN(ctx->has_input("weight_acc", 0)); + const user_op::TensorDesc& in_zero_point = ctx->InputTensorDesc("in_zero_point", 0); + CHECK_EQ_OR_RETURN(in_zero_point.shape().Count(0), 1); + const user_op::TensorDesc& in_scale = ctx->InputTensorDesc("in_scale", 0); + CHECK_EQ_OR_RETURN(in_scale.shape().Count(0), 1); + const user_op::TensorDesc& weight_scale = ctx->InputTensorDesc("weight_scale", 0); + CHECK_EQ_OR_RETURN(weight_scale.shape(), Shape({w_shape.At(0)})); + const user_op::TensorDesc& weight_acc = ctx->InputTensorDesc("weight_acc", 0); + CHECK_EQ_OR_RETURN(weight_acc.shape(), Shape({w_shape.At(0)})); + if (ctx->has_input("bias", 0)) { + const user_op::TensorDesc& bias = ctx->InputTensorDesc("bias", 0); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({w_shape.At(0)})); + } + } if (!is_split_mode) { CHECK_EQ_OR_RETURN(w_shape.At(1) % 2, 0) << "dimension 1 of \'w\' is not divisible by 2"; @@ -120,15 +152,24 @@ namespace oneflow { << "number of axes of \'v\' should have be equal to 2, yet get " << v_shape.NumAxes(); CHECK_OR_RETURN(v_shape == w_shape) << "the shape of \'v\' is not consistant with \'w\'"; - const Shape& v_scale_shape = ctx->InputShape("v_scale", 0); - CHECK_EQ_OR_RETURN(v_scale_shape.Count(0), v_shape.At(0)) - << "the element count of \'v_scale\'(" << v_scale_shape.Count(0) - << ") is not consistant with dimension 0 of \'v\'(" << v_shape.At(0) << ")"; - - const Shape& v_bias_shape = ctx->InputShape("v_bias", 0); - CHECK_EQ_OR_RETURN(v_bias_shape.Count(0), v_shape.At(0)) - << "the element count of \'v_bias\'(" << v_bias_shape.Count(0) - << ") is not consistant with dimension 0 of \'v\'(" << v_shape.At(0) << ")"; + if (ctx->has_input("v_scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("v_bias", 0)); + const user_op::TensorDesc& v_scale = ctx->InputTensorDesc("v_scale", 0); + CHECK_EQ_OR_RETURN(v_scale.shape(), Shape({v_shape.At(0)})); + const user_op::TensorDesc& v_bias = ctx->InputTensorDesc("v_bias", 0); + CHECK_EQ_OR_RETURN(v_bias.shape(), Shape({v_shape.At(0)})); + } + if (ctx->has_input("v_weight_scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("v_weight_acc", 0)); + const user_op::TensorDesc& v_weight_scale = ctx->InputTensorDesc("v_weight_scale", 0); + CHECK_EQ_OR_RETURN(v_weight_scale.shape(), Shape({v_shape.At(0)})); + const user_op::TensorDesc& v_weight_acc = ctx->InputTensorDesc("v_weight_acc", 0); + CHECK_EQ_OR_RETURN(v_weight_acc.shape(), Shape({v_shape.At(0)})); + if (ctx->has_input("v_bias", 0)) { + const user_op::TensorDesc& v_bias = ctx->InputTensorDesc("v_bias", 0); + CHECK_EQ_OR_RETURN(v_bias.shape(), Shape({v_shape.At(0)})); + } + } } // set shape of the output tensor y @@ -162,29 +203,17 @@ namespace oneflow { /* static */ auto FusedGluQuantOp::InferDataType(user_op::InferContext* ctx) -> Maybe { DataType out_dtype = ctx->Attr("out_dtype"); - // obtain input data types DataType x_dtype = ctx->InputDType("x", 0); - // check whether the user provide weight tensor v bool is_split_mode = false; if (ctx->has_input("v", 0)) { is_split_mode = true; } - // check types of x, w and b CHECK_EQ_OR_RETURN(ctx->InputDType("w", 0), x_dtype) << "data type of \'w\' is not consitant with \'x\'"; - CHECK_EQ_OR_RETURN(ctx->InputDType("scale", 0), out_dtype) - << "data type of \'scale\' is not consitant with out dtype " << out_dtype; - CHECK_EQ_OR_RETURN(ctx->InputDType("bias", 0), out_dtype) - << "data type of \'bias\' is not consitant with out dtype " << out_dtype; - // check types of v and c (optional) if (is_split_mode) { CHECK_EQ_OR_RETURN(ctx->InputDType("v", 0), x_dtype) << "data type of \'v\' is not consitant with \'x\'"; - CHECK_EQ_OR_RETURN(ctx->InputDType("v_scale", 0), out_dtype) - << "data type of \'v_scale\' is not consitant with out dtype " << out_dtype; - CHECK_EQ_OR_RETURN(ctx->InputDType("v_bias", 0), out_dtype) - << "data type of \'v_bias\' is not consitant with out dtype " << out_dtype; } // set output data type diff --git a/oneflow/user/ops/matmul_quant_op.cpp b/oneflow/user/ops/matmul_quant_op.cpp index 2d9d3656bc5..9e252c41538 100644 --- a/oneflow/user/ops/matmul_quant_op.cpp +++ b/oneflow/user/ops/matmul_quant_op.cpp @@ -87,6 +87,30 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { const auto& add_to_output = ctx->InputTensorDesc("_add_to_output", 0); CHECK_EQ_OR_RETURN(add_to_output.shape(), out->shape()); } + if (ctx->has_input("scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("bias", 0)); + const user_op::TensorDesc& scale = ctx->InputTensorDesc("scale", 0); + CHECK_EQ_OR_RETURN(scale.shape(), Shape({n})); + const user_op::TensorDesc& bias = ctx->InputTensorDesc("bias", 0); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({n})); + } + if (ctx->has_input("in_scale", 0)) { + CHECK_OR_RETURN(ctx->has_input("in_zero_point", 0)); + CHECK_OR_RETURN(ctx->has_input("weight_scale", 0)); + CHECK_OR_RETURN(ctx->has_input("weight_acc", 0)); + const user_op::TensorDesc& in_zero_point = ctx->InputTensorDesc("in_zero_point", 0); + CHECK_EQ_OR_RETURN(in_zero_point.shape().Count(0), 1); + const user_op::TensorDesc& in_scale = ctx->InputTensorDesc("in_scale", 0); + CHECK_EQ_OR_RETURN(in_scale.shape().Count(0), 1); + const user_op::TensorDesc& weight_scale = ctx->InputTensorDesc("weight_scale", 0); + CHECK_EQ_OR_RETURN(weight_scale.shape(), Shape({n})); + const user_op::TensorDesc& weight_acc = ctx->InputTensorDesc("weight_acc", 0); + CHECK_EQ_OR_RETURN(weight_acc.shape(), Shape({n})); + if (ctx->has_input("bias", 0)) { + const user_op::TensorDesc& bias = ctx->InputTensorDesc("bias", 0); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({n})); + } + } return Maybe::Ok(); } @@ -124,70 +148,56 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { if (ctx->user_op_conf().has_input("_add_to_output", 0)) { out_and_add_to_output_args.emplace_back("_add_to_output", 0); } - if (ctx->user_op_conf().has_input("scale", 0)) { - CHECK_OR_RETURN(ctx->user_op_conf().has_input("bias", 0)); - ctx->NewBuilder() - .Split(user_op::OpArg("a", 0), m_axis) - .Broadcast(user_op::OpArg("b", 0)) - .Broadcast(user_op::OpArg("scale", 0)) - .Broadcast(user_op::OpArg("bias", 0)) - .Split(out_and_add_to_output_args, 0) - .Build(); - ctx->NewBuilder() - .Broadcast(user_op::OpArg("a", 0)) - .Split(user_op::OpArg("b", 0), n_axis) - .Split(user_op::OpArg("scale", 0), 0) - .Split(user_op::OpArg("bias", 0), 0) - .Split(out_and_add_to_output_args, 1) - .Build(); - ctx->NewBuilder() - .Split(user_op::OpArg("a", 0), k_a_axis) - .Split(user_op::OpArg("b", 0), k_b_axis) - .Broadcast(user_op::OpArg("scale", 0)) - .Broadcast(user_op::OpArg("bias", 0)) - .PartialSum(out_and_add_to_output_args) - .Build(); - ctx->NewBuilder() - .PartialSum(user_op::OpArg("a", 0)) - .Broadcast(user_op::OpArg("b", 0)) - .Broadcast(user_op::OpArg("scale", 0)) - .Broadcast(user_op::OpArg("bias", 0)) - .PartialSum(out_and_add_to_output_args) - .Build(); - ctx->NewBuilder() - .Broadcast(user_op::OpArg("a", 0)) - .PartialSum(user_op::OpArg("b", 0)) - .Broadcast(user_op::OpArg("scale", 0)) - .Broadcast(user_op::OpArg("bias", 0)) - .PartialSum(out_and_add_to_output_args) - .Build(); - } else { - ctx->NewBuilder() - .Split(user_op::OpArg("a", 0), m_axis) - .Broadcast(user_op::OpArg("b", 0)) - .Split(out_and_add_to_output_args, 0) - .Build(); - ctx->NewBuilder() - .Broadcast(user_op::OpArg("a", 0)) - .Split(user_op::OpArg("b", 0), n_axis) - .Split(out_and_add_to_output_args, 1) - .Build(); - ctx->NewBuilder() - .Split(user_op::OpArg("a", 0), k_a_axis) - .Split(user_op::OpArg("b", 0), k_b_axis) - .PartialSum(out_and_add_to_output_args) - .Build(); - ctx->NewBuilder() - .PartialSum(user_op::OpArg("a", 0)) - .Broadcast(user_op::OpArg("b", 0)) - .PartialSum(out_and_add_to_output_args) - .Build(); - ctx->NewBuilder() - .Broadcast(user_op::OpArg("a", 0)) - .PartialSum(user_op::OpArg("b", 0)) - .PartialSum(out_and_add_to_output_args) - .Build(); + + std::vector scalar_args; + if (ctx->user_op_conf().has_input("in_zero_point", 0)) { + scalar_args.emplace_back("in_zero_point", 0); + } + if (ctx->user_op_conf().has_input("in_scale", 0)) { scalar_args.emplace_back("in_scale", 0); } + + std::vector vector_args; + if (ctx->user_op_conf().has_input("weight_scale", 0)) { + vector_args.emplace_back("weight_scale", 0); } + if (ctx->user_op_conf().has_input("weight_acc", 0)) { vector_args.emplace_back("weight_acc", 0); } + if (ctx->user_op_conf().has_input("scale", 0)) { vector_args.emplace_back("scale", 0); } + if (ctx->user_op_conf().has_input("bias", 0)) { vector_args.emplace_back("bias", 0); } + + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), m_axis) + .Broadcast(user_op::OpArg("b", 0)) + .Broadcast(scalar_args) + .Broadcast(vector_args) + .Split(out_and_add_to_output_args, 0) + .Build(); + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .Split(user_op::OpArg("b", 0), n_axis) + .Broadcast(scalar_args) + .Split(vector_args, 0) + .Split(out_and_add_to_output_args, 1) + .Build(); + ctx->NewBuilder() + .Split(user_op::OpArg("a", 0), k_a_axis) + .Split(user_op::OpArg("b", 0), k_b_axis) + .Broadcast(scalar_args) + .Broadcast(vector_args) + .PartialSum(out_and_add_to_output_args) + .Build(); + ctx->NewBuilder() + .PartialSum(user_op::OpArg("a", 0)) + .Broadcast(user_op::OpArg("b", 0)) + .Broadcast(scalar_args) + .Broadcast(vector_args) + .PartialSum(out_and_add_to_output_args) + .Build(); + ctx->NewBuilder() + .Broadcast(user_op::OpArg("a", 0)) + .PartialSum(user_op::OpArg("b", 0)) + .Broadcast(scalar_args) + .Broadcast(vector_args) + .PartialSum(out_and_add_to_output_args) + .Build(); return Maybe::Ok(); } From ef397d68f152ddb6d67a5520aad118142c427cf1 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Thu, 7 Sep 2023 07:46:17 +0000 Subject: [PATCH 54/65] refine --- oneflow/ir/lib/OneFlow/Passes.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp index 9e4e1b906a5..9373d7921ff 100644 --- a/oneflow/ir/lib/OneFlow/Passes.cpp +++ b/oneflow/ir/lib/OneFlow/Passes.cpp @@ -522,7 +522,6 @@ LogicalResult PruneReduntantQuantization(OpType op, PatternRewriter& rewriter) { for (auto q : it.second) { if (q != q0) { q->replaceAllUsesWith(q0->getResults()); - q->erase(); pruned = true; } } @@ -532,7 +531,6 @@ LogicalResult PruneReduntantQuantization(OpType op, PatternRewriter& rewriter) { for (auto q : it.second) { if (q != q0) { q->replaceAllUsesWith(q0->getResults()); - q->erase(); pruned = true; } } @@ -626,7 +624,6 @@ struct AutoNhwcPattern : public OpInterfaceRewritePattern { } num_transposed_result += 1; } - op->erase(); } return success(); } From f2620136c9af6c6e13dee1563038e7517d9be017 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Thu, 7 Sep 2023 14:35:26 +0000 Subject: [PATCH 55/65] fuse layer norm and dynamic quant --- .../core/cuda/layer_norm_min_max_observer.cuh | 796 ++++++++++++++++++ oneflow/ir/include/OneFlow/OneFlowUserOps.td | 32 + .../lib/OneFlow/PDLL/ForwardOpPatterns.pdll | 38 + .../dynamic_quantization_gpu_kernel.cu | 96 +++ ..._layer_norm_min_max_observer_gpu_kernel.cu | 182 ++++ oneflow/user/kernels/quantization_kernel.cu | 8 +- ...ation_kernel.cu => quantization_utils.cuh} | 102 +-- .../fused_layer_norm_min_max_observer_op.cpp | 119 +++ 8 files changed, 1284 insertions(+), 89 deletions(-) create mode 100644 oneflow/core/cuda/layer_norm_min_max_observer.cuh create mode 100644 oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu create mode 100644 oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu rename oneflow/user/kernels/{dynamic_quantization_kernel.cu => quantization_utils.cuh} (72%) create mode 100644 oneflow/user/ops/fused_layer_norm_min_max_observer_op.cpp diff --git a/oneflow/core/cuda/layer_norm_min_max_observer.cuh b/oneflow/core/cuda/layer_norm_min_max_observer.cuh new file mode 100644 index 00000000000..5a83bfbffdd --- /dev/null +++ b/oneflow/core/cuda/layer_norm_min_max_observer.cuh @@ -0,0 +1,796 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef ONEFLOW_CORE_CUDA_LAYER_NORM_MIN_MAX_OBSERVER_H_ +#define ONEFLOW_CORE_CUDA_LAYER_NORM_MIN_MAX_OBSERVER_H_ + +#include +#include +#include + +#include "oneflow/core/cuda/layer_norm.cuh" +#include "oneflow/core/ndarray/binary_func.h" +#include "oneflow/core/kernel/util/numeric_limits.cuh" + +namespace oneflow { + +namespace cuda { + +namespace layer_norm { + +template +inline __device__ void WelfordMinMaxCombine(T val, T* mean, T* m2, T* min, T* max, T* count) { + // Use Welford Online algorithem to compute mean and variance + // For more details you can refer to: + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm + *count += 1; + T delta1 = val - *mean; + *mean += Div(delta1, *count); + T delta2 = val - *mean; + *m2 += delta1 * delta2; + *min = BinaryFuncMin::Invoke(val, *min); + *max = BinaryFuncMax::Invoke(val, *max); +} + +template +inline __device__ void WelfordMinMaxCombine(T b_mean, T b_m2, T b_min, T b_max, T b_count, T* mean, + T* m2, T* min, T* max, T* count) { + if (b_count == 0) { return; } + T new_count = *count + b_count; + T nb_over_n = Div(b_count, new_count); + T delta = b_mean - *mean; + *mean += delta * nb_over_n; + *m2 += b_m2 + delta * delta * (*count) * nb_over_n; + *count = new_count; + *min = BinaryFuncMin::Invoke(b_min, *min); + *max = BinaryFuncMax::Invoke(b_max, *max); +} + +template +__inline__ __device__ void WelfordMinMaxWarpReduce(T thread_mean, T thread_m2, T thread_min, + T thread_max, T thread_count, T* mean, T* m2, + T* min, T* max, T* count) { + *mean = thread_mean; + *m2 = thread_m2; + *count = thread_count; + *min = thread_min; + *max = thread_max; + for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { + T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width); + T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width); + T b_min = __shfl_down_sync(0xffffffff, *min, mask, thread_group_width); + T b_max = __shfl_down_sync(0xffffffff, *max, mask, thread_group_width); + T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width); + WelfordMinMaxCombine(b_mean, b_m2, b_min, b_max, b_count, mean, m2, min, max, count); + } +} + +template +__inline__ __device__ void WelfordMinMaxWarpAllReduce(T thread_mean, T thread_m2, T thread_min, + T thread_max, T thread_count, T* mean, T* m2, + T* min, T* max, T* count) { + WelfordMinMaxWarpReduce(thread_mean, thread_m2, thread_min, thread_max, + thread_count, mean, m2, min, max, count); + *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width); + *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width); + *min = __shfl_sync(0xffffffff, *min, 0, thread_group_width); + *max = __shfl_sync(0xffffffff, *max, 0, thread_group_width); + *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width); +} + +template +__inline__ __device__ void WelfordMinMaxBlockAllReduce(T thread_mean, T thread_m2, T thread_min, + T thread_max, T thread_count, T* result_mean, + T* result_m2, T* result_min, T* result_max, + T* result_count) { + __shared__ T mean_shared[kWarpSize]; + __shared__ T m2_shared[kWarpSize]; + __shared__ T min_shared[kWarpSize]; + __shared__ T max_shared[kWarpSize]; + __shared__ T count_shared[kWarpSize]; + __shared__ T mean_result_broadcast; + __shared__ T m2_result_broadcast; + __shared__ T min_result_broadcast; + __shared__ T max_result_broadcast; + __shared__ T count_result_broadcast; + const int lid = threadIdx.x % kWarpSize; + const int wid = threadIdx.x / kWarpSize; + T warp_mean = 0; + T warp_m2 = 0; + T warp_min = detail::numeric_limits::max(); + T warp_max = detail::numeric_limits::lowest(); + T warp_count = 0; + WelfordMinMaxWarpReduce(thread_mean, thread_m2, thread_min, thread_max, thread_count, &warp_mean, + &warp_m2, &warp_min, &warp_max, &warp_count); + __syncthreads(); + if (lid == 0) { + mean_shared[wid] = warp_mean; + m2_shared[wid] = warp_m2; + min_shared[wid] = warp_min; + max_shared[wid] = warp_max; + count_shared[wid] = warp_count; + } + __syncthreads(); + if (wid == 0) { + if (threadIdx.x < blockDim.x / kWarpSize) { + warp_mean = mean_shared[lid]; + warp_m2 = m2_shared[lid]; + warp_min = min_shared[lid]; + warp_max = max_shared[lid]; + warp_count = count_shared[lid]; + } else { + warp_mean = static_cast(0); + warp_m2 = static_cast(0); + warp_min = detail::numeric_limits::max(); + warp_max = detail::numeric_limits::lowest(); + warp_count = static_cast(0); + } + __syncwarp(); + T block_mean = 0; + T block_m2 = 0; + T block_min = detail::numeric_limits::max(); + T block_max = detail::numeric_limits::lowest(); + T block_count = 0; + WelfordMinMaxWarpReduce(warp_mean, warp_m2, warp_min, warp_max, warp_count, &block_mean, + &block_m2, &block_min, &block_max, &block_count); + if (lid == 0) { + mean_result_broadcast = block_mean; + m2_result_broadcast = block_m2; + min_result_broadcast = block_min; + max_result_broadcast = block_max; + count_result_broadcast = block_count; + } + } + __syncthreads(); + *result_mean = mean_result_broadcast; + *result_m2 = m2_result_broadcast; + *result_min = min_result_broadcast; + *result_max = max_result_broadcast; + *result_count = count_result_broadcast; +} + +template +__global__ void LayerNormMinMaxObserverWarpImpl(LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, + T* min_max) { + using LoadType = typename LOAD::LoadType; + static_assert(max_cols_per_thread % pack_size == 0, ""); + static_assert(min_cols_per_thread % pack_size == 0, ""); + static_assert(thread_group_width <= kWarpSize, ""); + static_assert(kWarpSize % thread_group_width == 0, ""); + constexpr int max_num_packs = max_cols_per_thread / pack_size; + constexpr int min_num_packs = min_cols_per_thread / pack_size; + assert(cols <= max_cols_per_thread * thread_group_width); + ComputeType buf[rows_per_access][max_cols_per_thread]; + const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; + const int64_t num_global_thread_group = gridDim.x * blockDim.y; + const int64_t lane_id = threadIdx.x; + const int64_t step = num_global_thread_group * rows_per_access; + for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { + ComputeType thread_mean[rows_per_access]; + ComputeType thread_m2[rows_per_access]; + ComputeType thread_min[rows_per_access]; + ComputeType thread_max[rows_per_access]; + ComputeType thread_count[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + thread_mean[row_id] = 0; + thread_m2[row_id] = 0; + thread_min[row_id] = detail::numeric_limits::max(); + thread_max[row_id] = detail::numeric_limits::lowest(); + thread_count[row_id] = 0; + ComputeType* row_buf = buf[row_id]; +#pragma unroll + for (int pack_id = 0; pack_id < min_num_packs; ++pack_id) { + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + const int pack_offset = pack_id * pack_size; + LoadType pack[pack_size]; + load.template load(pack, row + row_id, col); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + row_buf[pack_offset + i] = static_cast(pack[i]); + WelfordMinMaxCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, + thread_min + row_id, thread_max + row_id, thread_count + row_id); + } + } + for (int pack_id = min_num_packs; pack_id < max_num_packs; ++pack_id) { + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + const int pack_offset = pack_id * pack_size; + if (!padding || col < cols) { + LoadType pack[pack_size]; + load.template load(pack, row + row_id, col); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + row_buf[pack_offset + i] = static_cast(pack[i]); + WelfordMinMaxCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, + thread_min + row_id, thread_max + row_id, thread_count + row_id); + } + } else { +#pragma unroll + for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = 0; } + } + } + } + ComputeType warp_mean[rows_per_access]; + ComputeType warp_m2[rows_per_access]; + ComputeType warp_min[rows_per_access]; + ComputeType warp_max[rows_per_access]; + ComputeType warp_count[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + int global_row_id = row + row_id; + ComputeType* row_buf = buf[row_id]; + WelfordMinMaxWarpAllReduce( + thread_mean[row_id], thread_m2[row_id], thread_min[row_id], thread_max[row_id], + thread_count[row_id], warp_mean + row_id, warp_m2 + row_id, warp_min + row_id, + warp_max + row_id, warp_count + row_id); + ComputeType row_mean = warp_mean[row_id]; + ComputeType row_variance = + max(Div(warp_m2[row_id], warp_count[row_id]), static_cast(0.0)); + ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); + if (lane_id == 0) { + min_max[global_row_id << 1] = (warp_min[row_id] - row_mean) * row_inv_var; + min_max[(global_row_id << 1) + 1] = (warp_max[row_id] - row_mean) * row_inv_var; + } +#pragma unroll + for (int i = 0; i < max_cols_per_thread; ++i) { + row_buf[i] = (row_buf[i] - row_mean) * row_inv_var; + } +#pragma unroll + for (int i = 0; i < min_num_packs; ++i) { + const int col = (i * thread_group_width + lane_id) * pack_size; + store.template store(row_buf + i * pack_size, global_row_id, col); + } +#pragma unroll + for (int i = min_num_packs; i < max_num_packs; ++i) { + const int col = (i * thread_group_width + lane_id) * pack_size; + if (!padding || col < cols) { + store.template store(row_buf + i * pack_size, global_row_id, col); + } + } + } + } +} + +template +inline cudaError_t LaunchLayerNormMinMaxObserverWarpImpl(cudaStream_t stream, LOAD load, + STORE store, const int64_t rows, + const int64_t cols, const double epsilon, + T* min_max) { + constexpr int block_size = 128; + constexpr int waves = 32; + static_assert(block_size % thread_group_width == 0, ""); + constexpr int thread_groups_per_block = block_size / thread_group_width; + dim3 block_dim(thread_group_width, thread_groups_per_block); + const int64_t num_blocks = + (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; + int grid_dim_x; + { + cudaError_t err = + GetNumBlocks(LayerNormMinMaxObserverWarpImpl, + block_size, 0, num_blocks, waves, &grid_dim_x); + if (err != cudaSuccess) { return err; } + } + LayerNormMinMaxObserverWarpImpl + <<>>(load, store, rows, cols, epsilon, min_max); + return cudaPeekAtLastError(); +} + +template +inline cudaError_t DispatchLayerNormMinMaxObserverWarpImplPadding(cudaStream_t stream, LOAD load, + STORE store, const int64_t rows, + const int64_t cols, + const double epsilon, + T* min_max) { + if (cols == max_cols_per_thread * thread_group_width) { + // when not padding, min_cols_per_thread must equals to max_cols_per_thread, pass + // max_cols_per_thread as min_cols_per_thread and max_cols_per_thread param. + return LaunchLayerNormMinMaxObserverWarpImpl( + stream, load, store, rows, cols, epsilon, min_max); + } else { + return LaunchLayerNormMinMaxObserverWarpImpl( + stream, load, store, rows, cols, epsilon, min_max); + } +} + +template +typename std::enable_if::type +DispatchLayerNormMinMaxObserverWarpImplCols(cudaStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, T* min_max) { + if (cols <= 0) { return cudaErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormMinMaxObserverWarpImplPadding< \ + LOAD, STORE, T, ComputeType, pack_size, pack_size, 0, thread_group_width, 2>( \ + stream, load, store, rows, cols, epsilon, min_max); \ + } else { \ + return DispatchLayerNormMinMaxObserverWarpImplPadding< \ + LOAD, STORE, T, ComputeType, pack_size, pack_size, 0, thread_group_width, 1>( \ + stream, load, store, rows, cols, epsilon, min_max); \ + } \ + } + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(max_col, min_col) \ + else if (cols <= (max_col)*kWarpSize) { \ + return DispatchLayerNormMinMaxObserverWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, min_max); \ + } + DEFINE_ONE_ELIF(2, 1) + DEFINE_ONE_ELIF(4, 2) + DEFINE_ONE_ELIF(8, 4) + DEFINE_ONE_ELIF(12, 8) + DEFINE_ONE_ELIF(16, 12) + DEFINE_ONE_ELIF(20, 16) + DEFINE_ONE_ELIF(24, 20) + DEFINE_ONE_ELIF(28, 24) + DEFINE_ONE_ELIF(32, 28) +#undef DEFINE_ONE_ELIF + else { + return cudaErrorInvalidValue; + } +} + +template +typename std::enable_if::type +DispatchLayerNormMinMaxObserverWarpImplCols(cudaStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, T* min_max) { + if (cols <= 0) { return cudaErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormMinMaxObserverWarpImplPadding< \ + LOAD, STORE, T, ComputeType, pack_size, pack_size, 0, thread_group_width, 2>( \ + stream, load, store, rows, cols, epsilon, min_max); \ + } else { \ + return DispatchLayerNormMinMaxObserverWarpImplPadding< \ + LOAD, STORE, T, ComputeType, pack_size, pack_size, 0, thread_group_width, 1>( \ + stream, load, store, rows, cols, epsilon, min_max); \ + } \ + } + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(max_col, min_col) \ + else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \ + return DispatchLayerNormMinMaxObserverWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, min_max); \ + } + DEFINE_ONE_ELIF(4, 2) + DEFINE_ONE_ELIF(8, 4) + DEFINE_ONE_ELIF(12, 8) + DEFINE_ONE_ELIF(16, 12) + DEFINE_ONE_ELIF(20, 16) + DEFINE_ONE_ELIF(24, 20) + DEFINE_ONE_ELIF(28, 24) + DEFINE_ONE_ELIF(32, 28) +#undef DEFINE_ONE_ELIF + else { + return cudaErrorInvalidValue; + } +} + +template +struct DispatchLayerNormMinMaxObserverWarpImplPackSize { + cudaError_t operator()(cudaStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, T* min_max) { + if (cols % 2 == 0 && CanPackAs(load, 2) && CanPackAs(store, 2)) { + return DispatchLayerNormMinMaxObserverWarpImplCols( + stream, load, store, rows, cols, epsilon, min_max); + } else { + return DispatchLayerNormMinMaxObserverWarpImplCols( + stream, load, store, rows, cols, epsilon, min_max); + } + } +}; + +template +inline cudaError_t DispatchLayerNormMinMaxObserverWarpImpl(cudaStream_t stream, LOAD load, + STORE store, const int64_t rows, + const int64_t cols, const double epsilon, + T* min_max) { + return DispatchLayerNormMinMaxObserverWarpImplPackSize()( + stream, load, store, rows, cols, epsilon, min_max); +} + +template +__global__ void LayerNormMinMaxObserverBlockSMemImpl(LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, + T* min_max) { + using LoadType = typename LOAD::LoadType; + extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; + auto* buf = reinterpret_cast(shared_buf); + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_mean = 0; + ComputeType thread_m2 = 0; + ComputeType thread_min = detail::numeric_limits::max(); + ComputeType thread_max = detail::numeric_limits::lowest(); + ComputeType thread_count = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + LoadType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + buf[i * num_packs + pack_id] = pack[i]; + WelfordMinMaxCombine(static_cast(pack[i]), &thread_mean, &thread_m2, + &thread_min, &thread_max, &thread_count); + } + } + ComputeType row_mean = 0; + ComputeType row_m2 = 0; + ComputeType row_min = detail::numeric_limits::max(); + ComputeType row_max = detail::numeric_limits::lowest(); + ComputeType row_count = 0; + WelfordMinMaxBlockAllReduce(thread_mean, thread_m2, thread_min, thread_max, + thread_count, &row_mean, &row_m2, &row_min, &row_max, + &row_count); + ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); + ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); + if (threadIdx.x == 0) { + min_max[row << 1] = (row_min - row_mean) * row_inv_var; + min_max[(row << 1) + 1] = (row_max - row_mean) * row_inv_var; + } + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + pack[i] = (static_cast(buf[i * num_packs + pack_id]) - row_mean) * row_inv_var; + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +inline cudaError_t LaunchLayerNormMinMaxObserverBlockSMemImpl(cudaStream_t stream, LOAD load, + STORE store, int smem, + const int64_t rows, + const int64_t cols, + const double epsilon, T* min_max) { + constexpr int waves = 32; + int grid_dim_x; + { + cudaError_t err = GetNumBlocks( + LayerNormMinMaxObserverBlockSMemImpl, + block_size, smem, rows, waves, &grid_dim_x); + if (err != cudaSuccess) { return err; } + } + LayerNormMinMaxObserverBlockSMemImpl + <<>>(load, store, rows, cols, epsilon, min_max); + return cudaPeekAtLastError(); +} + +template +inline cudaError_t TryDispatchLayerNormMinMaxObserverBlockSMemImplBlockSize( + cudaStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, + const double epsilon, T* min_max, bool* success) { + constexpr int block_size_conf_1 = 128; + constexpr int block_size_conf_2 = 256; + constexpr int block_size_conf_3 = 512; + constexpr int block_size_conf_4 = 1024; + + int dev = 0; + { + cudaError_t err = cudaGetDevice(&dev); + if (err != cudaSuccess) { return err; } + } + + int sm_count = 0; + { + cudaError_t err = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev); + if (err != cudaSuccess) { return err; } + } + + static const bool max_smem_configed = [=]() { + int max_smem_size = 0; + cudaError_t err = + cudaDeviceGetAttribute(&max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + if (err != cudaSuccess) { return false; } + + err = MaximizeDynamicSharedMemorySize( + LayerNormMinMaxObserverBlockSMemImpl, + max_smem_size); + if (err != cudaSuccess) { return false; } + err = MaximizeDynamicSharedMemorySize( + LayerNormMinMaxObserverBlockSMemImpl, + max_smem_size); + if (err != cudaSuccess) { return false; } + err = MaximizeDynamicSharedMemorySize( + LayerNormMinMaxObserverBlockSMemImpl, + max_smem_size); + if (err != cudaSuccess) { return false; } + err = MaximizeDynamicSharedMemorySize( + LayerNormMinMaxObserverBlockSMemImpl, + max_smem_size); + if (err != cudaSuccess) { return false; } + + return true; + }(); + + const size_t smem = cols * sizeof(typename LOAD::LoadType); + + int max_active_blocks_conf_1; + { + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_1, + LayerNormMinMaxObserverBlockSMemImpl, + block_size_conf_1, smem); + if (err != cudaSuccess) { return err; } + } + if (max_active_blocks_conf_1 <= 0) { + *success = false; + return cudaSuccess; + } + + int max_active_blocks_conf_4; + { + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_4, + LayerNormMinMaxObserverBlockSMemImpl, + block_size_conf_4, smem); + if (err != cudaSuccess) { return err; } + } + + if (max_active_blocks_conf_4 == max_active_blocks_conf_1 + || (max_active_blocks_conf_4 > 0 && rows <= sm_count)) { + *success = true; + return LaunchLayerNormMinMaxObserverBlockSMemImpl( + stream, load, store, smem, rows, cols, epsilon, min_max); + } + + int max_active_blocks_conf_3; + { + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_3, + LayerNormMinMaxObserverBlockSMemImpl, + block_size_conf_3, smem); + if (err != cudaSuccess) { return err; } + } + if (max_active_blocks_conf_3 == max_active_blocks_conf_1 + || (max_active_blocks_conf_3 > 0 && rows <= sm_count)) { + *success = true; + return LaunchLayerNormMinMaxObserverBlockSMemImpl( + stream, load, store, smem, rows, cols, epsilon, min_max); + } + + int max_active_blocks_conf_2; + { + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_2, + LayerNormMinMaxObserverBlockSMemImpl, + block_size_conf_2, smem); + if (err != cudaSuccess) { return err; } + } + if (max_active_blocks_conf_2 == max_active_blocks_conf_1 + || (max_active_blocks_conf_2 > 0 && rows <= sm_count)) { + *success = true; + return LaunchLayerNormMinMaxObserverBlockSMemImpl( + stream, load, store, smem, rows, cols, epsilon, min_max); + } + + *success = true; + return LaunchLayerNormMinMaxObserverBlockSMemImpl( + stream, load, store, smem, rows, cols, epsilon, min_max); +} + +template +struct TryDispatchLayerNormMinMaxObserverBlockSMemImplPackSize { + cudaError_t operator()(cudaStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, T* min_max, bool* success) { + if (cols % 4 == 0 && CanPackAs(load, 4) && CanPackAs(store, 4)) { + return TryDispatchLayerNormMinMaxObserverBlockSMemImplBlockSize( + stream, load, store, rows, cols, epsilon, min_max, success); + } else if (cols % 2 == 0 && CanPackAs(load, 2) && CanPackAs(store, 2)) { + return TryDispatchLayerNormMinMaxObserverBlockSMemImplBlockSize( + stream, load, store, rows, cols, epsilon, min_max, success); + } else { + return TryDispatchLayerNormMinMaxObserverBlockSMemImplBlockSize( + stream, load, store, rows, cols, epsilon, min_max, success); + } + } +}; + +template +inline cudaError_t TryDispatchLayerNormMinMaxObserverBlockSMemImpl(cudaStream_t stream, LOAD load, + STORE store, const int64_t rows, + const int64_t cols, + const double epsilon, T* min_max, + bool* success) { + return TryDispatchLayerNormMinMaxObserverBlockSMemImplPackSize()( + stream, load, store, rows, cols, epsilon, min_max, success); +} + +template +__global__ void __launch_bounds__(1024) + LayerNormMinMaxObserverBlockUncachedImpl(LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, T* min_max) { + using LoadType = typename LOAD::LoadType; + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_mean = 0; + ComputeType thread_m2 = 0; + ComputeType thread_min = detail::numeric_limits::max(); + ComputeType thread_max = detail::numeric_limits::lowest(); + ComputeType thread_count = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + LoadType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + WelfordMinMaxCombine(static_cast(pack[i]), &thread_mean, &thread_m2, + &thread_min, &thread_max, &thread_count); + } + } + ComputeType row_mean = 0; + ComputeType row_m2 = 0; + ComputeType row_min = detail::numeric_limits::max(); + ComputeType row_max = detail::numeric_limits::lowest(); + ComputeType row_count = 0; + WelfordMinMaxBlockAllReduce(thread_mean, thread_m2, thread_min, thread_max, + thread_count, &row_mean, &row_m2, &row_min, &row_max, + &row_count); + ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); + ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); + if (threadIdx.x == 0) { + min_max[row << 1] = (row_min - row_mean) * row_inv_var; + min_max[(row << 1) + 1] = (row_max - row_mean) * row_inv_var; + } + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + LoadType pack[pack_size]; + ComputeType dst_pack[pack_size]; + const int pack_offset = pack_id * pack_size; + load.template load(pack, row, pack_offset); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + dst_pack[i] = (static_cast(pack[i]) - row_mean) * row_inv_var; + } + store.template store(dst_pack, row, pack_offset); + } + } +} + +template +inline cudaError_t LaunchLayerNormMinMaxObserverBlockUncachedImpl(cudaStream_t stream, LOAD load, + STORE store, const int64_t rows, + const int64_t cols, + const double epsilon, + T* min_max) { + constexpr int block_size = 1024; + constexpr int waves = 32; + int grid_dim_x; + { + cudaError_t err = + GetNumBlocks(LayerNormMinMaxObserverBlockUncachedImpl, + block_size, 0, rows, waves, &grid_dim_x); + if (err != cudaSuccess) { return err; } + } + LayerNormMinMaxObserverBlockUncachedImpl + <<>>(load, store, rows, cols, epsilon, min_max); + return cudaPeekAtLastError(); +} + +template +struct DispatchLayerNormMinMaxObserverBlockUncachedImplPackSize { + cudaError_t operator()(cudaStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, T* min_max) { + if (cols % 4 == 0 && CanPackAs(load, 4) && CanPackAs(store, 4)) { + return LaunchLayerNormMinMaxObserverBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, min_max); + } else if (cols % 2 == 0 && CanPackAs(load, 2) && CanPackAs(store, 2)) { + return LaunchLayerNormMinMaxObserverBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, min_max); + } else { + return LaunchLayerNormMinMaxObserverBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, min_max); + } + } +}; + +template +inline cudaError_t DispatchLayerNormMinMaxObserverBlockUncachedImpl(cudaStream_t stream, LOAD load, + STORE store, const int64_t rows, + const int64_t cols, + const double epsilon, + T* min_max) { + return DispatchLayerNormMinMaxObserverBlockUncachedImplPackSize()( + stream, load, store, rows, cols, epsilon, min_max); +} + +template +inline typename std::enable_if::value, cudaError_t>::type +DispatchLayerNormMinMaxObserver(cudaStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, T* min_max) { + if (cols <= 1024) { + return DispatchLayerNormMinMaxObserverWarpImpl( + stream, load, store, rows, cols, epsilon, min_max); + } else { + bool dispatch_smem_impl_success; + { + cudaError_t err = + TryDispatchLayerNormMinMaxObserverBlockSMemImpl( + stream, load, store, rows, cols, epsilon, min_max, &dispatch_smem_impl_success); + if (err != cudaSuccess) { return err; } + } + if (!dispatch_smem_impl_success) { + return DispatchLayerNormMinMaxObserverBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, min_max); + } + return cudaSuccess; + } +} + +template +inline typename std::enable_if::value, cudaError_t>::type +DispatchLayerNormMinMaxObserver(cudaStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, T* min_max) { + return DispatchLayerNormMinMaxObserverBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, min_max); +} + +} // namespace layer_norm + +} // namespace cuda + +} // namespace oneflow + +#endif // ONEFLOW_CORE_CUDA_LAYER_NORM_MIN_MAX_OBSERVER_H_ diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index af4d22d8c75..62fe644a05e 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -7030,6 +7030,38 @@ def OneFlow_LayerNormOp : OneFlow_BaseOp<"layer_norm", [NoMemoryEffect, AttrSize let has_data_type_infer_fn = 1; } +def OneFlow_FusedLayerNormMinMaxObserverOp : OneFlow_BaseOp<"fused_layer_norm_min_max_observer", [NoMemoryEffect, NoGrad, AttrSizedOperandSegments, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$x, + Optional:$beta, + Optional:$gamma + ); + let output = (outs + OneFlow_Tensor:$y, + OneFlow_Tensor:$y_scale, + OneFlow_Tensor:$y_zero_point + ); + let attrs = (ins + DefaultValuedAttr:$center, + DefaultValuedAttr:$scale, + DefaultValuedAttr:$begin_norm_axis, + DefaultValuedAttr:$begin_params_axis, + DefaultValuedAttr:$epsilon, + DefaultValuedAttr:$quantization_formula, + DefaultValuedAttr:$quantization_bit, + DefaultValuedAttr:$quantization_scheme, + DefaultValuedAttr:$per_layer_quantization + ); + let trait_attrs = (ins + DenseI32ArrayAttr:$operand_segment_sizes + ); + let has_check_fn = 1; + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + def OneFlow_SkipLayerNormOp : OneFlow_BaseOp<"skip_layer_norm", [NoMemoryEffect, AttrSizedOperandSegments, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$x, diff --git a/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll b/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll index 252aa704ce1..68efa49a0a5 100644 --- a/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll +++ b/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll @@ -211,3 +211,41 @@ Pattern { replace root with input; } + +Pattern { + let center: Attr; + let scale: Attr; + let begin_norm_axis: Attr; + let begin_params_axis: Attr; + let epsilon: Attr; + let quantization_formula: Attr; + let quantization_bit: Attr; + let quantization_scheme: Attr; + let per_layer_quantization: Attr; + + let layer_norm = op(x: Value, beta: Value, gamma: Value) + {center = center, scale = scale, begin_norm_axis = begin_norm_axis, begin_params_axis = begin_params_axis, epsilon = epsilon} + -> (y: Type, mean: Type, inv_variance: Type); + let dynamic_quantization = op(layer_norm.0) + {quantization_formula = quantization_formula, quantization_bit = quantization_bit, quantization_scheme = quantization_scheme, + per_layer_quantization = per_layer_quantization} -> (out: Type, in_scale: Type, in_zero_point: Type); + + rewrite dynamic_quantization with { + let fused_layer_norm_min_max_observer = op(x, beta, gamma) + {center = center, scale = scale, begin_norm_axis = begin_norm_axis, begin_params_axis = begin_params_axis, epsilon = epsilon, + quantization_formula = quantization_formula, quantization_bit = quantization_bit, quantization_scheme = quantization_scheme, + per_layer_quantization = per_layer_quantization, + operand_segment_sizes = attr<"array">} -> (y, in_scale, in_zero_point); + + CopyUserOpAttrs(layer_norm, fused_layer_norm_min_max_observer); + + let quantization = op(fused_layer_norm_min_max_observer.0, + fused_layer_norm_min_max_observer.1, + fused_layer_norm_min_max_observer.2) { + quantization_formula = quantization_formula, quantization_bit = quantization_bit, quantization_scheme = quantization_scheme} -> (out); + + CopyUserOpAttrs(dynamic_quantization, quantization); + + replace dynamic_quantization with (quantization.0, fused_layer_norm_min_max_observer.1, fused_layer_norm_min_max_observer.2); + }; +} diff --git a/oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu b/oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu new file mode 100644 index 00000000000..4f1c48b4d3a --- /dev/null +++ b/oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu @@ -0,0 +1,96 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.cuh" +#include "oneflow/user/kernels/quantization_utils.cuh" + +namespace oneflow { + +template +class GpuDynamicQuantizationKernel final : public user_op::OpKernel { + public: + GpuDynamicQuantizationKernel() = default; + ~GpuDynamicQuantizationKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + + CHECK(quantization_scheme == "affine"); + + const int64_t elements = in->shape_view().elem_cnt(); + + constexpr int pack_size = cuda::elementwise::PackSize(); + int64_t pack_num = (elements + pack_size - 1) / pack_size; + int grid_size = 0; + cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + grid_size = grid_size > 2048 ? 2048 : grid_size; + + size_t element_bytes = GetSizeOfDataType(GetDataType::value); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), grid_size * element_bytes * 2); + + T* min_max = reinterpret_cast(tmp_buffer->mut_dptr()); + auto stream = ctx->stream()->As()->cuda_stream(); + if (per_layer_quantization) { + quantization::ReduceMinMaxPerTensor + <<>>(elements, in->dptr(), + min_max); + } else { + UNIMPLEMENTED() << "dynamic_quantization does not support per-channel quantization"; + } + + if (quantization_formula == "oneflow") { + if (quantization_bit == 8) { + quantization::ApplyDynamicQuantization( + stream, grid_size, min_max, elements, in->dptr(), quantization_bit, + out->mut_dptr(), scale->mut_dptr(), zero_point->mut_dptr()); + } else { + UNIMPLEMENTED(); + } + } else { + UNIMPLEMENTED() << "dynamic_quantization only support oneflow quantization formula"; + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DYNAMIC_QUANTIZATION_KERNEL(dtype) \ + REGISTER_USER_KERNEL("dynamic_quantization") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) + +REGISTER_DYNAMIC_QUANTIZATION_KERNEL(float); +REGISTER_DYNAMIC_QUANTIZATION_KERNEL(double); +REGISTER_DYNAMIC_QUANTIZATION_KERNEL(half); + +} // namespace oneflow diff --git a/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu b/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu new file mode 100644 index 00000000000..ffecc5bdeaa --- /dev/null +++ b/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu @@ -0,0 +1,182 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.cuh" +#include "oneflow/core/cuda/layer_norm.cuh" +#include "oneflow/core/cuda/layer_norm_min_max_observer.cuh" +#include "oneflow/core/ndarray/binary_func.h" +#include "oneflow/core/kernel/util/numeric_limits.cuh" +#include "oneflow/user/kernels/quantization_utils.cuh" + +namespace oneflow { + +namespace { + +template +struct AffineStore { + AffineStore(DST* y, int64_t row_size, const DST* gamma, const DST* beta) + : y(y), row_size(row_size), gamma(gamma), beta(beta) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::layer_norm::Pack y_pack; + cuda::layer_norm::Pack gamma_pack; + cuda::layer_norm::Pack beta_pack; + const int64_t offset = (row * row_size + col) / N; + const int64_t gamma_offset = col / N; + if (do_scale) { + gamma_pack.storage = + *(reinterpret_cast*>(gamma) + gamma_offset); + } else { +#pragma unroll + for (int i = 0; i < N; ++i) { gamma_pack.elem[i] = static_cast(1.f); } + } + if (do_center) { + beta_pack.storage = + *(reinterpret_cast*>(beta) + gamma_offset); + } else { +#pragma unroll + for (int i = 0; i < N; ++i) { beta_pack.elem[i] = static_cast(0.f); } + } +#pragma unroll + for (int i = 0; i < N; ++i) { + DST normalized_i = static_cast(src[i]); + if (do_scale || do_center) { + y_pack.elem[i] = normalized_i * gamma_pack.elem[i] + beta_pack.elem[i]; + } else { + y_pack.elem[i] = normalized_i; + } + } + *(reinterpret_cast*>(y) + offset) = y_pack.storage; + } + DST* y; + int64_t row_size; + const DST* gamma; + const DST* beta; +}; + +template +void LayerNormMinMaxObserverGpu(ep::Stream* stream, const int64_t num_instances, + const int64_t norm_size, const double epsilon, const T* x_ptr, + const T* gamma_ptr, const T* beta_ptr, T* y_ptr, T* min_max_ptr) { + using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; + cuda::layer_norm::DirectLoad load(x_ptr, norm_size); + AffineStore store(y_ptr, norm_size, gamma_ptr, beta_ptr); + cuda::layer_norm::DispatchLayerNormMinMaxObserver( + stream->As()->cuda_stream(), load, store, num_instances, norm_size, epsilon, + min_max_ptr); +} + +template +void DispatchFusedLayerNormMinMaxObserverGpu(ep::Stream* stream, const int64_t num_instances, + const int64_t norm_size, const double epsilon, + const T* x_ptr, const T* gamma_ptr, const T* beta_ptr, + T* y_ptr, T* min_max_ptr) { + if (gamma_ptr != nullptr && beta_ptr != nullptr) { + LayerNormMinMaxObserverGpu(stream, num_instances, norm_size, epsilon, x_ptr, + gamma_ptr, beta_ptr, y_ptr, min_max_ptr); + } else if (gamma_ptr != nullptr && beta_ptr == nullptr) { + LayerNormMinMaxObserverGpu(stream, num_instances, norm_size, epsilon, x_ptr, + gamma_ptr, beta_ptr, y_ptr, min_max_ptr); + } else if (gamma_ptr == nullptr && beta_ptr != nullptr) { + LayerNormMinMaxObserverGpu(stream, num_instances, norm_size, epsilon, x_ptr, + gamma_ptr, beta_ptr, y_ptr, min_max_ptr); + } else { + LayerNormMinMaxObserverGpu(stream, num_instances, norm_size, epsilon, x_ptr, + gamma_ptr, beta_ptr, y_ptr, min_max_ptr); + } +} + +template +class GpuFusedLayerNormMinMaxObserverKernel final : public user_op::OpKernel { + public: + GpuFusedLayerNormMinMaxObserverKernel() = default; + ~GpuFusedLayerNormMinMaxObserverKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const double epsilon = ctx->Attr("epsilon"); + CHECK_GE(epsilon, CUDNN_BN_MIN_EPSILON); + + int64_t begin_norm_axis = ctx->Attr("begin_norm_axis"); + if (begin_norm_axis < 0) { begin_norm_axis += x->shape_view().NumAxes(); } + const int64_t num_instances = x->shape_view().Count(0, begin_norm_axis); + const int64_t norm_size = x->shape_view().elem_cnt() / num_instances; + const T* gamma_ptr = nullptr; + const T* beta_ptr = nullptr; + if (ctx->has_input("gamma", 0)) { + const user_op::Tensor* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); + gamma_ptr = gamma->dptr(); + CHECK_EQ(gamma->shape_view().elem_cnt(), norm_size); + } + if (ctx->has_input("beta", 0)) { beta_ptr = ctx->Tensor4ArgNameAndIndex("beta", 0)->dptr(); } + + size_t element_bytes = GetSizeOfDataType(GetDataType::value); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), num_instances * 2 * element_bytes); + T* min_max = reinterpret_cast(tmp_buffer->mut_dptr()); + + DispatchFusedLayerNormMinMaxObserverGpu(ctx->stream(), num_instances, norm_size, epsilon, + x->dptr(), gamma_ptr, beta_ptr, y->mut_dptr(), + min_max); + + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + CHECK(quantization_scheme == "affine"); + + user_op::Tensor* y_scale = ctx->Tensor4ArgNameAndIndex("y_scale", 0); + user_op::Tensor* y_zero_point = ctx->Tensor4ArgNameAndIndex("y_zero_point", 0); + + auto stream = ctx->stream()->As()->cuda_stream(); + if (quantization_formula == "oneflow") { + if (quantization_bit == 8) { + int8_t upper_bound = (1 << (quantization_bit - 1)) - 1; + int8_t lower_bound = -upper_bound - 1; + quantization::ComputeScaleAndZeroPointBlock + <<<1, cuda::elementwise::kBlockSize, cuda::elementwise::kBlockSize * element_bytes * 2, + stream>>>(num_instances, min_max, upper_bound, lower_bound, + y_scale->mut_dptr(), y_zero_point->mut_dptr()); + } else { + UNIMPLEMENTED(); + } + } else { + UNIMPLEMENTED() << "only support oneflow quantization formula"; + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_LAYER_NORM_MIN_MAX_OBSERVER_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_layer_norm_min_max_observer") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) + +REGISTER_FUSED_LAYER_NORM_MIN_MAX_OBSERVER_KERNEL(double); +REGISTER_FUSED_LAYER_NORM_MIN_MAX_OBSERVER_KERNEL(float); +REGISTER_FUSED_LAYER_NORM_MIN_MAX_OBSERVER_KERNEL(half); + +} // namespace + +} // namespace oneflow diff --git a/oneflow/user/kernels/quantization_kernel.cu b/oneflow/user/kernels/quantization_kernel.cu index 1a06ab92681..cf675bbf609 100644 --- a/oneflow/user/kernels/quantization_kernel.cu +++ b/oneflow/user/kernels/quantization_kernel.cu @@ -125,7 +125,7 @@ __host__ __device__ int ModDiv<16>(int64_t N) { template __global__ void OFPerTensorQuantizationSymmetric(const int64_t elements, const T* in_ptr, - const T* scale_ptr, const OutT upper_bound, + const float* scale_ptr, const OutT upper_bound, const OutT lower_bound, OutT* out_ptr) { using LoadType = cuda::elementwise::PackType; using LoadPack = cuda::elementwise::Pack; @@ -164,7 +164,7 @@ __global__ void OFPerTensorQuantizationSymmetric(const int64_t elements, const T template __global__ void OFPerTensorQuantizationAffine(const int64_t elements, const T* in_ptr, - const T* scale_ptr, const OutT* zero_point_ptr, + const float* scale_ptr, const OutT* zero_point_ptr, const OutT upper_bound, const OutT lower_bound, OutT* out_ptr) { using LoadType = cuda::elementwise::PackType; @@ -224,12 +224,12 @@ void ApplyOFPerTensorQuantization(user_op::KernelComputeContext* ctx, if (quantization_scheme == "symmetric") { OFPerTensorQuantizationSymmetric <<>>( - elements, in->dptr(), scale->dptr(), upper_bound, lower_bound, + elements, in->dptr(), scale->dptr(), upper_bound, lower_bound, out->mut_dptr()); } else { OFPerTensorQuantizationAffine <<>>( - elements, in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, + elements, in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, lower_bound, out->mut_dptr()); } } diff --git a/oneflow/user/kernels/dynamic_quantization_kernel.cu b/oneflow/user/kernels/quantization_utils.cuh similarity index 72% rename from oneflow/user/kernels/dynamic_quantization_kernel.cu rename to oneflow/user/kernels/quantization_utils.cuh index e12dc2f048b..4db60e236c3 100644 --- a/oneflow/user/kernels/dynamic_quantization_kernel.cu +++ b/oneflow/user/kernels/quantization_utils.cuh @@ -13,6 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef ONEFLOW_USER_KERNELS_QUANTIZATION_UTILS_H_ +#define ONEFLOW_USER_KERNELS_QUANTIZATION_UTILS_H_ + #include "oneflow/core/cuda/elementwise.cuh" #include "oneflow/core/device/cuda_util.h" #include "oneflow/core/framework/framework.h" @@ -21,31 +24,30 @@ limitations under the License. #include "oneflow/core/kernel/util/numeric_limits.cuh" namespace oneflow { - -namespace { +namespace quantization { template -__host__ __device__ int ModDiv(int64_t N) { +__host__ __device__ __forceinline__ int ModDiv(int64_t N) { return N - (N / M * M); } template<> -__host__ __device__ int ModDiv<2>(int64_t N) { +__host__ __device__ __forceinline__ int ModDiv<2>(int64_t N) { return N & 0x1; } template<> -__host__ __device__ int ModDiv<4>(int64_t N) { +__host__ __device__ __forceinline__ int ModDiv<4>(int64_t N) { return N & 0x3; } template<> -__host__ __device__ int ModDiv<8>(int64_t N) { +__host__ __device__ __forceinline__ int ModDiv<8>(int64_t N) { return N & 0x7; } template<> -__host__ __device__ int ModDiv<16>(int64_t N) { +__host__ __device__ __forceinline__ int ModDiv<16>(int64_t N) { return N & 0xF; } @@ -155,7 +157,7 @@ __global__ void ComputeScaleAndZeroPointBlock(const int min_max_size, const T* m } template<> -__global__ void ComputeScaleAndZeroPointBlock( +inline __global__ void ComputeScaleAndZeroPointBlock( const int min_max_size, const half* min_max_ptr, const int8_t upper_bound, const int8_t lower_bound, float* scale_ptr, int8_t* zero_point_ptr) { using T = half; @@ -267,9 +269,10 @@ __global__ void ApplyQuantization(const int64_t elements, const T* in_ptr, const } template -void ApplyDynamicQuantization(cudaStream_t stream, const int min_max_size, const T* min_max_ptr, - const int64_t elements, const T* in_ptr, const int quantization_bit, - Q* out_ptr, float* scale_ptr, Q* zero_point_ptr) { +inline void ApplyDynamicQuantization(cudaStream_t stream, const int min_max_size, + const T* min_max_ptr, const int64_t elements, const T* in_ptr, + const int quantization_bit, Q* out_ptr, float* scale_ptr, + Q* zero_point_ptr) { Q upper_bound = (1 << (quantization_bit - 1)) - 1; Q lower_bound = -upper_bound - 1; size_t element_bytes = GetSizeOfDataType(GetDataType::value); @@ -286,78 +289,7 @@ void ApplyDynamicQuantization(cudaStream_t stream, const int min_max_size, const elements, in_ptr, scale_ptr, zero_point_ptr, upper_bound, lower_bound, out_ptr); } -} // namespace - -template -class GpuDynamicQuantizationKernel final : public user_op::OpKernel { - public: - GpuDynamicQuantizationKernel() = default; - ~GpuDynamicQuantizationKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const std::string quantization_scheme = ctx->Attr("quantization_scheme"); - const int32_t quantization_bit = ctx->Attr("quantization_bit"); - const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); - const std::string quantization_formula = ctx->Attr("quantization_formula"); - - CHECK(quantization_scheme == "affine"); - - const int64_t elements = in->shape_view().elem_cnt(); - - constexpr int pack_size = cuda::elementwise::PackSize(); - int64_t pack_num = (elements + pack_size - 1) / pack_size; - int grid_size = 0; - cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - grid_size = grid_size > 2048 ? 2048 : grid_size; - - size_t element_bytes = GetSizeOfDataType(GetDataType::value); - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), grid_size * element_bytes * 2); - - T* min_max = reinterpret_cast(tmp_buffer->mut_dptr()); - auto stream = ctx->stream()->As()->cuda_stream(); - if (per_layer_quantization) { - ReduceMinMaxPerTensor - <<>>(elements, in->dptr(), - min_max); - } else { - UNIMPLEMENTED() << "dynamic_quantization does not support per-channel quantization"; - } - - if (quantization_formula == "oneflow") { - if (quantization_bit == 8) { - ApplyDynamicQuantization( - stream, grid_size, min_max, elements, in->dptr(), quantization_bit, - out->mut_dptr(), scale->mut_dptr(), zero_point->mut_dptr()); - } else { - UNIMPLEMENTED(); - } - } else { - UNIMPLEMENTED() << "dynamic_quantization only support oneflow quantization formula"; - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_DYNAMIC_QUANTIZATION_KERNEL(dtype) \ - REGISTER_USER_KERNEL("dynamic_quantization") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) - -REGISTER_DYNAMIC_QUANTIZATION_KERNEL(float); -REGISTER_DYNAMIC_QUANTIZATION_KERNEL(double); -REGISTER_DYNAMIC_QUANTIZATION_KERNEL(half); - +} // namespace quantization } // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_QUANTIZATION_UTILS_H_ diff --git a/oneflow/user/ops/fused_layer_norm_min_max_observer_op.cpp b/oneflow/user/ops/fused_layer_norm_min_max_observer_op.cpp new file mode 100644 index 00000000000..8a4d504aff6 --- /dev/null +++ b/oneflow/user/ops/fused_layer_norm_min_max_observer_op.cpp @@ -0,0 +1,119 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +namespace { + +int64_t ShiftNegativeAxisIfNeed(const Shape& shape, int64_t axis) { + const int64_t shifted = axis < 0 ? axis + shape.NumAxes() : axis; + CHECK_GE(shifted, 0); + CHECK_LT(shifted, shape.NumAxes()); + return shifted; +} + +} // namespace + +/* static */ Maybe FusedLayerNormMinMaxObserverOp::InferLogicalTensorDesc( + user_op::InferContext* ctx) { + const user_op::TensorDesc& x = ctx->InputTensorDesc("x", 0); + const bool center = ctx->Attr("center"); + const bool scale = ctx->Attr("scale"); + const int64_t begin_params_axis = + ShiftNegativeAxisIfNeed(x.shape(), ctx->Attr("begin_params_axis")); + DimVector param_shape_dim_vec; + param_shape_dim_vec.insert(param_shape_dim_vec.end(), + x.shape().dim_vec().cbegin() + begin_params_axis, + x.shape().dim_vec().cend()); + const Shape param_shape(param_shape_dim_vec); + if (center) { + const user_op::TensorDesc& beta = ctx->InputTensorDesc("beta", 0); + CHECK_EQ_OR_RETURN(beta.shape(), param_shape); + } + if (scale) { + const user_op::TensorDesc& gamma = ctx->InputTensorDesc("gamma", 0); + CHECK_EQ_OR_RETURN(gamma.shape(), param_shape); + } + CHECK_OR_RETURN(ctx->Attr("per_layer_quantization")) + << "dynamic quantization only supports per-layer quantization"; + ctx->SetOutputShape("y", 0, x.shape()); + ctx->SetOutputShape("y_scale", 0, Shape({1})); + ctx->SetOutputShape("y_zero_point", 0, Shape({1})); + + return Maybe::Ok(); +} + +/*static*/ Maybe FusedLayerNormMinMaxObserverOp::InferPhysicalTensorDesc( + user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe FusedLayerNormMinMaxObserverOp::GetSbp(user_op::SbpContext* ctx) { + // dynamic quantization only supports broadcast + return Maybe::Ok(); +} + +/* static */ Maybe FusedLayerNormMinMaxObserverOp::CheckAttr( + const user_op::UserOpDefWrapper& def, const user_op::UserOpConfWrapper& op_conf) { + int32_t quantization_bit = op_conf.attr("quantization_bit"); + CHECK_GT_OR_RETURN(quantization_bit, 1); + CHECK_LE_OR_RETURN(quantization_bit, 8); + + std::string quantization_scheme = op_conf.attr("quantization_scheme"); + CHECK_OR_RETURN(quantization_scheme == "symmetric" || quantization_scheme == "affine"); + + std::string quantization_formula = op_conf.attr("quantization_formula"); + CHECK_OR_RETURN(quantization_formula == "google" || quantization_formula == "cambricon" + || quantization_formula == "oneflow"); + return Maybe::Ok(); +} + +/* static */ Maybe FusedLayerNormMinMaxObserverOp::InferDataType(user_op::InferContext* ctx) { + const bool center = ctx->Attr("center"); + const user_op::TensorDesc& x = ctx->InputTensorDesc("x", 0); + if (center) { + const user_op::TensorDesc& beta = ctx->InputTensorDesc("beta", 0); + CHECK_EQ_OR_RETURN(beta.data_type(), x.data_type()) + << "InferDataType Failed. Expected " << DataType_Name(x.data_type()) << ", but got " + << DataType_Name(beta.data_type()); + } + const bool scale = ctx->Attr("scale"); + if (scale) { + const user_op::TensorDesc& gamma = ctx->InputTensorDesc("gamma", 0); + CHECK_EQ_OR_RETURN(gamma.data_type(), x.data_type()) + << "InferDataType Failed. Expected " << DataType_Name(x.data_type()) << ", but got " + << DataType_Name(gamma.data_type()); + } + ctx->SetOutputDType("y", 0, x.data_type()); + ctx->SetOutputDType("y_scale", 0, DataType::kFloat); + + int32_t quantization_bit = ctx->Attr("quantization_bit"); + const std::string& quantization_formula = ctx->Attr("quantization_formula"); + if (quantization_formula == "oneflow") { + if (quantization_bit == 8) { + ctx->SetOutputDType("y_zero_point", 0, DataType::kInt8); + } else { + OF_UNIMPLEMENTED(); + } + } else { + OF_UNIMPLEMENTED(); + } + return Maybe::Ok(); +} + +} // namespace oneflow From 54b3dd53c6a198e58b1bf26eba2105cdececd538 Mon Sep 17 00:00:00 2001 From: clackhan Date: Fri, 8 Sep 2023 10:12:40 +0000 Subject: [PATCH 56/65] add_grouped_matmul_quant --- oneflow/core/functional/functional_api.yaml | 16 ++ oneflow/core/functional/impl/nn_functor.cpp | 139 +++++++++++ oneflow/ir/include/OneFlow/OneFlowPasses.td | 9 + oneflow/ir/include/OneFlow/OneFlowUserOps.td | 29 +++ .../OneFlow/Transform/OutlineAndFuse.h | 1 + .../lib/OneFlow/Transform/OutlineAndFuse.cpp | 122 +++++++++ .../lib/OneFlow/MLIROneFlowTranslation.cpp | 4 + .../kernels/grouped_matmul_quant_kernel.cu | 176 +++++++++++++ oneflow/user/ops/grouped_matmul_quant_op.cpp | 233 ++++++++++++++++++ 9 files changed, 729 insertions(+) create mode 100644 oneflow/user/kernels/grouped_matmul_quant_kernel.cu create mode 100644 oneflow/user/ops/grouped_matmul_quant_op.cpp diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index adba9ce38e4..b2ab81c9b3f 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1077,6 +1077,22 @@ ] bind_python: True +- name: "grouped_matmul_quant" + signature: + [ + 'Tensor (TensorTuple as, TensorTuple bs, TensorTuple scales, TensorTuple biases, + Bool transpose_a=False, Bool transpose_b=False, + Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant', + 'Tensor (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, + TensorTuple weight_accs, Bool transpose_a=False, Bool transpose_b=False, + Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant', + 'Tensor (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, + TensorTuple weight_accs, TensorTuple biases, + Bool transpose_a=False, Bool transpose_b=False, + Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant' + ] + bind_python: True + - name: "conv3d" signature: 'Tensor (Tensor input, Tensor weight, Tensor bias=None, Int32List[3] stride=1, diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 47231aba953..57cf97bed59 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -671,6 +671,143 @@ class MatMulQuantWithFilterScaleFunctor { std::shared_ptr matmul_scale_bias_op_; }; +class GroupedMatMulQuantFunctor { + public: + GroupedMatMulQuantFunctor() { + grouped_matmul_quant_scale_bias_op_.resize(kMaxInputCount /*the maximum number of inputs*/); + for (int n = 1; n < kMaxInputCount; ++n) { + grouped_matmul_quant_scale_bias_op_[n] = CHECK_JUST(one::OpBuilder("grouped_matmul_quant") + .Input("as", n) + .Input("bs") + .Input("scales", n) + .Input("biases", n) + .Output("out", n) + .Build()); + } + } + Maybe operator()(const TensorTuple& as, const TensorTuple& bs, + const TensorTuple& scales, const TensorTuple& biases, + const bool& transpose_a, const bool& transpose_b, + const double& alpha, + const Optional>& output_dtype) const { + CHECK_OR_RETURN(!transpose_a) + << "the first input should not be transposed for quantized matmul."; + CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; + CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; + auto& attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); + attrs.SetAllAttrs(transpose_a, transpose_b, alpha, + output_dtype.value_or(DType::Float())->data_type()); + int input_size = as.size(); + TensorTuple input(4 * input_size); + std::copy(as.begin(), as.end(), input.begin() + 0 * input_size); + std::copy(bs.begin(), bs.end(), input.begin() + 1 * input_size); + std::copy(scales.begin(), scales.end(), input.begin() + 2 * input_size); + std::copy(biases.begin(), biases.end(), input.begin() + 3 * input_size); + return OpInterpUtil::Dispatch(*grouped_matmul_quant_scale_bias_op_[input_size], + input, attrs); + } + + private: + std::vector> grouped_matmul_quant_scale_bias_op_; +}; + +class GroupedMatMulQuantWithFilterScaleFunctor { + public: + GroupedMatMulQuantWithFilterScaleFunctor() { + grouped_matmul_quant_with_filter_bias_op_.resize(kMaxInputCount); + for (int n = 1; n < kMaxInputCount; ++n) { + grouped_matmul_quant_with_filter_bias_op_[n] = + CHECK_JUST(one::OpBuilder("grouped_matmul_quant") + .Input("as", n) + .Input("bs", n) + .Input("in_zero_points", n) + .Input("in_scales", n) + .Input("weight_scales", n) + .Input("weight_accs", n) + .Output("out", n) + .Build()); + } + } + Maybe operator()(const TensorTuple& as, const TensorTuple& bs, + const TensorTuple& in_zero_points, const TensorTuple& in_scales, + const TensorTuple& weight_scales, const TensorTuple& weight_accs, + const bool& transpose_a, const bool& transpose_b, + const double& alpha, + const Optional>& output_dtype) const { + CHECK_OR_RETURN(!transpose_a) + << "the first input should not be transposed for quantized matmul."; + CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; + CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; + auto& attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); + attrs.SetAllAttrs(transpose_a, transpose_b, alpha, + output_dtype.value_or(DType::Float())->data_type()); + int input_size = as.size(); + TensorTuple input(6 * input_size); + std::copy(as.begin(), as.end(), input.begin() + 0 * input_size); + std::copy(bs.begin(), bs.end(), input.begin() + 1 * input_size); + std::copy(in_zero_points.begin(), in_zero_points.end(), input.begin() + 2 * input_size); + std::copy(in_scales.begin(), in_scales.end(), input.begin() + 3 * input_size); + std::copy(weight_scales.begin(), weight_scales.end(), input.begin() + 4 * input_size); + std::copy(weight_accs.begin(), weight_accs.end(), input.begin() + 5 * input_size); + return OpInterpUtil::Dispatch( + *grouped_matmul_quant_with_filter_bias_op_[input_size], input, attrs); + } + + private: + std::vector> grouped_matmul_quant_with_filter_bias_op_; +}; + +class GroupedMatMulBiasQuantWithFilterScaleFunctor { + public: + GroupedMatMulBiasQuantWithFilterScaleFunctor() { + grouped_matmul_bias_quant_with_filter_bias_op_.resize(kMaxInputCount); + for (int n = 1; n < kMaxInputCount; ++n) { + grouped_matmul_bias_quant_with_filter_bias_op_[n] = + CHECK_JUST(one::OpBuilder("grouped_matmul_quant") + .Input("as", n) + .Input("bs", n) + .Input("in_zero_points", n) + .Input("in_scales", n) + .Input("weight_scales", n) + .Input("weight_accs", n) + .Input("biases", n) + .Output("out", n) + .Build()); + } + } + Maybe operator()(const TensorTuple& as, const TensorTuple& bs, + const TensorTuple& in_zero_points, const TensorTuple& in_scales, + const TensorTuple& weight_scales, const TensorTuple& weight_accs, + const TensorTuple& biases, const bool& transpose_a, + const bool& transpose_b, const double& alpha, + const Optional>& output_dtype) const { + CHECK_OR_RETURN(!transpose_a) + << "the first input should not be transposed for quantized matmul."; + CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; + CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; + auto& attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); + attrs.SetAllAttrs(transpose_a, transpose_b, alpha, + output_dtype.value_or(DType::Float())->data_type()); + int input_size = as.size(); + TensorTuple input(7 * input_size); + std::copy(as.begin(), as.end(), input.begin() + 0 * input_size); + std::copy(bs.begin(), bs.end(), input.begin() + 1 * input_size); + std::copy(in_zero_points.begin(), in_zero_points.end(), input.begin() + 2 * input_size); + std::copy(in_scales.begin(), in_scales.end(), input.begin() + 3 * input_size); + std::copy(weight_scales.begin(), weight_scales.end(), input.begin() + 4 * input_size); + std::copy(weight_accs.begin(), weight_accs.end(), input.begin() + 5 * input_size); + std::copy(biases.begin(), biases.end(), input.begin() + 6 * input_size); + return OpInterpUtil::Dispatch( + *grouped_matmul_bias_quant_with_filter_bias_op_[input_size], input, attrs); + } + + private: + std::vector> grouped_matmul_bias_quant_with_filter_bias_op_; +}; + class VectorMatrixProductFunctor { public: VectorMatrixProductFunctor() { @@ -5667,6 +5804,8 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("Embedding"); m.add_functor("MatMul"); m.add_functor("MatmulQuant"); + m.add_functor("GroupedMatmulQuant"); m.add_functor("MatMulNoBroadCast"); m.add_functor("BatchMatMul"); m.add_functor("MatrixVectorProduct"); diff --git a/oneflow/ir/include/OneFlow/OneFlowPasses.td b/oneflow/ir/include/OneFlow/OneFlowPasses.td index f0affaecf55..31295c3a97a 100644 --- a/oneflow/ir/include/OneFlow/OneFlowPasses.td +++ b/oneflow/ir/include/OneFlow/OneFlowPasses.td @@ -180,6 +180,15 @@ def GroupMatMul : Pass<"group-matmul", "ModuleOp"> { let dependentDialects = []; } +def GroupMatMulQuant : Pass<"group-matmul-quant", "ModuleOp"> { + let summary = "group matmul quant together"; + let description = [{ + group matmul ops together and use cutlass batched matmul + }]; + let constructor = "mlir::oneflow::createGroupMatMulQuant()"; + let dependentDialects = []; +} + def FuseForwardOps : Pass<"fuse-forward-only-ops", "ModuleOp"> { let summary = "fuse forward ops"; let description = [{ diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index af4d22d8c75..83393e61c78 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -5404,6 +5404,35 @@ def OneFlow_MatmulQuantOp : OneFlow_BaseOp<"matmul_quant", [NoMemoryEffect, Attr let has_compute_complexity_fn = 1; } +def OneFlow_GroupedMatmulQuantOp : OneFlow_BaseOp<"grouped_matmul_quant", [NoMemoryEffect, AttrSizedOperandSegments, DeclareOpInterfaceMethods]> { + let input = (ins + Variadic:$as, + Variadic:$bs, + Variadic:$in_zero_points, + Variadic:$in_scales, + Variadic:$weight_scales, + Variadic:$weight_accs, + Variadic:$scales, + Variadic:$biases, + Variadic:$_add_to_outputs + ); + let output = (outs + Variadic:$outputs + ); + let attrs = (ins + DefaultValuedAttr:$transpose_a, + DefaultValuedAttr:$transpose_b, + DefaultValuedAttr:$alpha, + OneFlow_DataType:$out_dtype, + DefaultValuedAttr:$tuning_cache + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; + let has_compute_complexity_fn = 1; +} + def OneFlow_MatmulOp : OneFlow_BaseOp<"matmul", [NoMemoryEffect, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$a, diff --git a/oneflow/ir/include/OneFlow/Transform/OutlineAndFuse.h b/oneflow/ir/include/OneFlow/Transform/OutlineAndFuse.h index b0185b48e44..151a687df52 100644 --- a/oneflow/ir/include/OneFlow/Transform/OutlineAndFuse.h +++ b/oneflow/ir/include/OneFlow/Transform/OutlineAndFuse.h @@ -35,6 +35,7 @@ std::unique_ptr createWrapOpsToKernelLaunchPass(); std::unique_ptr createOutlineJitFunctionPass(); std::unique_ptr createFuseIntoExistingOpPass(); std::unique_ptr createGroupMatMul(); +std::unique_ptr createGroupMatMulQuant(); std::unique_ptr createFuseForwardOps(); std::unique_ptr createFuseOpsWithBackwardImpl(); std::unique_ptr createFuseNormalizationOps(); diff --git a/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp b/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp index ebcaf3e8572..31b7a2e2860 100644 --- a/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp +++ b/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp @@ -160,6 +160,126 @@ class GroupMatMulPass : public GroupMatMulBase { } }; +namespace { + +bool MatmulQuantOpHasInputScale(MatmulQuantOp op) { + return op.getODSOperands(3).empty() ? false : true; +} + +bool MatmulQuantOpHasScale(MatmulQuantOp op) { return op.getODSOperands(6).empty() ? false : true; } + +bool MatmulQuantOpHasBias(MatmulQuantOp op) { return op.getODSOperands(7).empty() ? false : true; } + +bool MatmulQuantOpHasAddToOutput(MatmulQuantOp op) { return !(op.getODSOperands(8).empty()); } + +} // namespace + +struct GroupMatMulQuantPattern : public mlir::OpRewritePattern { + explicit GroupMatMulQuantPattern(mlir::MLIRContext* context) + : OpRewritePattern(context, /*benefit=*/1) {} + mlir::LogicalResult matchAndRewrite(MatmulQuantOp op, + mlir::PatternRewriter& rewriter) const override { + llvm::SmallVector all_matmuls{}; + all_matmuls.push_back(op); + bool has_in_scale = MatmulQuantOpHasInputScale(op); + bool has_scale = MatmulQuantOpHasScale(op); + bool has_bias = MatmulQuantOpHasBias(op); + bool has_add_to_output = MatmulQuantOpHasAddToOutput(op); + for (auto xUser : op.getA().getUsers()) { + if (auto matmul_quant = dyn_cast(xUser)) { + if (has_in_scale != MatmulQuantOpHasInputScale(matmul_quant) + || has_scale != MatmulQuantOpHasScale(matmul_quant) + || has_bias != MatmulQuantOpHasBias(matmul_quant) + || has_add_to_output != MatmulQuantOpHasBias(matmul_quant)) { + continue; + } + all_matmuls.push_back(matmul_quant); + } + } + // all_matmuls has only self, means no other matmul can be grouped + if (all_matmuls.size() == 1) { return failure(); } + int a_size = 0; + int b_size = 0; + int in_zero_size = 0; + int in_scale_size = 0; + int weaght_scale_size = 0; + int weaght_acc_size = 0; + int scale_size = 0; + int bias_size = 0; + int add_to_out_put_size = 0; + + llvm::SmallVector operands{}; + for (auto matmul : all_matmuls) { operands.push_back(matmul.getA()); } + a_size = all_matmuls.size(); + for (auto matmul : all_matmuls) { operands.push_back(matmul.getB()); } + b_size = all_matmuls.size(); + if (has_in_scale) { + for (auto matmul : all_matmuls) { operands.push_back(matmul.getInZeroPoint()); } + for (auto matmul : all_matmuls) { operands.push_back(matmul.getInScale()); } + for (auto matmul : all_matmuls) { operands.push_back(matmul.getWeightScale()); } + for (auto matmul : all_matmuls) { operands.push_back(matmul.getWeightAcc()); } + in_zero_size = all_matmuls.size(); + in_scale_size = all_matmuls.size(); + weaght_scale_size = all_matmuls.size(); + weaght_acc_size = all_matmuls.size(); + } + if (has_scale) { + for (auto matmul : all_matmuls) { operands.push_back(matmul.getScale()); } + scale_size = all_matmuls.size(); + } + if (has_bias) { + for (auto matmul : all_matmuls) { operands.push_back(matmul.getBias()); } + bias_size = all_matmuls.size(); + } + if (has_add_to_output) { + for (auto matmul : all_matmuls) { operands.push_back(matmul.get_addToOutput()); } + add_to_out_put_size = all_matmuls.size(); + } + llvm::SmallVector results{}; + for (auto matmul : all_matmuls) { results.push_back(matmul.getOut().getType()); } + NamedAttrList attributes{}; + attributes.set(OpTrait::IsOpConfCompatible::getDeviceTagAttr(), + OpTrait::IsOpConfCompatible::getDeviceTag(op)); + attributes.set(OpTrait::IsOpConfCompatible::getDeviceNameAttr(), + OpTrait::IsOpConfCompatible::getDeviceName(op)); + attributes.set("transpose_a", op.getTransposeAAttr()); + attributes.set("transpose_b", op.getTransposeBAttr()); + attributes.set("alpha", op.getAlphaAttr()); + attributes.set("out_dtype", op.getOutDtypeAttr()); + attributes.set("tuning_cache", op.getTuningCacheAttr()); + if (auto hierarchy = OpTrait::IsOpConfCompatible::getHierarchy(op)) { + attributes.set(OpTrait::IsOpConfCompatible::getHierarchyAttr(), hierarchy); + } + if (auto scope_symbol_id = OpTrait::IsOpConfCompatible::getScopeSymbolID(op)) { + attributes.set(OpTrait::IsOpConfCompatible::getScopeSymbolIDAttr(), scope_symbol_id); + } + attributes.set(OpTrait::AttrSizedOperandSegments::getOperandSegmentSizeAttr(), + rewriter.getDenseI32ArrayAttr({a_size, b_size, in_zero_size, in_scale_size, + weaght_scale_size, weaght_acc_size, scale_size, + bias_size, add_to_out_put_size})); + attributes.set( + OpTrait::IsOpConfCompatible::getOpNameAttr(), + rewriter.getStringAttr("grouped_matmul_quant_" + + OpTrait::IsOpConfCompatible::getOpName(op).str())); + auto grouped_matmul_quant_op = + rewriter.create(op->getLoc(), results, operands, attributes); + for (const auto& matmul : llvm::enumerate(all_matmuls)) { + matmul.value().getOut().replaceAllUsesWith( + grouped_matmul_quant_op.getOutputs()[matmul.index()]); + } + return success(); + } +}; + +class GroupMatMulQuantPass : public GroupMatMulQuantBase { + void runOnOperation() override { + Operation* op = getOperation(); + RewritePatternSet patterns(op->getContext()); + patterns.add(op->getContext()); + (void)applyPatternsAndFoldGreedily(op, std::move(patterns)); + } +}; + struct GroupNormActivationPattern : public OpRewritePattern { explicit GroupNormActivationPattern(MLIRContext* context) : OpRewritePattern(context, /*benefit=*/1) {} @@ -229,6 +349,8 @@ std::unique_ptr createFuseIntoExistingOpPass() { std::unique_ptr createGroupMatMul() { return std::make_unique(); } +std::unique_ptr createGroupMatMulQuant() { return std::make_unique(); } + std::unique_ptr createFuseForwardOps() { return std::make_unique(); } std::unique_ptr createFuseOpsWithBackwardImpl() { return std::make_unique(); diff --git a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp index 233c33d6f02..225542e3e09 100644 --- a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp +++ b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp @@ -842,6 +842,10 @@ LogicalResult ApplyRoundTripPatterns(RoundTripOneFlowJobWrapperInterface& job_wr pm.addPass(oneflow::createConvertInferenceOpPass()); pm.addPass(oneflow::createPostConvertInferenceOpPass()); } + if (job_wrapper.IsLastIRPass() + && ::oneflow::ParseBooleanFromEnv("ONEFLOW_MLIR_GROUP_MATMUL_QUANT", false)) { + pm.addPass(oneflow::createGroupMatMulQuant()); + } if (!job_wrapper.IsLastIRPass() && ::oneflow::ParseBooleanFromEnv("ONEFLOW_MLIR_FUSE_NORMALIZATION_OPS", false)) { pm.addPass(oneflow::createFuseNormalizationOps()); diff --git a/oneflow/user/kernels/grouped_matmul_quant_kernel.cu b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu new file mode 100644 index 00000000000..59711b90583 --- /dev/null +++ b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu @@ -0,0 +1,176 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/core/cuda/atomic.cuh" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/common/scalar.h" + +namespace oneflow { + +struct GemmProblem { + GemmProblem(int64_t m, int64_t n, int64_t k) : m(m), n(n), k(k) {} + int64_t m; + int64_t n; + int64_t k; +}; + +inline bool operator==(const GemmProblem& lhs, const GemmProblem& rhs) { + return lhs.m == rhs.m && lhs.n == rhs.n && lhs.k == rhs.k; +} + +} // namespace oneflow + +namespace std { + +template<> +struct hash { + std::size_t operator()(const oneflow::GemmProblem& p) const { + return oneflow::Hash(p.m, p.n, p.k); + } +}; + +} // namespace std + +namespace oneflow { + +namespace { + +constexpr int64_t kMaxProblemBatch = 64; + +template +struct Buffer { + const int8_t* a; + const int8_t* b; + const int8_t* in_zero_point; + const float* in_scale; + const T* weight_scale; + const T* weight_acc; + const T* scale; + const T* biase; + const T* _add_to_output; + T* output; +}; + +template +struct Param { + Param(const GemmProblem& problem, std::vector> buffers) + : problem(problem), batch_size(buffers.size()) { + std::copy(buffers.cbegin(), buffers.cend(), buffer); + elem_cnt = batch_size * problem.m * problem.n; + } + GemmProblem problem; + Buffer buffer[kMaxProblemBatch]; + int batch_size; + int elem_cnt; +}; + +template +void ApplyGroup(const GemmProblem& problem, std::vector> ptrs, + user_op::Tensor* tmp_buffer, ep::Stream* stream) { + Param params(problem, ptrs); +} + +template +class GroupedMatmulQuantKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + GroupedMatmulQuantKernel() = default; + ~GroupedMatmulQuantKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const override { + HashMap>> groups; + const int32_t input_size = ctx->input_size("as"); + CHECK_EQ(ctx->input_size("bs"), input_size); + const bool has_in_zero_points = ctx->has_input("in_zero_points", 0); + const bool has_sacles = ctx->has_input("scales", 0); + const bool has_biases = ctx->has_input("biases", 0); + const bool has_add_to_outputs = ctx->has_input("_add_to_outputs", 0); + + for (int32_t i = 0; i < input_size; ++i) { + const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("as", i); + const user_op::Tensor* b = ctx->Tensor4ArgNameAndIndex("bs", i); + const user_op::Tensor* in_zero_point = ctx->Tensor4ArgNameAndIndex("in_zero_points", i); + const user_op::Tensor* in_scale = ctx->Tensor4ArgNameAndIndex("in_scales", i); + const user_op::Tensor* weight_scale = ctx->Tensor4ArgNameAndIndex("weight_scales", i); + const user_op::Tensor* weight_acc = ctx->Tensor4ArgNameAndIndex("weight_accs", i); + const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scales", i); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("biases", i); + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_outputs", i); + user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("outputs", i); + + CHECK_GE(a->shape_view().NumAxes(), 2); + const int64_t k = a->shape_view().At(a->shape_view().NumAxes() - 1); + const int64_t m = a->shape_view().elem_cnt() / k; + const int64_t n = b->shape_view().At(0); + + CHECK_EQ(output->shape_view().NumAxes(), a->shape_view().NumAxes()); + CHECK_EQ(output->shape_view().At(output->shape_view().NumAxes() - 1), n); + for (int32_t j = 0; j < output->shape_view().NumAxes() - 1; ++j) { + CHECK_EQ(output->shape_view().At(j), a->shape_view().At(j)); + } + const int8_t* a_ptr = a->dptr(); + const int8_t* b_ptr = b->dptr(); + const int8_t* in_zero_point_ptr = + has_in_zero_points ? in_zero_point->dptr() : nullptr; + const float* in_scale_ptr = has_in_zero_points ? in_scale->dptr() : nullptr; + const OutType* weight_scale_ptr = + has_in_zero_points ? weight_scale->dptr() : nullptr; + const OutType* weight_acc_ptr = has_in_zero_points ? weight_acc->dptr() : nullptr; + const OutType* scale_ptr = has_sacles ? scale->dptr() : nullptr; + const OutType* bias_ptr = has_biases ? bias->dptr() : nullptr; + const OutType* add_to_output_ptr = + has_add_to_outputs ? add_to_output->dptr() : nullptr; + OutType* output_ptr = output->mut_dptr(); + + groups[GemmProblem(m, n, k)].push_back( + Buffer{a_ptr, b_ptr, in_zero_point_ptr, in_scale_ptr, weight_scale_ptr, + weight_acc_ptr, scale_ptr, bias_ptr, add_to_output_ptr, output_ptr}); + } + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + for (const auto& group : groups) { + for (size_t i = 0; i < group.second.size(); i += kMaxProblemBatch) { + std::vector> ptrs( + {group.second.begin() + i, + group.second.begin() + i + + std::min(group.second.size() - i, kMaxProblemBatch)}); + ApplyGroup(group.first, ptrs, tmp_buffer, ctx->stream()); + } + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(out_cpp_type, out_data_type) \ + REGISTER_USER_KERNEL("grouped_matmul_quant") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("as", 0) == DataType::kInt8) \ + && (user_op::HobDataType("bs", 0) == DataType::kInt8) \ + && (user_op::HobDataType("outputs", 0) == out_data_type)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + return kMaxProblemBatch * 7 * sizeof(void*) + 3 * 1024 * 1024; \ + }); + +REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(half, DataType::kFloat16) +REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(float, DataType::kFloat) + +} // namespace + +} // namespace oneflow diff --git a/oneflow/user/ops/grouped_matmul_quant_op.cpp b/oneflow/user/ops/grouped_matmul_quant_op.cpp new file mode 100644 index 00000000000..06a8ef0ec3c --- /dev/null +++ b/oneflow/user/ops/grouped_matmul_quant_op.cpp @@ -0,0 +1,233 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/data_type.pb.h" +#include "oneflow/core/common/just.h" +#include "oneflow/core/common/maybe.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/infer_util.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +namespace { + +Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { + bool transpose_b = ctx->Attr("transpose_b"); + const Shape& shape_b = ctx->Shape4ArgNameAndIndex("b", 0); + int64_t n = 0; + if (!transpose_b) { + n = shape_b.At(shape_b.NumAxes() - 1); + } else { + n = shape_b.At(shape_b.NumAxes() - 2); + } + + double logical_computation_cost = 2 * ctx->Shape4ArgNameAndIndex("a", 0).elem_cnt() * n; + const auto& nd_sbp_a = ctx->NdSbp4ArgNameAndIndex("a", 0); + const auto& nd_sbp_b = ctx->NdSbp4ArgNameAndIndex("b", 0); + const auto& parallel_hierarchy = ctx->parallel_desc().hierarchy(); + for (int32_t sbp_dim = 0; sbp_dim < nd_sbp_a.sbp_parallel_size(); sbp_dim++) { + if (nd_sbp_a.sbp_parallel(sbp_dim).has_split_parallel() + || nd_sbp_b.sbp_parallel(sbp_dim).has_split_parallel()) { + logical_computation_cost /= parallel_hierarchy->At(sbp_dim); + } + } + return logical_computation_cost; +} + +} // namespace + +/* static */ Maybe GroupedMatmulQuantOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { + bool transpose_a = ctx->Attr("transpose_a"); + bool transpose_b = ctx->Attr("transpose_b"); + CHECK_EQ_OR_RETURN(transpose_a, false); + CHECK_EQ_OR_RETURN(transpose_b, true); + + const int64_t input_size = ctx->input_size("as"); + CHECK_EQ_OR_RETURN(ctx->input_size("bs"), input_size); + const bool has_sacles = ctx->has_input("scales", 0); + const bool has_biases = ctx->has_input("biases", 0); + const bool has_in_zero_points = ctx->has_input("in_zero_points", 0); + const bool has_add_to_outputs = ctx->has_input("_add_to_outputs", 0); + if (has_in_zero_points) { + CHECK_EQ_OR_RETURN(has_sacles, false); + CHECK_EQ_OR_RETURN(ctx->input_size("in_zero_points"), input_size); + CHECK_OR_RETURN(ctx->has_input("in_scales", 0)); + CHECK_EQ_OR_RETURN(ctx->input_size("in_scales"), input_size); + CHECK_OR_RETURN(ctx->has_input("weight_scales", 0)); + CHECK_EQ_OR_RETURN(ctx->input_size("weight_scales"), input_size); + CHECK_OR_RETURN(ctx->has_input("weight_accs", 0)); + CHECK_EQ_OR_RETURN(ctx->input_size("weight_accs"), input_size); + if (has_biases) { CHECK_EQ_OR_RETURN(ctx->input_size("biases"), input_size); } + } + if (has_sacles) { + CHECK_EQ_OR_RETURN(ctx->input_size("scales"), input_size); + CHECK_EQ_OR_RETURN(has_biases, true); + CHECK_EQ_OR_RETURN(ctx->input_size("biases"), input_size); + } + if (has_add_to_outputs) { CHECK_EQ_OR_RETURN(ctx->input_size("_add_to_outputs"), input_size); } + CHECK_EQ_OR_RETURN(ctx->output_size("outputs"), input_size); + + const DataType weight_data_type = ctx->InputTensorDesc("bs", 0).data_type(); + const DataType out_data_type = ctx->Attr("out_dtype"); + for (int64_t i = 0; i < input_size; ++i) { + const user_op::TensorDesc& x_desc = ctx->InputTensorDesc("as", i); + CHECK_EQ_OR_RETURN(x_desc.data_type(), weight_data_type); + CHECK_GE_OR_RETURN(x_desc.shape().NumAxes(), 2); + const int64_t k = x_desc.shape().At(x_desc.shape().NumAxes() - 1); + const user_op::TensorDesc& weight_desc = ctx->InputTensorDesc("bs", i); + CHECK_EQ_OR_RETURN(weight_desc.data_type(), weight_data_type); + CHECK_EQ_OR_RETURN(weight_desc.shape().NumAxes(), 2); + CHECK_EQ_OR_RETURN(weight_desc.shape().At(1), k); + const int64_t n = weight_desc.shape().At(0); + if (has_in_zero_points) { + const user_op::TensorDesc& in_zero_point = ctx->InputTensorDesc("in_zero_points", i); + CHECK_EQ_OR_RETURN(in_zero_point.data_type(), weight_data_type); + CHECK_EQ_OR_RETURN(in_zero_point.shape().Count(0), 1); + const user_op::TensorDesc& in_scale = ctx->InputTensorDesc("in_scales", i); + CHECK_EQ_OR_RETURN(in_scale.data_type(), DataType::kFloat); + CHECK_EQ_OR_RETURN(in_scale.shape().Count(0), 1); + const user_op::TensorDesc& weight_scale = ctx->InputTensorDesc("weight_scales", i); + CHECK_EQ_OR_RETURN(weight_scale.data_type(), out_data_type); + CHECK_EQ_OR_RETURN(weight_scale.shape(), Shape({n})); + const user_op::TensorDesc& weight_acc = ctx->InputTensorDesc("weight_accs", i); + CHECK_EQ_OR_RETURN(weight_acc.data_type(), out_data_type); + CHECK_EQ_OR_RETURN(weight_acc.shape(), Shape({n})); + if (has_biases) { + const user_op::TensorDesc& bias = ctx->InputTensorDesc("biases", i); + CHECK_EQ_OR_RETURN(bias.data_type(), out_data_type); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({n})); + } + } + if (has_sacles) { + CHECK_OR_RETURN(ctx->has_input("biases", i)); + const user_op::TensorDesc& scale = ctx->InputTensorDesc("scales", i); + CHECK_EQ_OR_RETURN(scale.data_type(), out_data_type); + CHECK_EQ_OR_RETURN(scale.shape(), Shape({n})); + const user_op::TensorDesc& bias = ctx->InputTensorDesc("biases", i); + CHECK_EQ_OR_RETURN(bias.shape(), Shape({n})); + CHECK_EQ_OR_RETURN(bias.data_type(), out_data_type); + } + user_op::TensorDesc* y_desc = ctx->MutOutputTensorDesc("outputs", i); + y_desc->set_data_type(out_data_type); + DimVector out_dim_vec = x_desc.shape().dim_vec(); + out_dim_vec.back() = n; + y_desc->set_shape(Shape(out_dim_vec)); + if (has_add_to_outputs) { + const auto& add_to_output = ctx->InputTensorDesc("_add_to_outputs", i); + CHECK_EQ_OR_RETURN(add_to_output.data_type(), out_data_type); + CHECK_EQ_OR_RETURN(add_to_output.shape(), y_desc->shape()); + } + } + return Maybe::Ok(); +} + +/*static*/ Maybe GroupedMatmulQuantOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe GroupedMatmulQuantOp::GetSbp(user_op::SbpContext* ctx) { + { + // s0 x b + auto builder = ctx->NewBuilder(); + for (int64_t i = 0; i < ctx->user_op_conf().input_size("as"); ++i) { + builder.Split(user_op::OpArg("as", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().input_size("bs"); ++i) { + builder.Broadcast(user_op::OpArg("bs", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("in_zero_points"); ++i) { + builder.Broadcast(user_op::OpArg("in_zero_points", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("in_scales"); ++i) { + builder.Broadcast(user_op::OpArg("in_scales", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("weight_scales"); ++i) { + builder.Broadcast(user_op::OpArg("weight_scales", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("weight_accs"); ++i) { + builder.Broadcast(user_op::OpArg("weight_accs", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("scales"); ++i) { + builder.Broadcast(user_op::OpArg("scales", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("biases"); ++i) { + builder.Broadcast(user_op::OpArg("biases", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("_add_to_outputs"); ++i) { + builder.Split(user_op::OpArg("_add_to_outputs", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().output_size("outputs"); ++i) { + builder.Split(user_op::OpArg("outputs", i), 0); + } + builder.Build(); + } + + { + // b x s0 + auto builder = ctx->NewBuilder(); + for (int64_t i = 0; i < ctx->user_op_conf().input_size("as"); ++i) { + builder.Broadcast(user_op::OpArg("as", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("bs"); ++i) { + builder.Split(user_op::OpArg("bs", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().input_size("in_zero_points"); ++i) { + builder.Broadcast(user_op::OpArg("in_zero_points", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("in_scales"); ++i) { + builder.Broadcast(user_op::OpArg("in_scales", i)); + } + for (int i = 0; i < ctx->user_op_conf().input_size("weight_scales"); ++i) { + builder.Split(user_op::OpArg("weight_scales", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().input_size("weight_accs"); ++i) { + builder.Split(user_op::OpArg("weight_accs", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().input_size("scales"); ++i) { + builder.Split(user_op::OpArg("scales", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().input_size("biases"); ++i) { + builder.Split(user_op::OpArg("biases", i), 0); + } + for (int i = 0; i < ctx->user_op_conf().input_size("_add_to_outputs"); ++i) { + builder.Split(user_op::OpArg("_add_to_outputs", i), + ctx->LogicalTensorDesc4InputArgNameAndIndex("as", i).shape().NumAxes() - 1); + } + for (int i = 0; i < ctx->user_op_conf().output_size("outputs"); ++i) { + builder.Split(user_op::OpArg("outputs", i), + ctx->LogicalTensorDesc4InputArgNameAndIndex("as", i).shape().NumAxes() - 1); + } + builder.Build(); + } + + return Maybe::Ok(); +} + +/* static */ Maybe GroupedMatmulQuantOp::InferDataType(user_op::InferContext* ctx) { + const DataType out_data_type = ctx->Attr("out_dtype"); + for (int32_t i = 0; i < ctx->output_size("outputs"); i++) { + user_op::TensorDesc* y_desc = ctx->MutOutputTensorDesc("outputs", i); + y_desc->set_data_type(out_data_type); + } + return Maybe::Ok(); +} + +/*static*/ Maybe GroupedMatmulQuantOp::GetComputeComplexity( + user_op::ComputeComplexityFnContext* ctx) { + return GetComputationCost(ctx); +} + +} // namespace oneflow From 66f50e554ecacd51c23a4e45645bbc02c0ac1f97 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 8 Sep 2023 14:22:42 +0000 Subject: [PATCH 57/65] fix --- .../core/cuda/layer_norm_min_max_observer.cuh | 212 ++++++++---------- ..._layer_norm_min_max_observer_gpu_kernel.cu | 3 +- 2 files changed, 95 insertions(+), 120 deletions(-) diff --git a/oneflow/core/cuda/layer_norm_min_max_observer.cuh b/oneflow/core/cuda/layer_norm_min_max_observer.cuh index 5a83bfbffdd..5b99707db8c 100644 --- a/oneflow/core/cuda/layer_norm_min_max_observer.cuh +++ b/oneflow/core/cuda/layer_norm_min_max_observer.cuh @@ -32,134 +32,74 @@ namespace cuda { namespace layer_norm { template -inline __device__ void WelfordMinMaxCombine(T val, T* mean, T* m2, T* min, T* max, T* count) { - // Use Welford Online algorithem to compute mean and variance - // For more details you can refer to: - // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm - *count += 1; - T delta1 = val - *mean; - *mean += Div(delta1, *count); - T delta2 = val - *mean; - *m2 += delta1 * delta2; +inline __device__ void MinMaxCombine(T val, T* min, T* max) { *min = BinaryFuncMin::Invoke(val, *min); *max = BinaryFuncMax::Invoke(val, *max); } template -inline __device__ void WelfordMinMaxCombine(T b_mean, T b_m2, T b_min, T b_max, T b_count, T* mean, - T* m2, T* min, T* max, T* count) { - if (b_count == 0) { return; } - T new_count = *count + b_count; - T nb_over_n = Div(b_count, new_count); - T delta = b_mean - *mean; - *mean += delta * nb_over_n; - *m2 += b_m2 + delta * delta * (*count) * nb_over_n; - *count = new_count; +inline __device__ void MinMaxCombine(T b_min, T b_max, T* min, T* max) { *min = BinaryFuncMin::Invoke(b_min, *min); *max = BinaryFuncMax::Invoke(b_max, *max); } template -__inline__ __device__ void WelfordMinMaxWarpReduce(T thread_mean, T thread_m2, T thread_min, - T thread_max, T thread_count, T* mean, T* m2, - T* min, T* max, T* count) { - *mean = thread_mean; - *m2 = thread_m2; - *count = thread_count; +__inline__ __device__ void MinMaxWarpReduce(T thread_min, T thread_max, T* min, T* max) { *min = thread_min; *max = thread_max; for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { - T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width); - T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width); T b_min = __shfl_down_sync(0xffffffff, *min, mask, thread_group_width); T b_max = __shfl_down_sync(0xffffffff, *max, mask, thread_group_width); - T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width); - WelfordMinMaxCombine(b_mean, b_m2, b_min, b_max, b_count, mean, m2, min, max, count); + MinMaxCombine(b_min, b_max, min, max); } } template -__inline__ __device__ void WelfordMinMaxWarpAllReduce(T thread_mean, T thread_m2, T thread_min, - T thread_max, T thread_count, T* mean, T* m2, - T* min, T* max, T* count) { - WelfordMinMaxWarpReduce(thread_mean, thread_m2, thread_min, thread_max, - thread_count, mean, m2, min, max, count); - *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width); - *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width); +__inline__ __device__ void MinMaxWarpAllReduce(T thread_min, T thread_max, T* min, T* max) { + MinMaxWarpReduce(thread_min, thread_max, min, max); *min = __shfl_sync(0xffffffff, *min, 0, thread_group_width); *max = __shfl_sync(0xffffffff, *max, 0, thread_group_width); - *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width); } template -__inline__ __device__ void WelfordMinMaxBlockAllReduce(T thread_mean, T thread_m2, T thread_min, - T thread_max, T thread_count, T* result_mean, - T* result_m2, T* result_min, T* result_max, - T* result_count) { - __shared__ T mean_shared[kWarpSize]; - __shared__ T m2_shared[kWarpSize]; +__inline__ __device__ void MinMaxBlockAllReduce(T thread_min, T thread_max, T* result_min, + T* result_max) { __shared__ T min_shared[kWarpSize]; __shared__ T max_shared[kWarpSize]; - __shared__ T count_shared[kWarpSize]; - __shared__ T mean_result_broadcast; - __shared__ T m2_result_broadcast; __shared__ T min_result_broadcast; __shared__ T max_result_broadcast; - __shared__ T count_result_broadcast; const int lid = threadIdx.x % kWarpSize; const int wid = threadIdx.x / kWarpSize; - T warp_mean = 0; - T warp_m2 = 0; T warp_min = detail::numeric_limits::max(); T warp_max = detail::numeric_limits::lowest(); - T warp_count = 0; - WelfordMinMaxWarpReduce(thread_mean, thread_m2, thread_min, thread_max, thread_count, &warp_mean, - &warp_m2, &warp_min, &warp_max, &warp_count); + MinMaxWarpReduce(thread_min, thread_max, &warp_min, &warp_max); + __syncthreads(); if (lid == 0) { - mean_shared[wid] = warp_mean; - m2_shared[wid] = warp_m2; min_shared[wid] = warp_min; max_shared[wid] = warp_max; - count_shared[wid] = warp_count; } __syncthreads(); if (wid == 0) { if (threadIdx.x < blockDim.x / kWarpSize) { - warp_mean = mean_shared[lid]; - warp_m2 = m2_shared[lid]; warp_min = min_shared[lid]; warp_max = max_shared[lid]; - warp_count = count_shared[lid]; } else { - warp_mean = static_cast(0); - warp_m2 = static_cast(0); warp_min = detail::numeric_limits::max(); warp_max = detail::numeric_limits::lowest(); - warp_count = static_cast(0); } __syncwarp(); - T block_mean = 0; - T block_m2 = 0; T block_min = detail::numeric_limits::max(); T block_max = detail::numeric_limits::lowest(); - T block_count = 0; - WelfordMinMaxWarpReduce(warp_mean, warp_m2, warp_min, warp_max, warp_count, &block_mean, - &block_m2, &block_min, &block_max, &block_count); + MinMaxWarpReduce(warp_min, warp_max, &block_min, &block_max); if (lid == 0) { - mean_result_broadcast = block_mean; - m2_result_broadcast = block_m2; min_result_broadcast = block_min; max_result_broadcast = block_max; - count_result_broadcast = block_count; } } __syncthreads(); - *result_mean = mean_result_broadcast; - *result_m2 = m2_result_broadcast; *result_min = min_result_broadcast; *result_max = max_result_broadcast; - *result_count = count_result_broadcast; } template::max(); - thread_max[row_id] = detail::numeric_limits::lowest(); thread_count[row_id] = 0; ComputeType* row_buf = buf[row_id]; #pragma unroll @@ -204,8 +140,8 @@ __global__ void LayerNormMinMaxObserverWarpImpl(LOAD load, STORE store, const in #pragma unroll for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = static_cast(pack[i]); - WelfordMinMaxCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, - thread_min + row_id, thread_max + row_id, thread_count + row_id); + WelfordCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, + thread_count + row_id); } } for (int pack_id = min_num_packs; pack_id < max_num_packs; ++pack_id) { @@ -217,8 +153,8 @@ __global__ void LayerNormMinMaxObserverWarpImpl(LOAD load, STORE store, const in #pragma unroll for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = static_cast(pack[i]); - WelfordMinMaxCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, - thread_min + row_id, thread_max + row_id, thread_count + row_id); + WelfordCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, + thread_count + row_id); } } else { #pragma unroll @@ -228,25 +164,27 @@ __global__ void LayerNormMinMaxObserverWarpImpl(LOAD load, STORE store, const in } ComputeType warp_mean[rows_per_access]; ComputeType warp_m2[rows_per_access]; - ComputeType warp_min[rows_per_access]; - ComputeType warp_max[rows_per_access]; ComputeType warp_count[rows_per_access]; + + ComputeType thread_min[rows_per_access]; + ComputeType thread_max[rows_per_access]; #pragma unroll for (int row_id = 0; row_id < rows_per_access; ++row_id) { + thread_min[row_id] = detail::numeric_limits::max(); + thread_max[row_id] = detail::numeric_limits::lowest(); int global_row_id = row + row_id; ComputeType* row_buf = buf[row_id]; - WelfordMinMaxWarpAllReduce( - thread_mean[row_id], thread_m2[row_id], thread_min[row_id], thread_max[row_id], - thread_count[row_id], warp_mean + row_id, warp_m2 + row_id, warp_min + row_id, - warp_max + row_id, warp_count + row_id); + WelfordWarpAllReduce( + thread_mean[row_id], thread_m2[row_id], thread_count[row_id], warp_mean + row_id, + warp_m2 + row_id, warp_count + row_id); ComputeType row_mean = warp_mean[row_id]; ComputeType row_variance = max(Div(warp_m2[row_id], warp_count[row_id]), static_cast(0.0)); ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); - if (lane_id == 0) { - min_max[global_row_id << 1] = (warp_min[row_id] - row_mean) * row_inv_var; - min_max[(global_row_id << 1) + 1] = (warp_max[row_id] - row_mean) * row_inv_var; - } + // if (lane_id == 0) { + // mean[global_row_id] = row_mean; + // inv_variance[global_row_id] = row_inv_var; + // } #pragma unroll for (int i = 0; i < max_cols_per_thread; ++i) { row_buf[i] = (row_buf[i] - row_mean) * row_inv_var; @@ -254,16 +192,39 @@ __global__ void LayerNormMinMaxObserverWarpImpl(LOAD load, STORE store, const in #pragma unroll for (int i = 0; i < min_num_packs; ++i) { const int col = (i * thread_group_width + lane_id) * pack_size; - store.template store(row_buf + i * pack_size, global_row_id, col); + size_t pack_offset = i * pack_size; + store.template store(row_buf + pack_offset, global_row_id, col, + row_buf + pack_offset); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + MinMaxCombine(row_buf[pack_offset + i], thread_min + row_id, thread_max + row_id); + } } #pragma unroll for (int i = min_num_packs; i < max_num_packs; ++i) { const int col = (i * thread_group_width + lane_id) * pack_size; if (!padding || col < cols) { - store.template store(row_buf + i * pack_size, global_row_id, col); + size_t pack_offset = i * pack_size; + store.template store(row_buf + pack_offset, global_row_id, col, + row_buf + pack_offset); + for (int i = 0; i < pack_size; ++i) { + MinMaxCombine(row_buf[pack_offset + i], thread_min + row_id, thread_max + row_id); + } } } } + ComputeType warp_min[rows_per_access]; + ComputeType warp_max[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + MinMaxWarpAllReduce(thread_min[row_id], thread_max[row_id], + warp_min + row_id, warp_max + row_id); + int global_row_id = row + row_id; + if (lane_id == 0) { + min_max[global_row_id << 1] = warp_min[row_id]; + min_max[(global_row_id << 1) + 1] = warp_max[row_id]; + } + } } } @@ -443,8 +404,6 @@ __global__ void LayerNormMinMaxObserverBlockSMemImpl(LOAD load, STORE store, con for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { ComputeType thread_mean = 0; ComputeType thread_m2 = 0; - ComputeType thread_min = detail::numeric_limits::max(); - ComputeType thread_max = detail::numeric_limits::lowest(); ComputeType thread_count = 0; for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { LoadType pack[pack_size]; @@ -452,31 +411,41 @@ __global__ void LayerNormMinMaxObserverBlockSMemImpl(LOAD load, STORE store, con #pragma unroll for (int i = 0; i < pack_size; ++i) { buf[i * num_packs + pack_id] = pack[i]; - WelfordMinMaxCombine(static_cast(pack[i]), &thread_mean, &thread_m2, - &thread_min, &thread_max, &thread_count); + WelfordCombine(static_cast(pack[i]), &thread_mean, &thread_m2, &thread_count); } } ComputeType row_mean = 0; ComputeType row_m2 = 0; - ComputeType row_min = detail::numeric_limits::max(); - ComputeType row_max = detail::numeric_limits::lowest(); ComputeType row_count = 0; - WelfordMinMaxBlockAllReduce(thread_mean, thread_m2, thread_min, thread_max, - thread_count, &row_mean, &row_m2, &row_min, &row_max, - &row_count); + WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, + &row_count); ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); - if (threadIdx.x == 0) { - min_max[row << 1] = (row_min - row_mean) * row_inv_var; - min_max[(row << 1) + 1] = (row_max - row_mean) * row_inv_var; - } + // if (threadIdx.x == 0) { + // mean[row] = row_mean; + // inv_variance[row] = row_inv_var; + // } + ComputeType thread_min = detail::numeric_limits::max(); + ComputeType thread_max = detail::numeric_limits::lowest(); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { ComputeType pack[pack_size]; #pragma unroll for (int i = 0; i < pack_size; ++i) { pack[i] = (static_cast(buf[i * num_packs + pack_id]) - row_mean) * row_inv_var; } - store.template store(pack, row, pack_id * pack_size); + store.template store(pack, row, pack_id * pack_size, pack); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + MinMaxCombine(static_cast(pack[i]), &thread_min, &thread_max); + } + } + ComputeType row_min = detail::numeric_limits::max(); + ComputeType row_max = detail::numeric_limits::lowest(); + MinMaxBlockAllReduce(thread_min, thread_max, &row_min, &row_max); + if (threadIdx.x == 0) { + min_max[row << 1] = row_min; + min_max[(row << 1) + 1] = row_max; } } } @@ -668,32 +637,29 @@ __global__ void __launch_bounds__(1024) for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { ComputeType thread_mean = 0; ComputeType thread_m2 = 0; - ComputeType thread_min = detail::numeric_limits::max(); - ComputeType thread_max = detail::numeric_limits::lowest(); ComputeType thread_count = 0; for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { LoadType pack[pack_size]; load.template load(pack, row, pack_id * pack_size); #pragma unroll for (int i = 0; i < pack_size; ++i) { - WelfordMinMaxCombine(static_cast(pack[i]), &thread_mean, &thread_m2, - &thread_min, &thread_max, &thread_count); + WelfordCombine(static_cast(pack[i]), &thread_mean, &thread_m2, &thread_count); } } ComputeType row_mean = 0; ComputeType row_m2 = 0; - ComputeType row_min = detail::numeric_limits::max(); - ComputeType row_max = detail::numeric_limits::lowest(); ComputeType row_count = 0; - WelfordMinMaxBlockAllReduce(thread_mean, thread_m2, thread_min, thread_max, - thread_count, &row_mean, &row_m2, &row_min, &row_max, - &row_count); + WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, + &row_count); ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); - if (threadIdx.x == 0) { - min_max[row << 1] = (row_min - row_mean) * row_inv_var; - min_max[(row << 1) + 1] = (row_max - row_mean) * row_inv_var; - } + // if (threadIdx.x == 0) { + // mean[row] = row_mean; + // inv_variance[row] = row_inv_var; + // } + ComputeType thread_min = detail::numeric_limits::max(); + ComputeType thread_max = detail::numeric_limits::lowest(); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { LoadType pack[pack_size]; ComputeType dst_pack[pack_size]; @@ -703,7 +669,15 @@ __global__ void __launch_bounds__(1024) for (int i = 0; i < pack_size; ++i) { dst_pack[i] = (static_cast(pack[i]) - row_mean) * row_inv_var; } - store.template store(dst_pack, row, pack_offset); + store.template store(dst_pack, row, pack_offset, dst_pack); + for (int i = 0; i < pack_size; ++i) { MinMaxCombine(dst_pack[i], &thread_min, &thread_max); } + } + ComputeType row_min = detail::numeric_limits::max(); + ComputeType row_max = detail::numeric_limits::lowest(); + MinMaxBlockAllReduce(thread_min, thread_max, &row_min, &row_max); + if (threadIdx.x == 0) { + min_max[row << 1] = row_min; + min_max[(row << 1) + 1] = row_max; } } } diff --git a/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu b/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu index ffecc5bdeaa..747b6e5715c 100644 --- a/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu +++ b/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu @@ -32,7 +32,7 @@ struct AffineStore { AffineStore(DST* y, int64_t row_size, const DST* gamma, const DST* beta) : y(y), row_size(row_size), gamma(gamma), beta(beta) {} template - __device__ void store(const SRC* src, int64_t row, int64_t col) { + __device__ void store(const SRC* src, int64_t row, int64_t col, SRC* dst) { cuda::layer_norm::Pack y_pack; cuda::layer_norm::Pack gamma_pack; cuda::layer_norm::Pack beta_pack; @@ -60,6 +60,7 @@ struct AffineStore { } else { y_pack.elem[i] = normalized_i; } + dst[i] = y_pack.elem[i]; } *(reinterpret_cast*>(y) + offset) = y_pack.storage; } From bfd5319ade7ad71e83c65ec00c06c0c5f67cfa45 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 8 Sep 2023 14:50:38 +0000 Subject: [PATCH 58/65] optimize --- .../dynamic_quantization_gpu_kernel.cu | 5 +- ..._layer_norm_min_max_observer_gpu_kernel.cu | 6 +- oneflow/user/kernels/quantization_utils.cuh | 208 +++++++++++------- 3 files changed, 133 insertions(+), 86 deletions(-) diff --git a/oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu b/oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu index 4f1c48b4d3a..25cde967778 100644 --- a/oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu +++ b/oneflow/user/kernels/dynamic_quantization_gpu_kernel.cu @@ -59,9 +59,8 @@ class GpuDynamicQuantizationKernel final : public user_op::OpKernel { auto stream = ctx->stream()->As()->cuda_stream(); if (per_layer_quantization) { quantization::ReduceMinMaxPerTensor - <<>>(elements, in->dptr(), - min_max); + <<>>(elements, in->dptr(), + min_max); } else { UNIMPLEMENTED() << "dynamic_quantization does not support per-channel quantization"; } diff --git a/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu b/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu index 747b6e5715c..ca40ac34d09 100644 --- a/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu +++ b/oneflow/user/kernels/fused_layer_norm_min_max_observer_gpu_kernel.cu @@ -153,9 +153,9 @@ class GpuFusedLayerNormMinMaxObserverKernel final : public user_op::OpKernel { int8_t upper_bound = (1 << (quantization_bit - 1)) - 1; int8_t lower_bound = -upper_bound - 1; quantization::ComputeScaleAndZeroPointBlock - <<<1, cuda::elementwise::kBlockSize, cuda::elementwise::kBlockSize * element_bytes * 2, - stream>>>(num_instances, min_max, upper_bound, lower_bound, - y_scale->mut_dptr(), y_zero_point->mut_dptr()); + <<<1, cuda::elementwise::kBlockSize, 0, stream>>>( + num_instances, min_max, upper_bound, lower_bound, y_scale->mut_dptr(), + y_zero_point->mut_dptr()); } else { UNIMPLEMENTED(); } diff --git a/oneflow/user/kernels/quantization_utils.cuh b/oneflow/user/kernels/quantization_utils.cuh index 4db60e236c3..9a90428dac8 100644 --- a/oneflow/user/kernels/quantization_utils.cuh +++ b/oneflow/user/kernels/quantization_utils.cuh @@ -26,6 +26,8 @@ limitations under the License. namespace oneflow { namespace quantization { +constexpr int kWarpSize = 32; + template __host__ __device__ __forceinline__ int ModDiv(int64_t N) { return N - (N / M * M); @@ -57,16 +59,17 @@ __global__ void ReduceMinMaxPerTensor(const int64_t elements, const T* in_ptr, T using LoadPack = cuda::elementwise::Pack; using MinMaxPack = cuda::elementwise::Pack; - extern __shared__ uint8_t buffer[]; + __shared__ T shared_buffer[kWarpSize << 1]; + MinMaxPack* shared_min_max = reinterpret_cast(shared_buffer); MinMaxPack min_max; min_max.elem[0] = detail::numeric_limits::max(); min_max.elem[1] = detail::numeric_limits::lowest(); - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t tid = (blockDim.x * blockIdx.x) + threadIdx.x; int64_t step = gridDim.x * blockDim.x * pack_size; - for (int64_t idx = gid * pack_size; idx < elements; idx += step) { + for (int64_t idx = tid * pack_size; idx < elements; idx += step) { LoadPack in; in.storage = reinterpret_cast(in_ptr + idx)[0]; #pragma unroll @@ -76,7 +79,7 @@ __global__ void ReduceMinMaxPerTensor(const int64_t elements, const T* in_ptr, T } } int rest = ModDiv(elements); - if (rest > 0 && gid == (gridDim.x * blockDim.x - 1)) { + if (rest > 0 && tid == (gridDim.x * blockDim.x - 1)) { in_ptr += elements - rest; LoadPack in; in.storage = reinterpret_cast(in_ptr)[0]; @@ -87,26 +90,41 @@ __global__ void ReduceMinMaxPerTensor(const int64_t elements, const T* in_ptr, T } } - int64_t tid = threadIdx.x; + for (int mask = kWarpSize >> 1; mask > 0; mask = mask >> 1) { + T b_min = __shfl_down_sync(0xffffffff, min_max.elem[0], mask, kWarpSize); + T b_max = __shfl_down_sync(0xffffffff, min_max.elem[1], mask, kWarpSize); + min_max.elem[0] = BinaryFuncMin::Invoke(b_min, min_max.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(b_max, min_max.elem[1]); + } + __syncthreads(); + + // const int lid = threadIdx.x % kWarpSize; + // const int wid = threadIdx.x / kWarpSize; + // kWarpSize is 32 + const int lid = threadIdx.x & 0x1F; + const int wid = threadIdx.x >> 5; - MinMaxPack* shared_min_max = reinterpret_cast(buffer); - shared_min_max[tid].storage = min_max.storage; + if (lid == 0) { shared_min_max[wid] = min_max; } __syncthreads(); - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - MinMaxPack min_max0, min_max1; - min_max0.storage = shared_min_max[tid].storage; - min_max1.storage = shared_min_max[tid + s].storage; - min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); - min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); - shared_min_max[tid].storage = min_max0.storage; + if (wid == 0) { + if (threadIdx.x < blockDim.x >> 5) { + min_max = shared_min_max[lid]; + } else { + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); } - __syncthreads(); - } + __syncwarp(); - if (tid == 0) { - reinterpret_cast(min_max_ptr)[blockIdx.x].storage = shared_min_max[0].storage; + for (int mask = kWarpSize >> 1; mask > 0; mask = mask >> 1) { + T b_min = __shfl_down_sync(0xffffffff, min_max.elem[0], mask, kWarpSize); + T b_max = __shfl_down_sync(0xffffffff, min_max.elem[1], mask, kWarpSize); + min_max.elem[0] = BinaryFuncMin::Invoke(b_min, min_max.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(b_max, min_max.elem[1]); + } + if (lid == 0) { + reinterpret_cast(min_max_ptr)[blockIdx.x].storage = min_max.storage; + } } } @@ -116,43 +134,59 @@ __global__ void ComputeScaleAndZeroPointBlock(const int min_max_size, const T* m float* scale_ptr, Q* zero_point_ptr) { using MinMaxPack = cuda::elementwise::Pack; - extern __shared__ uint8_t buffer[]; - MinMaxPack* shared_min_max = reinterpret_cast(buffer); - int64_t tid = threadIdx.x; - { - MinMaxPack min_max; - min_max.elem[0] = detail::numeric_limits::max(); - min_max.elem[1] = detail::numeric_limits::lowest(); + __shared__ T shared_buffer[kWarpSize << 1]; + MinMaxPack* shared_min_max = reinterpret_cast(shared_buffer); + + MinMaxPack min_max; + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); #pragma unroll - for (int64_t idx = tid; idx < min_max_size; idx += blockDim.x) { - MinMaxPack in = reinterpret_cast(min_max_ptr)[idx]; - min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[0]); - min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[1]); - } - shared_min_max[tid].storage = min_max.storage; - __syncthreads(); - - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - MinMaxPack min_max0, min_max1; - min_max0.storage = shared_min_max[tid].storage; - min_max1.storage = shared_min_max[tid + s].storage; - min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); - min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); - shared_min_max[tid].storage = min_max0.storage; - } - __syncthreads(); - } + for (int64_t idx = threadIdx.x; idx < min_max_size; idx += blockDim.x) { + MinMaxPack in = reinterpret_cast(min_max_ptr)[idx]; + min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[1]); } - if (threadIdx.x == 0) { - MinMaxPack min_max = shared_min_max[0]; - float min_value = static_cast(min_max.elem[0]); - float max_value = static_cast(min_max.elem[1]); - float scale = (max_value - min_value) / (upper_bound - lower_bound); - int32_t zero_point = lower_bound - __float2int_rn(min_value / scale); - scale_ptr[0] = scale; - zero_point_ptr[0] = static_cast(zero_point); + for (int mask = kWarpSize >> 1; mask > 0; mask = mask >> 1) { + T b_min = __shfl_down_sync(0xffffffff, min_max.elem[0], mask, kWarpSize); + T b_max = __shfl_down_sync(0xffffffff, min_max.elem[1], mask, kWarpSize); + min_max.elem[0] = BinaryFuncMin::Invoke(b_min, min_max.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(b_max, min_max.elem[1]); + } + __syncthreads(); + + // const int lid = threadIdx.x % kWarpSize; + // const int wid = threadIdx.x / kWarpSize; + // kWarpSize is 32 + const int lid = threadIdx.x & 0x1F; + const int wid = threadIdx.x >> 5; + + if (lid == 0) { shared_min_max[wid] = min_max; } + __syncthreads(); + + if (wid == 0) { + if (threadIdx.x < blockDim.x >> 5) { + min_max = shared_min_max[lid]; + } else { + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); + } + __syncwarp(); + + for (int mask = kWarpSize >> 1; mask > 0; mask = mask >> 1) { + T b_min = __shfl_down_sync(0xffffffff, min_max.elem[0], mask, kWarpSize); + T b_max = __shfl_down_sync(0xffffffff, min_max.elem[1], mask, kWarpSize); + min_max.elem[0] = BinaryFuncMin::Invoke(b_min, min_max.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(b_max, min_max.elem[1]); + } + if (lid == 0) { + float min_value = static_cast(min_max.elem[0]); + float max_value = static_cast(min_max.elem[1]); + float scale = (max_value - min_value) / (upper_bound - lower_bound); + int32_t zero_point = lower_bound - __float2int_rn(min_value / scale); + scale_ptr[0] = scale; + zero_point_ptr[0] = static_cast(zero_point); + } } } @@ -165,16 +199,15 @@ inline __global__ void ComputeScaleAndZeroPointBlock( using MinMaxPack4 = cuda::elementwise::Pack; using MinMaxPack = cuda::elementwise::Pack; - extern __shared__ uint8_t buffer[]; - MinMaxPack* shared_min_max = reinterpret_cast(buffer); - int64_t tid = threadIdx.x; + __shared__ T shared_buffer[kWarpSize << 1]; + MinMaxPack* shared_min_max = reinterpret_cast(shared_buffer); MinMaxPack min_max; min_max.elem[0] = detail::numeric_limits::max(); min_max.elem[1] = detail::numeric_limits::lowest(); #pragma unroll - for (int idx = tid; idx < (min_max_size >> 2); idx += blockDim.x) { + for (int idx = threadIdx.x; idx < (min_max_size >> 2); idx += blockDim.x) { MinMaxPack4 in = reinterpret_cast(min_max_ptr + (idx << 3))[0]; min_max.elem[0] = BinaryFuncMin::Invoke(min_max.elem[0], in.elem[0]); min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[1]); @@ -188,7 +221,7 @@ inline __global__ void ComputeScaleAndZeroPointBlock( int rest = ModDiv<4>(min_max_size); - if (rest > 0 && tid == blockDim.x - 1) { + if (rest > 0 && threadIdx.x == blockDim.x - 1) { int offset = (min_max_size - rest) << 1; MinMaxPack4 in = reinterpret_cast(min_max_ptr + offset)[0]; #pragma unroll @@ -197,30 +230,46 @@ inline __global__ void ComputeScaleAndZeroPointBlock( min_max.elem[1] = BinaryFuncMax::Invoke(min_max.elem[1], in.elem[(i << 1) + 1]); } } + for (int mask = kWarpSize >> 1; mask > 0; mask = mask >> 1) { + T b_min = __shfl_down_sync(0xffffffff, min_max.elem[0], mask, kWarpSize); + T b_max = __shfl_down_sync(0xffffffff, min_max.elem[1], mask, kWarpSize); + min_max.elem[0] = BinaryFuncMin::Invoke(b_min, min_max.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(b_max, min_max.elem[1]); + } + __syncthreads(); - shared_min_max[tid].storage = min_max.storage; + // const int lid = threadIdx.x % kWarpSize; + // const int wid = threadIdx.x / kWarpSize; + // kWarpSize is 32 + const int lid = threadIdx.x & 0x1F; + const int wid = threadIdx.x >> 5; + + if (lid == 0) { shared_min_max[wid] = min_max; } __syncthreads(); - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - MinMaxPack min_max0, min_max1; - min_max0.storage = shared_min_max[tid].storage; - min_max1.storage = shared_min_max[tid + s].storage; - min_max0.elem[0] = BinaryFuncMin::Invoke(min_max0.elem[0], min_max1.elem[0]); - min_max0.elem[1] = BinaryFuncMax::Invoke(min_max0.elem[1], min_max1.elem[1]); - shared_min_max[tid].storage = min_max0.storage; + if (wid == 0) { + if (threadIdx.x < blockDim.x >> 5) { + min_max = shared_min_max[lid]; + } else { + min_max.elem[0] = detail::numeric_limits::max(); + min_max.elem[1] = detail::numeric_limits::lowest(); } - __syncthreads(); - } + __syncwarp(); - if (threadIdx.x == 0) { - MinMaxPack min_max = shared_min_max[0]; - float min_value = static_cast(min_max.elem[0]); - float max_value = static_cast(min_max.elem[1]); - float scale = (max_value - min_value) / (upper_bound - lower_bound); - int32_t zero_point = lower_bound - __float2int_rn(min_value / scale); - scale_ptr[0] = scale; - zero_point_ptr[0] = static_cast(zero_point); + for (int mask = kWarpSize >> 1; mask > 0; mask = mask >> 1) { + T b_min = __shfl_down_sync(0xffffffff, min_max.elem[0], mask, kWarpSize); + T b_max = __shfl_down_sync(0xffffffff, min_max.elem[1], mask, kWarpSize); + min_max.elem[0] = BinaryFuncMin::Invoke(b_min, min_max.elem[0]); + min_max.elem[1] = BinaryFuncMax::Invoke(b_max, min_max.elem[1]); + } + if (lid == 0) { + float min_value = static_cast(min_max.elem[0]); + float max_value = static_cast(min_max.elem[1]); + float scale = (max_value - min_value) / (upper_bound - lower_bound); + int32_t zero_point = lower_bound - __float2int_rn(min_value / scale); + scale_ptr[0] = scale; + zero_point_ptr[0] = static_cast(zero_point); + } } } @@ -277,9 +326,8 @@ inline void ApplyDynamicQuantization(cudaStream_t stream, const int min_max_size Q lower_bound = -upper_bound - 1; size_t element_bytes = GetSizeOfDataType(GetDataType::value); - ComputeScaleAndZeroPointBlock - <<<1, cuda::elementwise::kBlockSize, cuda::elementwise::kBlockSize * element_bytes * 2, - stream>>>(min_max_size, min_max_ptr, upper_bound, lower_bound, scale_ptr, zero_point_ptr); + ComputeScaleAndZeroPointBlock<<<1, cuda::elementwise::kBlockSize, 0, stream>>>( + min_max_size, min_max_ptr, upper_bound, lower_bound, scale_ptr, zero_point_ptr); constexpr int pack_size = cuda::elementwise::PackSize(); int64_t pack_num = (elements + pack_size - 1) / pack_size; From a153075a0b186abf3340915ee45f652cd70247ef Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 13 Sep 2023 09:26:35 +0000 Subject: [PATCH 59/65] update xformers fmha --- CMakeLists.txt | 4 +-- cmake/third_party/cutlass.cmake | 1 + .../user/kernels/fused_attention_kernels.cu | 34 +++++++++++++------ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eebc44dfc12..c70af37c06c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,9 +212,9 @@ if(BUILD_PYTHON) endif(BUILD_PYTHON) set(CUTLASS_URL - https://github.com/Oneflow-Inc/cutlass/archive/d47b8883b5e3661b41cc8a7a6f4c240c5524647f.zip) + https://github.com/Oneflow-Inc/cutlass/archive/912d3fbdf2cd087ef09d0afe52dbd49194d958cd.zip) use_mirror(VARIABLE CUTLASS_URL URL ${CUTLASS_URL}) -set(CUTLASS_MD5 7b417720240a443276ce4bb9ef169db1) +set(CUTLASS_MD5 5a0541270b4d33c1f46bec3e7b6e2c89) include(cuda) add_subdirectory(external) diff --git a/cmake/third_party/cutlass.cmake b/cmake/third_party/cutlass.cmake index 46f1a9132ec..d127f4cb214 100644 --- a/cmake/third_party/cutlass.cmake +++ b/cmake/third_party/cutlass.cmake @@ -95,6 +95,7 @@ if(WITH_CUTLASS) "xformers_fmha/iterators/epilogue_predicated_tile_iterator.h" "xformers_fmha/iterators/transpose_warp_iterator.h" "xformers_fmha/iterators/warp_iterator_from_smem.h" + "xformers_fmha/iterators/default_warp_iterator_from_smem.h" "xformers_fmha/iterators/predicated_tile_access_iterator_residual_last.h" "xformers_fmha/kernel_backward.h") diff --git a/oneflow/user/kernels/fused_attention_kernels.cu b/oneflow/user/kernels/fused_attention_kernels.cu index d432a1f30a8..9de1c44b7ac 100644 --- a/oneflow/user/kernels/fused_attention_kernels.cu +++ b/oneflow/user/kernels/fused_attention_kernels.cu @@ -287,13 +287,13 @@ struct Params { }; template + int max_k, bool with_attn_bias> void LaunchCutlassFmha(const Params& params, ep::CudaStream* stream) { // The fmha implementation below is based on xformers's fmha // implementation at: // https://github.com/facebookresearch/xformers/tree/main/xformers/csrc/attention/cuda/fmha using Attention = AttentionKernel; + max_k, false, with_attn_bias>; typename Attention::Params p{}; p.query_ptr = const_cast(reinterpret_cast(params.query_ptr)); p.key_ptr = const_cast(reinterpret_cast(params.key_ptr)); @@ -363,24 +363,36 @@ void LaunchCutlassFmha(const Params& params, ep::CudaStream* stream) { } template + int max_k> void DispatchWithAttnBias(const Params& params, ep::CudaStream* stream) { if (params.attn_bias_ptr != nullptr) { - LaunchCutlassFmha(params, stream); + LaunchCutlassFmha( + params, stream); } else { - LaunchCutlassFmha(params, stream); + LaunchCutlassFmha( + params, stream); } } template void DispatchSingleValueIteration(const Params& params, ep::CudaStream* stream) { - if (params.value_head_size <= keys_per_block) { - DispatchWithAttnBias(params, - stream); + if (params.value_head_size <= 8) { + DispatchWithAttnBias(params, + stream); + } else if (params.value_head_size <= 16) { + DispatchWithAttnBias(params, + stream); + } else if (params.value_head_size <= 32) { + DispatchWithAttnBias(params, + stream); + } else if (params.value_head_size <= 64) { + DispatchWithAttnBias(params, + stream); + } else if (params.value_head_size <= 128) { + DispatchWithAttnBias(params, + stream); } else { - DispatchWithAttnBias(params, + DispatchWithAttnBias(params, stream); } } From af2da949d352f765102d1f857801edf557e00cf6 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 13 Sep 2023 12:50:42 +0000 Subject: [PATCH 60/65] Revert "update xformers fmha" This reverts commit a153075a0b186abf3340915ee45f652cd70247ef. --- CMakeLists.txt | 4 +-- cmake/third_party/cutlass.cmake | 1 - .../user/kernels/fused_attention_kernels.cu | 34 ++++++------------- 3 files changed, 13 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c70af37c06c..eebc44dfc12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,9 +212,9 @@ if(BUILD_PYTHON) endif(BUILD_PYTHON) set(CUTLASS_URL - https://github.com/Oneflow-Inc/cutlass/archive/912d3fbdf2cd087ef09d0afe52dbd49194d958cd.zip) + https://github.com/Oneflow-Inc/cutlass/archive/d47b8883b5e3661b41cc8a7a6f4c240c5524647f.zip) use_mirror(VARIABLE CUTLASS_URL URL ${CUTLASS_URL}) -set(CUTLASS_MD5 5a0541270b4d33c1f46bec3e7b6e2c89) +set(CUTLASS_MD5 7b417720240a443276ce4bb9ef169db1) include(cuda) add_subdirectory(external) diff --git a/cmake/third_party/cutlass.cmake b/cmake/third_party/cutlass.cmake index d127f4cb214..46f1a9132ec 100644 --- a/cmake/third_party/cutlass.cmake +++ b/cmake/third_party/cutlass.cmake @@ -95,7 +95,6 @@ if(WITH_CUTLASS) "xformers_fmha/iterators/epilogue_predicated_tile_iterator.h" "xformers_fmha/iterators/transpose_warp_iterator.h" "xformers_fmha/iterators/warp_iterator_from_smem.h" - "xformers_fmha/iterators/default_warp_iterator_from_smem.h" "xformers_fmha/iterators/predicated_tile_access_iterator_residual_last.h" "xformers_fmha/kernel_backward.h") diff --git a/oneflow/user/kernels/fused_attention_kernels.cu b/oneflow/user/kernels/fused_attention_kernels.cu index 9de1c44b7ac..d432a1f30a8 100644 --- a/oneflow/user/kernels/fused_attention_kernels.cu +++ b/oneflow/user/kernels/fused_attention_kernels.cu @@ -287,13 +287,13 @@ struct Params { }; template + bool single_value_iteration, bool with_attn_bias> void LaunchCutlassFmha(const Params& params, ep::CudaStream* stream) { // The fmha implementation below is based on xformers's fmha // implementation at: // https://github.com/facebookresearch/xformers/tree/main/xformers/csrc/attention/cuda/fmha using Attention = AttentionKernel; + single_value_iteration, false, with_attn_bias>; typename Attention::Params p{}; p.query_ptr = const_cast(reinterpret_cast(params.query_ptr)); p.key_ptr = const_cast(reinterpret_cast(params.key_ptr)); @@ -363,36 +363,24 @@ void LaunchCutlassFmha(const Params& params, ep::CudaStream* stream) { } template + bool single_value_iteration> void DispatchWithAttnBias(const Params& params, ep::CudaStream* stream) { if (params.attn_bias_ptr != nullptr) { - LaunchCutlassFmha( - params, stream); + LaunchCutlassFmha(params, stream); } else { - LaunchCutlassFmha( - params, stream); + LaunchCutlassFmha(params, stream); } } template void DispatchSingleValueIteration(const Params& params, ep::CudaStream* stream) { - if (params.value_head_size <= 8) { - DispatchWithAttnBias(params, - stream); - } else if (params.value_head_size <= 16) { - DispatchWithAttnBias(params, - stream); - } else if (params.value_head_size <= 32) { - DispatchWithAttnBias(params, - stream); - } else if (params.value_head_size <= 64) { - DispatchWithAttnBias(params, - stream); - } else if (params.value_head_size <= 128) { - DispatchWithAttnBias(params, - stream); + if (params.value_head_size <= keys_per_block) { + DispatchWithAttnBias(params, + stream); } else { - DispatchWithAttnBias(params, + DispatchWithAttnBias(params, stream); } } From 26dcc1dd41c2cb1ac8a6c48fe25e49afc9db3521 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 13 Sep 2023 14:08:28 +0000 Subject: [PATCH 61/65] optimize quant --- oneflow/user/kernels/quantization_kernel.cu | 73 +-------------------- oneflow/user/kernels/quantization_utils.cuh | 6 +- 2 files changed, 6 insertions(+), 73 deletions(-) diff --git a/oneflow/user/kernels/quantization_kernel.cu b/oneflow/user/kernels/quantization_kernel.cu index cf675bbf609..36cd74f03ad 100644 --- a/oneflow/user/kernels/quantization_kernel.cu +++ b/oneflow/user/kernels/quantization_kernel.cu @@ -17,6 +17,7 @@ limitations under the License. #include "oneflow/core/device/cuda_util.h" #include "oneflow/core/framework/framework.h" #include "oneflow/core/kernel/kernel_util.cuh" +#include "oneflow/user/kernels/quantization_utils.cuh" namespace oneflow { @@ -98,31 +99,6 @@ __global__ void QuantizationCambricon(const T* in_ptr, const T* shift, const int } } -template -__host__ __device__ int ModDiv(int64_t N) { - return N - (N / M * M); -} - -template<> -__host__ __device__ int ModDiv<2>(int64_t N) { - return N & 0x1; -} - -template<> -__host__ __device__ int ModDiv<4>(int64_t N) { - return N & 0x3; -} - -template<> -__host__ __device__ int ModDiv<8>(int64_t N) { - return N & 0x7; -} - -template<> -__host__ __device__ int ModDiv<16>(int64_t N) { - return N & 0xF; -} - template __global__ void OFPerTensorQuantizationSymmetric(const int64_t elements, const T* in_ptr, const float* scale_ptr, const OutT upper_bound, @@ -149,7 +125,7 @@ __global__ void OFPerTensorQuantizationSymmetric(const int64_t elements, const T reinterpret_cast(out_ptr + idx)[0] = out.storage; } - int rest = ModDiv(elements); + int rest = quantization::ModDiv(elements); if (rest > 0 && tid == (gridDim.x * blockDim.x - 1)) { in_ptr += elements - rest; @@ -162,49 +138,6 @@ __global__ void OFPerTensorQuantizationSymmetric(const int64_t elements, const T } } -template -__global__ void OFPerTensorQuantizationAffine(const int64_t elements, const T* in_ptr, - const float* scale_ptr, const OutT* zero_point_ptr, - const OutT upper_bound, const OutT lower_bound, - OutT* out_ptr) { - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - using StoreType = cuda::elementwise::PackType; - using StorePack = cuda::elementwise::Pack; - - int64_t tid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x * pack_size; - - float scale = *scale_ptr; - float zero_point = *zero_point_ptr; - - for (int64_t idx = tid * pack_size; idx < elements; idx += step) { - StorePack out; - LoadPack in; - in.storage = reinterpret_cast(in_ptr + idx)[0]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - out.elem[i] = - max(min(__float2int_rn(static_cast(in.elem[i]) / scale + zero_point), upper_bound), - lower_bound); - } - reinterpret_cast(out_ptr + idx)[0] = out.storage; - } - - int rest = ModDiv(elements); - - if (rest > 0 && tid == (gridDim.x * blockDim.x - 1)) { - in_ptr += elements - rest; - out_ptr += elements - rest; -#pragma unroll - for (int i = 0; i < rest; ++i) { - out_ptr[i] = - max(min(__float2int_rn(static_cast(in_ptr[i]) / scale + zero_point), upper_bound), - lower_bound); - } - } -} - template void ApplyOFPerTensorQuantization(user_op::KernelComputeContext* ctx, const std::string& quantization_scheme, @@ -227,7 +160,7 @@ void ApplyOFPerTensorQuantization(user_op::KernelComputeContext* ctx, elements, in->dptr(), scale->dptr(), upper_bound, lower_bound, out->mut_dptr()); } else { - OFPerTensorQuantizationAffine + quantization::ApplyQuantization <<>>( elements, in->dptr(), scale->dptr(), zero_point->dptr(), upper_bound, lower_bound, out->mut_dptr()); diff --git a/oneflow/user/kernels/quantization_utils.cuh b/oneflow/user/kernels/quantization_utils.cuh index 9a90428dac8..7c3c8f09d9e 100644 --- a/oneflow/user/kernels/quantization_utils.cuh +++ b/oneflow/user/kernels/quantization_utils.cuh @@ -285,7 +285,7 @@ __global__ void ApplyQuantization(const int64_t elements, const T* in_ptr, const int64_t tid = (blockDim.x * blockIdx.x) + threadIdx.x; int64_t step = gridDim.x * blockDim.x * pack_size; - float scale = *scale_ptr; + float scale = 1.f / *scale_ptr; float zero_point = *zero_point_ptr; for (int64_t idx = tid * pack_size; idx < elements; idx += step) { @@ -295,7 +295,7 @@ __global__ void ApplyQuantization(const int64_t elements, const T* in_ptr, const #pragma unroll for (int i = 0; i < pack_size; ++i) { out.elem[i] = - max(min(__float2int_rn(static_cast(in.elem[i]) / scale + zero_point), upper_bound), + max(min(__float2int_rn(static_cast(in.elem[i]) * scale + zero_point), upper_bound), lower_bound); } reinterpret_cast(out_ptr + idx)[0] = out.storage; @@ -311,7 +311,7 @@ __global__ void ApplyQuantization(const int64_t elements, const T* in_ptr, const #pragma unroll for (int i = 0; i < rest; ++i) { out_ptr[i] = - max(min(__float2int_rn(static_cast(in.elem[i]) / scale + zero_point), upper_bound), + max(min(__float2int_rn(static_cast(in.elem[i]) * scale + zero_point), upper_bound), lower_bound); } } From 397304f7c425711e01a8c49b807c27cc89b20b6a Mon Sep 17 00:00:00 2001 From: clackhan Date: Fri, 15 Sep 2023 09:10:33 +0000 Subject: [PATCH 62/65] add_cutlass_gemm_array_tuner --- oneflow/core/functional/functional_api.yaml | 6 +- oneflow/core/functional/impl/nn_functor.cpp | 8 +- .../cutlass_gemm_array_operation_cache_key.h | 133 ++++++ .../user/kernels/cutlass_gemm_array_tuner.h | 57 +++ .../kernels/cutlass_gemm_array_tuner_impl.cpp | 384 ++++++++++++++++++ .../kernels/cutlass_gemm_array_tuner_impl.h | 54 +++ .../kernels/grouped_matmul_quant_kernel.cu | 140 ++++++- 7 files changed, 767 insertions(+), 15 deletions(-) create mode 100644 oneflow/user/kernels/cutlass_gemm_array_operation_cache_key.h create mode 100644 oneflow/user/kernels/cutlass_gemm_array_tuner.h create mode 100644 oneflow/user/kernels/cutlass_gemm_array_tuner_impl.cpp create mode 100644 oneflow/user/kernels/cutlass_gemm_array_tuner_impl.h diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index b2ab81c9b3f..51800c755df 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1080,13 +1080,13 @@ - name: "grouped_matmul_quant" signature: [ - 'Tensor (TensorTuple as, TensorTuple bs, TensorTuple scales, TensorTuple biases, + 'TensorTuple (TensorTuple as, TensorTuple bs, TensorTuple scales, TensorTuple biases, Bool transpose_a=False, Bool transpose_b=False, Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant', - 'Tensor (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, + 'TensorTuple (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, TensorTuple weight_accs, Bool transpose_a=False, Bool transpose_b=False, Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant', - 'Tensor (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, + 'TensorTuple (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, TensorTuple weight_accs, TensorTuple biases, Bool transpose_a=False, Bool transpose_b=False, Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant' diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 57cf97bed59..a8687200abc 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -678,10 +678,10 @@ class GroupedMatMulQuantFunctor { for (int n = 1; n < kMaxInputCount; ++n) { grouped_matmul_quant_scale_bias_op_[n] = CHECK_JUST(one::OpBuilder("grouped_matmul_quant") .Input("as", n) - .Input("bs") + .Input("bs", n) .Input("scales", n) .Input("biases", n) - .Output("out", n) + .Output("outputs", n) .Build()); } } @@ -725,7 +725,7 @@ class GroupedMatMulQuantWithFilterScaleFunctor { .Input("in_scales", n) .Input("weight_scales", n) .Input("weight_accs", n) - .Output("out", n) + .Output("outputs", n) .Build()); } } @@ -773,7 +773,7 @@ class GroupedMatMulBiasQuantWithFilterScaleFunctor { .Input("weight_scales", n) .Input("weight_accs", n) .Input("biases", n) - .Output("out", n) + .Output("outputs", n) .Build()); } } diff --git a/oneflow/user/kernels/cutlass_gemm_array_operation_cache_key.h b/oneflow/user/kernels/cutlass_gemm_array_operation_cache_key.h new file mode 100644 index 00000000000..49b07987e53 --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_array_operation_cache_key.h @@ -0,0 +1,133 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUTLASS + +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_OPERATION_CACHE_KEY_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_OPERATION_CACHE_KEY_H_ + +#include "oneflow/core/framework/framework.h" + +#include +#include + +#ifdef WITH_CUTLASS_EXTENSION +#include +#include +#endif // WITH_CUTLASS_EXTENSION + +namespace oneflow { + +struct GemmArrayOperationCacheKey { + cutlass::library::GemmFunctionalKey functional_key; + cutlass::library::GemmArrayConfiguration configuraion; + size_t alignment; + size_t kind; + + GemmArrayOperationCacheKey(const cutlass::library::GemmFunctionalKey& functional_key, + const cutlass::library::GemmArrayConfiguration& configuraion, + const cutlass::library::GemmArrayArguments& arguments) + : functional_key(functional_key), configuraion(configuraion), kind(-1) { + const auto IsAligned = [&](size_t n) { + return configuraion.lda % n == 0 && configuraion.ldb % n == 0 && configuraion.ldc % n == 0 + && configuraion.ldd % n == 0; + }; + alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); + for (; alignment > 1; alignment = alignment >> 1) { + if (IsAligned(alignment)) { break; } + } + } + +#ifdef WITH_CUTLASS_EXTENSION + GemmArrayOperationCacheKey(cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmArrayScaleBiasFusionConfiguration& config, + const cutlass::library::GemmArrayScaleBiasFusionArguments& arguments) + : functional_key(functional_key) { + if (arguments.Scale) { + kind = arguments.Residual ? cutlass::library::SingletonKind::kGemmArrayScaleBiasResidualFusion + : cutlass::library::SingletonKind::kGemmArrayScaleBiasFusion; + } else if (arguments.FilterScale) { + if (arguments.Bias) { + kind = arguments.Residual + ? cutlass::library::SingletonKind::kGemmArrayFilterScaleBiasResidualFusion + : cutlass::library::SingletonKind::kGemmArrayFilterScaleBiasFusion; + } else { + kind = arguments.Residual + ? cutlass::library::SingletonKind::kGemmArrayFilterScaleResidualFusion + : cutlass::library::SingletonKind::kGemmArrayFilterScaleFusion; + } + } else { + UNIMPLEMENTED(); + } + configuraion.problem_size = config.problem_size; + configuraion.batch_count = config.batch_count; + configuraion.lda = config.lda; + configuraion.ldb = config.ldb; + configuraion.ldc = 0; + configuraion.ldd = config.ldd; + const auto IsAligned = [&](size_t n) { + return configuraion.lda % n == 0 && configuraion.ldb % n == 0 && config.ldr % n == 0 + && configuraion.ldd % n == 0; + }; + alignment = 128 / cutlass::library::sizeof_bits(functional_key.element_A); + for (; alignment > 1; alignment = alignment >> 1) { + if (IsAligned(alignment)) { break; } + } + } +#endif // WITH_CUTLASS_EXTENSION +}; + +struct GemmArrayConfigurationHasher { + size_t operator()(const cutlass::library::GemmArrayConfiguration& configuraion) const { + size_t hash = 0; + hash = HashCombine(hash, std::hash()(configuraion.problem_size.m())); + hash = HashCombine(hash, std::hash()(configuraion.problem_size.n())); + hash = HashCombine(hash, std::hash()(configuraion.problem_size.k())); + hash = HashCombine(hash, std::hash()(configuraion.batch_count)); + hash = HashCombine(hash, configuraion.lda); + hash = HashCombine(hash, configuraion.ldb); + hash = HashCombine(hash, configuraion.ldc); + hash = HashCombine(hash, configuraion.ldd); + return hash; + } +}; + +struct GemmArrayOperationCacheKeyHasher { + size_t operator()(const GemmArrayOperationCacheKey& key) const { + size_t hash = cutlass::library::GemmFunctionalKeyHasher()(key.functional_key); + hash = HashCombine(hash, GemmArrayConfigurationHasher()(key.configuraion)); + hash = HashCombine(hash, std::hash()(key.alignment)); + hash = HashCombine(hash, std::hash()(key.kind)); + return hash; + } +}; + +inline bool operator==(const cutlass::library::GemmArrayConfiguration& lhs, + const cutlass::library::GemmArrayConfiguration& rhs) { + return lhs.batch_count == rhs.batch_count && lhs.problem_size == rhs.problem_size + && lhs.lda == rhs.lda && lhs.ldb == rhs.ldb && lhs.ldc == rhs.ldc && lhs.ldd == rhs.ldd; +} + +inline bool operator==(const GemmArrayOperationCacheKey& lhs, + const GemmArrayOperationCacheKey& rhs) { + return lhs.functional_key == rhs.functional_key && lhs.configuraion == rhs.configuraion + && lhs.alignment == rhs.alignment && lhs.kind == rhs.kind; +} + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_OPERATION_CACHE_KEY_H_ + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_gemm_array_tuner.h b/oneflow/user/kernels/cutlass_gemm_array_tuner.h new file mode 100644 index 00000000000..0892ac1de94 --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_array_tuner.h @@ -0,0 +1,57 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_TUNER_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_TUNER_H_ + +#ifdef WITH_CUTLASS + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include "oneflow/user/kernels/cutlass_gemm_array_tuner_impl.h" + +#include +#include + +namespace oneflow { + +class CutlassGemmArrayTuner { + public: + CutlassGemmArrayTuner() = default; + + template + const cutlass::library::Operation* FindOperation( + ep::CudaStream* stream, const cutlass::library::GemmFunctionalKey& functional_key, + const Configuration& configuraion, const Arguments& arguments, void* workspace, + size_t workspace_size) { + return GetCutlassGemmArrayTunerImpl()->Find( + stream, functional_key, configuraion, arguments, workspace, workspace_size); + } + + template + const cutlass::library::Operation* GetOperation( + const std::string& name, ep::CudaStream* stream, + const cutlass::library::GemmFunctionalKey& functional_key, const Configuration& configuraion, + const Arguments& arguments, void* workspace, size_t workspace_size) { + return GetCutlassGemmArrayTunerImpl()->Get( + name, stream, functional_key, configuraion, arguments, workspace, workspace_size); + } +}; + +} // namespace oneflow + +#endif // WITH_CUTLASS + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_TUNER_H_ diff --git a/oneflow/user/kernels/cutlass_gemm_array_tuner_impl.cpp b/oneflow/user/kernels/cutlass_gemm_array_tuner_impl.cpp new file mode 100644 index 00000000000..68c5450a9b8 --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_array_tuner_impl.cpp @@ -0,0 +1,384 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#include "oneflow/user/kernels/cutlass_gemm_array_tuner_impl.h" + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" +#include +#include + +#include "oneflow/user/kernels/cutlass_gemm_array_operation_cache_key.h" +#ifdef WITH_CUTLASS_EXTENSION +#include +#include +#endif // WITH_CUTLASS_EXTENSION + +namespace oneflow { + +namespace { + +bool IsWeakerAlginOperation(const cutlass::library::Operation* lhs, + const cutlass::library::Operation* rhs) { + const std::string lhs_name = lhs->description().name; + const std::string rhs_name = rhs->description().name; + size_t lhs_pos = lhs_name.rfind("align"); + if (lhs_pos == std::string::npos) { return false; } + size_t rhs_pos = rhs_name.rfind("align"); + if (rhs_pos == std::string::npos) { return false; } + if (lhs_name.substr(0, lhs_pos) != rhs_name.substr(0, rhs_pos)) { return false; } + size_t align_len = std::strlen("align"); + int lhs_alignment = std::atoi(lhs_name.substr(lhs_pos + align_len).c_str()); + int rhs_alignment = std::atoi(rhs_name.substr(rhs_pos + align_len).c_str()); + return lhs_alignment < rhs_alignment; +} + +size_t GetTensorSize(cutlass::library::NumericTypeID element, cutlass::library::LayoutTypeID layout, + const int row, const int col, const int ldc) { + const size_t element_size = cutlass::library::sizeof_bits(element) / 8; + size_t capacity = 0; + if (layout == cutlass::library::LayoutTypeID::kRowMajor) { + capacity = row * ldc; + } else if (layout == cutlass::library::LayoutTypeID::kColumnMajor) { + capacity = ldc * col; + } else { + UNIMPLEMENTED(); + } + return capacity * element_size; +} + +template +const cutlass::library::Operation* FindFastestOperation( + const Singleton* singleton, const cutlass::library::GemmFunctionalKey& functional_key, + const Configuration& configuraion, const Arguments& arguments, void* workspace, + size_t workspace_size, cudaStream_t stream, int cuda_arch) { + constexpr int turing_warmup_iters = 2; + constexpr int turing_iters = 5; + cudaEvent_t start{}; + cudaEvent_t end{}; + OF_CUDA_CHECK(cudaEventCreate(&start)); + OF_CUDA_CHECK(cudaEventCreate(&end)); + const cutlass::library::Operation* fastest_operation = nullptr; + float fastest_time = 0; + const auto& operations_map = [&]() { + const auto& it = singleton->operation_table.gemm_operations.find(functional_key); + CHECK(it != singleton->operation_table.gemm_operations.cend()); + return it->second; + }(); + + for (const auto& pair : operations_map) { + std::map> operations; + for (auto operation : pair.second) { + operations.emplace(operation->description().name, operation); + } + const cutlass::library::Operation* prev_operation = nullptr; + for (const auto& name_operation : operations) { + const cutlass::library::Operation* operation = name_operation.second; + if (prev_operation != nullptr && IsWeakerAlginOperation(operation, prev_operation)) { + continue; + } + if (operation->description().tile_description.minimum_compute_capability * 10 > cuda_arch + || operation->description().tile_description.maximum_compute_capability * 10 + < cuda_arch) { + continue; + } + auto status = operation->can_implement(&configuraion, &arguments); + if (status != cutlass::Status::kSuccess) { continue; } + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); + if (device_workspace_size > workspace_size) { continue; } + std::vector host_workspace(host_workspace_size, 0); + if (operation->initialize(&configuraion, host_workspace.data(), workspace, stream) + != cutlass::Status::kSuccess) { + continue; + } + + const auto Run = [&]() { + auto init_status = + operation->initialize(&configuraion, host_workspace.data(), workspace, stream); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = operation->run(&arguments, host_workspace.data(), workspace, stream); + CHECK(run_status == cutlass::Status::kSuccess); + }; + OF_CUDA_CHECK(cudaStreamSynchronize(stream)); + for (int i = 0; i < turing_warmup_iters; ++i) { Run(); } + OF_CUDA_CHECK(cudaEventRecord(start, stream)); + for (int i = 0; i < turing_iters; ++i) { Run(); } + OF_CUDA_CHECK(cudaEventRecord(end, stream)); + OF_CUDA_CHECK(cudaEventSynchronize(end)); + float time = 0; + OF_CUDA_CHECK(cudaEventElapsedTime(&time, start, end)); + VLOG(3) << operation->description().name << " " << time; + prev_operation = operation; + if (fastest_operation == nullptr || time < fastest_time) { + fastest_operation = operation; + fastest_time = time; + } + } + } + OF_CUDA_CHECK(cudaEventDestroy(start)); + OF_CUDA_CHECK(cudaEventDestroy(end)); + VLOG(3) << "Fastest: " << fastest_operation->description().name << " " << fastest_time; + return fastest_operation; +} + +template +const cutlass::library::Operation* GetOperation( + const Singleton* singleton, const std::string& name, + const cutlass::library::GemmFunctionalKey& functional_key, const Configuration& configuraion, + const Arguments& arguments, void* workspace, size_t workspace_size, cudaStream_t stream, + int cuda_arch) { + const auto& it = singleton->operation_table.gemm_operations.find(functional_key); + if (it == singleton->operation_table.gemm_operations.cend()) { return nullptr; } + const cutlass::library::GemmOperationVectorMap& operations_map = it->second; + for (const auto& pair : operations_map) { + for (auto operation : pair.second) { + if (name != operation->description().name) { continue; } + if (operation->description().tile_description.minimum_compute_capability * 10 > cuda_arch + || operation->description().tile_description.maximum_compute_capability * 10 + < cuda_arch) { + continue; + } + auto status = operation->can_implement(&configuraion, &arguments); + if (status != cutlass::Status::kSuccess) { continue; } + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + const size_t device_workspace_size = operation->get_device_workspace_size(&configuraion); + if (device_workspace_size > workspace_size) { continue; } + std::vector host_workspace(host_workspace_size, 0); + if (operation->initialize(&configuraion, host_workspace.data(), workspace, stream) + != cutlass::Status::kSuccess) { + continue; + } + return operation; + } + } + return nullptr; +} + +void** PrepareTensorArrayMem(int batch_count, size_t size) { + void** array_ptr; + OF_CUDA_CHECK(cudaMalloc(&array_ptr, batch_count * sizeof(void*))); + std::vector host_ptr(batch_count); + for (int i = 0; i < batch_count; ++i) { + void* data_ptr; + OF_CUDA_CHECK(cudaMalloc(&data_ptr, size)); + host_ptr[i] = data_ptr; + } + OF_CUDA_CHECK( + cudaMemcpy(array_ptr, host_ptr.data(), batch_count * sizeof(void*), cudaMemcpyHostToDevice)); + return array_ptr; +} + +void FreeTensorArrayMem(void const* const* array_ptr, int batch_count) { + for (int i = 0; i < batch_count; ++i) { + OF_CUDA_CHECK(cudaFree(const_cast(array_ptr[i]))); + } + OF_CUDA_CHECK(cudaFree(const_cast((const void*)array_ptr))); +} + +} // namespace + +#ifdef WITH_CUTLASS_EXTENSION +template<> +class CutlassGemmArrayTunerImpl { + public: + using CacheMap = + std::unordered_map; + + CutlassGemmArrayTunerImpl() {} + + const cutlass::library::Operation* Find( + ep::CudaStream* stream, cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmArrayScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmArrayScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size); + + const cutlass::library::Operation* Get( + const std::string& name, ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmArrayScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmArrayScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size); + + private: + std::mutex mutex; + std::unordered_map cache; +}; + +const cutlass::library::Operation* +CutlassGemmArrayTunerImpl:: + Find(ep::CudaStream* stream, cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmArrayScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmArrayScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + GemmArrayOperationCacheKey cache_key(functional_key, configuraion, arguments); + { + std::lock_guard lock(mutex); + const auto& device_cache = cache[dev]; + const auto& it = device_cache.find(cache_key); + if (it != device_cache.end()) { return it->second; } + } + cutlass::library::GemmArrayScaleBiasFusionArguments benchmark_arguments = arguments; + void* benchmark_workspace = workspace; + cudaStream_t benchmark_stream = stream->cuda_stream(); +#ifdef WITH_CUDA_GRAPHS + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + OF_CUDA_CHECK(cudaStreamCreate(&benchmark_stream)); + OF_CUDA_CHECK(cudaMalloc(&benchmark_workspace, workspace_size)); + + const size_t a_size = GetTensorSize(functional_key.element_A, functional_key.layout_A, + configuraion.problem_size.m(), + configuraion.problem_size.k(), configuraion.lda); + benchmark_arguments.A = PrepareTensorArrayMem(configuraion.batch_count, a_size); + const size_t b_size = GetTensorSize(functional_key.element_B, functional_key.layout_B, + configuraion.problem_size.k(), + configuraion.problem_size.m(), configuraion.ldb); + benchmark_arguments.B = PrepareTensorArrayMem(configuraion.batch_count, b_size); + + if (benchmark_arguments.P != nullptr) { + const size_t size = cutlass::library::sizeof_bits(functional_key.element_A) / 8; + benchmark_arguments.P = PrepareTensorArrayMem(configuraion.batch_count, size); + } + if (benchmark_arguments.InScale != nullptr) { + const size_t size = cutlass::library::sizeof_bits(functional_key.element_scalar) / 8; + benchmark_arguments.InScale = PrepareTensorArrayMem(configuraion.batch_count, size); + } + if (benchmark_arguments.FilterScale != nullptr) { + const size_t size = configuraion.problem_size.n() + * cutlass::library::sizeof_bits(functional_key.element_D) / 8; + benchmark_arguments.FilterScale = PrepareTensorArrayMem(configuraion.batch_count, size); + } + if (benchmark_arguments.FilterAcc != nullptr) { + const size_t size = configuraion.problem_size.n() + * cutlass::library::sizeof_bits(functional_key.element_D) / 8; + benchmark_arguments.FilterAcc = PrepareTensorArrayMem(configuraion.batch_count, size); + } + if (benchmark_arguments.Scale != nullptr) { + const size_t scale_size = configuraion.problem_size.n() + * cutlass::library::sizeof_bits(functional_key.element_D) / 8; + benchmark_arguments.Scale = PrepareTensorArrayMem(configuraion.batch_count, scale_size); + } + if (benchmark_arguments.Bias != nullptr) { + const size_t bias_size = configuraion.problem_size.n() + * cutlass::library::sizeof_bits(functional_key.element_D) / 8; + benchmark_arguments.Bias = PrepareTensorArrayMem(configuraion.batch_count, bias_size); + } + if (benchmark_arguments.Residual != nullptr) { + const size_t residual_size = GetTensorSize(functional_key.element_D, functional_key.layout_D, + configuraion.problem_size.m(), + configuraion.problem_size.n(), configuraion.ldr); + benchmark_arguments.Residual = PrepareTensorArrayMem(configuraion.batch_count, residual_size); + } + const size_t d_size = GetTensorSize(functional_key.element_D, functional_key.layout_D, + configuraion.problem_size.m(), + configuraion.problem_size.n(), configuraion.ldd); + benchmark_arguments.D = PrepareTensorArrayMem(configuraion.batch_count, d_size); + } +#endif // WITH_CUDA_GRAPHS + + const cutlass::library::CutlassExtensionSingleton* singleton = + &cutlass::library::CutlassExtensionSingleton::get( + static_cast(cache_key.kind)); + + const cutlass::library::Operation* fastest_operation = FindFastestOperation( + singleton, functional_key, configuraion, benchmark_arguments, benchmark_workspace, + workspace_size, benchmark_stream, stream->cuda_arch()); + +#ifdef WITH_CUDA_GRAPHS + if (stream->IsGraphCapturing()) { + OF_CUDA_CHECK(cudaStreamSynchronize(benchmark_stream)); + OF_CUDA_CHECK(cudaStreamDestroy(benchmark_stream)); + FreeTensorArrayMem(benchmark_arguments.A, configuraion.batch_count); + FreeTensorArrayMem(benchmark_arguments.B, configuraion.batch_count); + if (benchmark_arguments.P != nullptr) { + FreeTensorArrayMem(benchmark_arguments.P, configuraion.batch_count); + } + if (benchmark_arguments.InScale != nullptr) { + FreeTensorArrayMem(benchmark_arguments.InScale, configuraion.batch_count); + } + if (benchmark_arguments.FilterScale != nullptr) { + FreeTensorArrayMem(benchmark_arguments.FilterScale, configuraion.batch_count); + } + if (benchmark_arguments.FilterAcc != nullptr) { + FreeTensorArrayMem(benchmark_arguments.FilterAcc, configuraion.batch_count); + } + if (benchmark_arguments.Scale != nullptr) { + FreeTensorArrayMem(benchmark_arguments.Scale, configuraion.batch_count); + } + if (benchmark_arguments.Bias != nullptr) { + FreeTensorArrayMem(benchmark_arguments.Bias, configuraion.batch_count); + } + if (benchmark_arguments.Residual != nullptr) { + FreeTensorArrayMem(benchmark_arguments.Residual, configuraion.batch_count); + } + FreeTensorArrayMem(benchmark_arguments.D, configuraion.batch_count); + OF_CUDA_CHECK(cudaFree(benchmark_workspace)); + OF_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + } +#endif // WITH_CUDA_GRAPHS + if (fastest_operation != nullptr) { + std::lock_guard lock(mutex); + cache[dev][cache_key] = fastest_operation; + } + return fastest_operation; +} + +const cutlass::library::Operation* +CutlassGemmArrayTunerImpl:: + Get(const std::string& name, ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const cutlass::library::GemmArrayScaleBiasFusionConfiguration& configuraion, + const cutlass::library::GemmArrayScaleBiasFusionArguments& arguments, void* workspace, + size_t workspace_size) { + int dev = 0; + OF_CUDA_CHECK(cudaGetDevice(&dev)); + + GemmArrayOperationCacheKey cache_key(functional_key, configuraion, arguments); + const cutlass::library::CutlassExtensionSingleton* singleton = + &cutlass::library::CutlassExtensionSingleton::get( + static_cast(cache_key.kind)); + + return GetOperation(singleton, name, functional_key, configuraion, arguments, workspace, + workspace_size, stream->cuda_stream(), stream->cuda_arch()); +} +#endif // WITH_CUTLASS_EXTENSION + +template +CutlassGemmArrayTunerImpl* GetCutlassGemmArrayTunerImpl() { + static CutlassGemmArrayTunerImpl impl; + return &impl; +} + +#ifdef WITH_CUTLASS_EXTENSION +template CutlassGemmArrayTunerImpl* +GetCutlassGemmArrayTunerImpl(); +#endif // WITH_CUTLASS_EXTENSION + +} // namespace oneflow + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/cutlass_gemm_array_tuner_impl.h b/oneflow/user/kernels/cutlass_gemm_array_tuner_impl.h new file mode 100644 index 00000000000..78e908bbfed --- /dev/null +++ b/oneflow/user/kernels/cutlass_gemm_array_tuner_impl.h @@ -0,0 +1,54 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef WITH_CUTLASS + +#ifndef ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_TUNER_IMPL_H_ +#define ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_TUNER_IMPL_H_ + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/cuda/cuda_stream.h" + +#include +#include +#include + +namespace oneflow { + +template +class CutlassGemmArrayTunerImpl { + public: + const cutlass::library::Operation* Find(ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const Configuration& configuraion, + const Arguments& arguments, void* workspace, + size_t workspace_size); + + const cutlass::library::Operation* Get(const std::string& name, ep::CudaStream* stream, + cutlass::library::GemmFunctionalKey functional_key, + const Configuration& configuraion, + const Arguments& arguments, void* workspace, + size_t workspace_size); +}; + +template +CutlassGemmArrayTunerImpl* GetCutlassGemmArrayTunerImpl(); + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_CUTLASS_GEMM_ARRAY_TUNER_IMPL_H_ + +#endif // WITH_CUTLASS diff --git a/oneflow/user/kernels/grouped_matmul_quant_kernel.cu b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu index 59711b90583..acd7a21ace0 100644 --- a/oneflow/user/kernels/grouped_matmul_quant_kernel.cu +++ b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef WITH_CUTLASS_EXTENSION + #include "oneflow/core/framework/framework.h" #include "oneflow/core/kernel/cuda_graph_support.h" #include "oneflow/core/cuda/elementwise.cuh" @@ -20,6 +22,12 @@ limitations under the License. #include "oneflow/core/device/cuda_util.h" #include "oneflow/core/common/scalar.h" +#include +#include +#include +#include "oneflow/user/kernels/cutlass_gemm_array_tuner.h" +#include + namespace oneflow { struct GemmProblem { @@ -50,7 +58,7 @@ namespace oneflow { namespace { -constexpr int64_t kMaxProblemBatch = 64; +constexpr int64_t kMaxProblemBatch = 32; template struct Buffer { @@ -69,20 +77,107 @@ struct Buffer { template struct Param { Param(const GemmProblem& problem, std::vector> buffers) - : problem(problem), batch_size(buffers.size()) { + : problem(problem), batch_count(buffers.size()) { std::copy(buffers.cbegin(), buffers.cend(), buffer); - elem_cnt = batch_size * problem.m * problem.n; } GemmProblem problem; Buffer buffer[kMaxProblemBatch]; - int batch_size; - int elem_cnt; + int batch_count; }; template -void ApplyGroup(const GemmProblem& problem, std::vector> ptrs, +__global__ void InitPtr(Param p, void** ptr_arr) { + CUDA_1D_KERNEL_LOOP(i, p.batch_count) { + ptr_arr[i] = const_cast(p.buffer[i].a); + ptr_arr[i + kMaxProblemBatch] = const_cast(p.buffer[i].b); + ptr_arr[i + 2 * kMaxProblemBatch] = const_cast(p.buffer[i].in_zero_point); + ptr_arr[i + 3 * kMaxProblemBatch] = const_cast(p.buffer[i].in_scale); + ptr_arr[i + 4 * kMaxProblemBatch] = const_cast(p.buffer[i].weight_scale); + ptr_arr[i + 5 * kMaxProblemBatch] = const_cast(p.buffer[i].weight_acc); + ptr_arr[i + 6 * kMaxProblemBatch] = const_cast(p.buffer[i].scale); + ptr_arr[i + 7 * kMaxProblemBatch] = const_cast(p.buffer[i].biase); + ptr_arr[i + 8 * kMaxProblemBatch] = const_cast(p.buffer[i]._add_to_output); + ptr_arr[i + 9 * kMaxProblemBatch] = p.buffer[i].output; + } +} + +template +void ApplyGroup(user_op::KernelComputeContext* ctx, const cutlass::library::GemmFunctionalKey& key, + const GemmProblem& problem, bool has_in_zero_points, bool has_sacles, + bool has_biases, bool has_add_to_outputs, std::vector> ptrs, user_op::Tensor* tmp_buffer, ep::Stream* stream) { + void* tmp_ptr = tmp_buffer->mut_dptr(); + void** ptr_arr = reinterpret_cast(tmp_ptr); + void* workspace = tmp_buffer + kMaxProblemBatch * 10 * sizeof(void*); + size_t workspace_size = + tmp_buffer->shape_view().elem_cnt() - kMaxProblemBatch * 10 * sizeof(void*); Param params(problem, ptrs); + RUN_CUDA_KERNEL((InitPtr), stream, params.batch_count, params, ptr_arr); + + cutlass::gemm::GemmCoord problem_size(problem.m, problem.n, problem.k); + + cutlass::library::GemmArrayScaleBiasFusionConfiguration configuraion; + configuraion.problem_size = problem_size; + configuraion.lda = problem_size.k(); + configuraion.ldb = problem_size.k(); + configuraion.ld_filter_scale = 0; + configuraion.ld_filter_acc = 0; + configuraion.ld_scale = 0; + configuraion.ld_bias = 0; + configuraion.ldr = problem_size.n(); + configuraion.ldd = problem_size.n(); + configuraion.batch_count = params.batch_count; + + cutlass::library::GemmArrayScaleBiasFusionArguments arguments; + arguments.A = ptr_arr; + arguments.B = ptr_arr + kMaxProblemBatch; + arguments.D = ptr_arr + 9 * kMaxProblemBatch; + arguments.P = nullptr; + arguments.InScale = nullptr; + arguments.FilterScale = nullptr; + arguments.FilterAcc = nullptr; + arguments.Scale = nullptr; + arguments.Bias = nullptr; + arguments.Residual = nullptr; + if (has_in_zero_points) { + arguments.P = ptr_arr + 2 * kMaxProblemBatch; + arguments.InScale = ptr_arr + 3 * kMaxProblemBatch; + arguments.FilterScale = ptr_arr + 4 * kMaxProblemBatch; + arguments.FilterAcc = ptr_arr + 5 * kMaxProblemBatch; + } + if (has_sacles) { arguments.Scale = ptr_arr + 6 * kMaxProblemBatch; } + if (has_biases) { arguments.Bias = ptr_arr + 7 * kMaxProblemBatch; } + if (has_add_to_outputs) { arguments.Residual = ptr_arr + 8 * kMaxProblemBatch; } + + auto* cuda_stream = stream->As(); + const cutlass::library::Operation* operation = nullptr; + + operation = [&]() -> const cutlass::library::Operation* { + const std::string& tuning_cache = ctx->Attr("tuning_cache"); + if (tuning_cache.empty()) { return nullptr; } + auto tuning_cache_object = nlohmann::json::parse(tuning_cache); + if (!tuning_cache_object.is_object()) { return nullptr; } + auto it = tuning_cache_object.find("cutlass"); + if (it == tuning_cache_object.end()) { return nullptr; } + if (!it->is_string()) { return nullptr; } + const std::string name = *it; + return CutlassGemmArrayTuner().GetOperation(name, cuda_stream, key, configuraion, arguments, + workspace, workspace_size); + }(); + if (!operation) { + operation = CutlassGemmArrayTuner().FindOperation(cuda_stream, key, configuraion, arguments, + workspace, workspace_size); + } + CHECK(operation != nullptr); + const size_t host_workspace_size = operation->get_host_workspace_size(&configuraion); + std::vector host_workspace(host_workspace_size, 0); + + auto init_status = operation->initialize(&configuraion, host_workspace.data(), workspace, + cuda_stream->cuda_stream()); + CHECK(init_status == cutlass::Status::kSuccess); + auto run_status = + operation->run(&arguments, host_workspace.data(), workspace, cuda_stream->cuda_stream()); + CHECK(run_status == cutlass::Status::kSuccess); } template @@ -103,6 +198,32 @@ class GroupedMatmulQuantKernel final : public user_op::OpKernel, public user_op: const bool has_biases = ctx->has_input("biases", 0); const bool has_add_to_outputs = ctx->has_input("_add_to_outputs", 0); + cutlass::library::GemmFunctionalKey key( + cutlass::library::Provider::kCUTLASS, cutlass::library::GemmKind::kGemm, + cutlass::library::NumericTypeID::kS32, // element_compute + cutlass::library::NumericTypeID::kS32, // element_scalar + cutlass::library::NumericTypeID::kS8, // element_A + cutlass::library::LayoutTypeID::kRowMajor, // layout_A + cutlass::library::ComplexTransform::kNone, // transform_A + cutlass::library::NumericTypeID::kS8, // element_B + cutlass::library::LayoutTypeID::kColumnMajor, // layout_B + cutlass::library::ComplexTransform::kNone, // transform_B + cutlass::library::NumericTypeID::kS32, // element_C + cutlass::library::LayoutTypeID::kRowMajor, // layout_C + cutlass::library::NumericTypeID::kS32, // element_D + cutlass::library::LayoutTypeID::kRowMajor // layout_D + ); + + if (GetDataType::value == DataType::kFloat) { + key.element_scalar = cutlass::library::NumericTypeID::kF32; + key.element_C = cutlass::library::NumericTypeID::kF32; + key.element_D = cutlass::library::NumericTypeID::kF32; + } else if (GetDataType::value == DataType::kFloat16) { + key.element_scalar = cutlass::library::NumericTypeID::kF32; + key.element_C = cutlass::library::NumericTypeID::kF16; + key.element_D = cutlass::library::NumericTypeID::kF16; + } + for (int32_t i = 0; i < input_size; ++i) { const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("as", i); const user_op::Tensor* b = ctx->Tensor4ArgNameAndIndex("bs", i); @@ -150,7 +271,8 @@ class GroupedMatmulQuantKernel final : public user_op::OpKernel, public user_op: {group.second.begin() + i, group.second.begin() + i + std::min(group.second.size() - i, kMaxProblemBatch)}); - ApplyGroup(group.first, ptrs, tmp_buffer, ctx->stream()); + ApplyGroup(ctx, key, group.first, has_in_zero_points, has_sacles, has_biases, + has_add_to_outputs, ptrs, tmp_buffer, ctx->stream()); } } } @@ -165,7 +287,7 @@ class GroupedMatmulQuantKernel final : public user_op::OpKernel, public user_op: && (user_op::HobDataType("bs", 0) == DataType::kInt8) \ && (user_op::HobDataType("outputs", 0) == out_data_type)) \ .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - return kMaxProblemBatch * 7 * sizeof(void*) + 3 * 1024 * 1024; \ + return kMaxProblemBatch * 10 * sizeof(void*) + 3 * 1024 * 1024; \ }); REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(half, DataType::kFloat16) @@ -174,3 +296,5 @@ REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(float, DataType::kFloat) } // namespace } // namespace oneflow + +#endif // WITH_CUTLASS_EXTENSION From c1c859b310f0447204c89348bbf2c10638a5c912 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Fri, 15 Sep 2023 18:22:08 +0800 Subject: [PATCH 63/65] fix --- oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp b/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp index 31b7a2e2860..97078d9392a 100644 --- a/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp +++ b/oneflow/ir/lib/OneFlow/Transform/OutlineAndFuse.cpp @@ -180,7 +180,6 @@ struct GroupMatMulQuantPattern : public mlir::OpRewritePattern { mlir::LogicalResult matchAndRewrite(MatmulQuantOp op, mlir::PatternRewriter& rewriter) const override { llvm::SmallVector all_matmuls{}; - all_matmuls.push_back(op); bool has_in_scale = MatmulQuantOpHasInputScale(op); bool has_scale = MatmulQuantOpHasScale(op); bool has_bias = MatmulQuantOpHasBias(op); From 4ba3cc5b99366c6abdfcb2770faf62c46c092977 Mon Sep 17 00:00:00 2001 From: clackhan Date: Fri, 22 Sep 2023 09:17:43 +0000 Subject: [PATCH 64/65] fuse_min_max_observer_and_matmul_quant --- oneflow/core/functional/functional_api.yaml | 10 +- oneflow/core/functional/impl/nn_functor.cpp | 73 ++++++++++- oneflow/ir/include/OneFlow/OneFlowUserOps.td | 15 ++- .../lib/OneFlow/PDLL/ForwardOpPatterns.pdll | 31 +++++ .../ir/lib/OneFlow/Transform/AutoNHWCOps.cpp | 21 +++ oneflow/user/kernels/conv_quant_kernels.cu | 27 ++-- .../user/kernels/fused_glu_quant_kernel.cu | 6 + .../kernels/grouped_matmul_quant_kernel.cu | 29 +++-- oneflow/user/kernels/matmul_quant_kernels.cu | 25 ++-- .../user/kernels/min_max_observer_kernel.cu | 123 +++++++++++++++--- oneflow/user/kernels/redistribute_kernel.cu | 100 ++++++++++++++ oneflow/user/ops/fused_glu_quant_op.cpp | 14 +- oneflow/user/ops/grouped_matmul_quant_op.cpp | 4 +- oneflow/user/ops/min_max_observer_op.cpp | 19 ++- oneflow/user/ops/redistribute_op.cpp | 53 ++++++++ 15 files changed, 488 insertions(+), 62 deletions(-) create mode 100644 oneflow/user/kernels/redistribute_kernel.cu create mode 100644 oneflow/user/ops/redistribute_op.cpp diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 51800c755df..bac44ee4f88 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -1064,6 +1064,11 @@ ] bind_python: True +- name: "redistribute" + signature: + 'Tensor (Tensor in) => Redistribute' + bind_python: True + - name: "matmul_quant" signature: [ @@ -1089,7 +1094,10 @@ 'TensorTuple (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, TensorTuple weight_accs, TensorTuple biases, Bool transpose_a=False, Bool transpose_b=False, - Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant' + Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant', + 'TensorTuple (TensorTuple as, TensorTuple bs, TensorTuple in_zero_points, TensorTuple in_scales, TensorTuple weight_scales, + TensorTuple weight_accs, TensorTuple biases, TensorTuple add_to_outputs, + Bool transpose_a=False, Bool transpose_b=False, Double alpha=1.0, DataType output_dtype=None) => GroupedMatmulQuant' ] bind_python: True diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index a8687200abc..5069c275de2 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -576,6 +576,23 @@ class BatchMatMulFunctor { std::shared_ptr batch_matmul_op_; }; +class RedistributeFunctor { + public: + RedistributeFunctor() { + redistribute_op_ = CHECK_JUST(one::OpBuilder("redistribute").Input("in").Output("out").Build()); + } + Maybe operator()(const std::shared_ptr& in) const { + const auto& in_size = in->shape(); + const int n = in_size->At(0); + const int k = in_size->At(1); + CHECK_EQ_OR_RETURN(k % 16, 0); + return OpInterpUtil::Dispatch(*redistribute_op_, {in}); + } + + private: + std::shared_ptr redistribute_op_; +}; + class MatMulQuantFunctor { public: MatMulQuantFunctor() { @@ -808,6 +825,58 @@ class GroupedMatMulBiasQuantWithFilterScaleFunctor { std::vector> grouped_matmul_bias_quant_with_filter_bias_op_; }; +class GroupedMatMulBiasQuantWithFilterScaleResidualFunctor { + public: + GroupedMatMulBiasQuantWithFilterScaleResidualFunctor() { + grouped_matmul_bias_quant_with_filter_bias_op_.resize(kMaxInputCount); + for (int n = 1; n < kMaxInputCount; ++n) { + grouped_matmul_bias_quant_with_filter_bias_op_[n] = + CHECK_JUST(one::OpBuilder("grouped_matmul_quant") + .Input("as", n) + .Input("bs", n) + .Input("in_zero_points", n) + .Input("in_scales", n) + .Input("weight_scales", n) + .Input("weight_accs", n) + .Input("biases", n) + .Input("_add_to_outputs", n) + .Output("outputs", n) + .Build()); + } + } + Maybe operator()(const TensorTuple& as, const TensorTuple& bs, + const TensorTuple& in_zero_points, const TensorTuple& in_scales, + const TensorTuple& weight_scales, const TensorTuple& weight_accs, + const TensorTuple& biases, const TensorTuple& add_to_outputs, + const bool& transpose_a, const bool& transpose_b, + const double& alpha, + const Optional>& output_dtype) const { + CHECK_OR_RETURN(!transpose_a) + << "the first input should not be transposed for quantized matmul."; + CHECK_OR_RETURN(transpose_b) << "the second input should be transposed for quantized matmul."; + CHECK_EQ_OR_RETURN(alpha, 1) << "alpha should be 1 for quantized matmul."; + auto& attrs = + THREAD_CACHED_MUTABLE_ATTR_MAP("transpose_a", "transpose_b", "alpha", "out_dtype"); + attrs.SetAllAttrs(transpose_a, transpose_b, alpha, + output_dtype.value_or(DType::Float())->data_type()); + int input_size = as.size(); + TensorTuple input(8 * input_size); + std::copy(as.begin(), as.end(), input.begin() + 0 * input_size); + std::copy(bs.begin(), bs.end(), input.begin() + 1 * input_size); + std::copy(in_zero_points.begin(), in_zero_points.end(), input.begin() + 2 * input_size); + std::copy(in_scales.begin(), in_scales.end(), input.begin() + 3 * input_size); + std::copy(weight_scales.begin(), weight_scales.end(), input.begin() + 4 * input_size); + std::copy(weight_accs.begin(), weight_accs.end(), input.begin() + 5 * input_size); + std::copy(biases.begin(), biases.end(), input.begin() + 6 * input_size); + std::copy(add_to_outputs.begin(), add_to_outputs.end(), input.begin() + 7 * input_size); + return OpInterpUtil::Dispatch( + *grouped_matmul_bias_quant_with_filter_bias_op_[input_size], input, attrs); + } + + private: + std::vector> grouped_matmul_bias_quant_with_filter_bias_op_; +}; + class VectorMatrixProductFunctor { public: VectorMatrixProductFunctor() { @@ -5805,7 +5874,9 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("MatMul"); m.add_functor("MatmulQuant"); m.add_functor("GroupedMatmulQuant"); + impl::GroupedMatMulBiasQuantWithFilterScaleFunctor, + impl::GroupedMatMulBiasQuantWithFilterScaleResidualFunctor>("GroupedMatmulQuant"); + m.add_functor("Redistribute"); m.add_functor("MatMulNoBroadCast"); m.add_functor("BatchMatMul"); m.add_functor("MatrixVectorProduct"); diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td index a57257ed437..90184f72367 100644 --- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td +++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td @@ -5375,6 +5375,19 @@ def OneFlow_ErfcGradOp : OneFlow_BaseOp<"erfc_grad", [NoMemoryEffect, DeclareOpI let has_data_type_infer_fn = 1; } +def OneFlow_RedistributeOp : OneFlow_BaseOp<"redistribute", [NoMemoryEffect, DeclareOpInterfaceMethods]> { + let input = (ins + OneFlow_Tensor:$in + ); + let output = (outs + OneFlow_Tensor:$out + ); + let has_logical_tensor_desc_infer_fn = 1; + let has_physical_tensor_desc_infer_fn = 1; + let has_get_sbp_fn = 1; + let has_data_type_infer_fn = 1; +} + def OneFlow_MatmulQuantOp : OneFlow_BaseOp<"matmul_quant", [NoMemoryEffect, AttrSizedOperandSegments, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$a, @@ -8310,7 +8323,7 @@ def OneFlow_FakeQuantizationOp : OneFlow_BaseOp<"fake_quantization", [NoMemoryEf let has_input_arg_modify_fn = 1; } -def OneFlow_MinMaxObserverOp : OneFlow_BaseOp<"min_max_observer", [NoMemoryEffect, NoGrad, DeclareOpInterfaceMethods]> { +def OneFlow_MinMaxObserverOp : OneFlow_BaseOp<"min_max_observer", [NoMemoryEffect, NoGrad, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let input = (ins OneFlow_Tensor:$in ); diff --git a/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll b/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll index 68efa49a0a5..7060eae76c8 100644 --- a/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll +++ b/oneflow/ir/lib/OneFlow/PDLL/ForwardOpPatterns.pdll @@ -249,3 +249,34 @@ Pattern { replace dynamic_quantization with (quantization.0, fused_layer_norm_min_max_observer.1, fused_layer_norm_min_max_observer.2); }; } + +Pattern { + let center: Attr; + let scale: Attr; + let begin_norm_axis: Attr; + let begin_params_axis: Attr; + let epsilon: Attr; + let quantization_formula: Attr; + let quantization_bit: Attr; + let quantization_scheme: Attr; + let per_layer_quantization: Attr; + + let layer_norm = op(x: Value, beta: Value, gamma: Value) + {center = center, scale = scale, begin_norm_axis = begin_norm_axis, begin_params_axis = begin_params_axis, epsilon = epsilon} + -> (y: Type, mean: Type, inv_variance: Type); + let min_max_observer = op(layer_norm.0) + {quantization_formula = quantization_formula, quantization_bit = quantization_bit, quantization_scheme = quantization_scheme, + per_layer_quantization = per_layer_quantization} -> (in_scale: Type, in_zero_point: Type); + + rewrite min_max_observer with { + let fused_layer_norm_min_max_observer = op(x, beta, gamma) + {center = center, scale = scale, begin_norm_axis = begin_norm_axis, begin_params_axis = begin_params_axis, epsilon = epsilon, + quantization_formula = quantization_formula, quantization_bit = quantization_bit, quantization_scheme = quantization_scheme, + per_layer_quantization = per_layer_quantization, + operand_segment_sizes = attr<"array">} -> (y, in_scale, in_zero_point); + + CopyUserOpAttrs(layer_norm, fused_layer_norm_min_max_observer); + + replace min_max_observer with (fused_layer_norm_min_max_observer.1, fused_layer_norm_min_max_observer.2); + }; +} diff --git a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp index c0a6497b62c..da90ce2b01a 100644 --- a/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp +++ b/oneflow/ir/lib/OneFlow/Transform/AutoNHWCOps.cpp @@ -399,6 +399,27 @@ llvm::SmallVector DynamicQuantizationOp::NchwToNhwc(llvm::SmallVector< return results; } +bool MinMaxObserverOp::IsNCHW() { return false; } + +llvm::DenseSet MinMaxObserverOp::OperandsToTranspose() { return {this->getIn()}; } + +llvm::DenseSet MinMaxObserverOp::ResultsToTranspose() { return {}; } + +llvm::SmallVector MinMaxObserverOp::NchwToNhwc(llvm::SmallVector value, + PatternRewriter& rewriter) { + auto min_max_observer_op = *this; + SmallVector operands{value[0]}; + auto res = rewriter + .create(min_max_observer_op.getLoc(), + getNHWCResultTypes(min_max_observer_op), + operands, min_max_observer_op->getAttrs()) + ->getResults(); + llvm::SmallVector results; + results.push_back(res[0]); + results.push_back(res[1]); + return results; +} + } // namespace oneflow } // namespace mlir diff --git a/oneflow/user/kernels/conv_quant_kernels.cu b/oneflow/user/kernels/conv_quant_kernels.cu index ef0ad093cc0..2fee0d2d2d5 100644 --- a/oneflow/user/kernels/conv_quant_kernels.cu +++ b/oneflow/user/kernels/conv_quant_kernels.cu @@ -148,6 +148,9 @@ class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGr cutlass::library::NumericTypeID::kS8, cutlass::library::LayoutTypeID::kTensorNHWC, cutlass::library::NumericTypeID::kS32, cutlass::library::LayoutTypeID::kTensorNHWC, cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); + if (in->data_type() == DataType::kFloat16) { + key.element_A = cutlass::library::NumericTypeID::kF16; + } if (out->data_type() == DataType::kFloat) { key.element_C = cutlass::library::NumericTypeID::kF32; key.element_compute = cutlass::library::NumericTypeID::kF32; @@ -173,17 +176,19 @@ class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGr } }; -REGISTER_USER_KERNEL("conv2d_quant") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobAttr("data_format") == "channels_last") - && (user_op::HobAttr("groups") == 1) - && (user_op::HobDataType("in", 0) == DataType::kInt8)) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { - // use static workspace size - return 128 * 1024 * 1024; - }) - .SetPriority(user_op::kKernelPriorityOptimized); +#define REGISTER_CONV_2D_QUANT_KERNEL(data_type) \ + REGISTER_USER_KERNEL("conv2d_quant") \ + .SetCreateFn() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobAttr("data_format") == "channels_last") \ + && (user_op::HobAttr("groups") == 1) \ + && (user_op::HobDataType("in", 0) == data_type) \ + && (user_op::HobDataType("weight", 0) == DataType::kInt8)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) \ + .SetPriority(user_op::kKernelPriorityOptimized); + +REGISTER_CONV_2D_QUANT_KERNEL(DataType::kInt8) +REGISTER_CONV_2D_QUANT_KERNEL(DataType::kFloat16) } // namespace diff --git a/oneflow/user/kernels/fused_glu_quant_kernel.cu b/oneflow/user/kernels/fused_glu_quant_kernel.cu index 802e8373130..27d3582407f 100644 --- a/oneflow/user/kernels/fused_glu_quant_kernel.cu +++ b/oneflow/user/kernels/fused_glu_quant_kernel.cu @@ -305,6 +305,9 @@ class GpuFusedGluQuantKernel final : public user_op::OpKernel, public user_op::C cutlass::library::NumericTypeID::kS32, // element_D cutlass::library::LayoutTypeID::kRowMajor // layout_D ); + if (input_x->data_type() == DataType::kFloat16) { + key.element_A = cutlass::library::NumericTypeID::kF16; + } if (data_type == DataType::kFloat) { key.element_scalar = cutlass::library::NumericTypeID::kF32; key.element_C = cutlass::library::NumericTypeID::kF32; @@ -354,6 +357,9 @@ class GpuFusedGluQuantKernel final : public user_op::OpKernel, public user_op::C REGISTER_GPU_FUSED_GLU_QUANT_KERNEL(int8_t, float); REGISTER_GPU_FUSED_GLU_QUANT_KERNEL(int8_t, half); +REGISTER_GPU_FUSED_GLU_QUANT_KERNEL(half, float); +REGISTER_GPU_FUSED_GLU_QUANT_KERNEL(half, half); + } // namespace oneflow #endif // CUDA_VERSION >= 11020 diff --git a/oneflow/user/kernels/grouped_matmul_quant_kernel.cu b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu index acd7a21ace0..522774f7377 100644 --- a/oneflow/user/kernels/grouped_matmul_quant_kernel.cu +++ b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu @@ -213,6 +213,10 @@ class GroupedMatmulQuantKernel final : public user_op::OpKernel, public user_op: cutlass::library::NumericTypeID::kS32, // element_D cutlass::library::LayoutTypeID::kRowMajor // layout_D ); + const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("as", 0); + if (a->data_type() == DataType::kFloat16) { + key.element_A = cutlass::library::NumericTypeID::kF16; + } if (GetDataType::value == DataType::kFloat) { key.element_scalar = cutlass::library::NumericTypeID::kF32; @@ -279,19 +283,22 @@ class GroupedMatmulQuantKernel final : public user_op::OpKernel, public user_op: bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(out_cpp_type, out_data_type) \ - REGISTER_USER_KERNEL("grouped_matmul_quant") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("as", 0) == DataType::kInt8) \ - && (user_op::HobDataType("bs", 0) == DataType::kInt8) \ - && (user_op::HobDataType("outputs", 0) == out_data_type)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - return kMaxProblemBatch * 10 * sizeof(void*) + 3 * 1024 * 1024; \ +#define REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(a_data_type, out_cpp_type, out_data_type) \ + REGISTER_USER_KERNEL("grouped_matmul_quant") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("as", 0) == a_data_type) \ + && (user_op::HobDataType("bs", 0) == DataType::kInt8) \ + && (user_op::HobDataType("outputs", 0) == out_data_type)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + return kMaxProblemBatch * 10 * sizeof(void*) + 3 * 1024 * 1024; \ }); -REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(half, DataType::kFloat16) -REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(float, DataType::kFloat) +REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(DataType::kInt8, half, DataType::kFloat16) +REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(DataType::kInt8, float, DataType::kFloat) + +REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(DataType::kFloat16, half, DataType::kFloat16) +REGISTER_GROUPED_MATMUL_BIAS_KERNEL_GPU(DataType::kFloat16, float, DataType::kFloat) } // namespace diff --git a/oneflow/user/kernels/matmul_quant_kernels.cu b/oneflow/user/kernels/matmul_quant_kernels.cu index d4ac0591aa1..a0a9c909cf2 100644 --- a/oneflow/user/kernels/matmul_quant_kernels.cu +++ b/oneflow/user/kernels/matmul_quant_kernels.cu @@ -145,6 +145,10 @@ class MatmulQuantKernel final : public user_op::OpKernel { cutlass::library::LayoutTypeID::kRowMajor // layout_D ); + if (a->data_type() == DataType::kFloat16) { + key.element_A = cutlass::library::NumericTypeID::kF16; + } + if (out->data_type() == DataType::kFloat) { key.element_scalar = cutlass::library::NumericTypeID::kF32; key.element_C = cutlass::library::NumericTypeID::kF32; @@ -170,16 +174,17 @@ class MatmulQuantKernel final : public user_op::OpKernel { } }; -REGISTER_USER_KERNEL("matmul_quant") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobDataType("a", 0) == DataType::kInt8) - && (user_op::HobDataType("b", 0) == DataType::kInt8)) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { - // use static workspace size - return 128 * 1024 * 1024; - }) - .SetPriority(user_op::kKernelPriorityOptimized); +#define REGISTER_MATMUL_QUANT_KERNEL(data_type) \ + REGISTER_USER_KERNEL("matmul_quant") \ + .SetCreateFn() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("a", 0) == data_type) \ + && (user_op::HobDataType("b", 0) == DataType::kInt8)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 128 * 1024 * 1024; }) \ + .SetPriority(user_op::kKernelPriorityOptimized); + +REGISTER_MATMUL_QUANT_KERNEL(DataType::kInt8) +REGISTER_MATMUL_QUANT_KERNEL(DataType::kFloat16) } // namespace oneflow diff --git a/oneflow/user/kernels/min_max_observer_kernel.cu b/oneflow/user/kernels/min_max_observer_kernel.cu index 786f46d8942..e8ecec8686e 100644 --- a/oneflow/user/kernels/min_max_observer_kernel.cu +++ b/oneflow/user/kernels/min_max_observer_kernel.cu @@ -17,6 +17,9 @@ limitations under the License. #include "oneflow/core/framework/framework.h" #include "oneflow/core/cuda/atomic.cuh" #include "oneflow/core/ep/cuda/cuda_stream.h" +#include "oneflow/core/cuda/elementwise.cuh" +#include "oneflow/user/kernels/quantization_utils.cuh" +#include "oneflow/core/ndarray/binary_func.h" #include @@ -24,6 +27,56 @@ namespace oneflow { namespace { +template +__device__ __forceinline__ T FAbs(T val) { + return fabs(val); +} + +template<> +__device__ __forceinline__ half FAbs(half val) { + return __habs(val); +} + +template +__device__ __forceinline__ T Log2(T val) { + return log2(val); +} + +template<> +__device__ __forceinline__ half Log2(half val) { + return hlog2(val); +} + +template +__device__ __forceinline__ T Nearbyint(T val) { + return nearbyint(val); +} + +template<> +__device__ __forceinline__ half Nearbyint(half val) { + return __half2int_rn(val); +} + +template +__device__ __forceinline__ T Floor(T val) { + return floor(val); +} + +template<> +__device__ __forceinline__ half Floor(half val) { + return __half2int_rd(val); +} + +template +__device__ __forceinline__ T Sub(const T a, const T b) { + return a - b; +} + +template<> +__device__ half Sub(const half a, const half b) { + return __hsub_rn(a, b); +} + // NOTE(Liang Depeng): refer to // https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda template @@ -39,23 +92,23 @@ __global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, shared_min[tid] = -FLT_MAX; while (gid < elements) { - shared_max[tid] = max(shared_max[tid], input_ptr[gid]); - shared_min[tid] = max(shared_min[tid], -input_ptr[gid]); + shared_max[tid] = BinaryFuncMax::Invoke(shared_max[tid], input_ptr[gid]); + shared_min[tid] = BinaryFuncMax::Invoke(shared_min[tid], -input_ptr[gid]); gid += gridDim.x * blockDim.x; } __syncthreads(); gid = (blockDim.x * blockIdx.x) + tid; for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s && gid < elements) { - shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); - shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); + shared_max[tid] = BinaryFuncMax::Invoke(shared_max[tid], shared_max[tid + s]); + shared_min[tid] = BinaryFuncMax::Invoke(shared_min[tid], shared_min[tid + s]); } __syncthreads(); } if (tid == 0) { - cuda::atomic::Max(max_ptr, shared_max[0]); - cuda::atomic::Max(min_ptr, shared_min[0]); + *max_ptr = BinaryFuncMax::Invoke(*max_ptr, shared_max[0]); + *min_ptr = BinaryFuncMax::Invoke(*min_ptr, shared_min[0]); } } @@ -78,23 +131,23 @@ __global__ void ReduceMaxMinPerChannel(const T* input_ptr, const int64_t element int64_t end = panel_size * (cur_channel + 1); while (index < end && index < elements) { - shared_max[tid] = max(shared_max[tid], input_ptr[index]); - shared_min[tid] = max(shared_min[tid], -input_ptr[index]); + shared_max[tid] = BinaryFuncMax::Invoke(shared_max[tid], input_ptr[index]); + shared_min[tid] = BinaryFuncMax::Invoke(shared_min[tid], -input_ptr[index]); index += blockDim.x; } __syncthreads(); for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { - shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); - shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); + shared_max[tid] = BinaryFuncMax::Invoke(shared_max[tid], shared_max[tid + s]); + shared_min[tid] = BinaryFuncMax::Invoke(shared_min[tid], shared_min[tid + s]); } __syncthreads(); } if (tid == 0) { - cuda::atomic::Max(&max_ptr[cur_channel], shared_max[0]); - cuda::atomic::Max(&min_ptr[cur_channel], shared_min[0]); + max_ptr[cur_channel] = BinaryFuncMax::Invoke(max_ptr[cur_channel], shared_max[0]); + min_ptr[cur_channel] = BinaryFuncMax::Invoke(min_ptr[cur_channel], shared_min[0]); } // __syncthreads(); @@ -122,8 +175,8 @@ __global__ void CalScaleZeroPointSymmetric(const T* max_ptr, const T* min_ptr, int64_t gid = (blockDim.x * blockIdx.x) + tid; while (gid < elements) { - T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); - T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; + T weight_max = BinaryFuncMax::Invoke(FAbs(max_ptr[gid]), FAbs(min_ptr[gid])); + T denominator = Sub(static_cast(pow(2.0, quantization_bit - 1)), static_cast(1)); scale[gid] = weight_max / denominator; zero_point[gid] = 0; gid += gridDim.x * blockDim.x; @@ -137,11 +190,11 @@ __global__ void CalScaleZeroPointAffine(const T* max_ptr, const T* min_ptr, cons int64_t gid = (blockDim.x * blockIdx.x) + tid; while (gid < elements) { - T denominator = static_cast(pow(2.0, quantization_bit)) - 1; + T denominator = Sub(static_cast(pow(2.0, quantization_bit)), static_cast(1)); T min = -min_ptr[gid]; T s = (max_ptr[gid] - min) / denominator; scale[gid] = s; - zero_point[gid] = -nearbyint(min / s); + zero_point[gid] = -Nearbyint(min / s); gid += gridDim.x * blockDim.x; } } @@ -154,9 +207,9 @@ __global__ void CalScaleZeroPointCambricon(const T* max_ptr, const T* min_ptr, int64_t gid = (blockDim.x * blockIdx.x) + tid; while (gid < elements) { - T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); + T weight_max = BinaryFuncMax::Invoke(FAbs(max_ptr[gid]), FAbs(min_ptr[gid])); // T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; - scale[gid] = floor(log2(weight_max)) - (quantization_bit - 2); + scale[gid] = static_cast(Floor(Log2(weight_max))) - (quantization_bit - 2); zero_point[gid] = 0; gid += gridDim.x * blockDim.x; } @@ -231,6 +284,37 @@ class GpuMinMaxObserverKernel final : public user_op::OpKernel { LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon), cuda_stream, channel, 0, max_ptr, min_ptr, channel, static_cast(quantization_bit), scale->mut_dptr(), zero_point->mut_dptr()); + } else if (quantization_formula == "oneflow") { + if (per_layer_quantization) { + constexpr int pack_size = cuda::elementwise::PackSize(); + int64_t pack_num = (elements + pack_size - 1) / pack_size; + int grid_size = 0; + cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + grid_size = grid_size > 2048 ? 2048 : grid_size; + + size_t element_bytes = GetSizeOfDataType(GetDataType::value); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), grid_size * element_bytes * 2); + T* min_max = reinterpret_cast(tmp_buffer->mut_dptr()); + + quantization::ReduceMinMaxPerTensor + <<cuda_stream()>>>( + elements, in->dptr(), min_max); + if (quantization_bit == 8) { + int8_t upper_bound = (1 << (quantization_bit - 1)) - 1; + int8_t lower_bound = -upper_bound - 1; + + quantization::ComputeScaleAndZeroPointBlock + <<<1, cuda::elementwise::kBlockSize, 0, cuda_stream->cuda_stream()>>>( + grid_size, min_max, upper_bound, lower_bound, scale->mut_dptr(), + zero_point->mut_dptr()); + } else { + UNIMPLEMENTED(); + } + + } else { + UNIMPLEMENTED() << "min max observer with oneflow quantization_formula only supports " + "per-layer quantization"; + } } else { UNIMPLEMENTED(); } @@ -250,10 +334,11 @@ class GpuMinMaxObserverKernel final : public user_op::OpKernel { const Shape& in_shape = ctx->InputShape("in", 0); \ tmp_buffer_size = in_shape.At(0); \ } \ - return 2 * tmp_buffer_size * sizeof(dtype); \ + return 128 * 1024 * 1024; \ }) REGISTER_MIN_MAX_OBSERVER_KERNEL(float); REGISTER_MIN_MAX_OBSERVER_KERNEL(double); +REGISTER_MIN_MAX_OBSERVER_KERNEL(half); } // namespace oneflow diff --git a/oneflow/user/kernels/redistribute_kernel.cu b/oneflow/user/kernels/redistribute_kernel.cu new file mode 100644 index 00000000000..48cdd19fa9a --- /dev/null +++ b/oneflow/user/kernels/redistribute_kernel.cu @@ -0,0 +1,100 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUTLASS_EXTENSION + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/framework/config_def.h" +#include "oneflow/core/kernel/cuda_graph_support.h" + +namespace oneflow { + +namespace { + +template +__global__ void RedistributionData(int64_t n, int64_t k, const T* src, T* dst) { + const int global_tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int global_tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (int64_t j = global_tid_y; j < n; j += blockDim.y * gridDim.y) { + for (int64_t i = global_tid_x * 16; i < k; i += blockDim.x * gridDim.x * 16) { + for (int m = 0; m < 4; ++m) { + dst[j * k + i + (m * 4)] = src[j * k + i + (m * 2)]; + dst[j * k + i + (m * 4 + 1)] = src[j * k + i + (m * 2 + 1)]; + dst[j * k + i + (m * 4 + 2)] = src[j * k + i + (8 + m * 2)]; + dst[j * k + i + (m * 4 + 3)] = src[j * k + i + (8 + m * 2 + 1)]; + } + } + } +} + +void GetBlockDims(const int64_t col_size, int* block_dim_x, int* block_dim_y) { + const int block_size = 128; + if ((col_size / 4) < block_size) { + *block_dim_x = std::ceil(static_cast(col_size) / 4); + *block_dim_y = (block_size + *block_dim_x - 1) / *block_dim_x; + } else { + *block_dim_x = block_size; + *block_dim_y = 1; + } +} + +int GetNumBlocks(const int64_t num_instances, const int64_t instance_per_block) { + int max_blocks = (num_instances + instance_per_block - 1) / instance_per_block; + return std::min(max_blocks, kCudaMaxBlocksNum); +} + +} // namespace + +template +class RedistributeKernel final : public user_op::OpKernel { + public: + RedistributeKernel() = default; + ~RedistributeKernel() = default; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + cudaStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); + const int n = in->shape_view().At(0); + const int k = in->shape_view().At(1); + int block_dim_x; + int block_dim_y; + GetBlockDims(k, &block_dim_x, &block_dim_y); + dim3 block_dims = dim3(block_dim_x, block_dim_y); + const int num_blocks = GetNumBlocks(n, block_dim_y); + RedistributionData + <<>>(n, k, in->dptr(), out->mut_dptr()); + } +}; + +#define REGISTER_REDISTRIBUTE_KERNEL(cpp_type, data_type) \ + REGISTER_USER_KERNEL("redistribute") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == data_type)); + +REGISTER_REDISTRIBUTE_KERNEL(int8_t, DataType::kInt8) +REGISTER_REDISTRIBUTE_KERNEL(float, DataType::kFloat) +REGISTER_REDISTRIBUTE_KERNEL(half, DataType::kFloat16) + +} // namespace oneflow + +#endif // WITH_CUTLASS_EXTENSION diff --git a/oneflow/user/ops/fused_glu_quant_op.cpp b/oneflow/user/ops/fused_glu_quant_op.cpp index acf153b65b8..9b63898a55a 100644 --- a/oneflow/user/ops/fused_glu_quant_op.cpp +++ b/oneflow/user/ops/fused_glu_quant_op.cpp @@ -208,12 +208,16 @@ namespace oneflow { bool is_split_mode = false; if (ctx->has_input("v", 0)) { is_split_mode = true; } - CHECK_EQ_OR_RETURN(ctx->InputDType("w", 0), x_dtype) - << "data type of \'w\' is not consitant with \'x\'"; - + if (ctx->InputDType("w", 0) != x_dtype) { + CHECK_EQ_OR_RETURN(x_dtype, out_dtype) + << "data type of \'w\' is not consitant with \'out_dtype\'"; + } else { + CHECK_EQ_OR_RETURN(ctx->InputDType("w", 0), x_dtype) + << "data type of \'w\' is not consitant with \'x\'"; + } if (is_split_mode) { - CHECK_EQ_OR_RETURN(ctx->InputDType("v", 0), x_dtype) - << "data type of \'v\' is not consitant with \'x\'"; + CHECK_EQ_OR_RETURN(ctx->InputDType("v", 0), ctx->InputDType("w", 0)) + << "data type of \'v\' is not consitant with \'w\'"; } // set output data type diff --git a/oneflow/user/ops/grouped_matmul_quant_op.cpp b/oneflow/user/ops/grouped_matmul_quant_op.cpp index 06a8ef0ec3c..0caff423fe3 100644 --- a/oneflow/user/ops/grouped_matmul_quant_op.cpp +++ b/oneflow/user/ops/grouped_matmul_quant_op.cpp @@ -84,7 +84,9 @@ Maybe GetComputationCost(user_op::ComputeComplexityFnContext* ctx) { const DataType out_data_type = ctx->Attr("out_dtype"); for (int64_t i = 0; i < input_size; ++i) { const user_op::TensorDesc& x_desc = ctx->InputTensorDesc("as", i); - CHECK_EQ_OR_RETURN(x_desc.data_type(), weight_data_type); + if (x_desc.data_type() != weight_data_type) { + CHECK_EQ_OR_RETURN(x_desc.data_type(), out_data_type); + } CHECK_GE_OR_RETURN(x_desc.shape().NumAxes(), 2); const int64_t k = x_desc.shape().At(x_desc.shape().NumAxes() - 1); const user_op::TensorDesc& weight_desc = ctx->InputTensorDesc("bs", i); diff --git a/oneflow/user/ops/min_max_observer_op.cpp b/oneflow/user/ops/min_max_observer_op.cpp index c1f2ba715f6..cdc766690da 100644 --- a/oneflow/user/ops/min_max_observer_op.cpp +++ b/oneflow/user/ops/min_max_observer_op.cpp @@ -30,6 +30,12 @@ namespace oneflow { ctx->SetOutputShape("scale", 0, Shape({in_shape.At(0)})); ctx->SetOutputShape("zero_point", 0, Shape({in_shape.At(0)})); } + } else if (ctx->Attr("quantization_formula") == "oneflow") { + CHECK_OR_RETURN(ctx->Attr("per_layer_quantization")) + << "min max observer with oneflow quantization_formula only supports per-layer " + "quantization"; + ctx->SetOutputShape("scale", 0, Shape({1})); + ctx->SetOutputShape("zero_point", 0, Shape({1})); } else { // quantization_formula == "cambricon" ctx->SetOutputShape("scale", 0, Shape({1})); ctx->SetOutputShape("zero_point", 0, Shape({1})); @@ -70,8 +76,17 @@ namespace oneflow { } /* static */ Maybe MinMaxObserverOp::InferDataType(user_op::InferContext* ctx) { - ctx->SetOutputDType("scale", 0, ctx->InputDType("in", 0)); - ctx->SetOutputDType("zero_point", 0, ctx->InputDType("in", 0)); + if (ctx->Attr("quantization_formula") == "oneflow") { + if (ctx->Attr("quantization_bit") == 8) { + ctx->SetOutputDType("zero_point", 0, DataType::kInt8); + ctx->SetOutputDType("scale", 0, DataType::kFloat); + } else { + OF_UNIMPLEMENTED(); + } + } else { + ctx->SetOutputDType("scale", 0, ctx->InputDType("in", 0)); + ctx->SetOutputDType("zero_point", 0, ctx->InputDType("in", 0)); + } return Maybe::Ok(); } diff --git a/oneflow/user/ops/redistribute_op.cpp b/oneflow/user/ops/redistribute_op.cpp new file mode 100644 index 00000000000..7be467c485d --- /dev/null +++ b/oneflow/user/ops/redistribute_op.cpp @@ -0,0 +1,53 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/op_generated.h" + +namespace oneflow { + +/* static */ Maybe RedistributeOp::InferLogicalTensorDesc(user_op::InferContext* ctx) { + const user_op::TensorDesc& in = ctx->InputTensorDesc("in", 0); + CHECK_EQ_OR_RETURN(in.shape().NumAxes(), 2); + const int n = in.shape().At(0); + const int k = in.shape().At(1); + + CHECK_EQ_OR_RETURN(k % 16, 0); + + user_op::TensorDesc* out = ctx->MutOutputTensorDesc("out", 0); + ctx->SetOutputIsDynamic("out", 0, ctx->InputIsDynamic("in", 0)); + out->set_shape(ctx->InputShape("in", 0)); + return Maybe::Ok(); +} + +/*static*/ Maybe RedistributeOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) { + return InferLogicalTensorDesc(ctx); +} + +/* static */ Maybe RedistributeOp::GetSbp(user_op::SbpContext* ctx) { + ctx->NewBuilder().Split(user_op::OpArg("in", 0), 0).Split(user_op::OpArg("out", 0), 0).Build(); + ctx->NewBuilder() + .PartialSum(user_op::OpArg("in", 0)) + .PartialSum(user_op::OpArg("out", 0)) + .Build(); + return Maybe::Ok(); +} + +/* static */ Maybe RedistributeOp::InferDataType(user_op::InferContext* ctx) { + ctx->SetOutputDType("out", 0, ctx->InputTensorDesc("in", 0).data_type()); + return Maybe::Ok(); +} + +} // namespace oneflow From 7e154e18ea80d74d1617e4ea1b4b45f8ec838665 Mon Sep 17 00:00:00 2001 From: clackhan Date: Fri, 22 Sep 2023 09:35:56 +0000 Subject: [PATCH 65/65] refine --- oneflow/user/kernels/conv_quant_kernels.cu | 1 + oneflow/user/kernels/fused_glu_quant_kernel.cu | 1 + oneflow/user/kernels/grouped_matmul_quant_kernel.cu | 1 + oneflow/user/kernels/matmul_quant_kernels.cu | 1 + 4 files changed, 4 insertions(+) diff --git a/oneflow/user/kernels/conv_quant_kernels.cu b/oneflow/user/kernels/conv_quant_kernels.cu index 2fee0d2d2d5..5998498145f 100644 --- a/oneflow/user/kernels/conv_quant_kernels.cu +++ b/oneflow/user/kernels/conv_quant_kernels.cu @@ -150,6 +150,7 @@ class Conv2dQuantKernel final : public user_op::OpKernel, public user_op::CudaGr cutlass::library::NumericTypeID::kS32, cutlass::library::NumericTypeID::kS32); if (in->data_type() == DataType::kFloat16) { key.element_A = cutlass::library::NumericTypeID::kF16; + return; } if (out->data_type() == DataType::kFloat) { key.element_C = cutlass::library::NumericTypeID::kF32; diff --git a/oneflow/user/kernels/fused_glu_quant_kernel.cu b/oneflow/user/kernels/fused_glu_quant_kernel.cu index 27d3582407f..cd60160298b 100644 --- a/oneflow/user/kernels/fused_glu_quant_kernel.cu +++ b/oneflow/user/kernels/fused_glu_quant_kernel.cu @@ -307,6 +307,7 @@ class GpuFusedGluQuantKernel final : public user_op::OpKernel, public user_op::C ); if (input_x->data_type() == DataType::kFloat16) { key.element_A = cutlass::library::NumericTypeID::kF16; + return; } if (data_type == DataType::kFloat) { key.element_scalar = cutlass::library::NumericTypeID::kF32; diff --git a/oneflow/user/kernels/grouped_matmul_quant_kernel.cu b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu index 522774f7377..7dd4df7ef62 100644 --- a/oneflow/user/kernels/grouped_matmul_quant_kernel.cu +++ b/oneflow/user/kernels/grouped_matmul_quant_kernel.cu @@ -216,6 +216,7 @@ class GroupedMatmulQuantKernel final : public user_op::OpKernel, public user_op: const user_op::Tensor* a = ctx->Tensor4ArgNameAndIndex("as", 0); if (a->data_type() == DataType::kFloat16) { key.element_A = cutlass::library::NumericTypeID::kF16; + return; } if (GetDataType::value == DataType::kFloat) { diff --git a/oneflow/user/kernels/matmul_quant_kernels.cu b/oneflow/user/kernels/matmul_quant_kernels.cu index a0a9c909cf2..11073885848 100644 --- a/oneflow/user/kernels/matmul_quant_kernels.cu +++ b/oneflow/user/kernels/matmul_quant_kernels.cu @@ -147,6 +147,7 @@ class MatmulQuantKernel final : public user_op::OpKernel { if (a->data_type() == DataType::kFloat16) { key.element_A = cutlass::library::NumericTypeID::kF16; + return; } if (out->data_type() == DataType::kFloat) {