Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split batched solver compilation #1629

Open
wants to merge 15 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions common/cuda_hip/solver/batch_bicgstab_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_

#include "core/solver/batch_bicgstab_kernels.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
Expand All @@ -25,6 +27,11 @@
namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {


constexpr int max_bicgstab_threads = 1024;


namespace batch_single_kernels {


Expand Down Expand Up @@ -168,12 +175,14 @@ __device__ __forceinline__ void update_x_middle(
template <typename StopType, int n_shared, bool prec_shared_bool,
typename PrecType, typename LogType, typename BatchMatrixType,
typename ValueType>
__global__ void apply_kernel(
const gko::kernels::batch_bicgstab::storage_config sconf,
const int max_iter, const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared, const BatchMatrixType mat,
const ValueType* const __restrict__ b, ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
__global__ void __launch_bounds__(max_bicgstab_threads)
apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
const int max_iter, const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared,
const BatchMatrixType mat,
const ValueType* const __restrict__ b,
ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
{
using real_type = typename gko::remove_complex<ValueType>;
const auto num_batch_items = mat.num_batch_items;
Expand Down
80 changes: 80 additions & 0 deletions common/cuda_hip/solver/batch_bicgstab_launch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include "common/cuda_hip/base/batch_struct.hpp"
#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/matrix/batch_struct.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_bicgstab_kernels.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_bicgstab {


template <typename T>
using settings = gko::kernels::batch_bicgstab::settings<T>;


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_bicgstab::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size);

#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH(_vtype, _n_shared, _prec_shared, \
mat_t, log_t, pre_t, stop_t) \
void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
stop_t<device_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_bicgstab::storage_config& sconf, \
const settings<remove_complex<device_type<_vtype>>>& settings, \
log_t<gko::remove_complex<device_type<_vtype>>>& logger, \
pre_t<device_type<_vtype>>& prec, \
const mat_t<const device_type<_vtype>>& mat, \
const device_type<_vtype>* const __restrict__ b_values, \
device_type<_vtype>* const __restrict__ x_values, \
device_type<_vtype>* const __restrict__ workspace_data, \
const int& block_size, const size_t& shared_size)

#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 0, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 1, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 2, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 3, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 4, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 5, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 6, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 7, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 8, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, false)
#define GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH, _vtype, 9, true)


} // namespace batch_bicgstab
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
69 changes: 69 additions & 0 deletions common/cuda_hip/solver/batch_bicgstab_launch.instantiate.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/solver/batch_bicgstab_launch.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>

#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_bicgstab_kernels.hpp"
#include "core/solver/batch_dispatch.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_bicgstab {


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_bicgstab::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size)
{
batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
<<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
sconf, settings.max_iterations,
as_device_type(settings.residual_tol), logger, prec, mat, b_values,
x_values, workspace_data);
}


// begin
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_0_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_1_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_2_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_3_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_4_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_5_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_6_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_7_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_8_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_LAUNCH_9_TRUE);
// end


} // namespace batch_bicgstab
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
23 changes: 15 additions & 8 deletions common/cuda_hip/solver/batch_cg_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_


#include "core/solver/batch_cg_kernels.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>
Expand All @@ -27,6 +29,11 @@
namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {


constexpr int max_cg_threads = 1024;


namespace batch_single_kernels {


Expand Down Expand Up @@ -113,14 +120,14 @@ __device__ __forceinline__ void update_x_and_r(
template <typename StopType, const int n_shared, const bool prec_shared_bool,
typename PrecType, typename LogType, typename BatchMatrixType,
typename ValueType>
__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
const int max_iter,
const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared,
const BatchMatrixType mat,
const ValueType* const __restrict__ b,
ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
__global__ void __launch_bounds__(max_cg_threads)
apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
const int max_iter, const gko::remove_complex<ValueType> tol,
LogType logger, PrecType prec_shared,
const BatchMatrixType mat,
const ValueType* const __restrict__ b,
ValueType* const __restrict__ x,
ValueType* const __restrict__ workspace = nullptr)
{
using real_type = typename gko::remove_complex<ValueType>;
const auto num_batch_items = mat.num_batch_items;
Expand Down
72 changes: 72 additions & 0 deletions common/cuda_hip/solver/batch_cg_launch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include "common/cuda_hip/base/batch_struct.hpp"
#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/matrix/batch_struct.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_cg_kernels.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_cg {


template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_cg::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size);

#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
log_t, pre_t, stop_t) \
void launch_apply_kernel<device_type<_vtype>, _n_shared, _prec_shared, \
stop_t<device_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_cg::storage_config& sconf, \
const settings<remove_complex<_vtype>>& settings, \
log_t<device_type<gko::remove_complex<device_type<_vtype>>>>& logger, \
pre_t<device_type<_vtype>>& prec, \
const mat_t<const device_type<_vtype>>& mat, \
const device_type<_vtype>* const __restrict__ b_values, \
device_type<_vtype>* const __restrict__ x_values, \
device_type<_vtype>* const __restrict__ workspace_data, \
const int& block_size, const size_t& shared_size)

#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)


} // namespace batch_cg
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
61 changes: 61 additions & 0 deletions common/cuda_hip/solver/batch_cg_launch.instantiate.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/solver/batch_cg_launch.hpp"

#include <ginkgo/core/base/exception_helpers.hpp>

#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_cg_kernels.hpp"
#include "core/solver/batch_dispatch.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace batch_cg {


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_cg::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const device_type<ValueType>* const __restrict__ b_values,
device_type<ValueType>* const __restrict__ x_values,
device_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size)
{
batch_single_kernels::apply_kernel<StopType, n_shared, prec_shared>
<<<mat.num_batch_items, block_size, shared_size, exec->get_stream()>>>(
sconf, settings.max_iterations,
as_device_type(settings.residual_tol), logger, prec, mat, b_values,
x_values, workspace_data);
}


// begin
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE);
// split
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE);
// end


} // namespace batch_cg
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Loading
Loading