Skip to content

Commit

Permalink
Merge pull request #189 from gbalduzz/ct_int-solver
Browse files Browse the repository at this point in the history
Ct int solver
  • Loading branch information
PDoakORNL committed Aug 13, 2020
2 parents 6fcfcb4 + 1a2c079 commit c3e0fb6
Show file tree
Hide file tree
Showing 104 changed files with 3,537 additions and 360 deletions.
36 changes: 30 additions & 6 deletions cmake/dca_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -201,28 +201,43 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/rng.hpp.in"
################################################################################
# Select the cluster solver.
set(DCA_CLUSTER_SOLVER "CT-AUX" CACHE STRING
"The cluster solver for the DCA(+) loop. Options are: CT-AUX | SS-CT-HYB.")
set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX SS-CT-HYB)
"The cluster solver for the DCA(+) loop. Options are: CT-AUX | CT-INT | SS-CT-HYB.")
set_property(CACHE DCA_CLUSTER_SOLVER PROPERTY STRINGS CT-AUX CT-INT SS-CT-HYB)

if (DCA_CLUSTER_SOLVER STREQUAL "CT-INT")
set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_INT)
set(DCA_CLUSTER_SOLVER_INCLUDE "dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp")

set(DCA_USE_CTINT_SUBMATRIX ON CACHE BOOL "Use submatrix updates if the CT-INT solver is selected.")
if(DCA_USE_CTINT_SUBMATRIX)
set(DCA_CLUSTER_SOLVER_TYPE
"dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, true>")
else()
set(DCA_CLUSTER_SOLVER_TYPE
"dca::phys::solver::CtintClusterSolver<walker_device, ParametersType, false>")
endif()

if (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX")
elseif (DCA_CLUSTER_SOLVER STREQUAL "CT-AUX")
set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::CT_AUX)
set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::CtauxClusterSolver<walker_device, ParametersType, DcaDataType, DIST>")
set(DCA_CLUSTER_SOLVER_INCLUDE
"dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp")
"dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp")


elseif (DCA_CLUSTER_SOLVER STREQUAL "SS-CT-HYB")
set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::SS_CT_HYB)
set(DCA_CLUSTER_SOLVER_TYPE "dca::phys::solver::SsCtHybClusterSolver<walker_device, ParametersType, DcaDataType, DIST>")
set(DCA_CLUSTER_SOLVER_INCLUDE
"dca/phys/dca_step/cluster_solver/ss_ct_hyb/ss_ct_hyb_cluster_solver.hpp")
"dca/phys/dca_step/cluster_solver/ss_ct_hyb/ss_ct_hyb_cluster_solver.hpp")

# elseif (DCA_CLUSTER_SOLVER STREQUAL "HTS")
# set(DCA_CLUSTER_SOLVER_NAME dca::phys::solver::HIGH_TEMPERATURE_SERIES)
# set(DCA_CLUSTER_SOLVER_INCLUDE
# "dca/phys/dca_step/cluster_solver/high_temperature_series_expansion/high_temperature_series_expansion_solver.hpp")

else()
message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | SS-CT-HYB.")
message(FATAL_ERROR "Please set DCA_CLUSTER_SOLVER to a valid option: CT-AUX | CT_INT |
SS-CT-HYB.")
endif()

################################################################################
Expand Down Expand Up @@ -312,6 +327,15 @@ configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in"
"${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY)


################################################################################
# Symmetrization
option(DCA_SYMMETRIZE "Apply cluster, time and frequency symmetries to single particle functions."
ON)

if(DCA_SYMMETRIZE)
add_compile_definitions(DCA_WITH_SYMMETRIZATION)
endif()

################################################################################
# Generate applications' config files.
configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/analysis.hpp.in"
Expand Down
1 change: 0 additions & 1 deletion include/dca/config/cmake_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ struct CMakeOptions {
// Parallelization
static const std::string dca_with_mpi;
static const std::string dca_with_threaded_solver;
static const std::string dca_threading_library;

// Others
static const std::string dca_cluster_solver;
Expand Down
30 changes: 15 additions & 15 deletions include/dca/linalg/lapack/magma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,15 @@ inline magma_trans_t toMagmaTrans(const char x) {
inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
const float alpha, const float* const* a, int* lda,
const float* const* b, int* ldb, const float beta, float** c,
int* ldc, const int batch_count, magma_queue_t& queue) {
int* ldc, const int batch_count, magma_queue_t queue) {
magmablas_sgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
ldb, beta, c, ldc, batch_count, queue);
checkErrorsCudaDebug();
}
inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m, int* n, int* k,
const double alpha, const double* const* a, int* lda,
const double* const* b, int* ldb, const double beta, double** c,
int* ldc, const int batch_count, const magma_queue_t& queue) {
int* ldc, const int batch_count, const magma_queue_t queue) {
magmablas_dgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
ldb, beta, c, ldc, batch_count, queue);
checkErrorsCudaDebug();
Expand All @@ -142,7 +142,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
const std::complex<float>* const* a, int* lda,
const std::complex<float>* const* b, int* ldb,
const std::complex<float> beta, std::complex<float>** c,
int* ldc, const int batch_count, const magma_queue_t& queue) {
int* ldc, const int batch_count, const magma_queue_t queue) {
using util::castCudaComplex;
magmablas_cgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b),
Expand All @@ -154,7 +154,7 @@ inline void magmablas_gemm_vbatched(const char transa, const char transb, int* m
const std::complex<double>* const* a, int* lda,
const std::complex<double>* const* b, int* ldb,
const std::complex<double> beta, std::complex<double>** c,
int* ldc, const int batch_count, const magma_queue_t& queue) {
int* ldc, const int batch_count, const magma_queue_t queue) {
using util::castCudaComplex;
magmablas_zgemm_vbatched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b),
Expand All @@ -168,7 +168,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr
const float* const* b, int* ldb, const float beta,
float** c, int* ldc, const int batch_count,
const int m_max, const int n_max, const int k_max,
magma_queue_t& queue) {
magma_queue_t queue) {
magmablas_sgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha,
a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max,
k_max, queue);
Expand All @@ -181,7 +181,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(const char transa, const char tr
const double* const* b, int* ldb, const double beta,
double** c, int* ldc, const int batch_count,
const int m_max, const int n_max, const int k_max,
magma_queue_t& queue) {
magma_queue_t queue) {
magmablas_dgemm_vbatched_max_nocheck(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha,
a, lda, b, ldb, beta, c, ldc, batch_count, m_max, n_max,
k_max, queue);
Expand All @@ -192,7 +192,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(
const char transa, const char transb, int* m, int* n, int* k, const std::complex<float> alpha,
const std::complex<float>* const* a, int* lda, const std::complex<float>* const* b, int* ldb,
const std::complex<float> beta, std::complex<float>** c, int* ldc, const int batch_count,
const int m_max, const int n_max, const int k_max, magma_queue_t& queue) {
const int m_max, const int n_max, const int k_max, magma_queue_t queue) {
using util::castCudaComplex;
magmablas_cgemm_vbatched_max_nocheck(
toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha),
Expand All @@ -205,7 +205,7 @@ inline void magmablas_gemm_vbatched_max_nocheck(
const char transa, const char transb, int* m, int* n, int* k, const std::complex<double> alpha,
const std::complex<double>* const* a, int* lda, const std::complex<double>* const* b, int* ldb,
const std::complex<double> beta, std::complex<double>** c, int* ldc, const int batch_count,
const int m_max, const int n_max, const int k_max, magma_queue_t& queue) {
const int m_max, const int n_max, const int k_max, magma_queue_t queue) {
using util::castCudaComplex;
magmablas_zgemm_vbatched_max_nocheck(
toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, *castCudaComplex(alpha),
Expand All @@ -218,7 +218,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
const int k, const float alpha, const float* const* a,
const int lda, const float* const* b, const int ldb,
const float beta, float** c, const int ldc,
const int batch_count, magma_queue_t& queue) {
const int batch_count, magma_queue_t queue) {
magmablas_sgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
ldb, beta, c, ldc, batch_count, queue);
checkErrorsCudaDebug();
Expand All @@ -227,7 +227,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
const int k, const double alpha, const double* const* a,
const int lda, const double* const* b, const int ldb,
const double beta, double** c, const int ldc,
const int batch_count, const magma_queue_t& queue) {
const int batch_count, const magma_queue_t queue) {
magmablas_dgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k, alpha, a, lda, b,
ldb, beta, c, ldc, batch_count, queue);
checkErrorsCudaDebug();
Expand All @@ -237,7 +237,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
const std::complex<float>* const* a, const int lda,
const std::complex<float>* const* b, const int ldb,
const std::complex<float> beta, std::complex<float>** c,
const int ldc, const int batch_count, const magma_queue_t& queue) {
const int ldc, const int batch_count, const magma_queue_t queue) {
using util::castCudaComplex;
magmablas_cgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb,
Expand All @@ -249,7 +249,7 @@ inline void magmablas_gemm_batched(const char transa, const char transb, const i
const std::complex<double>* const* a, const int lda,
const std::complex<double>* const* b, const int ldb,
const std::complex<double> beta, std::complex<double>** c,
const int ldc, const int batch_count, const magma_queue_t& queue) {
const int ldc, const int batch_count, const magma_queue_t queue) {
using util::castCudaComplex;
magmablas_zgemm_batched(toMagmaTrans(transa), toMagmaTrans(transb), m, n, k,
*castCudaComplex(alpha), castCudaComplex(a), lda, castCudaComplex(b), ldb,
Expand All @@ -276,8 +276,8 @@ inline int get_getri_nb<std::complex<double>>(int n) {
return magma_get_zgetri_nb(n);
}

} // magma
} // linalg
} // dca
} // namespace magma
} // namespace linalg
} // namespace dca

#endif // DCA_LINALG_LAPACK_MAGMA_HPP
13 changes: 11 additions & 2 deletions include/dca/linalg/util/cuda_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,14 @@ class CudaStream {
}

CudaStream(const CudaStream& other) = delete;
CudaStream& operator=(const CudaStream& other) = delete;

CudaStream(CudaStream&& other) {
std::swap(stream_, other.stream_);
CudaStream(CudaStream&& other) noexcept {
swap(other);
}
CudaStream& operator=(CudaStream&& other) noexcept {
swap(other);
return *this;
}

void sync() const {
Expand All @@ -49,6 +54,10 @@ class CudaStream {
return stream_;
}

void swap(CudaStream& other) noexcept {
std::swap(stream_, other.stream_);
}

private:
cudaStream_t stream_ = nullptr;
};
Expand Down
29 changes: 15 additions & 14 deletions include/dca/linalg/util/magma_batched_gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "dca/linalg/lapack/magma.hpp"
#include "dca/linalg/util/allocators/vectors_typedefs.hpp"
#include "dca/linalg/util/cuda_event.hpp"
#include "dca/linalg/util/magma_queue.hpp"
#include "dca/linalg/vector.hpp"

namespace dca {
Expand All @@ -30,7 +31,7 @@ template <typename ScalarType>
class MagmaBatchedGemm {
public:
// Creates a plan for a batched gemm.
MagmaBatchedGemm(magma_queue_t queue);
MagmaBatchedGemm(const linalg::util::MagmaQueue& queue);
// Creates a plan for a batched gemm and allocates the memory for the arguments of `size`
// multiplications.
MagmaBatchedGemm(int size, magma_queue_t queue);
Expand All @@ -52,20 +53,19 @@ class MagmaBatchedGemm {
void synchronizeCopy();

private:
magma_queue_t queue_;
const cudaStream_t stream_;
const linalg::util::MagmaQueue& queue_;
CudaEvent copied_;

linalg::util::HostVector<const ScalarType *> a_ptr_, b_ptr_;
linalg::util::HostVector<const ScalarType*> a_ptr_, b_ptr_;
linalg::util::HostVector<ScalarType*> c_ptr_;

linalg::Vector<const ScalarType *, linalg::GPU> a_ptr_dev_, b_ptr_dev_;
linalg::Vector<const ScalarType*, linalg::GPU> a_ptr_dev_, b_ptr_dev_;
linalg::Vector<ScalarType*, linalg::GPU> c_ptr_dev_;
};

template <typename ScalarType>
MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(magma_queue_t queue)
: queue_(queue), stream_(magma_queue_get_cuda_stream(queue_)) {}
MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const linalg::util::MagmaQueue& queue)
: queue_(queue) {}

template <typename ScalarType>
MagmaBatchedGemm<ScalarType>::MagmaBatchedGemm(const int size, magma_queue_t queue)
Expand Down Expand Up @@ -99,10 +99,11 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb,
const int n, const int k, const ScalarType alpha,
const ScalarType beta, const int lda, const int ldb,
const int ldc) {
a_ptr_dev_.setAsync(a_ptr_, stream_);
b_ptr_dev_.setAsync(b_ptr_, stream_);
c_ptr_dev_.setAsync(c_ptr_, stream_);
copied_.record(stream_);
// TODO: store in a buffer if the performance gain is necessary.
a_ptr_dev_.setAsync(a_ptr_, queue_);
b_ptr_dev_.setAsync(b_ptr_, queue_);
c_ptr_dev_.setAsync(c_ptr_, queue_);
copied_.record(queue_);

const int n_batched = a_ptr_.size();
magma::magmablas_gemm_batched(transa, transb, m, n, k, alpha, a_ptr_dev_.ptr(), lda,
Expand All @@ -111,9 +112,9 @@ void MagmaBatchedGemm<ScalarType>::execute(const char transa, const char transb,
assert(cudaPeekAtLastError() == cudaSuccess);
}

} // util
} // linalg
} // dca
} // namespace util
} // namespace linalg
} // namespace dca

#endif // DCA_HAVE_CUDA
#endif // DCA_LINALG_UTIL_MAGMA_BATCHED_GEMM_HPP
57 changes: 49 additions & 8 deletions include/dca/linalg/util/magma_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
#define DCA_LINALG_UTIL_MAGMA_QUEUE_HPP
#ifdef DCA_HAVE_CUDA

#include <cublas_v2.h>
#include <cuda.h>
#include <magma.h>
#include <cusparse_v2.h>
#include <magma_v2.h>

#include "dca/linalg/util/cuda_stream.hpp"

namespace dca {
namespace linalg {
Expand All @@ -24,28 +28,65 @@ namespace util {
class MagmaQueue {
public:
MagmaQueue() {
magma_queue_create(&queue_);
cublasCreate(&cublas_handle_);
cusparseCreate(&cusparse_handle_);
int device;
cudaGetDevice(&device);
magma_queue_create_from_cuda(device, stream_, cublas_handle_, cusparse_handle_, &queue_);
}

MagmaQueue(const MagmaQueue& rhs) = delete;
MagmaQueue& operator=(const MagmaQueue& rhs) = delete;

MagmaQueue(MagmaQueue&& rhs) noexcept : queue_(std::move(rhs.queue_)) {
std::swap(cublas_handle_, rhs.cublas_handle_);
std::swap(cusparse_handle_, rhs.cusparse_handle_);
std::swap(queue_, rhs.queue_);
}

MagmaQueue& operator=(MagmaQueue&& rhs) noexcept {
swap(rhs);
return *this;
}

~MagmaQueue() {
magma_queue_destroy(queue_);
cublasDestroy(cublas_handle_);
cusparseDestroy(cusparse_handle_);
}

inline operator magma_queue_t() {
operator magma_queue_t() const {
return queue_;
}

cudaStream_t getStream() const {
return magma_queue_get_cuda_stream(queue_);
// Allows a large number of calls that previously took a stream
// take a MagmaQueue, this makes all this code less intelligible
// but less verbose. Consider this carefully.
operator cudaStream_t() const {
return static_cast<cudaStream_t>(stream_);
}

const CudaStream& getStream() const {
return stream_;
}

void swap(MagmaQueue& rhs) noexcept {
std::swap(stream_, rhs.stream_);
std::swap(cublas_handle_, rhs.cublas_handle_);
std::swap(cusparse_handle_, rhs.cusparse_handle_);
std::swap(queue_, rhs.queue_);
}

private:
CudaStream stream_;
magma_queue_t queue_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr;
cusparseHandle_t cusparse_handle_ = nullptr;
};

} // util
} // linalg
} // dca
} // namespace util
} // namespace linalg
} // namespace dca

#endif // DCA_HAVE_CUDA
#endif // DCA_LINALG_UTIL_MAGMA_QUEUE_HPP
Loading

0 comments on commit c3e0fb6

Please sign in to comment.