Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPS with cuQuantum #2168

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
563ae6e
initial layout
MozammilQ Jun 5, 2024
280f868
refactor code
MozammilQ Jun 5, 2024
ae44c69
refactor code
MozammilQ Jun 6, 2024
5b48265
refactor code
MozammilQ Jun 6, 2024
517a554
refactor code
MozammilQ Jun 6, 2024
7e40588
refactor code
MozammilQ Jun 6, 2024
ebf9ca0
refactor code
MozammilQ Jun 6, 2024
a422690
refactor code
MozammilQ Jun 6, 2024
80b59d5
refactor code
MozammilQ Jun 6, 2024
52f1ed4
refactor code
MozammilQ Jun 7, 2024
ed43e71
refactor code
MozammilQ Jun 9, 2024
649a5d7
refactor code
MozammilQ Jun 9, 2024
83e4b5e
Merge branch 'main' into mps-cutensor
doichanj Jun 10, 2024
c33571f
refactor code
MozammilQ Jun 11, 2024
abc5552
Merge branch 'main' into mps-cutensor
doichanj Jun 14, 2024
f0205e3
refactor code
MozammilQ Jun 14, 2024
629f65f
refactor code
MozammilQ Jun 15, 2024
644a822
added release note
MozammilQ Jun 16, 2024
e6f2288
refactor code
MozammilQ Jun 17, 2024
42f983e
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Jun 17, 2024
c24b9e2
refactor code
MozammilQ Jun 18, 2024
34e9502
refactor code
MozammilQ Jun 18, 2024
00f88e9
refactor code; included test
MozammilQ Jun 18, 2024
454f8c0
lint
MozammilQ Jun 18, 2024
985c7f2
added suggestion
MozammilQ Jun 18, 2024
7ffab7d
Merge branch 'main' into mps-cutensor
doichanj Jul 4, 2024
6b0b41d
Merge branch 'main' into mps-cutensor
MozammilQ Aug 30, 2024
34a5e75
fixed a typo
MozammilQ Aug 31, 2024
a1ae308
refactor code
MozammilQ Sep 10, 2024
859e946
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Oct 4, 2024
2ae116e
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Nov 6, 2024
ed9a907
refactor code
MozammilQ Nov 11, 2024
a4dbd12
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Nov 12, 2024
e1e80d1
added cublas for contract
MozammilQ Nov 28, 2024
321230c
fixed typo
MozammilQ Dec 3, 2024
702ecd0
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Dec 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
features:
- |
This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs.

While choosing for the backend for Matrix Product State simulation users can
choose all as usual, but this time when they can choose the device as GPU, the
simulation is accelerated by NVIDIA GPU. To be precise, this PR offloads the
Singular Value Decomposition and matrix multiplication of bigger matrices involved
in Matrix Product State Simulation to GPU.


Example

.. code-block:: python

from qiskit_aer import AerSimulator
from qiskit.circuit import QuantumCircuit
from qiskit.compiler import transpile

num_qubits = 10
shots = 5

qc = QuantumCircuit(num_qubits)
qc.h(0)

for control, target in zip(range(num_qubits-1), range(1, num_qubits)):
qc.cx(control, target)

qc.measure_all()

sim = AerSimulator(method="matrix_product_state", device="GPU")
qc_t = transpile(qc, backend=sim)
job = sim.run(qc_t, shots = shots)

counts = job.result().get_counts()
counts





5 changes: 5 additions & 0 deletions src/simulators/matrix_product_state/matrix_product_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,11 @@ void State::set_config(const Config &config) {

// Set LAPACK SVD
MPS::set_mps_lapack_svd(config.mps_lapack);

// Set device for SVD
#ifdef AER_THRUST_CUDA
MPS::set_mps_device(config.device);
#endif // AER_THRUST_CUDA
}

void State::add_metadata(ExperimentResult &result) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "stdlib.h"
#include "string.h"
#include <iostream>
#include <string>
#include <utility>

#include "framework/linalg/almost_equal.hpp"
Expand All @@ -45,6 +46,9 @@ double MPS::json_chop_threshold_ = 1E-8;
std::stringstream MPS::logging_str_;
bool MPS::mps_log_data_ = 0;
bool MPS::mps_lapack_ = false;
#ifdef AER_THRUST_CUDA
std::string MPS::mps_device_;
#endif // AER_THRUST_CUDA

//------------------------------------------------------------------------
// local function declarations
Expand Down Expand Up @@ -627,8 +631,14 @@ void MPS::common_apply_2_qubit_gate(
if (A + 1 != num_qubits_ - 1)
q_reg_[A + 1].mul_Gamma_by_right_Lambda(lambda_reg_[A + 1]);

#ifdef AER_THRUST_CUDA
MPS_Tensor temp =
MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1],
MPS::mps_device_, num_qubits_, cublas_handle);
#else
MPS_Tensor temp =
MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1]);
#endif // AER_THRUST_CUDA

switch (gate_type) {
case cx:
Expand Down Expand Up @@ -663,8 +673,14 @@ void MPS::common_apply_2_qubit_gate(

MPS_Tensor left_gamma, right_gamma;
rvector_t lambda;
#ifdef AER_THRUST_CUDA
double discarded_value = MPS_Tensor::Decompose(
temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, MPS::mps_device_,
num_qubits_, cuda_stream, cutensor_handle);
#else
double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
right_gamma, MPS::mps_lapack_);
#endif // AER_THRUST_CUDA

if (discarded_value > json_chop_threshold_)
MPS::print_to_log("discarded_value=", discarded_value, ", ");
Expand Down Expand Up @@ -1291,6 +1307,7 @@ MPS_Tensor MPS::state_vec_as_MPS(const reg_t &qubits) {
}

MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const {

MPS_Tensor temp = q_reg_[first_index];

if (first_index != 0)
Expand All @@ -1303,8 +1320,14 @@ MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const {
}

for (uint_t i = first_index + 1; i < last_index + 1; i++) {
#ifdef AER_THRUST_CUDA
temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i],
MPS::mps_device_, num_qubits_, cublas_handle);
#else
temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i]);
#endif // AER_THRUST_CUDA
}

// now temp is a tensor of 2^n matrices
if (last_index != num_qubits_ - 1)
temp.mul_Gamma_by_right_Lambda(lambda_reg_[last_index]);
Expand Down Expand Up @@ -1803,7 +1826,34 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
// step 2 - SVD
S.clear();
S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));

#ifdef AER_THRUST_CUDA
// Before offloading to the GPU, we need to make sure the size of
// matrices to be operated on should be big enough to make
// the time taken by the initialization of `cutensornet` worth it.
// Size of matrices involved for qubits < 13 is too small to be
// considered to offload to the GPU.
// Even if num_qubits involved in the operation is > 13, still
// a lot of the matrices are too small to be offloaded, so to determine
// the size of matrices to offload, we can look at the number of elements
// it has. The number of elements `8401` and num_qubits > 13 has been
// obtained not on any theoretical basis, but on trial and error.
// The number of elements and number of qubits depends solely on
// difference between the speed of CPU and GPU involved. Even if matrices
// are big, they are not big enough to make speed of PICe a bottleneck.
// In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`.
if ((num_qubits_ > 13) && (MPS::mps_device_.compare("GPU") == 0) &&
Copy link
Author

@MozammilQ MozammilQ Jan 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though MPI acceleration is not available for MPS, still I compiled with AER_MPI=True.
Upon re-testing, the code against different CPUs:
Cascade Lake 4-cores + CUDA Arch 89, SapphireRapids 4-cores, zen+ 8-cores + CUDA Arch 86
In case of very low qubit count like 2,3 etc, GPU version is 1.5-2.5 seconds slower, maybe this is the time for GPU initialization.
The offloading of SVD to the GPU does accelerates the computation( in all cases! )
It will still be faster to offload to CUDA Arch 89 even with SapphireRapids CPU.

((reshaped_matrix.GetRows() * reshaped_matrix.GetColumns()) > 8401)) {

cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream,
cutensor_handle);
} else {
csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
}
#else
csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
#endif // AER_THRUST_CUDA

reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
#ifndef _aer_matrix_product_state_hpp_
#define _aer_matrix_product_state_hpp_

#include <cstdarg>

#include "framework/json.hpp"
#include "framework/operations.hpp"
#include "framework/utils.hpp"
#include "matrix_product_state_tensor.hpp"
#include <cstdarg>
#include <string>

namespace AER {
namespace MatrixProductState {
Expand Down Expand Up @@ -81,8 +81,29 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT };

class MPS {
public:
MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {}
~MPS() {}
MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {
#ifdef AER_THRUST_CUDA
cuda_stream = NULL;
cutensor_handle = NULL;
cublas_handle = NULL;
if (mps_device_.compare("GPU") == 0) {
cudaStreamCreate(&cuda_stream);
cutensornetCreate(&cutensor_handle);
cublasCreate(&cublas_handle);
cublasSetStream(cublas_handle, cuda_stream);
}
#endif // AER_THRUST_CUDA
}
~MPS() {
#ifdef AER_THRUST_CUDA
if (cutensor_handle)
cutensornetDestroy(cutensor_handle);
if (cublas_handle)
cublasDestroy(cublas_handle);
if (cuda_stream)
cudaStreamDestroy(cuda_stream);
#endif // AER_THRUST_CUDA
}

//--------------------------------------------------------------------------
// Function name: initialize
Expand Down Expand Up @@ -321,6 +342,11 @@ class MPS {
}

static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
#ifdef AER_THRUST_CUDA
static void set_mps_device(std::string mps_device) {
mps_device_ = mps_device;
}
#endif // AER_THRUST_CUDA

static uint_t get_omp_threads() { return omp_threads_; }
static uint_t get_omp_threshold() { return omp_threshold_; }
Expand Down Expand Up @@ -544,6 +570,12 @@ class MPS {
std::vector<MPS_Tensor> q_reg_;
std::vector<rvector_t> lambda_reg_;

#ifdef AER_THRUST_CUDA
cudaStream_t cuda_stream;
cutensornetHandle_t cutensor_handle;
cublasHandle_t cublas_handle;
#endif // AER_THRUST_CUDA

struct ordering {
// order_ stores the current ordering of the qubits,
// location_ stores the location of each qubit in the vector. It is derived
Expand All @@ -570,6 +602,9 @@ class MPS {
static bool mps_log_data_;
static MPS_swap_direction mps_swap_direction_;
static bool mps_lapack_;
#ifdef AER_THRUST_CUDA
static std::string mps_device_;
#endif // AER_THRUST_CUDA
};

inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {
Expand Down
Loading
Loading