From 51a0731d0bac62b4453e54932f99c41cb2d60396 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 26 Nov 2024 00:29:35 +0100
Subject: [PATCH 1/7] misc bug fixes

which progress toward making the v3 deprecated unit tests run without error.

We also relax exclusivity of multithreading and GPU-acceleration, which was pointless and annoying.
---
 quest/src/api/decoherence.cpp     |  3 +--
 quest/src/api/operations.cpp      | 16 ++++++-------
 quest/src/api/qureg.cpp           |  3 ---
 quest/src/core/autodeployer.cpp   | 40 +++++++++++++++----------------
 quest/src/core/errors.cpp         |  2 +-
 quest/src/core/localiser.cpp      |  7 ++++++
 quest/src/core/utilities.cpp      |  3 +++
 quest/src/core/validation.cpp     | 17 ++-----------
 quest/src/core/validation.hpp     |  2 --
 quest/src/cpu/cpu_subroutines.cpp |  3 +++
 quest/src/gpu/gpu_thrust.cuh      |  3 +++
 11 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/quest/src/api/decoherence.cpp b/quest/src/api/decoherence.cpp
index 3287bbf4..30437c95 100644
--- a/quest/src/api/decoherence.cpp
+++ b/quest/src/api/decoherence.cpp
@@ -123,8 +123,7 @@ void mixQureg(Qureg outQureg, Qureg inQureg, qreal inProb) {
     validate_quregFields(outQureg, __func__);
     validate_quregFields(inQureg, __func__);
     validate_probability(inProb, __func__);
-    validate_quregIsDensityMatrix(outQureg, __func__);
-    validate_quregsCanBeMixed(outQureg, inQureg, __func__);
+    validate_quregsCanBeMixed(outQureg, inQureg, __func__); // checks outQureg is densmatr
 
     qreal outProb = 1 - inProb;
     localiser_densmatr_mixQureg(outProb, outQureg, inProb, inQureg);
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index 8c9c05dc..f0ce3eac 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -376,8 +376,8 @@ void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     bool onlyMultiply = true;
     qcomp exponent = qcomp(1, 0);
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
@@ -387,8 +387,8 @@ void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp
 
     bool onlyMultiply = true;
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
@@ -400,8 +400,8 @@ void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     bool onlyMultiply = false;
     qcomp exponent = qcomp(1, 0);
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 void applyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
@@ -412,8 +412,8 @@ void applyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp ex
 
     bool onlyMultiply = false;
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
index 0444b60d..6db094bb 100644
--- a/quest/src/api/qureg.cpp
+++ b/quest/src/api/qureg.cpp
@@ -146,9 +146,6 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
     // automatically overwrite distrib, GPU, and multithread fields which were left as modeflag::USE_AUTO
     autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread, env);
 
-    // throw error if the user had forced multithreading but GPU accel was auto-chosen
-    validate_newQuregNotBothMultithreadedAndGpuAccel(useGpuAccel, useMultithread, caller);
-
     Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
 
     // always allocate CPU memory
diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp
index 7ab13dec..d53f99ec 100644
--- a/quest/src/core/autodeployer.cpp
+++ b/quest/src/core/autodeployer.cpp
@@ -67,7 +67,8 @@ void chooseWhetherToDistributeQureg(int numQubits, int isDensMatr, int &useDistr
     // it's ok if we cannot query RAM; if we'd have exceeded it, it's likely we'll exceed auto-threshold and will still distribute
     } catch (mem::COULD_NOT_QUERY_RAM &e) {}
 
-    // force distribution if GPU deployment is possible but we exceed local VRAM
+    // force distribution if GPU deployment is available but we exceed local VRAM; 
+    // this is preferable over falling back to CPU-only which would be astonishingly slow 
     if (useGpuAccel == 1 || useGpuAccel == modeflag::USE_AUTO) {
         size_t localGpuMem = gpu_getCurrentAvailableMemoryInBytes();
         if (!mem_canQuregFitInMemory(numQubits, isDensMatr, 1, localGpuMem)) {
@@ -76,47 +77,44 @@ void chooseWhetherToDistributeQureg(int numQubits, int isDensMatr, int &useDistr
         }
     }
 
-    // by now, we know that Qureg can definitely fit into a single GPU, or principally fit into RAM,
-    // but we may still wish to distribute it so that multiple Quregs don't choke up memory.
+    // to reach here, we know that Qureg can fit into the remaining memory of a single GPU, or principally 
+    // fit into RAM, but we may still wish to distribute for improved parallelisation and to avoid memory saturation
     int effectiveNumQubitsPerNode = mem_getEffectiveNumStateVecQubitsPerNode(numQubits, isDensMatr, numEnvNodes);
     useDistrib = (effectiveNumQubitsPerNode >= MIN_NUM_LOCAL_QUBITS_FOR_AUTO_QUREG_DISTRIBUTION);
 }
 
 
-void chooseWhetherToGpuAccelQureg(int numQubits, int isDensMatr, int useDistrib, int &useGpuAccel, int numQuregNodes) {
+void chooseWhetherToGpuAccelQureg(int numQubits, int isDensMatr, int &useGpuAccel, int numQuregNodes) {
 
     // if the flag is already set, don't change it
     if (useGpuAccel != modeflag::USE_AUTO)
         return;
 
-    // determine the 'effective number of qubits' each GPU would have to simulate, if distributed
+    // determine the 'effective number of qubits' each GPU would have to simulate, if distributed...
     int effectiveNumQubits = mem_getEffectiveNumStateVecQubitsPerNode(numQubits, isDensMatr, numQuregNodes);
 
-    // choose to GPU accelerate only if that's not too few
+    // and choose to GPU accelerate only if that's not too few
     useGpuAccel = (effectiveNumQubits >= MIN_NUM_LOCAL_QUBITS_FOR_AUTO_QUREG_GPU_ACCELERATION);
 
     // notice there was no automatic disabling of GPU acceleration in the scenario that the local
     // partition exceeded GPU memory. This is because such a scenario would be catastrophically
     // slow and astonish users by leaving GPUs idle in intensive simulation. Instead, we auto-deploy
-    // to GPU and subsequent validation will notice we exceeded GPU memory.
+    // to GPU anyway and subsequent validation will notice we exceeded GPU memory and report an error.
 }
 
 
-void chooseWhetherToMultithreadQureg(int numQubits, int isDensMatr, int useDistrib, int useGpuAccel, int &useMultithread, int numQuregNodes) {
+void chooseWhetherToMultithreadQureg(int numQubits, int isDensMatr, int &useMultithread, int numQuregNodes) {
 
     // if the flag is already set (user-given, or inferred from env), don't change it
     if (useMultithread != modeflag::USE_AUTO)
         return;
 
-    // if GPU-aceleration was chosen, disable auto multithreading...
-    if (useGpuAccel) {
-        useMultithread = 0;
-        return;
-    }
-
-    // otherwise, we're not GPU-accelerating, and should choose to multithread based on Qureg size
+    // otherwise, choose to multithread based on Qureg size
     int effectiveNumQubits = mem_getEffectiveNumStateVecQubitsPerNode(numQubits, isDensMatr, numQuregNodes);
     useMultithread = (effectiveNumQubits >= MIN_NUM_LOCAL_QUBITS_FOR_AUTO_QUREG_MULTITHREADING);
+
+    // note the qureg may be simultaneously GPU-accelerated and so never use its
+    // multithreaded CPU routines, except in functions which accept multiple Quregs
 }
 
 
@@ -125,8 +123,6 @@ void autodep_chooseQuregDeployment(int numQubits, int isDensMatr, int &useDistri
     // preconditions:
     //  - the given configuration is compatible with env (assured by prior validation)
     //  - this means no deployment is forced (=1) which is incompatible with env
-    //  - it also means GPU-acceleration and multithreading are not simultaneously forced
-    //    (although they may still be left automatic and need explicit revision)
 
     // disable any automatic deployments not permitted by env (it's gauranteed we never overwrite =1 to =0)
     if (!env.isDistributed)
@@ -141,11 +137,15 @@ void autodep_chooseQuregDeployment(int numQubits, int isDensMatr, int &useDistri
     if (env.numNodes == 1)
         useDistrib = 0;
 
-    // overwrite any auto options (== modeflag::USE_AUTO)
+    // overwrite useDistrib
     chooseWhetherToDistributeQureg(numQubits, isDensMatr, useDistrib, useGpuAccel, env.numNodes);
     int numQuregNodes = (useDistrib)? env.numNodes : 1;
-    chooseWhetherToGpuAccelQureg(numQubits, isDensMatr, useDistrib, useGpuAccel, numQuregNodes);
-    chooseWhetherToMultithreadQureg(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread, numQuregNodes);
+    
+    // overwrite useGpuAccel
+    chooseWhetherToGpuAccelQureg(numQubits, isDensMatr, useGpuAccel, numQuregNodes);
+
+    // overwrite useMultithread
+    chooseWhetherToMultithreadQureg(numQubits, isDensMatr, useMultithread, numQuregNodes);
 }
 
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 1e9d9ad0..c9985709 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -185,7 +185,7 @@ void assert_pairRankIsDistinct(Qureg qureg, int pairRank) {
 
 void assert_bufferSendRecvDoesNotOverlap(qindex sendInd, qindex recvInd, qindex numAmps) {
 
-    if (sendInd + numAmps > recvInd)
+    if (sendInd < recvInd + numAmps)
         raiseInternalError("A distributed function attempted to send and receive portions of the buffer which overlapped.");
 }
 
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index fe624b3c..3abf3ddd 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1427,6 +1427,13 @@ void localiser_densmatr_twoQubitDepolarising(Qureg qureg, int qubit1, int qubit2
     bool comm1 = doesChannelRequireComm(qureg, qubit1);
     bool comm2 = doesChannelRequireComm(qureg, qubit2);
 
+
+
+    // TODO:
+    // this is bugged, even when non-distributed!!!
+
+
+
     if (comm2 && comm1)
         twoQubitDepolarisingOnPrefixAndPrefix(qureg, qubit1, qubit2, prob);
     if (comm2 && !comm1)
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 176b53e5..00ed7f2f 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -221,6 +221,9 @@ qindex util_getNumLocalDiagonalAmpsWithBits(Qureg qureg, vector<int> qubits, vec
             if (util_getRankBitOfBraQubit(qubits[i], qureg) != outcomes[i])
                 return 0;
 
+    // TODO:
+    // I THINK THIS MAY BE BUGGED / INCORRECT LOGIC!!
+
     // otherwise, every 2^#qubits local diagonal is consistent with outcomes
     qindex numColsPerNode = powerOf2(qureg.logNumColsPerNode);
     qindex numDiags = numColsPerNode / powerOf2(qubits.size());
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index eceab0e9..67850ea2 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -167,10 +167,6 @@ namespace report {
         "Cannot enable multithreaded processing of a Qureg created in a non-multithreaded QuEST environment.";
 
 
-    string NEW_GPU_QUREG_CANNOT_USE_MULTITHREADING = 
-        "Cannot simultaneously GPU-accelerate and multithread a Qureg. Please disable multithreading, or set it to ${AUTO_DEPLOYMENT_FLAG} for QuEST to automatically disable it when deploying to GPU.";
-
-
     string NEW_QUREG_CANNOT_FIT_INTO_NON_DISTRIB_CPU_MEM =
         "The non-distributed Qureg (isDensity=${IS_DENS}) of ${NUM_QUBITS} qubits would be too large (${QCOMP_BYTES} * ${EXP_BASE}^${NUM_QUBITS} bytes) to fit into a single node's RAM (${RAM_SIZE} bytes). See reportQuESTEnv(), and consider using distribution.";
 
@@ -1430,13 +1426,6 @@ void assertQuregFitsInGpuMem(int numQubits, int isDensMatr, int isDistrib, int i
     }
 }
 
-void validate_newQuregNotBothMultithreadedAndGpuAccel(int useGpu, int useMultithread, const char* caller) {
-
-    // note either or both of useGpu and useMultithread are permitted to be modeflag::USE_AUTO (=-1)
-    tokenSubs vars = {{"${AUTO_DEPLOYMENT_FLAG}", modeflag::USE_AUTO}};
-    assertThat(useGpu != 1 || useMultithread != 1, report::NEW_GPU_QUREG_CANNOT_USE_MULTITHREADING, vars, caller);
-}
-
 void validate_newQuregParams(int numQubits, int isDensMatr, int isDistrib, int isGpuAccel, int isMultithread, QuESTEnv env, const char* caller) {
 
     // some of the below validation involves getting distributed node consensus, which
@@ -1452,8 +1441,6 @@ void validate_newQuregParams(int numQubits, int isDensMatr, int isDistrib, int i
     assertQuregNotDistributedOverTooManyNodes(numQubits, isDensMatr, isDistrib, env, caller);
     assertQuregFitsInCpuMem(numQubits, isDensMatr, isDistrib, env, caller);
     assertQuregFitsInGpuMem(numQubits, isDensMatr, isDistrib, isGpuAccel, env, caller);
-
-    validate_newQuregNotBothMultithreadedAndGpuAccel(isGpuAccel, isMultithread, caller);
 }
 
 void validate_newQuregAllocs(Qureg qureg, const char* caller) {
@@ -1549,8 +1536,8 @@ void assertMatrixTotalNumElemsDontExceedMaxIndex(int numQubits, bool isDense, co
     int maxNumQubits = mem_getMaxNumMatrixQubitsBeforeIndexOverflow(isDense);
 
     string msg = (isDense)?
-        report::NEW_DIAG_MATR_NUM_ELEMS_WOULD_EXCEED_QINDEX :
-        report::NEW_COMP_MATR_NUM_ELEMS_WOULD_EXCEED_QINDEX ;
+        report::NEW_COMP_MATR_NUM_ELEMS_WOULD_EXCEED_QINDEX:
+        report::NEW_DIAG_MATR_NUM_ELEMS_WOULD_EXCEED_QINDEX;
 
     tokenSubs vars = {
         {"${NUM_QUBITS}", numQubits}, 
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 92637457..ca898617 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -95,8 +95,6 @@ void validate_newMaxNumReportedSigFigs(int numSigFigs, const char* caller);
 
 void validate_newQuregParams(int numQubits, int isDensMatr, int isDistrib, int isGpuAccel, int numCpuThreads, QuESTEnv env, const char* caller);
 
-void validate_newQuregNotBothMultithreadedAndGpuAccel(int useGpu, int numThreads, const char* caller);
-
 void validate_newQuregAllocs(Qureg qureg, const char* caller);
 
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 89c7e2bf..65678e33 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -1627,6 +1627,9 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
     #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
+        // TODO:
+        // I BELIEVE THIS IS BUGGED AND/OR HAS INCORRECT LOGIC!
+
         // i = local column index of the nth local pure state which contributes to the probability
         qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask);
 
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index b3794295..5cb831e3 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -642,6 +642,9 @@ qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> q
     auto ampIter = thrust::make_permutation_iterator(getStartPtr(qureg), diagIter);
     auto probIter = thrust::make_transform_iterator(ampIter, probFunctor);
 
+    // TODO:
+    // I BELIEVE THIS IS BUGGED AND/OR HAS INCORRECT LOGIC!
+
     qindex numIts = util_getNumLocalDiagonalAmpsWithBits(qureg, qubits, outcomes);
     qreal prob = thrust::reduce(probIter, probIter + numIts);
     return prob;

From fb46979b13f9a8e69b29e5dde6286759b2ee1421 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 15 Dec 2024 23:07:31 +0100
Subject: [PATCH 2/7] patching density-matrix prob-calc

---
 quest/src/core/localiser.cpp      | 13 +++++++++++--
 quest/src/core/utilities.cpp      | 19 -------------------
 quest/src/core/utilities.hpp      |  2 --
 quest/src/cpu/cpu_subroutines.cpp | 25 +++++++++++++------------
 quest/src/gpu/gpu_thrust.cuh      |  5 +----
 5 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 3abf3ddd..208995bd 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1762,8 +1762,17 @@ qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qu
 
     if (doAnyLocalStatesHaveQubitValues(qureg, braQubits, outcomes)) {
 
-        // such nodes need to know all ket qubits (which are all suffix)
-        prob += accel_densmatr_calcProbOfMultiQubitOutcome_sub(qureg, qubits, outcomes);
+        // such nodes need only know the ket qubits/outcomes for which the bra-qubits are in suffix
+        vector<int> ketQubitsWithBraInSuffix;
+        vector<int> ketOutcomesWithBraInSuffix;
+        for (int q=0; q<qubits.size(); q++)
+            if (util_isBraQubitInSuffix(qubits[q], qureg)) {
+                ketQubitsWithBraInSuffix.push_back(qubits[q]);
+                ketOutcomesWithBraInSuffix.push_back(outcomes[q]);
+            }
+
+        prob += accel_densmatr_calcProbOfMultiQubitOutcome_sub(
+            qureg, ketQubitsWithBraInSuffix, ketOutcomesWithBraInSuffix);
     }
 
     // all nodes must sum their probabilities (unless qureg was cloned per-node), for consensus
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 00ed7f2f..881a3760 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -211,25 +211,6 @@ qindex util_getLocalIndexOfFirstDiagonalAmp(Qureg qureg) {
     return qureg.rank * powerOf2(qureg.logNumColsPerNode);
 }
 
-qindex util_getNumLocalDiagonalAmpsWithBits(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
-    assert_utilsGivenDensMatr(qureg);
-
-    // a corresponding bra-qubit in the prefix with an inconsistent outcome means the node
-    // contains no diagonal basis states consistent with the given outcomes
-    for (size_t i=0; i<qubits.size(); i++)
-        if (!util_isBraQubitInSuffix(qubits[i], qureg))
-            if (util_getRankBitOfBraQubit(qubits[i], qureg) != outcomes[i])
-                return 0;
-
-    // TODO:
-    // I THINK THIS MAY BE BUGGED / INCORRECT LOGIC!!
-
-    // otherwise, every 2^#qubits local diagonal is consistent with outcomes
-    qindex numColsPerNode = powerOf2(qureg.logNumColsPerNode);
-    qindex numDiags = numColsPerNode / powerOf2(qubits.size());
-    return numDiags;
-}
-
 qindex util_getGlobalFlatIndex(Qureg qureg, qindex globalRow, qindex globalCol) {
     assert_utilsGivenDensMatr(qureg);
 
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index bd747672..0fe1ad97 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -68,8 +68,6 @@ qindex util_getLocalIndexOfGlobalIndex(Qureg qureg, qindex globalInd);
 
 qindex util_getLocalIndexOfFirstDiagonalAmp(Qureg qureg);
 
-qindex util_getNumLocalDiagonalAmpsWithBits(Qureg qureg, vector<int> qubits, vector<int> outcomes);
-
 qindex util_getGlobalFlatIndex(Qureg qureg, qindex globalRow, qindex globalCol);
 
 int util_getRankContainingIndex(Qureg qureg, qindex globalInd);
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 65678e33..4a046b45 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -1585,19 +1585,20 @@ qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
     qreal prob = 0;
 
     // each iteration visits one amp per 2^qubits.size() amps
+    // (>=1 since all qubits are in suffix, so qubits.size() <= suffix size) 
     qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size());
 
     auto sortedQubits = util_getSorted(qubits); // all in suffix
     auto qubitStateMask = util_getBitMask(qubits, outcomes);
 
     // use template param to compile-time unroll loop in insertBits()
-    SET_VAR_AT_COMPILE_TIME(int, numQubits, NumQubits, qubits.size());
+    SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
 
     #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where qubits are in the specified outcome state
-        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubits, qubitStateMask);
+        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask);
 
         prob += std::norm(qureg.cpuAmps[i]);
     }
@@ -1611,14 +1612,17 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
+    // note that qubits are only ket qubits for which the corresponding bra-qubit is in the suffix;
+    // this function is not invoked upon nodes where prefix bra-qubits do not correspond to given outcomes
+
     qreal prob = 0;
 
-    // each iteration visits one column (contributing one diagonal amp) per 2^qubits.size() possible
-    qindex numIts = util_getNumLocalDiagonalAmpsWithBits(qureg, qubits, outcomes);
-    qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
+    // each iteration visits one relevant diagonal amp (= one column)
+    qindex numIts = powerOf2(qureg.logNumColsPerNode - qubits.size());
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
+    qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
 
-    auto sortedQubits = util_getSorted(qubits); // all in suffix
+    auto sortedQubits = util_getSorted(qubits); // all in suffix, with corresponding bra's all in suffix
     auto qubitStateMask = util_getBitMask(qubits, outcomes);
 
     // use template param to compile-time unroll loop in insertBits()
@@ -1627,13 +1631,10 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
     #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        // TODO:
-        // I BELIEVE THIS IS BUGGED AND/OR HAS INCORRECT LOGIC!
-
-        // i = local column index of the nth local pure state which contributes to the probability
-        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask);
+        // i = local statevector index of nth local basis state with a contributing diagonal
+        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask); // may be unrolled at compile-time
 
-        // j = local flat index of the diagonal element corresponding to i
+        // j = local, flat, density-matrix index of diagonal amp corresponding to state i
         qindex j = fast_getLocalIndexOfDiagonalAmp(i, firstDiagInd, numAmpsPerCol);
 
         prob += std::real(qureg.cpuAmps[j]);
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 5cb831e3..b300d8dc 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -642,10 +642,7 @@ qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> q
     auto ampIter = thrust::make_permutation_iterator(getStartPtr(qureg), diagIter);
     auto probIter = thrust::make_transform_iterator(ampIter, probFunctor);
 
-    // TODO:
-    // I BELIEVE THIS IS BUGGED AND/OR HAS INCORRECT LOGIC!
-
-    qindex numIts = util_getNumLocalDiagonalAmpsWithBits(qureg, qubits, outcomes);
+    qindex numIts = powerOf2(qureg.logNumColsPerNode - qubits.size());
     qreal prob = thrust::reduce(probIter, probIter + numIts);
     return prob;
 }

From 41868c93ae9100c4bb52cbfc44311d321d5e3309 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 17 Dec 2024 18:04:44 +0100
Subject: [PATCH 3/7] added missing amps-fit-in-node validation

---
 quest/src/api/decoherence.cpp |  1 +
 quest/src/api/operations.cpp  | 15 ++++++--
 quest/src/core/validation.cpp | 65 +++++++++++++++++++++++++++--------
 quest/src/core/validation.hpp | 19 +++++-----
 4 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/quest/src/api/decoherence.cpp b/quest/src/api/decoherence.cpp
index 30437c95..d887e270 100644
--- a/quest/src/api/decoherence.cpp
+++ b/quest/src/api/decoherence.cpp
@@ -111,6 +111,7 @@ void mixPaulis(Qureg qureg, int qubit, qreal probX, qreal probY, qreal probZ) {
 void mixKrausMap(Qureg qureg, int* qubits, int numQubits, KrausMap map) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
+    validate_mixedAmpsFitInNode(qureg, numQubits, __func__);
     validate_targets(qureg, qubits, numQubits, __func__);
     validate_krausMapIsCPTP(map, __func__); // also checks fields and is-sync
     validate_krausMapMatchesTargets(map, numQubits, __func__);
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index f0ce3eac..21f80289 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -39,6 +39,7 @@ void validateAndApplyAnyCtrlAnyTargUnitaryMatrix(Qureg qureg, int* ctrls, int* s
     validate_controlsAndTargets(qureg, ctrls, numCtrls, targs, numTargs, caller);
     validate_controlStates(states, numCtrls, caller);
     validate_matrixDimMatchesTargets(matr, numTargs, caller); // also checks fields and is-synced
+    validate_mixedAmpsFitInNode(qureg, numTargs, caller);
     validate_matrixIsUnitary(matr, caller); // harmlessly rechecks fields and is-synced
 
     auto ctrlVec  = util_getVector(ctrls,  numCtrls);
@@ -117,6 +118,7 @@ void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix)
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, target1, target2, __func__);
     validate_matrixFields(matrix, __func__); // matrix can be non-unitary
+    validate_mixedAmpsFitInNode(qureg, 2, __func__);
 
     bool conj = false;
     localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj);
@@ -156,6 +158,7 @@ void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
     validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
+    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
 
     bool conj = false;
     localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj);
@@ -617,9 +620,14 @@ void applyMultiStateControlledSqrtSwap(Qureg qureg, int* controls, int* states,
     validate_controlsAndTwoTargets(qureg, controls, numControls, target1, target2, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // this is likely suboptimal, and there must exist a more 
-    // efficient bespoke strategy for sqrt-SWAP, although given
-    // it is a little esoteric, optimisation is not worthwhile
+    // TODO:
+    // this function exacts sqrtSwap as a dense 2-qubit matrix,
+    // where as bespoke communication and simulation strategy is
+    // clearly possible which we have not supported because the gate
+    // is somewhat esoteric. As such, we must validate mixed-amps fit
+
+    validate_mixedAmpsFitInNode(qureg, 2, __func__); // to throw SqrtSwap error, not generic CompMatr2 error
+
     CompMatr2 matr = getCompMatr2({
         {1, 0, 0, 0},
         {0, .5+.5_i, .5-.5_i, 0},
@@ -1224,6 +1232,7 @@ void applySuperOp(Qureg qureg, SuperOp superop, int* targets, int numTargets) {
     validate_superOpFields(superop, __func__);
     validate_superOpIsSynced(superop, __func__);
     validate_superOpDimMatchesTargs(superop, numTargets, __func__);
+    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
 
     localiser_densmatr_superoperator(qureg, superop, util_getVector(targets, numTargets));
 }
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 67850ea2..ae0447c0 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -783,6 +783,20 @@ namespace report {
         "The GPU has less available memory (${MEM_AVAIL} bytes) than that needed (${MEM_NEEDED} bytes) to temporarily store the ${NUM_OUTCOMES} outcome probabilities of the specified ${NUM_QUBITS} qubits.";
 
 
+    /*
+    * MISC GATE PARAMETERS
+    */
+
+    string ROTATION_AXIS_VECTOR_IS_ZERO =
+        "The rotation axis vector was all zero, or within epsilion magnitude to the zero vector.";
+
+
+    string CANNOT_FIT_MIXED_STATEVEC_AMPS_INTO_SINGLE_NODE =
+        "Cannot perform this ${NUM_TARGS}-target operation upon a ${NUM_QUREG_QUBITS}-qubit statevector distributed between ${NUM_NODES} nodes, since each node's communication buffer (with capacity for ${NUM_QUREG_AMPS_PER_NODE} amps) cannot simultaneously store the ${NUM_TARG_AMPS} mixed remote amplitudes.";
+
+    string CANNOT_FIT_MIXED_DENSMATR_AMPS_INTO_SINGLE_NODE =
+        "Cannot perform this ${NUM_TARGS}-target operation upon a ${NUM_QUREG_QUBITS}-qubit density-matrix distributed between ${NUM_NODES} nodes, since each node's communication buffer (with capacity for ${NUM_QUREG_AMPS_PER_NODE} amps) cannot simultaneously store the ${NUM_TARG_AMPS} mixed remote amplitudes.";
+
 
     /*
      * CHANNEL PARAMETERS 
@@ -816,7 +830,7 @@ namespace report {
      */
 
     string MIXED_QUREG_NOT_DENSITY_MATRIX =
-        "The first Qureg, which will undergo mixing, must be a density matrx.";
+        "The first Qureg, which will undergo mixing, must be a density matrix.";
 
     string MIXED_QUREGS_HAVE_DIFFERENT_NUM_QUBITS =
         "The given Quregs contain an inconsistent number of qubits (${NUM_A} and ${NUM_B}) and cannot be mixed.";
@@ -3230,19 +3244,6 @@ void validate_controlStates(int* states, int numCtrls, const char* caller) {
 
 
 
-/*
- * ROTATION PARAMETERS
- */
-
-void validate_rotationAxisNotZeroVector(qreal x, qreal y, qreal z, const char* caller) {
-
-    qreal norm = pow(x,2) + pow(y,2) + pow(z,2);
-
-    assertThat(norm > global_validationEpsilon, report::ROTATION_AXIS_VECTOR_IS_ZERO, caller);
-}
-
-
-
 /*
  * MEASUREMENT PARAMETERS
  */
@@ -3299,6 +3300,42 @@ void validate_measurementOutcomesFitInGpuMem(Qureg qureg, int numQubits, const c
 
 
 
+/*
+ * MISC GATE PARAMETERS
+ */
+
+void validate_rotationAxisNotZeroVector(qreal x, qreal y, qreal z, const char* caller) {
+
+    qreal norm = pow(x,2) + pow(y,2) + pow(z,2);
+
+    assertThat(norm > global_validationEpsilon, report::ROTATION_AXIS_VECTOR_IS_ZERO, caller);
+}
+
+void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller) {
+
+    // only relevant to distributed quregs
+    if (!qureg.isDistributed)
+        return;
+
+    qindex numTargAmps = powerOf2(numTargets * (qureg.isDensityMatrix? 2:1));
+
+    tokenSubs vars = {
+        {"${NUM_TARGS}",        numTargets},
+        {"${NUM_TARG_AMPS}",    numTargAmps},
+        {"${NUM_NODES}",               qureg.numNodes},
+        {"${NUM_QUREG_QUBITS}",        qureg.numQubits},
+        {"${NUM_QUREG_AMPS_PER_NODE}", qureg.numAmpsPerNode}
+    };
+
+    string msg = (qureg.isDensityMatrix)?
+        report::CANNOT_FIT_MIXED_DENSMATR_AMPS_INTO_SINGLE_NODE:
+        report::CANNOT_FIT_MIXED_STATEVEC_AMPS_INTO_SINGLE_NODE;
+
+    assertThat(qureg.numAmpsPerNode >= numTargAmps, msg, vars, caller);
+}
+
+
+
 /*
  * CHANNEL PARAMETERS 
  */
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index ca898617..76d26856 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -344,14 +344,6 @@ void validate_controlsAndTwoTargets(Qureg qureg, int* ctrls, int numCtrls, int t
 
 
 
-/*
- * ROTATION PARAMETERS
- */
-
-void validate_rotationAxisNotZeroVector(qreal x, qreal y, qreal z, const char* caller);
-
-
-
 /*
  * MEASUREMENT PARAMETERS
  */
@@ -368,6 +360,16 @@ void validate_measurementOutcomesFitInGpuMem(Qureg qureg, int numQubits, const c
 
 
 
+/*
+ * MISC GATE PARAMETERS
+ */
+
+void validate_rotationAxisNotZeroVector(qreal x, qreal y, qreal z, const char* caller);
+
+void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller);
+
+
+
 /*
  * DECOHERENCE 
  */
@@ -427,6 +429,7 @@ void validate_numInitRandomPureStates(qindex numPureStates,  const char* caller)
 void validate_expecValIsReal(qcomp value, bool isDensMatr, const char* caller);
 
 
+
 /*
  * FILE IO
  */

From 3b1150abede7e93f2c615ac245bbba1022596c0f Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 17 Dec 2024 18:05:20 +0100
Subject: [PATCH 4/7] patched bug in many-target general matrix gates

---
 quest/src/core/localiser.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 208995bd..339b2d81 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -803,9 +803,15 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, vector<int> ctrls, vect
     //     a communicator which may be inelegant alongside our own distribution scheme.
 
     // perform necessary swaps to move all targets into suffix, each of which invokes communication
-    for (size_t i=0; i<targsA.size(); i++)
-        if (targsA[i] != targsB[i])
-            anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, targsA[i], targsB[i]);
+    for (size_t i=0; i<targsA.size(); i++) {
+
+        if (targsA[i] == targsB[i])
+            continue;
+
+        int suffixTarg = std::min(targsA[i], targsB[i]);
+        int prefixTarg = std::max(targsA[i], targsB[i]);
+        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+    }
 }
 
 

From 28b57b3f8b29dadc131876fdedecabd1e189473d Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 22 Dec 2024 15:49:53 +0100
Subject: [PATCH 5/7] avoid overwriting .isCPTP when eps=0

---
 quest/src/core/errors.cpp     |  6 +++++
 quest/src/core/errors.hpp     |  2 ++
 quest/src/core/utilities.cpp  | 25 ++++--------------
 quest/src/core/validation.cpp | 49 ++++++++++++++++++++++-------------
 quest/src/core/validation.hpp |  2 +-
 5 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index c9985709..8347e202 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -684,6 +684,12 @@ void assert_utilsGivenDensMatr(Qureg qureg) {
         raiseInternalError("A utility function was given a statevector where a density matrix was expected.");
 }
 
+void assert_utilsGivenNonZeroEpsilon(qreal eps) {
+
+    if (eps == 0)
+        raiseInternalError("A utility function (isUnitary, isHermitian, isCPTP) received an epsilon of zero, which should have precluded it being called.");
+}
+
 
 
 /*
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 215746be..109911c1 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -290,6 +290,8 @@ void assert_utilsGivenStateVec(Qureg qureg);
 
 void assert_utilsGivenDensMatr(Qureg qureg);
 
+void assert_utilsGivenNonZeroEpsilon(qreal eps);
+
 
 
 /*
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 881a3760..19d9ede0 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -318,10 +318,7 @@ void util_setConj(DiagMatr matrix) {
 // type T can be qcomp** or qcomp*[]
 template <typename T>
 bool isUnitary(T elems, qindex dim, qreal eps) {
-
-    // skip expensive unitarity check if eps is infinite (encoded by 0)
-    if (eps == 0)
-        return true;
+    assert_utilsGivenNonZeroEpsilon(eps);
 
     qreal epsSq = eps * eps;
 
@@ -347,10 +344,7 @@ bool isUnitary(T elems, qindex dim, qreal eps) {
 
 // diagonal version doesn't need templating because array decays to pointer, yay!
 bool isUnitary(qcomp* diags, qindex dim, qreal eps) {
-
-    // skip expensive unitarity check if eps is infinite (encoded by 0)
-    if (eps == 0)
-        return true;
+    assert_utilsGivenNonZeroEpsilon(eps);
 
     // check every element has unit magnitude
     for (qindex i=0; i<dim; i++) {
@@ -421,10 +415,7 @@ bool util_isUnitary(FullStateDiagMatr matrix, qreal eps) {
 // type T can be qcomp** or qcomp*[]
 template <typename T>
 bool isHermitian(T elems, qindex dim, qreal eps) {
-
-    // skip expensive Hermiticity check if eps is infinite (encoded by 0)
-    if (eps == 0)
-        return true;
+    assert_utilsGivenNonZeroEpsilon(eps);
 
     qreal epsSq = eps * eps;
 
@@ -444,10 +435,7 @@ bool isHermitian(T elems, qindex dim, qreal eps) {
 
 // diagonal version doesn't need templating because array decays to pointer, yay!
 bool isHermitian(qcomp* diags, qindex dim, qreal eps) {
-
-    // skip expensive Hermiticity check if eps is infinite (encoded by 0)
-    if (eps == 0)
-        return true;
+    assert_utilsGivenNonZeroEpsilon(eps);
 
     // check every element has a zero (or <eps) imaginary component
     for (qindex i=0; i<dim; i++)
@@ -524,6 +512,7 @@ bool util_isHermitian(PauliStrSum sum, qreal eps) {
  */
 
 bool util_isCPTP(KrausMap map, qreal eps) {
+    assert_utilsGivenNonZeroEpsilon(eps);
 
     // TODO:
     // if KrausMap is GPU-accelerated, we should maybe
@@ -531,10 +520,6 @@ bool util_isCPTP(KrausMap map, qreal eps) {
     // otherwise, if matrix is large, we should potentially
     // use a multithreaded routine
 
-    // skip expensive CPTP check if eps is infinite (encoded by 0)
-    if (eps == 0)
-        return true;
-
     qreal epsSquared = eps * eps;
 
     // each whether each element satisfies Identity = sum dagger(m)*m
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index ae0447c0..6b6b67d1 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -755,14 +755,6 @@ namespace report {
         "The control qubit at index ${INDEX} has an invalidly specified control-state of ${STATE}. Valid states are 0 and 1.";
 
 
-    /*
-    * ROTATION PARAMETERS
-    */
-
-    string ROTATION_AXIS_VECTOR_IS_ZERO =
-        "The rotation axis vector was all zero, or within epsilion magnitude to the zero vector.";
-
-
     /*
     * MEASUREMENT PARAMETERS
     */
@@ -987,8 +979,15 @@ extern "C" {
 /*
  * VALIDATION TOGGLE
  *
- * consulted by assertThat below, or earlier by more expensive
- * validation functions to avoid superfluous compute
+ * which disables all forms of validation (e.g. numerical
+ * Hermiticity of operators, and whether qubit indices are
+ * valid, etc), such that global_validationEpsilon (defined
+ * below) is ignored. Note that many validation functions
+ * will still actually perform the validation checks/compute,
+ * and thereafter decide whether to throw an error within
+ * assertThat(). Ergo expensive validators should check that
+ * global_isValidationEnabled=true before even performing
+ * their checks, to avoid superfluous computation.
  */
 
 static bool global_isValidationEnabled = true;
@@ -1008,8 +1007,12 @@ bool validateconfig_isEnabled() {
 /*
  * VALIDATION PRECISION
  *
- * which influences how strict unitarity, 
- * Hermiticity and CPTP checks are performed
+ * which influences how strict numerical checks are performed,
+ * e.g. checking unitarity/hermiticity/CPTP-ness. This is only
+ * consulted when global_isValidationEnabled=true, and can be
+ * separately disabled by setting epsilon=0, in which case 
+ * permanent properties of structs (like .isCPTP) will not be 
+ * overwritten (so will stay validate_STRUCT_PROPERTY_UNKNOWN_FLAG)
  */
 
 static qreal global_validationEpsilon = DEAULT_VALIDATION_EPSILON;
@@ -1024,6 +1027,12 @@ qreal validateconfig_getEpsilon() {
     return global_validationEpsilon;
 }
 
+bool isNumericalValidationDisabled() {
+    return (
+        global_isValidationEnabled == 0 || 
+        global_validationEpsilon   == 0);
+}
+
 
 
 /*
@@ -2082,8 +2091,8 @@ void ensureMatrixUnitarityIsKnown(T matr) {
 template <class T> 
 void assertMatrixIsUnitary(T matr, const char* caller) {
 
-    // avoid expensive unitarity check if validation is anyway disabled
-    if (!global_isValidationEnabled)
+    // avoid expensive unitarity check (and do not overwrite .isUnitary) if validation is anyway disabled
+    if (isNumericalValidationDisabled())
         return;
 
     // unitarity is determined differently depending on matrix type
@@ -2150,8 +2159,8 @@ void ensureMatrHermiticityIsKnown(T matr) {
 template <class T> 
 void assertMatrixIsHermitian(T matr, const char* caller) {
 
-    // avoid expensive unitarity check if validation is anyway disabled
-    if (!global_isValidationEnabled)
+    // avoid expensive unitarity check (and do not overwrite .isHermitian) if validation is anyway disabled
+    if (isNumericalValidationDisabled())
         return;
 
     // unitarity is determined differently depending on matrix type
@@ -2724,8 +2733,8 @@ void validate_krausMapIsCPTP(KrausMap map, const char* caller) {
     validate_krausMapFields(map, caller);
     validate_krausMapIsSynced(map, caller);
 
-    // avoid expensive CPTP check if validation is anyway disabled
-    if (!global_isValidationEnabled)
+    // avoid expensive CPTP check (and do not overwrite .isCPTP) if validation is anyway disabled
+    if (isNumericalValidationDisabled())
         return;
 
     // evaluate CPTPness if it isn't already known 
@@ -2973,6 +2982,10 @@ void validate_pauliStrSumFields(PauliStrSum sum, const char* caller) {
 
 void valdidate_pauliStrSumIsHermitian(PauliStrSum sum, const char* caller) {
 
+    // avoid expensive hermiticity check (and do not overwrite .isHermitian) if validation is anyway disabled
+    if (isNumericalValidationDisabled())
+        return;
+
     // ensure hermiticity is known (if not; compute it)
     if (*(sum.isHermitian) == validate_STRUCT_PROPERTY_UNKNOWN_FLAG)
         *(sum.isHermitian) = util_isHermitian(sum, global_validationEpsilon);
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 76d26856..d2e17dbe 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -22,7 +22,7 @@ using std::string;
 
 
 /*
- *  MATRIX ATTRIBUTE FLAGS
+ *  PLACEHOLDER STRUCT FIELDS
  */
 
 const int validate_STRUCT_PROPERTY_UNKNOWN_FLAG = -1;

From c04cc436daa6f14827dffcc3c25ec6c133f8c166 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 12 Jan 2025 13:03:54 +0100
Subject: [PATCH 6/7] patched deprecated createPauliStrSum

---
 quest/include/deprecated.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/quest/include/deprecated.h b/quest/include/deprecated.h
index 8a4c01a1..91aff3fa 100644
--- a/quest/include/deprecated.h
+++ b/quest/include/deprecated.h
@@ -964,9 +964,11 @@ static inline PauliStrSum _createPauliStrSumFromCodes(int numQubits, _NoWarnPaul
 
     PauliStr* strings = (PauliStr*) malloc(numTerms * sizeof *strings);
     for (int i=0; i<numTerms; i++) {
-        int codes[100];
+
+        int codes[100]; // assumes numQubits<=100
         for (int j=0; j<numQubits && j<100; j++)
-            codes[i] = (int) allPauliCodes[i*numQubits+j];
+            codes[j] = (int) allPauliCodes[i*numQubits+j];
+
         strings[i] = getPauliStr(codes, targs, numQubits);
     }
 

From 6a6c7c8b5120935295537325fc070156dd2169b7 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Jan 2025 15:32:26 +0100
Subject: [PATCH 7/7] patched two-qubit depolarising

as per https://github.com/TysonRayJones/Distributed-Full-State-Algorithms/issues/7
---
 quest/src/api/decoherence.cpp     |  2 +-
 quest/src/core/accelerator.cpp    |  6 ---
 quest/src/core/accelerator.hpp    |  1 -
 quest/src/core/localiser.cpp      | 28 +++++------
 quest/src/core/utilities.cpp      |  9 ++++
 quest/src/core/utilities.hpp      |  4 +-
 quest/src/cpu/cpu_subroutines.cpp | 77 +++++++++----------------------
 quest/src/cpu/cpu_subroutines.hpp |  1 -
 quest/src/gpu/gpu_kernels.cuh     | 47 ++++++-------------
 quest/src/gpu/gpu_subroutines.cpp | 39 +++++-----------
 quest/src/gpu/gpu_subroutines.hpp |  1 -
 11 files changed, 74 insertions(+), 141 deletions(-)

diff --git a/quest/src/api/decoherence.cpp b/quest/src/api/decoherence.cpp
index d887e270..35f63fa8 100644
--- a/quest/src/api/decoherence.cpp
+++ b/quest/src/api/decoherence.cpp
@@ -111,8 +111,8 @@ void mixPaulis(Qureg qureg, int qubit, qreal probX, qreal probY, qreal probZ) {
 void mixKrausMap(Qureg qureg, int* qubits, int numQubits, KrausMap map) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
-    validate_mixedAmpsFitInNode(qureg, numQubits, __func__);
     validate_targets(qureg, qubits, numQubits, __func__);
+    validate_mixedAmpsFitInNode(qureg, numQubits, __func__);
     validate_krausMapIsCPTP(map, __func__); // also checks fields and is-sync
     validate_krausMapMatchesTargets(map, numQubits, __func__);
 
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 6d7f0e51..a1443e2d 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -747,12 +747,6 @@ void accel_densmatr_twoQubitDepolarising_subF(Qureg qureg, int qubit1, int qubit
         gpu_densmatr_twoQubitDepolarising_subF(qureg, qubit1, qubit2, prob):
         cpu_densmatr_twoQubitDepolarising_subF(qureg, qubit1, qubit2, prob);
 }
-void accel_densmatr_twoQubitDepolarising_subG(Qureg qureg, int qubit1, int qubit2, qreal prob) {
-
-    (qureg.isGpuAccelerated)?
-        gpu_densmatr_twoQubitDepolarising_subG(qureg, qubit1, qubit2, prob):
-        cpu_densmatr_twoQubitDepolarising_subG(qureg, qubit1, qubit2, prob);
-}
 
 
 
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index d389f59e..aeafe66a 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -237,7 +237,6 @@ void accel_densmatr_twoQubitDepolarising_subC(Qureg qureg, int qubit1, int qubit
 void accel_densmatr_twoQubitDepolarising_subD(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void accel_densmatr_twoQubitDepolarising_subE(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void accel_densmatr_twoQubitDepolarising_subF(Qureg qureg, int qubit1, int qubit2, qreal prob);
-void accel_densmatr_twoQubitDepolarising_subG(Qureg qureg, int qubit1, int qubit2, qreal prob);
 
 void accel_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int qubit, qreal pI, qreal pX, qreal pY, qreal pZ);
 void accel_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int qubit, qreal pI, qreal pX, qreal pY, qreal pZ);
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 339b2d81..7c46f5cb 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1407,18 +1407,25 @@ void twoQubitDepolarisingOnPrefixAndPrefix(Qureg qureg, int ketQb1, int ketQb2,
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
 
-    // scale 25% of (non-communicated) amps
+    // pack unscaled amps before subsequent scaling
+    qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, {ketQb1,ketQb2}, {braBit1,braBit2});
+
+    // scale all amps
     accel_densmatr_twoQubitDepolarising_subE(qureg, ketQb1, ketQb2, prob);
 
-    // pack and swap 25% of buffer, and use it to modify 25% of local amps
+    // swap the buffer with 3 other nodes to update local amps
     int pairRank1 = util_getRankWithBraQubitFlipped(ketQb1, qureg);
-    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank1, {ketQb1,ketQb2}, {braBit1,braBit2});
+    int pairRank2 = util_getRankWithBraQubitFlipped(ketQb2, qureg);
+    int pairRank3 = util_getRankWithBraQubitsFlipped({ketQb1,ketQb2}, qureg);
+
+    comm_exchangeSubBuffers(qureg, numPacked, pairRank1);
     accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob);
 
-    // pack and swap another 25% of buffer (we could pack during subE, but we choose not to)
-    int pairRank2 = util_getRankWithBraQubitFlipped(ketQb2, qureg);
-    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank2, {ketQb1,ketQb2}, {braBit1,braBit2});
-    accel_densmatr_twoQubitDepolarising_subG(qureg, ketQb1, ketQb2, prob);
+    comm_exchangeSubBuffers(qureg, numPacked, pairRank2);
+    accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob);
+
+    comm_exchangeSubBuffers(qureg, numPacked, pairRank3);
+    accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob);
 }
 
 
@@ -1433,13 +1440,6 @@ void localiser_densmatr_twoQubitDepolarising(Qureg qureg, int qubit1, int qubit2
     bool comm1 = doesChannelRequireComm(qureg, qubit1);
     bool comm2 = doesChannelRequireComm(qureg, qubit2);
 
-
-
-    // TODO:
-    // this is bugged, even when non-distributed!!!
-
-
-
     if (comm2 && comm1)
         twoQubitDepolarisingOnPrefixAndPrefix(qureg, qubit1, qubit2, prob);
     if (comm2 && !comm1)
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 19d9ede0..f42cce80 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -121,6 +121,15 @@ int util_getRankWithBraQubitFlipped(int ketQubit, Qureg qureg) {
     return rankFlip;
 }
 
+int util_getRankWithBraQubitsFlipped(vector<int> ketQubits, Qureg qureg) {
+
+    int rank = qureg.rank;
+    for (int qubit : ketQubits)
+        rank = flipBit(rank, util_getPrefixBraInd(qubit, qureg));
+
+    return rank;
+}
+
 vector<int> util_getBraQubits(vector<int> ketQubits, Qureg qureg) {
 
     vector<int> braInds(0);
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index 0fe1ad97..f7c70b12 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -40,9 +40,11 @@ int util_getRankBitOfQubit(int ketQubit, Qureg qureg);
 int util_getRankBitOfBraQubit(int ketQubit, Qureg qureg);
 
 int util_getRankWithQubitFlipped(int ketQubit, Qureg qureg);
-int util_getRankWithBraQubitFlipped(int ketQubit, Qureg qureg);
 int util_getRankWithQubitsFlipped(vector<int> prefixQubits, Qureg qureg);
 
+int util_getRankWithBraQubitFlipped(int ketQubit, Qureg qureg);
+int util_getRankWithBraQubitsFlipped(vector<int> ketQubits, Qureg qureg);
+
 vector<int> util_getBraQubits(vector<int> ketQubits, Qureg qureg);
 
 vector<int> util_getVector(int* qubits, int numQubits);
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 4a046b45..237209ce 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -1083,11 +1083,11 @@ void cpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
     for (qindex n=0; n<numIts; n++) {
 
         // determine whether to modify amp
-        int flag1 = !(getBit(n, ketQb1) ^ getBit(n, braQb1));
-        int flag2 = !(getBit(n, ketQb2) ^ getBit(n, braQb2));
-        int mod   = !(flag1 & flag2);
+        bool flag1 = getBit(n, ketQb1) == getBit(n, braQb1);
+        bool flag2 = getBit(n, ketQb2) == getBit(n, braQb2);
+        bool mod = !(flag1 & flag2);
 
-        // multiply 15/16 of all amps by (1 + c3)
+        // multiply 15/16-th of all amps by (1 + c3)
         qureg.cpuAmps[n] *= 1 + c3 * mod;
     }
 }
@@ -1102,10 +1102,14 @@ void cpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
 
+    // factors used in amp -> c1*amp + c2(sum of other three amps)
     auto factors = util_getTwoQubitDepolarisingFactors(prob);
     auto c1 = factors.c1;
     auto c2 = factors.c2;
 
+    // below, we compute term = (sum of all four amps) for brevity, so adjust c1
+    c1 -= c2;
+
     // for brevity
     qcomp* amps = qureg.cpuAmps;
 
@@ -1151,7 +1155,7 @@ void cpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
         // decide whether or not to modify nth local
         bool flag1 = getBit(n, ketQb1) == getBit(n, braQb1); 
         bool flag2 = getBit(n, ketQb2) == braBit2;
-        bool mod   = !(flag1 & flag2);
+        bool mod = !(flag1 & flag2);
 
         // scale amp by 1 or (1 + c3)
         qureg.cpuAmps[n] *= 1 + c3 * mod;
@@ -1197,30 +1201,26 @@ void cpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
 
 void cpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
 
-    // scale 25% of amps but iterate all
+    // all amplitudes are scaled; 25% by c1 and 75% by 1 + c3
     qindex numIts = qureg.numAmpsPerNode;
 
+    auto factors = util_getTwoQubitDepolarisingFactors(prob);
+    qreal fac0 = 1 + factors.c3;
+    qreal fac1 = factors.c1 - fac0;
+
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
 
-    qreal c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
-
-    // TODO:
-    // are we really inefficiently enumerating all amps and applying a non-unity
-    // factor to only 25%?! Is this because we do not know braBit2 and ergo 
-    // cannot be sure a direct enumeration is accessing indicies in a monotonically
-    // increasing order? Can that really outweigh a 3x slowdown?! Test and fix!
-
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
-
-        // choose whether to modify amp
-        bool flag1 = getBit(n, ketQb1) == braBit1; 
-        bool flag2 = getBit(n, ketQb2) == braBit2;
-        bool mod   = !(flag1 & flag2);
         
-        // multiply amp by 1 or (1 + c3)
-        qureg.cpuAmps[n] *=  1 + c3 * mod;
+        // choose factor by which to sacle amp
+        bool same1 = getBit(n, ketQb1) == braBit1; 
+        bool same2 = getBit(n, ketQb2) == braBit2;
+        bool flag = same1 & same2;
+
+        // scale amp by c1 or (1+c3)
+        qureg.cpuAmps[n] *= fac1 * flag + fac0;;
     }
 }
 
@@ -1236,9 +1236,7 @@ void cpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
 
-    auto factors = util_getTwoQubitDepolarisingFactors(prob);
-    auto c1 = factors.c1;
-    auto c2 = factors.c2;
+    auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2;
 
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
@@ -1250,42 +1248,11 @@ void cpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
         qindex j = n + offset;
 
         // mix local amp with received buffer amp
-        qureg.cpuAmps[i] *= c1;
         qureg.cpuAmps[i] += c2 * qureg.cpuCommBuffer[j];
     }
 }
 
 
-void cpu_densmatr_twoQubitDepolarising_subG(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
-
-    // modify 25% of local amps, one per iteration
-    qindex numIts = qureg.numAmpsPerNode / 4;
-
-    // received amplitudes may begin at an arbitrary offset in the buffer
-    qindex offset = getBufferRecvInd();
-
-    int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
-    int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
-
-    auto factors = util_getTwoQubitDepolarisingFactors(prob);
-    auto c1 = factors.c1;
-    auto c2 = factors.c2;
-
-    #pragma omp parallel for if(qureg.isMultithreaded)
-    for (qindex n=0; n<numIts; n++) {
-
-        // i = nth local index where suffix ket qubits equal prefix bra qubits
-        qindex i = insertTwoBits(n, ketQb2, braBit2, ketQb1, braBit1);
-
-        // j = nth received amp in buffer
-        qindex j = n + offset;
-
-        // overwrite local amp with buffer amp
-        qureg.cpuAmps[i] = (c2 / c1) * qureg.cpuCommBuffer[j];
-    }
-}
-
-
 
 /*
  * PAULI CHANNEL
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index d238552b..9c4186c3 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -110,7 +110,6 @@ void cpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int qubit1, int qubit2,
 void cpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void cpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void cpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int qubit1, int qubit2, qreal prob);
-void cpu_densmatr_twoQubitDepolarising_subG(Qureg qureg, int qubit1, int qubit2, qreal prob);
 
 void cpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int qubit, qreal pI, qreal pX, qreal pY, qreal pZ);
 void cpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int qubit, qreal pI, qreal pX, qreal pY, qreal pZ);
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 8fbec202..078c152f 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -787,7 +787,7 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subA(
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subB(
     cu_qcomp* amps, qindex numThreads, 
-    int ketQb1, int ketQb2, int braQb1, int braQb2, qreal c1, qreal c2
+    int ketQb1, int ketQb2, int braQb1, int braQb2, qreal c1alt, qreal c2
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -800,10 +800,10 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subB(
     // mix 1/16 of all amps in groups of 4
     cu_qcomp term = amps[i0000] + amps[i0101] + amps[i1010] + amps[i1111];
 
-    amps[i0000] = c1*amps[i0000] + c2*term;
-    amps[i0101] = c1*amps[i0101] + c2*term;
-    amps[i1010] = c1*amps[i1010] + c2*term;
-    amps[i1111] = c1*amps[i1111] + c2*term;
+    amps[i0000] = c1alt*amps[i0000] + c2*term;
+    amps[i0101] = c1alt*amps[i0101] + c2*term;
+    amps[i1010] = c1alt*amps[i1010] + c2*term;
+    amps[i1111] = c1alt*amps[i1111] + c2*term;
 }
 
 
@@ -850,28 +850,23 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subD(
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subE(
     cu_qcomp* amps, qindex numThreads, 
-    int ketQb1, int ketQb2, int braBit1, int braBit2, qreal c3
+    int ketQb1, int ketQb2, int braBit1, int braBit2, qreal fac0, qreal fac1
 ) {
     GET_THREAD_IND(n, numThreads);
 
-    // TODO:
-    // this kernel modifies every amplitude, but I think only
-    // 25% are actually being changed; fix this by dispatching
-    // 25% fewer kernels which go straight to the modified amps
+    // choose factor by which to scale amp
+    bool same1 = getBit(n, ketQb1) == braBit1; 
+    bool same2 = getBit(n, ketQb2) == braBit2;
+    bool flag = (same1 & same2);
 
-    // choose whether to modify amp
-    bool flag1 = getBit(n, ketQb1) == braBit1; 
-    bool flag2 = getBit(n, ketQb2) == braBit2;
-    bool mod   = !(flag1 & flag2);
-    
-    // multiply amp by 1 or (1 + c3)
-    amps[n] *=  1 + c3 * mod;
+    // scale amp by c1 or (1+c3)
+    amps[n] *=  fac1 * flag + fac0;
 }
 
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subF(
     cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
-    int ketQb1, int ketQb2, int braBit1, int braBit2, qreal c1, qreal c2
+    int ketQb1, int ketQb2, int braBit1, int braBit2, qreal c2
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -879,21 +874,7 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subF(
     qindex i = insertTwoBits(n, ketQb2, braBit2, ketQb1, braBit1);
 
     // mix local amp with received buffer amp
-    amps[i] = c1*amps[i] + c2*buffer[n];
-}
-
-
-__global__ void kernel_densmatr_twoQubitDepolarising_subG(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
-    int ketQb1, int ketQb2, int braBit1, int braBit2, qreal c1, qreal c2
-) {
-    GET_THREAD_IND(n, numThreads);
-
-    // i = nth local index where suffix ket qubits equal prefix bra qubits
-    qindex i = insertTwoBits(n, ketQb2, braBit2, ketQb1, braBit1);
-
-    // overwrite local amp with buffer amp
-    amps[i] = (c2 / c1) * buffer[n];
+    amps[i] += c2*buffer[n];
 }
 
 
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 98a27530..03d0a7d9 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -984,9 +984,12 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     int braQb2 = util_getBraQubit(ketQb2, qureg);
     auto factors = util_getTwoQubitDepolarisingFactors(prob);
 
+    // each kernel invocation sums all 4 amps together, so adjusts c1
+    qreal altc1 = factors.c1 - factors.c2;
+
     kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
-        ketQb1, ketQb2, braQb1, braQb2, factors.c1, factors.c2
+        ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2
     );
 
 #else
@@ -1049,11 +1052,14 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
-    auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
+
+    auto factors = util_getTwoQubitDepolarisingFactors(prob);
+    qreal fac0 = 1 + factors.c3;
+    qreal fac1 = factors.c1 - fac0;
 
     kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
         toCuQcomps(qureg.gpuAmps), numThreads,
-        ketQb1, ketQb2, braBit1, braBit2, c3
+        ketQb1, ketQb2, braBit1, braBit2, fac0, fac1
     );
 
 #else
@@ -1072,34 +1078,11 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
-    auto factors = util_getTwoQubitDepolarisingFactors(prob);
+    auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2;
 
     kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
         toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads,
-        ketQb1, ketQb2, braBit1, braBit2, factors.c1, factors.c2
-    );
-
-#else
-    error_gpuSimButGpuNotCompiled();
-#endif
-}
-
-
-void gpu_densmatr_twoQubitDepolarising_subG(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
-
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
-
-    qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
-    qindex offset = getBufferRecvInd();
-
-    int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
-    int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
-    auto factors = util_getTwoQubitDepolarisingFactors(prob);
-
-    kernel_densmatr_twoQubitDepolarising_subG <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads,
-        ketQb1, ketQb2, braBit1, braBit2, factors.c1, factors.c2
+        ketQb1, ketQb2, braBit1, braBit2, c2
     );
 
 #else
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index 9f773ba8..1b042749 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -108,7 +108,6 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int qubit1, int qubit2,
 void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int qubit1, int qubit2, qreal prob);
-void gpu_densmatr_twoQubitDepolarising_subG(Qureg qureg, int qubit1, int qubit2, qreal prob);
 
 void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, qreal pX, qreal pY, qreal pZ);
 void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, qreal pX, qreal pY, qreal pZ);