QuEST-Kit · TysonRayJones · Jan 20, 2025 · Nov 25, 2024 · Dec 15, 2024 · Dec 17, 2024
diff --git a/quest/include/deprecated.h b/quest/include/deprecated.h
@@ -964,9 +964,11 @@ static inline PauliStrSum _createPauliStrSumFromCodes(int numQubits, _NoWarnPaul
 
     PauliStr* strings = (PauliStr*) malloc(numTerms * sizeof *strings);
     for (int i=0; i<numTerms; i++) {
-        int codes[100];
+
+        int codes[100]; // assumes numQubits<=100
         for (int j=0; j<numQubits && j<100; j++)
-            codes[i] = (int) allPauliCodes[i*numQubits+j];
+            codes[j] = (int) allPauliCodes[i*numQubits+j];
+
         strings[i] = getPauliStr(codes, targs, numQubits);
     }
 

diff --git a/quest/src/api/decoherence.cpp b/quest/src/api/decoherence.cpp
@@ -112,6 +112,7 @@ void mixKrausMap(Qureg qureg, int* qubits, int numQubits, KrausMap map) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_targets(qureg, qubits, numQubits, __func__);
+    validate_mixedAmpsFitInNode(qureg, numQubits, __func__);
     validate_krausMapIsCPTP(map, __func__); // also checks fields and is-sync
     validate_krausMapMatchesTargets(map, numQubits, __func__);
 
@@ -123,8 +124,7 @@ void mixQureg(Qureg outQureg, Qureg inQureg, qreal inProb) {
     validate_quregFields(outQureg, __func__);
     validate_quregFields(inQureg, __func__);
     validate_probability(inProb, __func__);
-    validate_quregIsDensityMatrix(outQureg, __func__);
-    validate_quregsCanBeMixed(outQureg, inQureg, __func__);
+    validate_quregsCanBeMixed(outQureg, inQureg, __func__); // checks outQureg is densmatr
 
     qreal outProb = 1 - inProb;
     localiser_densmatr_mixQureg(outProb, outQureg, inProb, inQureg);

diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
@@ -39,6 +39,7 @@ void validateAndApplyAnyCtrlAnyTargUnitaryMatrix(Qureg qureg, int* ctrls, int* s
     validate_controlsAndTargets(qureg, ctrls, numCtrls, targs, numTargs, caller);
     validate_controlStates(states, numCtrls, caller);
     validate_matrixDimMatchesTargets(matr, numTargs, caller); // also checks fields and is-synced
+    validate_mixedAmpsFitInNode(qureg, numTargs, caller);
     validate_matrixIsUnitary(matr, caller); // harmlessly rechecks fields and is-synced
 
     auto ctrlVec  = util_getVector(ctrls,  numCtrls);
@@ -117,6 +118,7 @@ void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix)
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, target1, target2, __func__);
     validate_matrixFields(matrix, __func__); // matrix can be non-unitary
+    validate_mixedAmpsFitInNode(qureg, 2, __func__);
 
     bool conj = false;
     localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj);
@@ -156,6 +158,7 @@ void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
     validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
+    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
 
     bool conj = false;
     localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj);
@@ -376,8 +379,8 @@ void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     bool onlyMultiply = true;
     qcomp exponent = qcomp(1, 0);
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
@@ -387,8 +390,8 @@ void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp
 
     bool onlyMultiply = true;
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
@@ -400,8 +403,8 @@ void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     bool onlyMultiply = false;
     qcomp exponent = qcomp(1, 0);
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 void applyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
@@ -412,8 +415,8 @@ void applyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp ex
 
     bool onlyMultiply = false;
     (qureg.isDensityMatrix)?
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent) :
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply);
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
 
@@ -617,9 +620,14 @@ void applyMultiStateControlledSqrtSwap(Qureg qureg, int* controls, int* states,
     validate_controlsAndTwoTargets(qureg, controls, numControls, target1, target2, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // this is likely suboptimal, and there must exist a more 
-    // efficient bespoke strategy for sqrt-SWAP, although given
-    // it is a little esoteric, optimisation is not worthwhile
+    // TODO:
+    // this function exacts sqrtSwap as a dense 2-qubit matrix,
+    // where as bespoke communication and simulation strategy is
+    // clearly possible which we have not supported because the gate
+    // is somewhat esoteric. As such, we must validate mixed-amps fit
+
+    validate_mixedAmpsFitInNode(qureg, 2, __func__); // to throw SqrtSwap error, not generic CompMatr2 error
+
     CompMatr2 matr = getCompMatr2({
         {1, 0, 0, 0},
         {0, .5+.5_i, .5-.5_i, 0},
@@ -1224,6 +1232,7 @@ void applySuperOp(Qureg qureg, SuperOp superop, int* targets, int numTargets) {
     validate_superOpFields(superop, __func__);
     validate_superOpIsSynced(superop, __func__);
     validate_superOpDimMatchesTargs(superop, numTargets, __func__);
+    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
 
     localiser_densmatr_superoperator(qureg, superop, util_getVector(targets, numTargets));
 }

diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
@@ -146,9 +146,6 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
     // automatically overwrite distrib, GPU, and multithread fields which were left as modeflag::USE_AUTO
     autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread, env);
 
-    // throw error if the user had forced multithreading but GPU accel was auto-chosen
-    validate_newQuregNotBothMultithreadedAndGpuAccel(useGpuAccel, useMultithread, caller);
-
     Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
 
     // always allocate CPU memory

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
@@ -747,12 +747,6 @@ void accel_densmatr_twoQubitDepolarising_subF(Qureg qureg, int qubit1, int qubit
         gpu_densmatr_twoQubitDepolarising_subF(qureg, qubit1, qubit2, prob):
         cpu_densmatr_twoQubitDepolarising_subF(qureg, qubit1, qubit2, prob);
 }
-void accel_densmatr_twoQubitDepolarising_subG(Qureg qureg, int qubit1, int qubit2, qreal prob) {
-
-    (qureg.isGpuAccelerated)?
-        gpu_densmatr_twoQubitDepolarising_subG(qureg, qubit1, qubit2, prob):
-        cpu_densmatr_twoQubitDepolarising_subG(qureg, qubit1, qubit2, prob);
-}
 
 
 

diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
@@ -237,7 +237,6 @@ void accel_densmatr_twoQubitDepolarising_subC(Qureg qureg, int qubit1, int qubit
 void accel_densmatr_twoQubitDepolarising_subD(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void accel_densmatr_twoQubitDepolarising_subE(Qureg qureg, int qubit1, int qubit2, qreal prob);
 void accel_densmatr_twoQubitDepolarising_subF(Qureg qureg, int qubit1, int qubit2, qreal prob);
-void accel_densmatr_twoQubitDepolarising_subG(Qureg qureg, int qubit1, int qubit2, qreal prob);
 
 void accel_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int qubit, qreal pI, qreal pX, qreal pY, qreal pZ);
 void accel_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int qubit, qreal pI, qreal pX, qreal pY, qreal pZ);

diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp
@@ -67,7 +67,8 @@ void chooseWhetherToDistributeQureg(int numQubits, int isDensMatr, int &useDistr
     // it's ok if we cannot query RAM; if we'd have exceeded it, it's likely we'll exceed auto-threshold and will still distribute
     } catch (mem::COULD_NOT_QUERY_RAM &e) {}
 
-    // force distribution if GPU deployment is possible but we exceed local VRAM
+    // force distribution if GPU deployment is available but we exceed local VRAM; 
+    // this is preferable over falling back to CPU-only which would be astonishingly slow 
     if (useGpuAccel == 1 || useGpuAccel == modeflag::USE_AUTO) {
         size_t localGpuMem = gpu_getCurrentAvailableMemoryInBytes();
         if (!mem_canQuregFitInMemory(numQubits, isDensMatr, 1, localGpuMem)) {
@@ -76,47 +77,44 @@ void chooseWhetherToDistributeQureg(int numQubits, int isDensMatr, int &useDistr
         }
     }
 
-    // by now, we know that Qureg can definitely fit into a single GPU, or principally fit into RAM,
-    // but we may still wish to distribute it so that multiple Quregs don't choke up memory.
+    // to reach here, we know that Qureg can fit into the remaining memory of a single GPU, or principally 
+    // fit into RAM, but we may still wish to distribute for improved parallelisation and to avoid memory saturation
     int effectiveNumQubitsPerNode = mem_getEffectiveNumStateVecQubitsPerNode(numQubits, isDensMatr, numEnvNodes);
     useDistrib = (effectiveNumQubitsPerNode >= MIN_NUM_LOCAL_QUBITS_FOR_AUTO_QUREG_DISTRIBUTION);
 }
 
 
-void chooseWhetherToGpuAccelQureg(int numQubits, int isDensMatr, int useDistrib, int &useGpuAccel, int numQuregNodes) {
+void chooseWhetherToGpuAccelQureg(int numQubits, int isDensMatr, int &useGpuAccel, int numQuregNodes) {
 
     // if the flag is already set, don't change it
     if (useGpuAccel != modeflag::USE_AUTO)
         return;
 
-    // determine the 'effective number of qubits' each GPU would have to simulate, if distributed
+    // determine the 'effective number of qubits' each GPU would have to simulate, if distributed...
     int effectiveNumQubits = mem_getEffectiveNumStateVecQubitsPerNode(numQubits, isDensMatr, numQuregNodes);
 
-    // choose to GPU accelerate only if that's not too few
+    // and choose to GPU accelerate only if that's not too few
     useGpuAccel = (effectiveNumQubits >= MIN_NUM_LOCAL_QUBITS_FOR_AUTO_QUREG_GPU_ACCELERATION);
 
     // notice there was no automatic disabling of GPU acceleration in the scenario that the local
     // partition exceeded GPU memory. This is because such a scenario would be catastrophically
     // slow and astonish users by leaving GPUs idle in intensive simulation. Instead, we auto-deploy
-    // to GPU and subsequent validation will notice we exceeded GPU memory.
+    // to GPU anyway and subsequent validation will notice we exceeded GPU memory and report an error.
 }
 
 
-void chooseWhetherToMultithreadQureg(int numQubits, int isDensMatr, int useDistrib, int useGpuAccel, int &useMultithread, int numQuregNodes) {
+void chooseWhetherToMultithreadQureg(int numQubits, int isDensMatr, int &useMultithread, int numQuregNodes) {
 
     // if the flag is already set (user-given, or inferred from env), don't change it
     if (useMultithread != modeflag::USE_AUTO)
         return;
 
-    // if GPU-aceleration was chosen, disable auto multithreading...
-    if (useGpuAccel) {
-        useMultithread = 0;
-        return;
-    }
-
-    // otherwise, we're not GPU-accelerating, and should choose to multithread based on Qureg size
+    // otherwise, choose to multithread based on Qureg size
     int effectiveNumQubits = mem_getEffectiveNumStateVecQubitsPerNode(numQubits, isDensMatr, numQuregNodes);
     useMultithread = (effectiveNumQubits >= MIN_NUM_LOCAL_QUBITS_FOR_AUTO_QUREG_MULTITHREADING);
+
+    // note the qureg may be simultaneously GPU-accelerated and so never use its
+    // multithreaded CPU routines, except in functions which accept multiple Quregs
 }
 
 
@@ -125,8 +123,6 @@ void autodep_chooseQuregDeployment(int numQubits, int isDensMatr, int &useDistri
     // preconditions:
     //  - the given configuration is compatible with env (assured by prior validation)
     //  - this means no deployment is forced (=1) which is incompatible with env
-    //  - it also means GPU-acceleration and multithreading are not simultaneously forced
-    //    (although they may still be left automatic and need explicit revision)
 
     // disable any automatic deployments not permitted by env (it's gauranteed we never overwrite =1 to =0)
     if (!env.isDistributed)
@@ -141,11 +137,15 @@ void autodep_chooseQuregDeployment(int numQubits, int isDensMatr, int &useDistri
     if (env.numNodes == 1)
         useDistrib = 0;
 
-    // overwrite any auto options (== modeflag::USE_AUTO)
+    // overwrite useDistrib
     chooseWhetherToDistributeQureg(numQubits, isDensMatr, useDistrib, useGpuAccel, env.numNodes);
     int numQuregNodes = (useDistrib)? env.numNodes : 1;
-    chooseWhetherToGpuAccelQureg(numQubits, isDensMatr, useDistrib, useGpuAccel, numQuregNodes);
-    chooseWhetherToMultithreadQureg(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread, numQuregNodes);
+
+    // overwrite useGpuAccel
+    chooseWhetherToGpuAccelQureg(numQubits, isDensMatr, useGpuAccel, numQuregNodes);
+
+    // overwrite useMultithread
+    chooseWhetherToMultithreadQureg(numQubits, isDensMatr, useMultithread, numQuregNodes);
 }
 
 

diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
@@ -185,7 +185,7 @@ void assert_pairRankIsDistinct(Qureg qureg, int pairRank) {
 
 void assert_bufferSendRecvDoesNotOverlap(qindex sendInd, qindex recvInd, qindex numAmps) {
 
-    if (sendInd + numAmps > recvInd)
+    if (sendInd < recvInd + numAmps)
         raiseInternalError("A distributed function attempted to send and receive portions of the buffer which overlapped.");
 }
 
@@ -684,6 +684,12 @@ void assert_utilsGivenDensMatr(Qureg qureg) {
         raiseInternalError("A utility function was given a statevector where a density matrix was expected.");
 }
 
+void assert_utilsGivenNonZeroEpsilon(qreal eps) {
+
+    if (eps == 0)
+        raiseInternalError("A utility function (isUnitary, isHermitian, isCPTP) received an epsilon of zero, which should have precluded it being called.");
+}
+
 
 
 /*

diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
@@ -290,6 +290,8 @@ void assert_utilsGivenStateVec(Qureg qureg);
 
 void assert_utilsGivenDensMatr(Qureg qureg);
 
+void assert_utilsGivenNonZeroEpsilon(qreal eps);
+
 
 
 /*

diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
@@ -803,9 +803,15 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, vector<int> ctrls, vect
     //     a communicator which may be inelegant alongside our own distribution scheme.
 
     // perform necessary swaps to move all targets into suffix, each of which invokes communication
-    for (size_t i=0; i<targsA.size(); i++)
-        if (targsA[i] != targsB[i])
-            anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, targsA[i], targsB[i]);
+    for (size_t i=0; i<targsA.size(); i++) {
+
+        if (targsA[i] == targsB[i])
+            continue;
+
+        int suffixTarg = std::min(targsA[i], targsB[i]);
+        int prefixTarg = std::max(targsA[i], targsB[i]);
+        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+    }
 }
 
 
@@ -1401,18 +1407,25 @@ void twoQubitDepolarisingOnPrefixAndPrefix(Qureg qureg, int ketQb1, int ketQb2,
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
 
-    // scale 25% of (non-communicated) amps
+    // pack unscaled amps before subsequent scaling
+    qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, {ketQb1,ketQb2}, {braBit1,braBit2});
+
+    // scale all amps
     accel_densmatr_twoQubitDepolarising_subE(qureg, ketQb1, ketQb2, prob);
 
-    // pack and swap 25% of buffer, and use it to modify 25% of local amps
+    // swap the buffer with 3 other nodes to update local amps
     int pairRank1 = util_getRankWithBraQubitFlipped(ketQb1, qureg);
-    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank1, {ketQb1,ketQb2}, {braBit1,braBit2});
+    int pairRank2 = util_getRankWithBraQubitFlipped(ketQb2, qureg);
+    int pairRank3 = util_getRankWithBraQubitsFlipped({ketQb1,ketQb2}, qureg);
+
+    comm_exchangeSubBuffers(qureg, numPacked, pairRank1);
     accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob);
 
-    // pack and swap another 25% of buffer (we could pack during subE, but we choose not to)
-    int pairRank2 = util_getRankWithBraQubitFlipped(ketQb2, qureg);
-    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank2, {ketQb1,ketQb2}, {braBit1,braBit2});
-    accel_densmatr_twoQubitDepolarising_subG(qureg, ketQb1, ketQb2, prob);
+    comm_exchangeSubBuffers(qureg, numPacked, pairRank2);
+    accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob);
+
+    comm_exchangeSubBuffers(qureg, numPacked, pairRank3);
+    accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob);
 }
 
 
@@ -1755,8 +1768,17 @@ qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qu
 
     if (doAnyLocalStatesHaveQubitValues(qureg, braQubits, outcomes)) {
 
-        // such nodes need to know all ket qubits (which are all suffix)
-        prob += accel_densmatr_calcProbOfMultiQubitOutcome_sub(qureg, qubits, outcomes);
+        // such nodes need only know the ket qubits/outcomes for which the bra-qubits are in suffix
+        vector<int> ketQubitsWithBraInSuffix;
+        vector<int> ketOutcomesWithBraInSuffix;
+        for (int q=0; q<qubits.size(); q++)
+            if (util_isBraQubitInSuffix(qubits[q], qureg)) {
+                ketQubitsWithBraInSuffix.push_back(qubits[q]);
+                ketOutcomesWithBraInSuffix.push_back(outcomes[q]);
+            }
+
+        prob += accel_densmatr_calcProbOfMultiQubitOutcome_sub(
+            qureg, ketQubitsWithBraInSuffix, ketOutcomesWithBraInSuffix);
     }
 
     // all nodes must sum their probabilities (unless qureg was cloned per-node), for consensus
Original file line number	Diff line number	Diff line change
Expand Up		@@ -290,6 +290,8 @@ void assert_utilsGivenStateVec(Qureg qureg);

		void assert_utilsGivenDensMatr(Qureg qureg);

		void assert_utilsGivenNonZeroEpsilon(qreal eps);



		/*
Expand Down