From 252e8c3b1a78c1fc78f12b07d0fea1d2457f935a Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Wed, 21 Apr 2021 23:57:30 -0400
Subject: [PATCH 01/11] Use cupy scipy.fft on GPU

---
 museval/metrics.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/museval/metrics.py b/museval/metrics.py
index 7bfb78b..6e3bab7 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -46,7 +46,9 @@
      1706, IRISA, April 2005."""
 
 import numpy as np
-import scipy.fftpack
+import scipy.fft
+import cupyx
+import cupy
 from scipy.linalg import toeplitz
 from scipy.signal import fftconvolve
 import itertools
@@ -523,19 +525,20 @@ def _compute_reference_correlations(reference_sources, filters_len):
     # zero padding and FFT of references
     reference_sources = _zeropad(reference_sources, filters_len - 1, axis=2)
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
-    sf = scipy.fftpack.fft(reference_sources, n=n_fft, axis=2)
+
+    sf = cupy.asnumpy(cupyx.scipy.fft.fft(cupy.asarray(reference_sources), n=n_fft, axis=2))
 
     # compute intercorrelation between sources
     G = np.zeros((nsrc, nsrc, nchan, nchan, filters_len, filters_len))
+
     for ((i, c1), (j, c2)) in itertools.combinations_with_replacement(
         itertools.product(
             list(range(nsrc)), list(range(nchan))
         ),
         2
     ):
+        ssf = np.real(cupy.asnumpy(cupyx.scipy.fft.ifft(cupy.asarray(sf[j, c2] * np.conj(sf[i, c1])))))
 
-        ssf = sf[j, c2] * np.conj(sf[i, c1])
-        ssf = np.real(scipy.fftpack.ifft(ssf))
         ss = toeplitz(
             np.hstack((ssf[0], ssf[-1:-filters_len:-1])),
             r=ssf[:filters_len]
@@ -569,15 +572,16 @@ def _compute_projection_filters(G, sf, estimated_source):
 
     # compute its FFT
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
-    sef = scipy.fftpack.fft(estimated_source, n=n_fft)
+
+    sef = cupy.asnumpy(cupyx.scipy.fft.fft(cupy.asarray(estimated_source, dtype=np.float32), n=n_fft))
 
     # compute the cross-correlations between sources and estimates
     D = np.zeros((nsrc, nchan, filters_len, nchan))
+
     for (j, cj, c) in itertools.product(
         list(range(nsrc)), list(range(nchan)), list(range(nchan))
     ):
-        ssef = sf[j, cj] * np.conj(sef[c])
-        ssef = np.real(scipy.fftpack.ifft(ssef))
+        ssef = np.real(cupy.asnumpy(cupyx.scipy.fft.ifft(cupy.asarray(sf[j, cj] * np.conj(sef[c])))))
         D[j, cj, :, c] = np.hstack((ssef[0], ssef[-1:-filters_len:-1]))
 
     # reshape matrices to build the filters

From 309db90046eb5d5803967c87f6a69b66fd026119 Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Sat, 24 Apr 2021 11:14:40 -0400
Subject: [PATCH 02/11] Speed up slowest computations with cupy

---
 museval/metrics.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/museval/metrics.py b/museval/metrics.py
index 6e3bab7..2585121 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -526,7 +526,7 @@ def _compute_reference_correlations(reference_sources, filters_len):
     reference_sources = _zeropad(reference_sources, filters_len - 1, axis=2)
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
 
-    sf = cupy.asnumpy(cupyx.scipy.fft.fft(cupy.asarray(reference_sources), n=n_fft, axis=2))
+    sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources), n=n_fft, axis=2))
 
     # compute intercorrelation between sources
     G = np.zeros((nsrc, nsrc, nchan, nchan, filters_len, filters_len))
@@ -537,7 +537,7 @@ def _compute_reference_correlations(reference_sources, filters_len):
         ),
         2
     ):
-        ssf = np.real(cupy.asnumpy(cupyx.scipy.fft.ifft(cupy.asarray(sf[j, c2] * np.conj(sf[i, c1])))))
+        ssf = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(sf[j, c2] * np.conj(sf[i, c1]))))
 
         ss = toeplitz(
             np.hstack((ssf[0], ssf[-1:-filters_len:-1])),
@@ -573,7 +573,7 @@ def _compute_projection_filters(G, sf, estimated_source):
     # compute its FFT
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
 
-    sef = cupy.asnumpy(cupyx.scipy.fft.fft(cupy.asarray(estimated_source, dtype=np.float32), n=n_fft))
+    sef = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(estimated_source, dtype=np.float32), n=n_fft))
 
     # compute the cross-correlations between sources and estimates
     D = np.zeros((nsrc, nchan, filters_len, nchan))
@@ -581,20 +581,23 @@ def _compute_projection_filters(G, sf, estimated_source):
     for (j, cj, c) in itertools.product(
         list(range(nsrc)), list(range(nchan)), list(range(nchan))
     ):
-        ssef = np.real(cupy.asnumpy(cupyx.scipy.fft.ifft(cupy.asarray(sf[j, cj] * np.conj(sef[c])))))
+        ssef = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(sf[j, cj] * np.conj(sef[c]))))
         D[j, cj, :, c] = np.hstack((ssef[0], ssef[-1:-filters_len:-1]))
 
     # reshape matrices to build the filters
     D = D.reshape(nsrc * nchan * filters_len, nchan)
     G = _reshape_G(G)
 
+    D_gpu = cupy.asarray(D)
+    G_gpu = cupy.asarray(G)
+
     # Distortion filters
     try:
-        C = np.linalg.solve(G + eps*np.eye(G.shape[0]), D).reshape(
+        C = cupy.asnumpy(cupy.linalg.solve(G_gpu + eps*cupy.eye(G.shape[0]), D_gpu)).reshape(
             nsrc, nchan, filters_len, nchan
         )
     except np.linalg.linalg.LinAlgError:
-        C = np.linalg.lstsq(G, D)[0].reshape(
+        C = cupy.asnumpy(cupy.linalg.lstsq(G_gpu, D_gpu))[0].reshape(
             nsrc, nchan, filters_len, nchan
         )
 

From 4bd943c8f52fdf29df5b4a5e5dbbee2f9bf5e28f Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Sun, 25 Apr 2021 12:02:36 -0400
Subject: [PATCH 03/11] Use float32 for better and more stable gpu performance

---
 museval/metrics.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/museval/metrics.py b/museval/metrics.py
index 2585121..8d5b701 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -526,7 +526,7 @@ def _compute_reference_correlations(reference_sources, filters_len):
     reference_sources = _zeropad(reference_sources, filters_len - 1, axis=2)
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
 
-    sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources), n=n_fft, axis=2))
+    sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources, dtype=np.float32), n=n_fft, axis=2))
 
     # compute intercorrelation between sources
     G = np.zeros((nsrc, nsrc, nchan, nchan, filters_len, filters_len))
@@ -538,13 +538,13 @@ def _compute_reference_correlations(reference_sources, filters_len):
         2
     ):
         ssf = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(sf[j, c2] * np.conj(sf[i, c1]))))
-
         ss = toeplitz(
             np.hstack((ssf[0], ssf[-1:-filters_len:-1])),
             r=ssf[:filters_len]
         )
         G[j, i, c2, c1] = ss
         G[i, j, c1, c2] = ss.T
+
     return G, sf
 
 
@@ -588,8 +588,8 @@ def _compute_projection_filters(G, sf, estimated_source):
     D = D.reshape(nsrc * nchan * filters_len, nchan)
     G = _reshape_G(G)
 
-    D_gpu = cupy.asarray(D)
-    G_gpu = cupy.asarray(G)
+    D_gpu = cupy.asarray(D, dtype=np.float32)
+    G_gpu = cupy.asarray(G, dtype=np.float32)
 
     # Distortion filters
     try:

From 65f6843c64e4b6df40ba1b4897f3bfa292e0aaa4 Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Sun, 25 Apr 2021 13:19:36 -0400
Subject: [PATCH 04/11] Describe cupy accelerations in README

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 6f47ca8..439d4de 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,13 @@
 
 A python package to evaluate source separation results using the [MUSDB18](https://sigsep.github.io/musdb) dataset. This package was part of the [MUS task](https://sisec.inria.fr/home/2018-professionally-produced-music-recordings/) of the [Signal Separation Evaluation Campaign (SISEC)](https://sisec.inria.fr/).
 
+### Note on the sevagh fork
+
+My fork adds considerable speedups by using cupy for offloading the fft, ifft, and linalg executions of BSSv4 to the user's NVIDIA GPU. However, this has several downsides:
+* Switching to using GPU arithmetic and float32 changes output values due to numerical precision differences
+* Managing memory is not straightforward, often requiring the user to disable and re-enable the cupy fft cache between runs
+* Installing GPU libraries is a burden on end users, so it's a bigger conversation to merge such a requirement upstream
+
 ### BSSEval v4
 
 The BSSEval metrics, as implemented in the [MATLAB toolboxes](http://bass-db.gforge.inria.fr/bss_eval/) and their re-implementation in [mir_eval](http://craffel.github.io/mir_eval/#module-mir_eval.separation) are widely used in the audio separation literature. One particularity of BSSEval is to compute the metrics after optimally matching the estimates to the true sources through linear distortion filters. This allows the criteria to be robust to some linear mismatches. Apart from the optional evaluation for all possible permutations of the sources, this matching is the reason for most of the computation cost of BSSEval, especially considering it is done for each evaluation window when the metrics are computed on a framewise basis.

From f3b7540faddc8d2b1bc91cecaf05fd20d96df7c6 Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Tue, 27 Apr 2021 11:40:00 -0400
Subject: [PATCH 05/11] Fall back to host numpy/scipy if CUDA oom

---
 museval/metrics.py | 54 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/museval/metrics.py b/museval/metrics.py
index 8d5b701..655d38a 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -526,7 +526,10 @@ def _compute_reference_correlations(reference_sources, filters_len):
     reference_sources = _zeropad(reference_sources, filters_len - 1, axis=2)
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
 
-    sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources, dtype=np.float32), n=n_fft, axis=2))
+    try:
+        sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources, dtype=np.float32), n=n_fft, axis=2))
+    except cupy.cuda.memory.OutOfMemoryError:
+        sf = scipy.fft.rfft(reference_sources, n=nfft, axis=2)
 
     # compute intercorrelation between sources
     G = np.zeros((nsrc, nsrc, nchan, nchan, filters_len, filters_len))
@@ -537,7 +540,11 @@ def _compute_reference_correlations(reference_sources, filters_len):
         ),
         2
     ):
-        ssf = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(sf[j, c2] * np.conj(sf[i, c1]))))
+        tmp = sf[j, c2] * np.conj(sf[i, c1])
+        try:
+            ssf = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(tmp)))
+        except cupy.cuda.memory.OutOfMemoryError:
+            ssf = scipy.fft.irfft(tmp)
         ss = toeplitz(
             np.hstack((ssf[0], ssf[-1:-filters_len:-1])),
             r=ssf[:filters_len]
@@ -573,7 +580,10 @@ def _compute_projection_filters(G, sf, estimated_source):
     # compute its FFT
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
 
-    sef = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(estimated_source, dtype=np.float32), n=n_fft))
+    try:
+        sef = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(estimated_source, dtype=np.float32), n=n_fft))
+    except cupy.cuda.memory.OutOfMemoryError:
+        sef = scipy.fft.rfft(estimated_source, n=n_fft)
 
     # compute the cross-correlations between sources and estimates
     D = np.zeros((nsrc, nchan, filters_len, nchan))
@@ -581,25 +591,39 @@ def _compute_projection_filters(G, sf, estimated_source):
     for (j, cj, c) in itertools.product(
         list(range(nsrc)), list(range(nchan)), list(range(nchan))
     ):
-        ssef = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(sf[j, cj] * np.conj(sef[c]))))
+        tmp = sf[j, cj] * np.conj(sef[c])
+        try:
+            ssef = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(tmp)))
+        except cupy.cuda.memory.OutOfMemoryError:
+            ssef = scipy.fft.irfft(tmp)
         D[j, cj, :, c] = np.hstack((ssef[0], ssef[-1:-filters_len:-1]))
 
     # reshape matrices to build the filters
     D = D.reshape(nsrc * nchan * filters_len, nchan)
     G = _reshape_G(G)
 
-    D_gpu = cupy.asarray(D, dtype=np.float32)
-    G_gpu = cupy.asarray(G, dtype=np.float32)
-
-    # Distortion filters
     try:
-        C = cupy.asnumpy(cupy.linalg.solve(G_gpu + eps*cupy.eye(G.shape[0]), D_gpu)).reshape(
-            nsrc, nchan, filters_len, nchan
-        )
-    except np.linalg.linalg.LinAlgError:
-        C = cupy.asnumpy(cupy.linalg.lstsq(G_gpu, D_gpu))[0].reshape(
-            nsrc, nchan, filters_len, nchan
-        )
+        D_gpu = cupy.asarray(D, dtype=np.float32)
+        G_gpu = cupy.asarray(G, dtype=np.float32)
+
+        # Distortion filters
+        try:
+            C = cupy.asnumpy(cupy.linalg.solve(G_gpu + eps*cupy.eye(G.shape[0]), D_gpu)).reshape(
+                nsrc, nchan, filters_len, nchan
+            )
+        except np.linalg.linalg.LinAlgError:
+            C = cupy.asnumpy(cupy.linalg.lstsq(G_gpu, D_gpu))[0].reshape(
+                nsrc, nchan, filters_len, nchan
+            )
+    except cupy.cuda.memory.OutOfMemoryError:
+        try:
+            C = np.linalg.solve(G + eps*cupy.eye(G.shape[0]), D).reshape(
+                nsrc, nchan, filters_len, nchan
+            )
+        except np.linalg.linalg.LinAlgError:
+            C = np.linalg.lstsq(G, D)[0].reshape(
+                nsrc, nchan, filters_len, nchan
+            )
 
     # if we asked for one single reference source,
     # return just a nchan X filters_len matrix

From 178f2945d77dd8486526ae52b086963c6827b1cf Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Tue, 10 Aug 2021 09:35:17 -0400
Subject: [PATCH 06/11] Make cupy optional and add helper functions

---
 .gitignore               |   1 +
 museval/__init__.py      |   1 +
 museval/metrics.py       | 111 +++++++++++++++++++++++++++++----------
 setup.py                 |   1 +
 tests/test_regression.py |   2 +
 5 files changed, 87 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore
index db52ccb..2a9e0ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,4 @@ data/*
 !data/fetch.sh
 !data/decode.sh
 Estimates/
+*.json
diff --git a/museval/__init__.py b/museval/__init__.py
index 346e748..d383fe2 100644
--- a/museval/__init__.py
+++ b/museval/__init__.py
@@ -9,6 +9,7 @@
 import pandas as pd
 from . aggregate import TrackStore, MethodStore, EvalStore, json2df
 from . import metrics
+from . metrics import disable_cupy, clear_cupy_cache
 
 
 def _load_track_estimates(track, estimates_dir, output_dir, ext='wav'):
diff --git a/museval/metrics.py b/museval/metrics.py
index 655d38a..34566a8 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -47,18 +47,47 @@
 
 import numpy as np
 import scipy.fft
-import cupyx
-import cupy
 from scipy.linalg import toeplitz
 from scipy.signal import fftconvolve
 import itertools
 import collections
 import warnings
 
+use_cupy = False
+try:
+    import cupyx
+    import cupy
+    use_cupy = True
+except ImportError:
+    warnings.warn('cupy not available, falling back to regular numpy', file=sys.stderr)
+
 # The maximum allowable number of sources (prevents insane computational load)
 MAX_SOURCES = 100
 
 
+# allows one to disable cupy even if its available
+def disable_cupy():
+    global use_cupy
+    use_cupy = False
+
+
+# fft plans take up space, you might need to call this between large tracks
+def clear_cupy_cache():
+    # cupy disable fft caching to free blocks
+    fft_cache = cupy.fft.config.get_plan_cache()
+    orig_sz = fft_cache.get_size()
+    orig_memsz = fft_cache.get_memsize()
+
+    # clear the cache
+    fft_cache.set_size(0)
+
+    cupy.get_default_memory_pool().free_all_blocks()
+
+    # cupy reenable fft caching
+    fft_cache.set_size(orig_sz)
+    fft_cache.set_memsize(orig_memsz)
+
+
 def validate(reference_sources, estimated_sources):
     """Checks that the input data to a metric are valid, and throws helpful
     errors if not.
@@ -526,10 +555,13 @@ def _compute_reference_correlations(reference_sources, filters_len):
     reference_sources = _zeropad(reference_sources, filters_len - 1, axis=2)
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
 
-    try:
-        sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources, dtype=np.float32), n=n_fft, axis=2))
-    except cupy.cuda.memory.OutOfMemoryError:
-        sf = scipy.fft.rfft(reference_sources, n=nfft, axis=2)
+    if use_cupy:
+        try:
+            sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources, dtype=np.float32), n=n_fft, axis=2))
+        except cupy.cuda.memory.OutOfMemoryError:
+            sf = scipy.fft.rfft(reference_sources, n=n_fft, axis=2)
+    else:
+        sf = scipy.fft.rfft(reference_sources, n=n_fft, axis=2)
 
     # compute intercorrelation between sources
     G = np.zeros((nsrc, nsrc, nchan, nchan, filters_len, filters_len))
@@ -541,10 +573,15 @@ def _compute_reference_correlations(reference_sources, filters_len):
         2
     ):
         tmp = sf[j, c2] * np.conj(sf[i, c1])
-        try:
-            ssf = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(tmp)))
-        except cupy.cuda.memory.OutOfMemoryError:
+
+        if use_cupy:
+            try:
+                ssf = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(tmp)))
+            except cupy.cuda.memory.OutOfMemoryError:
+                ssf = scipy.fft.irfft(tmp)
+        else:
             ssf = scipy.fft.irfft(tmp)
+
         ss = toeplitz(
             np.hstack((ssf[0], ssf[-1:-filters_len:-1])),
             r=ssf[:filters_len]
@@ -580,9 +617,12 @@ def _compute_projection_filters(G, sf, estimated_source):
     # compute its FFT
     n_fft = int(2**np.ceil(np.log2(nsampl + filters_len - 1.)))
 
-    try:
-        sef = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(estimated_source, dtype=np.float32), n=n_fft))
-    except cupy.cuda.memory.OutOfMemoryError:
+    if use_cupy:
+        try:
+            sef = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(estimated_source, dtype=np.float32), n=n_fft))
+        except cupy.cuda.memory.OutOfMemoryError:
+            sef = scipy.fft.rfft(estimated_source, n=n_fft)
+    else:
         sef = scipy.fft.rfft(estimated_source, n=n_fft)
 
     # compute the cross-correlations between sources and estimates
@@ -592,9 +632,12 @@ def _compute_projection_filters(G, sf, estimated_source):
         list(range(nsrc)), list(range(nchan)), list(range(nchan))
     ):
         tmp = sf[j, cj] * np.conj(sef[c])
-        try:
-            ssef = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(tmp)))
-        except cupy.cuda.memory.OutOfMemoryError:
+        if use_cupy:
+            try:
+                ssef = cupy.asnumpy(cupyx.scipy.fft.irfft(cupy.asarray(tmp)))
+            except cupy.cuda.memory.OutOfMemoryError:
+                ssef = scipy.fft.irfft(tmp)
+        else:
             ssef = scipy.fft.irfft(tmp)
         D[j, cj, :, c] = np.hstack((ssef[0], ssef[-1:-filters_len:-1]))
 
@@ -602,22 +645,32 @@ def _compute_projection_filters(G, sf, estimated_source):
     D = D.reshape(nsrc * nchan * filters_len, nchan)
     G = _reshape_G(G)
 
-    try:
-        D_gpu = cupy.asarray(D, dtype=np.float32)
-        G_gpu = cupy.asarray(G, dtype=np.float32)
-
-        # Distortion filters
+    if use_cupy:
         try:
-            C = cupy.asnumpy(cupy.linalg.solve(G_gpu + eps*cupy.eye(G.shape[0]), D_gpu)).reshape(
-                nsrc, nchan, filters_len, nchan
-            )
-        except np.linalg.linalg.LinAlgError:
-            C = cupy.asnumpy(cupy.linalg.lstsq(G_gpu, D_gpu))[0].reshape(
-                nsrc, nchan, filters_len, nchan
-            )
-    except cupy.cuda.memory.OutOfMemoryError:
+            D_gpu = cupy.asarray(D, dtype=np.float32)
+            G_gpu = cupy.asarray(G, dtype=np.float32)
+
+            # Distortion filters
+            try:
+                C = cupy.asnumpy(cupy.linalg.solve(G_gpu + eps*cupy.eye(G.shape[0]), D_gpu)).reshape(
+                    nsrc, nchan, filters_len, nchan
+                )
+            except np.linalg.linalg.LinAlgError:
+                C = cupy.asnumpy(cupy.linalg.lstsq(G_gpu, D_gpu))[0].reshape(
+                    nsrc, nchan, filters_len, nchan
+                )
+        except cupy.cuda.memory.OutOfMemoryError:
+            try:
+                C = np.linalg.solve(G + eps*np.eye(G.shape[0]), D).reshape(
+                    nsrc, nchan, filters_len, nchan
+                )
+            except np.linalg.linalg.LinAlgError:
+                C = np.linalg.lstsq(G, D)[0].reshape(
+                    nsrc, nchan, filters_len, nchan
+                )
+    else:
         try:
-            C = np.linalg.solve(G + eps*cupy.eye(G.shape[0]), D).reshape(
+            C = np.linalg.solve(G + eps*np.eye(G.shape[0]), D).reshape(
                 nsrc, nchan, filters_len, nchan
             )
         except np.linalg.linalg.LinAlgError:
diff --git a/setup.py b/setup.py
index f990a2d..ed8b086 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@
         extras_require={  # Optional
             'dev': ['check-manifest'],
             'tests': ['pytest'],
+            'cupy': ['cupy-cuda114'],
             'docs': [
                 'sphinx',
                 'sphinx_rtd_theme',
diff --git a/tests/test_regression.py b/tests/test_regression.py
index b2b6df4..820be53 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -5,6 +5,8 @@
 import museval
 import numpy as np
 
+#museval.disable_cupy()
+
 
 @pytest.fixture()
 def mus():

From 62698095b167dd0ccd6f68e12cc45dab3f03d4d1 Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Tue, 10 Aug 2021 10:04:39 -0400
Subject: [PATCH 07/11] Remove float32 dtypes

Needs float64 or it gets large errors in regression tests
---
 museval/metrics.py       | 6 +++---
 tests/test_regression.py | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/museval/metrics.py b/museval/metrics.py
index 34566a8..0a4e687 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -557,7 +557,7 @@ def _compute_reference_correlations(reference_sources, filters_len):
 
     if use_cupy:
         try:
-            sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources, dtype=np.float32), n=n_fft, axis=2))
+            sf = cupy.asnumpy(cupyx.scipy.fft.rfft(cupy.asarray(reference_sources), n=n_fft, axis=2))
         except cupy.cuda.memory.OutOfMemoryError:
             sf = scipy.fft.rfft(reference_sources, n=n_fft, axis=2)
     else:
@@ -647,8 +647,8 @@ def _compute_projection_filters(G, sf, estimated_source):
 
     if use_cupy:
         try:
-            D_gpu = cupy.asarray(D, dtype=np.float32)
-            G_gpu = cupy.asarray(G, dtype=np.float32)
+            D_gpu = cupy.asarray(D)
+            G_gpu = cupy.asarray(G)
 
             # Distortion filters
             try:
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 820be53..b2b6df4 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -5,8 +5,6 @@
 import museval
 import numpy as np
 
-#museval.disable_cupy()
-
 
 @pytest.fixture()
 def mus():

From d527421da3b03477a14f379e92e3c9110f45d24a Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Thu, 19 Aug 2021 09:50:16 -0400
Subject: [PATCH 08/11] Remove unnecessary fork description in README

---
 README.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/README.md b/README.md
index 439d4de..6f47ca8 100644
--- a/README.md
+++ b/README.md
@@ -6,13 +6,6 @@
 
 A python package to evaluate source separation results using the [MUSDB18](https://sigsep.github.io/musdb) dataset. This package was part of the [MUS task](https://sisec.inria.fr/home/2018-professionally-produced-music-recordings/) of the [Signal Separation Evaluation Campaign (SISEC)](https://sisec.inria.fr/).
 
-### Note on the sevagh fork
-
-My fork adds considerable speedups by using cupy for offloading the fft, ifft, and linalg executions of BSSv4 to the user's NVIDIA GPU. However, this has several downsides:
-* Switching to using GPU arithmetic and float32 changes output values due to numerical precision differences
-* Managing memory is not straightforward, often requiring the user to disable and re-enable the cupy fft cache between runs
-* Installing GPU libraries is a burden on end users, so it's a bigger conversation to merge such a requirement upstream
-
 ### BSSEval v4
 
 The BSSEval metrics, as implemented in the [MATLAB toolboxes](http://bass-db.gforge.inria.fr/bss_eval/) and their re-implementation in [mir_eval](http://craffel.github.io/mir_eval/#module-mir_eval.separation) are widely used in the audio separation literature. One particularity of BSSEval is to compute the metrics after optimally matching the estimates to the true sources through linear distortion filters. This allows the criteria to be robust to some linear mismatches. Apart from the optional evaluation for all possible permutations of the sources, this matching is the reason for most of the computation cost of BSSEval, especially considering it is done for each evaluation window when the metrics are computed on a framewise basis.

From 551d4d99f08ff0cd1f2f13470d5e64110c5436f8 Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Wed, 9 Feb 2022 13:53:27 -0500
Subject: [PATCH 09/11] Import sys to correct warning message

---
 museval/metrics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/museval/metrics.py b/museval/metrics.py
index 0a4e687..33df8aa 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -52,6 +52,7 @@
 import itertools
 import collections
 import warnings
+import sys
 
 use_cupy = False
 try:

From cef0f91975f4ed7d9297b2cdb779d57bf25c34a0 Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Fri, 10 Feb 2023 08:57:38 -0500
Subject: [PATCH 10/11] Fix aggrate->aggregate typo (nit)

---
 museval/aggregate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/museval/aggregate.py b/museval/aggregate.py
index d414ced..a6e8870 100644
--- a/museval/aggregate.py
+++ b/museval/aggregate.py
@@ -258,7 +258,7 @@ def save(self, path):
 
     def __repr__(self):
         targets = self.df['target'].unique()
-        out = "Aggrated Scores ({} over frames, {} over tracks)\n".format(
+        out = "Aggregated Scores ({} over frames, {} over tracks)\n".format(
             self.frames_agg, self.tracks_agg
         )
         for target in targets:

From e5b49ff1a1460453566da25b8c054cc25a65e964 Mon Sep 17 00:00:00 2001
From: Sevag Hanssian <sevagh@protonmail.com>
Date: Sat, 18 Feb 2023 11:09:02 -0500
Subject: [PATCH 11/11] Patch incorrect use of warn fn

---
 museval/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/museval/metrics.py b/museval/metrics.py
index 33df8aa..20eff98 100644
--- a/museval/metrics.py
+++ b/museval/metrics.py
@@ -60,7 +60,7 @@
     import cupy
     use_cupy = True
 except ImportError:
-    warnings.warn('cupy not available, falling back to regular numpy', file=sys.stderr)
+    warnings.warn('cupy not available, falling back to regular numpy')
 
 # The maximum allowable number of sources (prevents insane computational load)
 MAX_SOURCES = 100