diff --git a/.clang-format b/.clang-format
index 7cf221dc1..5f3510b8c 100644
--- a/.clang-format
+++ b/.clang-format
@@ -104,19 +104,32 @@ IfMacros:
 IncludeBlocks:   Regroup
 IncludeCategories:
   - Regex:           '^<ext/.*\.h>'
-    Priority:        2
+    Priority:        4
     SortPriority:    0
     CaseSensitive:   false
-  - Regex:           '^<.*\.h>'
+  # it's critical that <hip/hip_runtime.h> is included before the
+  # <hip/hip_cooperative_groups.h> header
+  # -> I spent a LOT of time trying to get SortPriority to work properly, and
+  #    I've concluded that SortPriority is probably buggy or the documentation isn't
+  #    accurate (or I'm missing something obvious)
+  - Regex:           '^<hip/hip_runtime.h>$'
     Priority:        1
     SortPriority:    0
+    CaseSensitive:   true
+  - Regex:           '^<hip/.*>$'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   true
+  - Regex:           '^<.*\.h>'
+    Priority:        3
+    SortPriority:    0
     CaseSensitive:   false
   - Regex:           '^<.*'
-    Priority:        2
+    Priority:        4
     SortPriority:    0
     CaseSensitive:   false
   - Regex:           '.*'
-    Priority:        3
+    Priority:        5
     SortPriority:    0
     CaseSensitive:   false
 IncludeIsMainRegex: '([-_](test|unittest))?$'
diff --git a/.github/workflows/compilation_checks.yml b/.github/workflows/compilation_checks.yml
index c9c41f785..fef2d4edf 100644
--- a/.github/workflows/compilation_checks.yml
+++ b/.github/workflows/compilation_checks.yml
@@ -26,7 +26,7 @@ jobs:
         make-type: [hydro, gravity, disk, particles, cosmology, mhd, dust, cooling]
         container:
           #- {name: "CUDA", link: "docker://chollahydro/cholla:cuda_github"}
-          - {name: "HIP", link: "docker://chollahydro/cholla:rocm_github"}
+          - {name: "HIP", link: "ghcr.io/cholla-hydro/cholla-rocm:sha-a08bd2c"}
     env:  # Set environment variables
       CHOLLA_MAKE_TYPE: ${{ matrix.make-type }}
 
diff --git a/builds/make.type.disk b/builds/make.type.disk
index 5de7f1891..568b8990a 100644
--- a/builds/make.type.disk
+++ b/builds/make.type.disk
@@ -22,7 +22,7 @@ DFLAGS += -DGRAVITY_5_POINTS_GRADIENT
 
 DFLAGS    += -DMPI_CHOLLA
 DFLAGS    += -DPRECISION=2
-DFLAGS    += -DPLMC
+DFLAGS    += -DPLMP
 DFLAGS    += -DHLLC
 DFLAGS    += -DVL
 
diff --git a/builds/make.type.starblast b/builds/make.type.starblast
index 582f4cc44..26127c2ae 100644
--- a/builds/make.type.starblast
+++ b/builds/make.type.starblast
@@ -25,7 +25,7 @@ DFLAGS += -DANALYSIS
 DFLAGS    += -DCUDA
 DFLAGS    += -DMPI_CHOLLA 
 DFLAGS    += -DPRECISION=2
-DFLAGS    += -DPLMC
+DFLAGS    += -DPLMP
 DFLAGS    += -DHLLC
 DFLAGS    += -DVL
 
diff --git a/docs/sphinx/Physics/DiskModel.md b/docs/sphinx/Physics/DiskModel.md
new file mode 100644
index 000000000..cd826e895
--- /dev/null
+++ b/docs/sphinx/Physics/DiskModel.md
@@ -0,0 +1,335 @@
+# Disk Model
+## Overview
+
+One of Cholla's flagship applications is Idealized Galaxy Simulations of disk galaxies.
+These simulations can involve choices related to several physical concepts: (i) modeling of gravity, (ii) radiative cooling, and (iii) modeling stellar feedback. These modules are all highly related.
+You also need to consider 2 simulation phases: initial-conditions and evolution over a timestep.
+
+### Primer on Gravitational Potentials
+
+A central ingredient of any galaxy simulation is the physical model of the gravitational potential. This is a complex topic that is relevant in the context of initial conditions **AND** in the context of evolving the simulation over a timestep. The precise treatment of the potential depends on the context **AND** on Cholla's configuration.
+
+It is instructive to first define all possible components of the total potential. Throughout the rest of this page, we will subsequently describe how treatment of these potentials can vary. The total potential is given by: 
+
+:::{math}
+
+\Phi_{\rm tot} = \Phi_{\rm dm} + \Phi_{\rm stars,old} + \Phi_{\rm stars,young} + \Phi_{\rm gas}
+:::
+
+where
+-  {math}`\Phi_{\rm dm}` corresponds to the dark matter halo  (always a static potential)
+-  {math}`\Phi_{\rm stars,old}` corresponds to older stellar populations in the disk (always a static potential)
+- {math}`\Phi_{\rm stars,young}` corresponds to the younger stellar populations (computed with self-gravity from particles)
+- {math}`\Phi_{\rm gas}` corresponds to the younger stellar populations (computed with self-gravity from particles)
+
+### Primer on Radiative Cooling
+
+At the time of writing, simulations must always be initialized with CIE cooling (cooling shuts off below 1e4 K) in order to prevent immediate collapse of the gas disk.
+
+To use other cooling mechanisms, we currently recommend that you:
+
+- start running the simulation with CIE cooling
+
+- after enough time has passed to start driving turbulence in the gas disk (commonly a few 10s of Myr), you can then restart the simulation with a different cooling recipe.
+
+### Primer on Feedback and Star Formation:
+
+We provide a basic particle-based feedback approach.
+At the moment, all star particles must be initialized at startup (see below).
+In the future, the goal is to introduce self-consistent star formation.
+
+## Configurations
+
+Historically, there have been 3 main physics-module configurations for running an Idealized Galaxy Problem:
+1. Standalone Static Gravity:
+  - In this mode, standalone static gravity is used and particles are **NOT** supported.
+  - Effectively, this mode assumes that {math}`\Phi_{\rm stars,young}=0` and {math}`\Phi_{\rm gas}=0`.
+  - **Purpose:** A simpler model that is easy to customize
+:::{warning}
+This configuration has not been tested for quite a while in the dev branch.
+:::
+
+:::{caution}
+At the time of writing, the logic modelling the static potential is duplicated between the initial conditions and the source-term calculation.
+It is the user's responsibility to ensure that consistent models of the potential are used in both parts of the code.
+:::
+
+2. Normal Self Gravity
+  - use the self-gravity module to model full gravitational potential while updating timesteps.
+  - We provide details about assumptions that the gravity-solver makes to compute contributions from self-gravity [down below](#analytic-self-gravity-estimate)
+  - approximations for the total potential are used during initial conditions
+  - **Purpose:** to model the disk of a galaxy as self-consistently as possible (it's currently unclear whether collapsing gas clouds in simulations without self-consistent star formation are an issue)
+3. Particle-Self-Gravity
+  - uses the self-gravity module to effectively achieve a hybrid between the other 2 modes.
+  - enable this mode by setting the `gravity.gas_only_use_static_grav` runtime parameter to `true`
+  - The gravitational forces are handled differently for star particles and gas:
+    - gravity source terms for the gas are only computed from the static potentials: {math}`(\Phi_{\rm dm} + \Phi_{\rm stars,old})`.
+    - the star particles experience forces from all components {math}`(\Phi_{\rm dm} + \Phi_{\rm stars,old} + \Phi_{\rm stars,young} + \Phi_{\rm gas})`.
+    - This merits some emphasis: star particles do indeed experience gravity from the gas (as well as from themselves) while the gas does **not** experience its own self-gravity. This choice is motivated by the assumptions we make for the gravity-solver to compute contributions from self-gravity [down below](#analytic-self-gravity-estimate)
+  - **Purpose:** this is intended to be an intermediate point between both of the other modes that lets us confidently experiment with a 3-phase ISM without self-consistent star formation.
+
+
+
+
+
+
+
+## Analytic Gravitational Potentials
+
+In this section, we discuss the analytic formulae used for modeling Gravitational Potentials. It may be instructive to look back at our [definitions of various components](#primer-on-gravitational-potentials).
+### Static Potentials
+
+First, let's consider static potentials. 
+
+All configurations of the idealized Galaxy Simulations:
+- currently assume that you are using a static Gravitational Potential produced by 2 components: (i) component contributed by older stellar populations {math}`\Phi_{\rm stars-old}` and (ii) a component contributed by dark matter in the halo, {math}`\Phi_{\rm halo}`.
+- use these potentials for initializations and for evolving the simulation over a timestep.
+
+:::{important}
+[As we noted above](#configurations), when using the standalone Static-Gravity module, the logic implementing the static potential is duplicated between the initial conditions and the source-term calculations; it's the user's responsibility to ensure logic remains consistent in both parts of the code. 
+This is **NOT** a concern when using the Self-Gravity module
+:::
+
+Consider the parameterization of the potential of a Miyamoto-Nagai potential (this will be important [again](#analytic-self-gravity-estimate), shortly)
+
+:::{math}
+\Phi_{\rm MN}(R,z; M,a,b)\equiv \frac{-G M}{\sqrt{R^2 + (a + \sqrt{z^2 + b^2})^2}}.
+:::
+
+At the time of writing, we always define {math}`\Phi_{\rm stars,old}(R,z)=\Phi_{\rm MN}(R,z; M_{\rm stars}, R_{\rm stars}, z_{\rm stars})`, where {math}`R` & {math}`z` are cylindrical coordinates, {math}`R_{\rm stars}` & {math}`z_{\rm stars}` are the scale radius/height, and {math}`M_{\rm stars}` is the mass of the disk.
+Currently, these values are hardcoded where we initialize the `ClusteredDiskGalaxy` struct that holds the Milky Way properties (in {repository-file}`src/model/particles/disk_galaxy.cu`).
+
+At the time of writing, {math}`\Phi_{\rm dm}` is always assumed to be an NFW profile, or
+
+:::{math}
+\Phi_{\rm dm}(r) = \frac{-G M_{\rm vir}}{r[\ln(1+c) - c/(1+c)]}\ \ln \left(1 +\frac{r}{R_{\rm dm}}\right),
+:::
+
+where {math}`r=\sqrt{R^2+z^2}` is radius in spherical coordinates, {math}`M_{\rm vir}` is the dark matter mass, {math}`c` is halo concentration, and {math}`R_{\rm dm}` is the scale length of the halo.
+
+
+### Analytic Self-Gravity Estimate
+
+Now we turn our attention to an analytic estimate related to self-gravity. Be aware: when updating a simulation over a timestep, the gravitational forces acting on particles and the fluid are **NEVER** directly computed from this formula.
+
+Specifically, we define {math}`\hat{\Phi}_{\rm gas,disk}(R,z)`. As we will [become clear below](#gas-initial-conditions), we can roughly approximate our gas disk is roughly characterized by a double-exponential surface density profile 
+
+:::{math}
+\rho_{\rm d}(R,) = \rho_{{\rm d},0} \exp(-R/R_{\rm d}) \exp(-|z|/z_{\rm d}).
+:::
+
+The potential for this density profile does **NOT** have an analytic form. Instead, we use the tables from [Smith+15](https://ui.adsabs.harvard.edu/abs/2015MNRAS.448.2934S) to define {math}`\hat{\Phi}_{\rm gas}(R,z)` as the superposition of 3 Miyamoto-Nagai disk potentials that approximate the disk potential.
+
+This approximation comes up in 2 cases:
+- During initialization, we assume that {math}`\hat{\Phi}_{\rm gas,disk}(R,z)` is a good approximation for the self gravity of gas disk and star particles when computing circular velocities.
+- In the context of the self-gravity solver, we assume that {math}`\hat{\Phi}_{\rm gas,disk}(R,z)` is a good approximation for the total potential {math}`(\Phi_{\rm gas,disk}(R,z) + \Phi_{\rm gas,halo}(R,z) + \Phi_{\rm stars,young}(R,z))`  at the boundaries of the simulation domain. As discussed on the [[Gravity]] page of the documentation, if this is a bad approximation, then artifacts will arise in the simulation
+
+The primary limitation to this approximation for {math}`\hat{\Phi}_{\rm gas,disk}(R,z)` is that we don't account for the fact that the gas disk is truncated. Truncation has 2 impacts:
+- less importantly: the shape of truncation can be important for the shape of the potential near the truncation radius
+- more importantly: {math}`\hat{\Phi}_{\rm gas,disk}(R,z)` includes the contributions to the gravitational potential from material outside of the simulation domain (if that sounds "wrong" to you, that's becomes most of your physical intuition comes from spherical symmetry).
+
+## Gas Initial Conditions
+
+The ``"Disk_3D"`` and ``"Disk_3D_particles"`` initial conditions will both initialize the gas disk. Both cases initialize an isothermal gas disk within a halo. The latter also subsequently initializes particles.
+
+Cholla offers machinery to setup an idealized galaxy simulation. Specifically, it initializes properties a gas disk inside of gas halo. The conditions broadly match the description provided by  [Schneider & Robertson 2018](https://ui.adsabs.harvard.edu/abs/2018ApJ...860..135S) (there are some extensions when using self-gravity).
+
+Ideally, we our initial conditions would be in hydrostatic equilibrium (i.e. the properties of a simulation would not change if we hit "go"). For a handful of reasons, we can't actually achieve initial conditions a handful of reasons (related to the gas in the halo and the boundary conditions), but we pick ICs that are stable.
+
+To accomplish this, it is useful to consider gas in the disk and gas in the halo. Let  {math}`\rho_{\rm disk}(R, z)` and {math}`P_{\rm disk}(R,z)` refer to the axis-symmetric profiles for the disk gas. We define {math}`\rho_{\rm halo}(r)` and {math}`P_{\rm halo}(r)` as the spherical profiles for gas in the halo. We will provide precise definitions for these profiles momentarily. To ensure a smooth transition between disk and halo, we define the simulation's initial density and pressure profiles as 
+
+:::{math}
+:nowrap:
+
+\begin{eqnarray}
+\rho(R,z) = \rho_{\rm disk}(R, z) + \rho_{\rm halo}(\sqrt{R^2+z^2}) \\
+P(R,z) = P_{\rm disk}(R, z) + P_{\rm halo}(\sqrt{R^2+z^2}).
+\end{eqnarray}
+:::
+
+It's instructive to note that while {math}`\rho_{\rm disk}` & {math}`P_{\rm disk}` are both 0 "outside of the disk," {math}`\rho_{\rm halo}` & {math}`P_{\rm halo}` are **always** positive.
+
+At all spatial locations within the gas disk (i.e. where {math}`\rho_{\rm disk}>0`), we introduce a rotational velocity, {math}`v_{\rm rot}`, to support the disk or that
+
+:::{math}
+
+\frac{v_{\rm rot}^2 (R, z)}{R} = -\frac{\partial\Phi_{\rm cyl,est}}{\partial R} + \frac{1}{\rho(R,z)} \frac{\partial P}{\partial R},
+:::
+
+where {math}`\Phi_{\rm cyl,est}` is our best estimate of the total potential (this is defined below).
+
+:::{note}
+Earlier versions of Cholla **_only_** considered the density and pressure of the disk's gas when it computed {math}`v_{\rm rot}` (in other words, it didn't consider gas from the halo).
+:::
+
+### A quick primer on Solving Density Profiles in Hydrostatic Eq
+
+It is instructive to briefly consider the generic solution for density profiles in Hydrostatic Equilibrium. In 1D hydrostatic equlibrium requires that:
+
+:::{math}
+
+\frac{1}{\rho} \frac{\partial P}{\partial x} + \frac{\partial \Phi}{\partial x} = 0.
+
+:::
+
+In spherical symmetry, we can replace {math}`x` with {math}`r`. If you are considering vertical stratification of a planar "atmosphere", you can replace {math}`x` with {math}`z`.
+
+In all cases we assume {math}`\Phi` is independent of {math}`\rho`. To derive a solution, we need an equation of state (EoS) relating {math}`P` and {math}`\rho`. Solutions are typically derived using the the generic polytropic equation of state: {math}`P = K\rho^\Gamma `, where
+- {math}`K` is a constant 
+- {math}`\Gamma` is a constant called the "polytropic index" (sometimes "polytropic index" refers to a separate, related quantity). The EoS is isothermal (temperature is constant) for {math}`\Gamma=1` and isentropic (entropy is constant) for {math}`\Gamma=\gamma`.
+It is instructive to note that {math}`c_s^2 = \partial P / \partial \rho = K \Gamma \rho^{\Gamma - 1} `
+
+The solution is:
+
+:::{math}
+:nowrap:
+
+\begin{eqnarray}
+\rho(x) &=& \rho(x_0) \exp\left(-\frac{\Phi(x) - \Phi(x_0)}{c_s^2}\right)&,&\ \Gamma = 1 \\
+\rho(x) &=& \rho(x_0) \left[ 1+(\gamma - 1) \frac{\Phi(x) - \Phi(x_0)}{c_s^2(x_0)}\right]^{1/(\gamma-1)}&,&\ {\rm otherwise}
+\end{eqnarray}
+
+:::
+
+### Halo Gas
+
+Lets consider the halo gas profile.
+
+As noted in the [Schneider & Robertson 2018](https://ui.adsabs.harvard.edu/abs/2018ApJ...860..135S) , "in setting the halo gas distribution the potential is taken to be spherically symmetric." In other words, the halo gas profile uses a potential called {math}`\Phi_{\rm sph}(r) = \Phi_{\rm stars}(R=0,z=r) + \Phi_{\rm halo}(r)` (it **NEVER** includes contributions from self-gravity). We use an isentropic profile:
+
+:::{math}
+
+\rho_{\rm h}(r) = \rho_{\rm h}(r_0)\left[ 1+(\gamma - 1) \frac{\Phi_{\rm sph}(r) - \Phi_{\rm sph}(r_0)}{c_{s,{\rm h}}^2(r_0)}\right]^{1/(\gamma-1)}.
+
+:::
+
+In the above equation
+- {math}`r_0` is the "cooling radius." This effectively is where we choose to normalize the profile. Essentially, the parameterized by specifying {math}`\rho_{\rm h}(r_0)` .
+- {math}`c_{s,{\rm h}}^2(r_0)` is the adiabatic sound speed at the cooling radius, or {math}`c_{s,{\rm h}}(r_0)= \sqrt{\gamma k_B T_{\rm h}(r_0) / (\mu m_{\rm H})}`.
+
+:::{note}
+Earlier versions of the code historically defined {math}`c_{s,{\rm h}}` as the isothermal sound speed. Typically, this means that profile never reached the associated temperature by {math}`r_0`.
+:::
+
+### Gas Disk Profile
+
+The gas disk is initialized with an isothermal profile. An exponential gas disk surface density profile is {math}`\Sigma(R) = \Sigma_0 \exp(-R / R_{\rm d})`. In practice, we truncate the surface density. The modified surface density equation is given by:
+
+:::{math}
+
+\Sigma(R) = \Sigma_0 \exp(-R / R_{\rm d}) (1- f(r)),
+:::
+
+where {math}`f(R)` is a modified form of logistic function or 
+
+:::{math}
+
+f(R) = 0.5 + 0.5 \tanh\left(\frac{R -R_{\rm trunc}}{2\alpha}\right).
+:::
+
+At this time {math}`R_{\rm trunc}` is computed based on the domain, and {math}`\alpha` is hardcoded (they should probably be runtime parameters).
+
+:::{note}
+Earlier versions accomplished tapering in a slightly different manner
+:::
+
+For fixed {math}`R`, we can solve for a given hydrostatic column through the disk. The solution is governed by the following pair of equations:
+
+:::{math}
+:nowrap:
+
+\begin{eqnarray}
+\Sigma(R) &=& 2\int_{0}^\infty\rho_{\rm disk} (R, z)dz \\
+\rho_{\rm disk}(R, z) &=& \rho_{\rm disk} (R, z=0)\ \exp\left(-\frac{\Phi_{\rm vert}(R,z) - \Phi_{\rm vert}(R,z=0)}{c_s^2}\right)
+\end{eqnarray}
+:::
+
+Recall that the latter equation [is the solution for an isothermal fluid in Hydrostatic equilibrium](#a-quick-primer-on-solving-density-profiles-in-hydrostatic-eq).
+
+#### Ignoring Gas Self-Gravity
+
+When ignoring gas self gravity, these equations are easy to solve since {math}`\Phi_{\rm vert}` is just the analytic equation: {math}`\Phi_{\rm vert}=\Phi_{\rm dm} + \Phi_{\rm stars,old}`. These equations become:
+
+:::{math}
+:nowrap:
+
+\begin{eqnarray}
+\rho_{\rm disk}(R, z=0) &=&  \frac{\Sigma(R)}{2} \left[\int_{0}^\infty\exp\left(-\frac{\Phi_{\rm vert}(R,z) - \Phi_{\rm vert}(R,z=0)}{c_s^2}\right)dz\right]^{-1} \\
+\rho_{\rm disk}(R, z) &=& \rho_{\rm disk} (R, z=0)\ \exp\left(-\frac{\Phi_{\rm vert}(R,z) - \Phi_{\rm vert}(R,z=0)}{c_s^2}\right)
+\end{eqnarray}
+:::
+
+#### Including Disk-Gas Self-Gravity
+
+This case is a little more complex. In this case, {math}`\Phi_{\rm vert} = \Phi_{\rm gas, disk} + \Phi_{\rm dm} + \Phi_{\rm stars,old}`. We describe the {math}`\Phi_{\rm gas, disk}` with the simplified form of Poisson's Equation for an axisymmetric thin disc or
+
+:::{math}
+
+\frac{d^2\Phi_{\rm gas, disk}}{dz^2} = 4 \pi G \rho_{\rm gas,disk}.
+:::
+
+For more details about why its okay to ignore neglect variations along {math}`R`, see the discussion of equation 14 and Appendix E in [Wang+ 2010](https://ui.adsabs.harvard.edu/abs/2010MNRAS.407..705W/abstract). For convenience, we define {math}`\Phi_{\rm gas, disk}(R,z=0) = 0` (we can do this since we only think about a single {math}`R` at a time).
+
+Let's also define the variable {math}`u` as:
+
+:::{math}
+
+u(R, z)=\int_{0}^z\frac{\rho_{\rm disk} (R, z)}{\rho_{\rm disk} (R, 0)} dz = \int_{0}^z \exp\left(-\frac{\Phi_{\rm vert}(R,z) - \Phi_{\rm vert}(R,z=0)}{c_s^2}\right) dz.
+:::
+
+In other words, {math}`\rho_{\rm disk} (R, 0) = \Sigma(R)/(2 u(R,\infty))`.
+
+**What does that do for us?** We can recast our various equations as:
+
+:::{math}
+:nowrap:
+
+\begin{eqnarray}
+\frac{d}{dz} \frac{d\Phi_{\rm gas, disk}}{dz} &=& 4 \pi \rho_{\rm disk} (R, 0)\ \exp\left(-\frac{\Phi_{\rm gas, disk}(R,z) + (\Phi_{\rm dm}(R,z)-\Phi_{\rm dm}(R,0)) + (\Phi_{\rm stars,old}(R, z) - \Phi_{\rm stars,old}(R, 0))}{c_s^2}\right) \\
+\frac{d}{dz} \Phi_{\rm gas, disk} &=& \frac{d}{dz} \frac{d\Phi_{\rm gas, disk}}{dz} \\
+\frac{d}{dz} u &=& \exp\left(-\frac{\Phi_{\rm gas, disk}(R,z) + (\Phi_{\rm dm}(R,z)-\Phi_{\rm dm}(R,0)) + (\Phi_{\rm stars,old}(R, z) - \Phi_{\rm stars,old}(R, 0))}{c_s^2}\right)
+\end{eqnarray}
+:::
+
+We have written the equations in this form to highlight that this is an "initial value problem." To solve for the {math}`\Phi_{\rm gas, disk}(R,z)` and the density profile (again, this is all at fixed {math}`R`), we follow this procedure:
+1. Start with an initial guess for {math}`\rho_{\rm disk} (R, 0)`
+2. Use our guess for {math}`\rho_{\rm disk} (R, 0)` to integrate the system of 3 differential equations from {math}`z = 0` out to a "large z" (where {math}`\rho_{\rm disk}` is essentially 0).
+   - we know that {math}`\frac{d\Phi_{\rm gas, disk}}{dz}(R,z=0) = 0` (since it's an extrema), {math}`\Phi_{\rm gas, disk}(R,z=0)=0` (we picked this earlier) and {math}`u(R,z=0)=0` (by definition)
+   - this directly gives us {math}`\frac{d\Phi_{\rm gas, disk}}{dz}(R,z)`, {math}`\Phi_{\rm gas, disk}(R,z)`, and {math}`u(R,z)`
+3. We compute a new guess for {math}`\rho_{\rm disk} (R, 0)` from {math}`\Sigma(R) / (2 * u(R, \infty))`
+4. If our new guess was "close enough" to our previous guess, we're done. Otherwise, go back to step 2 using our newest guess.
+
+## Particle ICs
+
+Particles are only initialized if you use the ``"Disk_3D_particles"`` initial conditions.
+
+We use the Kennicutt–Schmidt law to determine the distribution of particles with respect to {math}`r_{\rm cyl}`
+
+:::{math}
+
+\Sigma_{SFR}(R) = a * \Sigma_{\rm gas}^{k_s},
+:::
+
+where `a` is some arbitrary normalization constant and {math}`k_s` is usually 1.4. We can combine this with the formula for the gas surface density, {math}`\Sigma_{\rm gas}(R) = \Sigma_{{\rm gas},0} \exp(-R / R_{\rm d})`, to get a more detailed formula for star-formation surface density:
+
+:::{math}
+
+\Sigma_{\rm SFR}(R) = a * \Sigma_{{\rm gas},0}^{k_s} * \exp(-r_{\rm cyl} / R_{\rm gas-scale-length}).
+:::
+
+Essentially we can use this to incremental rate of star-formation {math}`d{\rm SFR}` in the disk between {math}`r_{\rm cyl}` and {math}`(r_{\rm cyl} + dr_{\rm cyl}`). If we consider some duration of time (in practice set by the `t_out` runtime parameter), you can work an expected number of stars formed between  {math}`r_{\rm cyl}` and {math}`(r_{\rm cyl} + dr_{\rm cyl}`) during that duration. It's straight-forward to convert this function into a PDF.
+
+:::{note}
+I can definitely elaborate more -- I found the relevant notes on this topic
+:::
+
+At startup we sample this PDF to determine the {math}`r_{\rm cyl}` at which all particles are expected to form. We create star-particles at these radii and distribute the "turn-on times" from a time a little before we start the simulation until the time specified by the `tout` parameter. We use Poisson sampling to distribute these "turn-on times," to target a particular SFR.
+At the time of writing, the SFR is hardcoded within the `disk_stellar_cluster_init_` C++ function (in {repository-file}`src/particles/particles_3D.cpp`).
+
+:::{todo}
+Adjust the location where SFR is set to be co-located with other parameters.
+(Ideally, it would be a runtime parameter)
+:::
+
+
+
diff --git a/docs/sphinx/Physics/Dual-Energy-Formalism.md b/docs/sphinx/Physics/Dual-Energy-Formalism.md
index 01f0b1e2b..684c7f458 100644
--- a/docs/sphinx/Physics/Dual-Energy-Formalism.md
+++ b/docs/sphinx/Physics/Dual-Energy-Formalism.md
@@ -12,7 +12,7 @@ Problems arise in simulations with large Mach numbers (such as cosmological simu
 
 The solution to this numerical problem is to use the "dual-energy formalism" (more details are provided in [Bryan+ 2014](https://ui.adsabs.harvard.edu/abs/2014ApJS..211...19B)). The core idea is to track an extra separately-advected "thermal energy" field at each cell-location, in addition to the total energy field and use this "thermal energy" field in cases where {math}`(E - E_{\rm thermal})` provides insufficient precision. 
 
-The dual-energy formalism is parameterized by two parameters, {math}`\eta_1` and {math}`\eta_2`. It's easiest to understand their meaning by discussing how they are used. The [Bryan+2014](https://ui.adsabs.harvard.edu/abs/2014ApJS..211...19B) paper describes two main steps:
+The dual-energy formalism is parameterized by two parameters, {math}`\eta_1` and {math}`\eta_2`. It's easiest to understand their meaning by discussing how they are used. The [Bryan+ 2014](https://ui.adsabs.harvard.edu/abs/2014ApJS..211...19B) paper describes two main steps:
 1. During a given timestep, when we want to compute thermal pressure, we compare quotient of the "thermal energy" field divided by {math}`E` to {math}`\eta_1`.
     - When the ratio is smaller than {math}`\eta_1` we use the "thermal energy" field. When it exceeds {math}`\eta_1`, we use {math}`(E-E_{\rm kinetic})`.
     - In effect, {math}`\eta_1` directly parameterizes the precision where the dual-energy formalism kicks in.
@@ -21,7 +21,7 @@ The dual-energy formalism is parameterized by two parameters, {math}`\eta_1` and
     - To motivate this step, it's important to understand that when we separately advect the "thermal energy" and add the {math}`-p(\nabla \cdot {\bf v})\Delta t/ \rho` source term, we are effectively ignoring the effects of shock heating.
     - Consequently, we might want to overwrite the "thermal energy" to capture the effects of shock heating. 
     - The precise condition that dictates when we overwrite the "thermal energy" field involves a comparison of {math}`\eta_2` and the values in neighboring cells. When {math}`\eta_2` is too high, we would effectively exclude shock-heating from weaker shocks. When {math}`\eta_2` is too low we may include spurious heating that is introduced by the truncation error of {math}`(E-E_{\rm kinetic})`.
-    - **NOTE:** [Bryan+2014](https://ui.adsabs.harvard.edu/abs/2014ApJS..211...19B) call this step "synchronization" - we find that name somewhat confusing since it may imply a bidirectional update (updating both "thermal energy" and {math}`E`).
+    - **NOTE:** [Bryan+ 2014](https://ui.adsabs.harvard.edu/abs/2014ApJS..211...19B) call this step "synchronization" - we find that name somewhat confusing since it may imply a bidirectional update (updating both "thermal energy" and {math}`E`).
 
 In practice, Cholla does something slightly different:
 1. It implements step 1 exactly as described above.
diff --git a/docs/sphinx/Physics/Feedback.md b/docs/sphinx/Physics/Feedback.md
index 7e8f3f6db..520ce6d95 100644
--- a/docs/sphinx/Physics/Feedback.md
+++ b/docs/sphinx/Physics/Feedback.md
@@ -1,9 +1,168 @@
 # Particle-based Feedback
 
-Models supernova (SN) feedback from star cluster particles as a Poisson processes following the prescription in [Kim & Ostriker (2015)](https://ui.adsabs.harvard.edu/abs/2015ApJ...815...67K/abstract). Energy or momentum is injected into the interstellar medium depending on whether the SN is sufficiently numerically resolved.
+This page covers particle-based feedback. Essentially particles are used to model star-clusters and periodically they are scheduled to inject mass/material/energy as feedback into a fluid field. When an event occurs, mass is removed from a star cluster. The cells of the simulated grid that are modified are known as the event's "stencil."
 
-## Required Compilation Flags
-* `SUPERNOVA` - Supernova rate (SNR) information from a Starburst99 generated *.snr file can be read in by specifying the path as the value of the `snr_filename` parameter.  If this parameter is not set, then a default constant SNR is used.  The default SNR corresponds to 1 supernova per {math}`100~\mathrm{M}_\odot` of cluster mass, spread out over 36 Myr, starting when the cluster is 4 Myr old.  A sample Starburst99 file is included in the source code at `src/particles/starburst99_snr.txt`.  The sample represents a {math}`10^6~\mathrm{M}_\odot` fixed mass cluster, created using a Kroupa initial mass function, and with an {math}`8~\mathrm{M}_\odot` supernova cutoff.
+At this time, we only support supernovae. (Foundations for stellar-wind feedback can also be found in the codebase, but a significant work is needed to fully implement it).
+
+## Enabling Particle-Based Feedback
+At this time, the user needs to define the `FEEDBACK` macro (we can convert that to a runtime parameter with minimal effort).
+
+The user is also responsible for defining macros for properly configuring the particle macro:
 * `PARTICLES_GPU` - Particle-based feedback requires that the particle data be on the GPUs.
 * `PARTICLE_AGE` - Feedback varies with cluster age.
-* `PARTICLE_IDS` - Cluster IDs are used to prevent possible correlations or biases when generating random numbers used by the feedback algorithm.
\ No newline at end of file
+* `PARTICLE_IDS` - Cluster IDs are used to prevent possible correlations or biases when generating random numbers used by the feedback algorithm.
+
+## Runtime Parameters
+At the time of writing, the runtime parameters for configuring feedback are:
+- {par:param}`feedback.boundary_strategy`
+- {par:param}`feedback.snr_filename`
+- {par:param}`feedback.sn_model`
+- {par:param}`feedback.sn_rate`
+
+You can find a more complete description of these parameters {ref}`here <Reference-Feedback-Runtime-Params>`.
+
+## Supernovae Rate 
+
+This is controlled by the {par:param}`feedback.sn_rate` parameter.
+This topic is split into 2 parts: (i) {ref}`general-rate <general-SNe-rate>` and (ii) [handling events from neighboring particles](#handling-events-from-neighboring-particles).
+
+(general-SNe-rate)=
+##  General Rate
+
+Setting {par:param}`feedback.sn_rate` to `"immediate_sn"` is primarily used for testing-purposes.
+
+This discussion assumes that you set the {par:param}`feedback.sn_rate` parameter to `"table"`. Our treatment of the SN rate is inspired by [Kim & Ostriker (17)](https://ui.adsabs.harvard.edu/abs/2017ApJ...846..133K/abstract). For clusters with a mean age bounded by {math}`[t_m, t_m+dt)`, we consider the specific supernovae rate (i.e. the number of supernovae, {math}`\mathcal{N}_{\rm SN}`, per cluster mass {math}`M_{\rm cl}`)
+${math}`\xi_{\rm SN}(t_{\rm m}) = \frac{d}{dt}\left(\frac{\mathcal{N}_{\rm SN}}{M_{\rm cl}}\right).`$
+For a star-cluster that currently has mass {math}`M_{\rm cl}^\prime` and mean age {math}`t_{\rm m}^\prime`, we draw the number of supernovae in the cluster over a global simulation timestep {math}`\delta t` from a Poisson distribution expected value {math}`(\delta t M_{\rm cl}^\prime\xi_{\rm SN}(t_{\rm m}^\prime))`.  This treatment
+- means that a given cluster can have multiple SNe during a single timestep (this is reflected by our prescriptions)
+- clearly makes some assumptions (e.g. {math}`\xi_{\rm SN}(t_{\rm m})` is roughly constant over {math}`\delta t`, changes in {math}`M_{\rm cl}` during a timestep doesn't dramatically impact the probability of subsequent events)
+
+As in [Kim & Ostriker (17)](https://ui.adsabs.harvard.edu/abs/2017ApJ...846..133K/abstract), we use the table of results from `STARBURST99` for a fully sampled Kroupa IMF to infer {math}`\xi_{\rm SN}(t_{\rm m})`. In practice, Cholla reads in the `STARBURST99` results from the file specified by the {par:param}`feedback.snr_filename` parameter to pre-tabulates {math}`\xi_{\rm SN}(t_{\rm m})` at simulation startup.
+
+:::{important}
+We seed the PRNG to try to make a given simulation deterministic.
+If you use exactly the same version of the code, in the exact same configuration, the supernova rate *should* remain consistent (useful in the context of restarts and debugging).
+However, note that seeding depends on particle ids and the number of simulation timesteps (from t=0).
+Additionally, the algorithms for sampling a Poisson distribution may vary between CUDA & HIP.
+:::
+
+### Handling events from neighboring particles
+
+In this subsection, we address the question:
+
+> "How do we handle the scenario when 2 or more particles are scheduled to undergo feedback during the same timestep and have overlapping stencils?" 
+
+This is an **extremely** important question for a code where separate threads are applying feedback for separate particles to a single block in parallel (it's also relevant in any code if a single feedback stencil is allowed to modify cells in multiple blocks).  A naive implementation will lead to race-conditions.
+
+**Our solution:** we "sequence" the supernova events within a single timestep based on particle id. Essentially when multiple particles have events that can modify common cells, we sequentially apply feedback (the order is dictated by particle id). This behavior is simple and deterministic, and importantly it's well-defined in arbitrarily complex scenarios (e.g consider a small cluster of 5 particle with overlapping stencils, but there are no cells that are overlapped by more than 3 stencils). 
+
+:::{note}
+While this "sequencing" solution isn't very "physical," it only makes a practical difference when stencils overlap.
+If it is coming into play frequently in a given simulation, we may want to consider alternatives, like shorter timesteps or subcycling (after all, the probability that 2 nearby star clusters would have a supernova at exactly the same time is extremely low)
+:::
+
+The main alternative that we could revisit is implementing the method prescriptions as atomic operations. If we want to adopt this solution in the future, there are a number of important considerations that need to be addressed in the future. These details are highlighted in the next subsection.
+
+
+#### ASIDE: Important considerations for atomic "conflict-resolution"
+
+Essentially we walk through 3 increasingly complex scenarios:
+1. **Pure thermal energy-injecting prescription:** if all SN events **_only_** inject energy this is easy to accomplish with atomics.
+2. **Thermal Energy and Mass injection:** is more complex if only because kinetic energy density scales linearly with mass density while thermal energy density is independent of mass density.
+   - it becomes more complex if you modify the gas momentum to account for the fact that you're injecting the mass in the particle's reference frame.
+   - In this scenario, you need to (i) subtract off the kinetic from the entire total energy density field before anything else, (ii) atomically modify the fluid fields, (iii) add kinetic energy (using the new values) back to the total energy density field
+1. **Thermal Energy, Momentum, and Mass injection:** in practice this isn't any more complex than the last case (the same strategy applies)
+2. **Dynamically adjusting the prescription based on local conditions:** theoretically you can determine the kind of prescription each particle will use first, and then apply the same procedure that was used in the last 2 cases. In practice, this may not be sensible (e.g. if deciding between resolved vs unresolved)
+3. **Prescriptions that can average values within stencils in addition to injections:** I don't think there is a robust, straight-forward way to make this work with atomics (that also scales to arbitrarily complex scenarios)
+
+(SNe-Prescription-Descriptions)=
+## Prescription
+
+This section describe's the actual prescriptions. First we offer some broad background context. Then, we offer a high-level description of the available prescriptions. Subsequently, we provide additional detail about the stencils that are actually used and the magnitudes of the injected quantities.
+### Background Context
+
+Broadly speaking, it's useful to introduce the distinction between a "resolved" prescription that primarily injects thermal energy, and an "unresolved" prescription that primarily injects momentum. In slightly more detail:
+- a simulation with infinite resolution would always use "resolved" prescriptions that injects thermal energy.
+- Injecting thermal energy is problematic at coarser resolutions because overcooling prevents us from resolving a supernova remnant's evolution. Conventional wisdom holds that its better to simply inject the final momentum that we expect the remnant to produce after the remnant would have enough time to evolve to spatial scales that are large enough to be resolved. 
+- both kinds of prescriptions may inject mass. Properly accounting for a star-particle's reference frame means that mass injection implicitly involves some level of modifying the momentum of the gas.
+
+It is also possible to come up with other schemes that inject both thermal energy and momentum  (as in [Kim & Ostriker 17](https://ui.adsabs.harvard.edu/abs/2017ApJ...846..133K/abstract)).
+
+:::{note}
+In the context of Cholla, we adopt slightly more precise definitions of "resolved" and "unresolved."
+:::
+
+## High-Level Description of Prescriptions in Cholla:
+
+At the time of writing, Cholla adopt the following definitions of "resolved" and "unresolved" feedback:
+- resolved feedback always injects thermal energy and transfers mass. 
+- unresolved feedback is 2-step prescription that first overrides the density and momentum for the full cells that have at least partial overlap with the stencil with the average value (across the region being overwritten). Then, the final momentum is injected and mass is transferred.
+In both cases, we properly account for the particle's reference frame (when transferring mass and injecting momentum). We also propagate changes to total energy density.
+
+Presently, Cholla supports pure resolved-feedback prescriptions and hybrid-prescriptions. In a hybrid prescription:
+- we compute the total mass in all cells that at least partially overlap with the stencil divide it by the total volume of those cells to get an average mass density. We convert that to a number density {math}`n_0`.
+- We can plug this into {math}`r_{\rm sf} = 22.6\, {\rm pc}\ N_{\rm SN}^{0.29} n_0^{-0.42}` to get the radius of shell formation. This comes from equation 8 [Kim & Ostriker (15)](https://ui.adsabs.harvard.edu/abs/2015ApJ...802...99K/abstract) (the {math}`N_{\rm SN}^{0.29}`  term comes from replacing {math}`E_{51}^{0.29}`)
+- When, {math}`r_{\rm sf} > 3\Delta x`, we use the resolved prescription. Otherwise, we use the unresolved prescription
+
+Here we consider the values of {par:param}`feedback.sn_model`. There are currently a couple of flavors (some of them are for experimental purposes). Some flavors are just resolved, while others are hybrid prescriptions. The stencils also vary [(we describe the stencils below)](#stencil-descriptions).
+
+- {par:param}`feedback.sn_model` = `"resolvedCiC"`
+  - kind: resolved feedback
+  - stencil: we use standard 8-cell Cloud-in-Cell Interpolation
+- {par:param}`feedback.sn_model` = `"resolved27cell"`
+  - kind: resolved feedback
+  - stencil: we use a spherical 27-cell stencil, with supersampling to calculate overlap
+  - **EXPERIMENTAL**
+-  {par:param}`feedback.sn_model` = `"legacy"`
+  - kind: hybrid
+  - stencil: resolved feedback uses standard 8-cell Cloud-in-Cell (it's **exactly** like `"resolvedCiC"`).
+    Unresolved feedback uses a legacy 27-cell stencil based on CiC Interpolation (i.e. the stencil is effectively a cube)
+  - **IMPORTANT:** please see the description of the momentum stencil before you pick this choice.
+- {par:param}`feedback.sn_model` = `"legacyAlt"`
+  - kind: hybrid
+  - stencil: - stencil: resolved feedback uses standard 8-cell Cloud-in-Cell (it's **exactly** like `"resolvedCiC"`). Unresolved feedback uses a spherical 27-cell stencil, with supersampling to calculate overlap
+  - **EXPERIMENTAL:** (this is probably a better choice that `"legacy"` since the momentum-stencil is better defined)
+
+:::{todo}
+We may want to drop a few unneeded models, and rename some options
+:::
+
+### Stencil Descriptions
+
+The stencil is tied to way we distribute the source terms among cells from a feedback event.
+- as part of a stencil calculation, we might calculate the fraction of a cell's volume that is enclosed within the stencil-volume
+- when it comes to mass, we usually try to inject a constant amount of mass (or thermal energy) per unit volume throughout the stencil's volume.
+- momentum is trickier since it's a vector (with 3 components). In general, we try to (i) distribute momentum density as evenly as possible and (ii) ensure that we don't introduce any net momentum (in the reference frame of the source star-cluster). But there are some thorny questions (that we won't directly address):
+  - "what do we do when there is cancellation?" (obviously less of an issue for a larger stencil)
+  - "do we want the magnitude of the momentum vector to be constant?" Presumably, the answer is yes, but "what about non-spherical deposition regions?"
+
+Types of Stencils:
+- Cloud-in-Cell Interpolation:
+  - essentially, we treat the deposition volume as cube with side-length {math}`\Delta x` (the volume is centered on the particle). For context, a sphere with radius {math}`\Delta x / 2` is about half the volume.
+  - This is way too small for dealing with momentum
+- Legacy-27 stencil based on CiC
+  - in broad strokes, the idea is to "divid[e] a scalar quantity between 3x3x3 cells is done by imagining a 2x2x2 cell volume around the SN"
+  - it's unclear how the vector deposition strategy was derived (the original author can't figure out how he arrived at the answer)
+  - In this case, the deposition region is essentially a cube where each side has a width {math}`2\Delta x`
+- spherical 27-cell stencil, with supersampling to calculate overlap
+  - essentially, we treat the deposition volume as a sphere with radius  {math}`\Delta x`. We compute the fractional overlap of cells with the sphere using supersampling. 
+  - momentum-deposition is based on estimating the integral of the radial unit-vector over a cell.
+
+Future work may want to consider the use of larger stencils.
+### Magnitude Description
+
+Every supernova transfers {math}`10 M_\odot` from the star particle.
+
+Resolved feedback injects  {math}`E_{\rm SN}=10^{51}\ {\rm erg}` of thermal energy
+
+:::{NOTE}
+ASIDE: In our simulations that inject {math}`2\times10^{51} {\rm erg}`, I'm pretty sure that I simply modified the hardcoded value.
+Because of the way that the code was originally written I didn't realize that the equations for {math}`r_{\rm sf}` and {math}`p_{\rm final}` implicitly hid their injection-energy dependence.
+Thus, in those simulation I don't think we accounted for that energy dependence.
+:::
+
+At the time of writing, unresolved prescriptions use a final momentum of {math}`p_{\rm final}=2.8\times 10^5 M_\odot\ {\rm km}\ {\rm s}^{-1} n_0^{-0.17} N_{\rm SN}^{0.93}`
+-  everything but the {math}`N_{\rm SN}^{0.93}`  term comes from equation 34 of [Kim & Ostriker (15)](https://ui.adsabs.harvard.edu/abs/2015ApJ...802...99K/abstract), which is direct fit to a number of different simulations of a supernova remnant  ( [Kim & Ostriker 17](https://ui.adsabs.harvard.edu/abs/2017ApJ...846..133K/abstract) also use this formula).
+- the {math}`N_{\rm SN}^{0.93}`  term comes from the {math}`E_{51}^{0.93}` term in equation 17 of [Kim & Ostriker (15)](https://ui.adsabs.harvard.edu/abs/2015ApJ...802...99K/abstract).
+- In this equation, Cholla plugs the value of {math}`n_0` that was computed while determining if a supernova is resolved. I believe that Cholla's choice of mean molecular weight that differs from  [Kim & Ostriker 15](https://ui.adsabs.harvard.edu/abs/2015ApJ...802...99K/abstract) and [Kim & Ostriker 17](https://ui.adsabs.harvard.edu/abs/2017ApJ...846..133K/abstract) 
+
diff --git a/docs/sphinx/Physics/Gravity.md b/docs/sphinx/Physics/Gravity.md
index 11e62bc7d..d944761e4 100644
--- a/docs/sphinx/Physics/Gravity.md
+++ b/docs/sphinx/Physics/Gravity.md
@@ -1,6 +1,24 @@
-# Static Gravity
+# Gravity
+## Overview
 
-Static gravity is activated using the ```STATIC_GRAV``` macro, and is used in several of the example problems provided with Cholla. Static gravity is a simple prescription that does not require any other gravity flags, but does require the input file parameter "custom_grav" to specify the analytic function that will be applied (dev branch only). Static gravity is applied as momentum and energy source terms in [src/hydro/hydro_cuda.cu](https://github.com/cholla-hydro/cholla/blob/dev/src/hydro/hydro_cuda.cu) and the analytic functions are defined in [src/gravity/static_grav.h](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/static_grav.h). As of 10-27-2023 on the main branch, the static gravitational field is hard-coded to provide a Milky Way-like model or an M82-like model. On the dev branch, the input parameter flags correspond to:
+Cholla Primarily implements 2 gravity-related modules:
+- [Standalone Static Gravity](#static-gravity)
+- [Self-Gravity](#self-gravity-fft-based)
+
+:::{note}
+At the time of writing, these 2 modules are presently incompatible with each another.
+If you want to want to consider a static potential with the self-gravity solver, you currently need to write separate machinery for the self-gravity solver.
+Honestly, this seems to be a historical artifact that would be straight-forward to fix (most other simulations codes treat static and self-gravity separately)
+:::
+
+Historically, Cholla also supported a SOR-based Self-Gravity, but that has not been maintained for a long time (and may be removed in the future)
+
+## Static Gravity
+
+Static gravity is activated using the ``STATIC_GRAV`` macro, and is used in several of the example problems provided with Cholla.
+Static gravity is a simple prescription that does not require any other gravity flags, but does require the input file parameter ``custom_grav" to specify the analytic function that will be applied (dev branch only).
+Static gravity is applied as momentum and energy source terms in [src/hydro/hydro_cuda.cu](https://github.com/cholla-hydro/cholla/blob/dev/src/hydro/hydro_cuda.cu) and the analytic functions are defined in [src/gravity/static_grav.h](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/static_grav.h).
+As of 10-27-2023 on the main branch, the static gravitational field is hard-coded to provide a Milky Way-like model or an M82-like model. On the dev branch, the input parameter flags correspond to:
 
 **1D:**
 * 1: a MW-like Miyamoto-Nagai disk + NFW halo potential (assumed z = 0)
@@ -16,33 +34,125 @@ Static gravity is activated using the ```STATIC_GRAV``` macro, and is used in se
 * 2: An M82-like Miyamoto-Nagai disk + NFW halo potential
 
 
+
+
 ## Self Gravity: FFT-based
 
-In addition to static gravity, Cholla has an FFT-based self gravity solver. Only one or the other may be used. The self-gravity solver is turned on with the ```GRAVITY``` macro in the makefile. The default behavior in the [make.type.gravity](https://github.com/cholla-hydro/cholla/blob/dev/builds/make.type.gravity) build (and builds that depend on it) is also to turn on the ```GRAVITY_GPU``` macro, which ensures that gravity fields reside on the GPU (required for gpu-based MPI communications), and the ```PARIS``` macro, which specifies that the Poisson solve will be carried out on the GPU by the cuFFT or rocFFT libraries. Cholla does also have CPU-based gravity solvers, although they are not currently maintained. Definitions of other macros options associated with the gravity solver are given below.
+In addition to static gravity, Cholla has an FFT-based self gravity solver.
+Only one or the other may be used.
+The self-gravity solver is turned on with the ``GRAVITY`` macro in the makefile.
+The default behavior in the [make.type.gravity](https://github.com/cholla-hydro/cholla/blob/dev/builds/make.type.gravity) build (and builds that depend on it) is also to turn on the ``GRAVITY\_GPU`` macro, which ensures that gravity fields reside on the GPU (required for gpu-based MPI communications), and the ```PARIS``` macro, which specifies that the Poisson solve will be carried out on the GPU by the cuFFT or rocFFT libraries.
+Cholla does also have CPU-based gravity solvers, although they are not currently maintained.
+Definitions of other macros options associated with the gravity solver are given below.
+
+In general, this module relies on a particle-mesh scheme. Broadly speaking, for each timestep Cholla:
+- constructs a total density field that includes contributions of all gravitating dynamic mass. This typically include the mass density of gas and the mass from particles.[^1]
+   - this density field **NEVER** include mass from a static background gravitational potential 
+- passes this total density field to the Gravity solver to compute the associated Gravitational Potential
+- subsequently uses this information (and potentially information about a static background potential) to account for gravitational forces
+
+## Macro flags associated with self-gravity:
+
+``GRAVITY``: Turns on self-gravity. Necessary for particle-only simulations.
+
+``GRAVITY_GPU``: Specifies that fields required by gravity are allocated on the GPU.
+
+``PARIS``: Use the Paris 3D GPU-based Poisson solver to calculate the gravitational potential on a periodic domain.
+
+``GRAVITY_5_POINTS_GRADIENT``: Use a 5-point stencil to calculate the gradient of the potential for gravity source terms (default behavior is a 3-point stencil)
+
+``GRAVITY_ANALYTIC_COMP``: Add an analytic component to the gravitational potential. As of 10-27-2023, this is hard-coded to a Milky Way galaxy model in the function `Setup_Analytic_Potential` from [gravity_functions.cpp](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/gravity_functions.cpp). 
+
+``PARIS_3PT``: Use a 3-point gradient for the divergence operator approximation in Paris (default behavior is to use a spectral method)
+
+``PARIS_5PT``: Use a 5-point gradient for the divergence operator approximation in Paris
+
+``PARIS_GALACTIC``: Use the Paris Poisson solver on a domain with analytic boundaries set to match the selected model in the DiskGalaxy class.
+As of 10-27-2023, this is hard-coded to a Milky Way galaxy model in the function `Compute_Gravitational_Potential` from [gravity_functions.cpp](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/gravity_functions.cpp) and in `Compute_Potential_Isolated_Boundary` from [gravity_boundaries.cpp](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/gravity_boundaries.cpp).
+
+``PARIS_GALACTIC_3PT``: Same as above but for the analytic boundary version
+
+``PARIS_GALACTIC_5PT``: Same as above but for the analytic boundary version
+
+``PARIS_GALACTIC_TEST``: Turn on to test whether Paris returns the same gravitational potential as the SOR solver.
+Doesn't work with ``GRAVITY_GPU``, should probably be deprecated.
+
+### Runtime Parameters associated with Static Gravity
+
+At the time of writing, the only runtime parameter is {par:param}`gravity.gas_only_use_static_grav`
+
+
+### Boundary Conditions: Context (and the trivial-case)
+
+*General Background:* Boundary conditions are a messy part of any gravity solver. In general, the choice to use a method that solves gravity in Real space or Fourier space introduces complexity for handling periodic or non-periodic boundaries. As a general rule of thumb:
+- real-space gravity solvers (e.g. tree-methods/multipole-methods) handle non-periodic boundaries trivially.
+  They commonly resort to using Ewald summation for periodic boundaries.
+- Because Fourier-space fundamentally represents real-space fields in terms of periodic basis functions, Fourier-space solvers find the scenarios with periodic boundaries to be relatively trivial.
+  Non-periodic boundaries are much more tricky to handle.
+
+Cholla's FFT-solver very much definitely adheres to this "rule of thumb." Consequently, it trivially handles simulations with periodic boundaries (e.g. cosmological simulations).
+We discuss the non-periodic case below.
+
+## Non-Periodic Boundary Strategy
+
+Before digging in, we briefly introduce some (fairly standard) notation:  {math}`{\bf x}` denotes a spatial position, {math}`\rho({\bf x})` is a mass density field in the simulation (in this particular discussion, we include contributions from gas **AND** particles), {math}`{\bf g}({\bf x})` is the gravitational acceleration, and {math}`\phi({\bf x})` is the gravitational potential
+
+### The Strategy's Ingredients
+Our solution for the non-periodic case involves 2 ingredients: a modified scheme for the Poisson-solve and a static estimate for the potential produced by {math}`\rho_{\rm tot}`
+
+1) **Modified scheme for Poisson-Solve:** This scheme makes use of Discrete Sine Transforms (implemented in terms of FFT machinery).
+Instead of requiring periodic boundaries, this variant requires the following boundary conditions:
+    1. {math}`{\bf g}({\bf x}_{\rm bound}) = {\bf 0}`, or equivalently {math}`{\bf \nabla} \phi \rvert_{{\bf x}_{\rm bound}} = {\bf 0}`
+    2. {math}`\rho({\bf x}_{\rm bound}) = 0`, or equivalently {math}`\nabla^2\phi\rvert_{{\bf x}_{\rm bound}} = 0`
+    While this modified-scheme no longer requires periodic boundary conditions, it clearly is **NOT** the full solution (only pathological scenarios can satisfy condition 1).
+
+2) **A static estimate for the gravitational potential produced by {math}`\rho_{\rm tot}`**: In more detail, we actually need a static estimate for the density-potential pair
+   - Added context: in case you aren't familiar with the concept, every valid gravitational potential profile is associated with a unique density profile. Thus we describe the pair of profiles as a "density-potential" pair [^density-potential-pair]
+   - Let's call refer denote these quantities as {math}`\rho_{\rm estimate}({\bf x})` and {math}`\phi_{\rm estimate}({\bf x})`
+   - At the time of writing, the solver requires {math}`\rho_{\rm estimate}({\bf x})` and {math}`\phi_{\rm estimate}({\bf x})` to have analytic formulae. (However, the implementation could be generalized to use numerically computed profiles)
+
+### Tying things together:
+
+Let's briefly review the "inputs" and "outputs":
+- at startup, the gravity solver is initialized so that it "knows" {math}`\rho_{\rm estimate}({\bf x})` and {math}`\phi_{\rm estimate}({\bf x})`
+- during each timestep Cholla provides the solver with the density-field of the dynamical mass {math}`\rho_{\rm tot}({\bf x})` and expects it to output the associated potential {math}`\phi_{\rm tot}({\bf x})`
 
-Macro flags associated with self-gravity:
+We now describe the solver's procedure:
+1. Compute {math}`\rho_{\rm offset}({\bf x})=\rho_{\rm tot}({\bf x}) - \rho_{\rm estimate}({\bf x})`
+2. Use the Poisson-solve to compute the gravitational potential {math}`\phi_{\rm offset}\left({\bf x}\right)` from {math}`\rho_{\rm offset}\left({\bf x}\right)`.
+3. Compute {math}`\phi_{\rm tot}({\bf x})=\phi_{\rm offset}({\bf x}) + \phi_{\rm estimate}({\bf x})`
 
-```GRAVITY```: Turns on self-gravity. Necessary for particle-only simulations.
+This procedure implicitly leverages 2 fundamental concepts. First, density-potential pairs are additive (e.g. if you have 2 point-masses, you can add together their potentials to get the total gravitational potential). Second, You might notice here that {math}`\rho_{\rm offset}({\bf x})` can have negative values. While the concept of negative density is unintuitive (since negative mass is somewhat meaningless), there aren't any mathematical issues with computing gravitational potential from negative potentials.[^negative-density]
 
-```GRAVITY_GPU```: Specifies that fields required by gravity are allocated on the GPU.
+It's instructive to revisit boundary conditions for the Poisson-solve and consider them in the context of the procedure's second step:
+1. {math}`{\bf g}_{\rm offset}({\bf x}_{\rm bound}) = {\bf \nabla} \phi_{\rm offset} \rvert_{{\bf x}_{\rm bound}} =  {\bf \nabla} \phi_{\rm tot} \rvert_{{\bf x}_{\rm bound}} - {\bf \nabla} \phi_{\rm estimate} \rvert_{{\bf x}_{\rm bound}}`
+2. {math}`\rho_{\rm offset}({\bf x}_{\rm bound})=\rho_{\rm tot}({\bf x}_{\rm bound}) - \rho_{\rm estimate}({\bf x}_{\rm bound})`
 
-```PARIS```: Use the Paris 3D GPU-based Poisson solver to calculate the gravitational potential on a periodic domain.
+This places important requirements on {math}`\rho_{\rm estimate}({\bf x})` and {math}`\phi_{\rm estimate}({\bf x})`:
+- the boundary requirement of the Poisson-solve are satisfied as long as BOTH {math}`{\bf \nabla} \phi_{\rm estimate}` is a good estimate for {math}`{\bf \nabla} \phi_{\rm tot}` at the boundaries **AND**  {math}`\rho_{\rm estimate}` is a good estimate for {math}`\rho_{\rm tot}` at the boundaries
+- as long as the above conditions  are met, it is okay for {math}`\rho_{\rm estimate}({\bf x})` to have large deviations from {math}`\rho_{\rm tot}({\bf x})`.
+  This can only happen far from the boundaries.
 
-```GRAVITY_5_POINTS_GRADIENT```: Use a 5-point stencil to calculate the gradient of the potential for gravity source terms (default behavior is a 3-point stencil)
+:::{warning}
+If {math}`{\bf \nabla} \phi_{\rm estimate}` isn't a good estimate for {math}`{\bf \nabla} \phi_{\rm tot}` at the boundaries **OR**  {math}`\rho_{\rm estimate}` isn't a good estimate for {math}`\rho_{\rm tot}`, then the artifacts and inaccuracies will be introduced into the solution
+:::
 
-```GRAVITY_ANALYTIC_COMP```: Add an analytic component to the gravitational potential. As of 10-27-2023, this is hard-coded to a Milky Way galaxy model in the function `Setup_Analytic_Potential` from [gravity_functions.cpp](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/gravity_functions.cpp). 
 
-```PARIS_3PT```: Use a 3-point gradient for the divergence operator approximation in Paris (default behavior is to use a spectral method)
+## Support for Problem-types with Non-Periodic Boundaries
 
-```PARIS_5PT```: Use a 5-point gradient for the divergence operator approximation in Paris
+Currently, the only scenario that Cholla supports with non-periodic boundaries is idealized Galaxy simulations without.
 
-```PARIS_GALACTIC```: Use the Paris Poisson solver on a domain with analytic boundaries set to match the selected model in the DiskGalaxy class. As of 10-27-2023, this is hard-coded to a Milky Way galaxy model in the function `Compute_Gravitational_Potential` from [gravity_functions.cpp](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/gravity_functions.cpp) and in `Compute_Potential_Isolated_Boundary` from [gravity_boundaries.cpp](https://github.com/cholla-hydro/cholla/blob/dev/src/gravity/gravity_boundaries.cpp).
+At the time of writing, current density-potential pairs used in these problems currently have some **minor** "flaws:"
+- they don't account for the gravitational potential from halo gas
+- they don't account for the potential of star particles
+- they make use of density-potential pairs that don't account for disk-truncation
 
-```PARIS_GALACTIC_3PT```: Same as above but for the analytic boundary version
+Of these 3 "flaws," the third is probably most troubling (since the estimated density-potential pair models contributions from material beyond the edges of the simulation domain). However, these flaws are probably not a concern since everything seems qualitatively reasonable (it's hard to quantify this).
 
-```PARIS_GALACTIC_5PT```: Same as above but for the analytic boundary version
+[^1]: In more detail, we use the fairly standard second-order cloud-in-cell (CIC) interpolation technique to distribute the particle mass onto the field. This total density field is the primary input for this solver.
 
-```PARIS_GALACTIC_TEST```: Turn on to test whether Paris returns the same gravitational potential as the SOR solver. Doesn't work with GRAVITY_GPU, should probably be deprecated.
+[^negative-density]: If you are still struggling to wrap your head around this, it may be instructive to recall the parallels with electrostatics. In that context, Poisson's equation, with physical different constants, relates electrostatic potential and charge density (which can be negative).
 
-## Self Gravity: SOR based
-To-do: Describe the SOR solver
+[^density-potential-pair]: For the uninitiated, a given density profile always has a unique gravitational potential (modulo a constant offset in the potential).  Some simple examples of well known potential pairs include:
+   - point mass: As we all know, the density profile of a point mass (of mass {math}`M`) is associated with {math}`\phi(r) = -GM/r`
+   - Plummer-Kuzmin disk potential (where {math}`R` is cylindrical radius): an infinitely thin disk with surface density profile {math}`\Sigma(R) = a M/ (2\pi (R^2 + a^2)^{3/2})` has the potential {math}`\phi(R,z) = -GM/\sqrt{R^2 + (a + |z|)^2}`
diff --git a/docs/sphinx/Physics/index.md b/docs/sphinx/Physics/index.md
index 2ca113faf..dc046a0e6 100644
--- a/docs/sphinx/Physics/index.md
+++ b/docs/sphinx/Physics/index.md
@@ -9,6 +9,7 @@ Port over more from the wiki
 
 CoolingChemistry.md
 Cosmology.md
+DiskModel.md
 Dual-Energy-Formalism.md
 Dust.md
 Feedback.md
diff --git a/docs/sphinx/Reference/Parameters.md b/docs/sphinx/Reference/Parameters.md
index 16811c44d..1fc0ebfff 100644
--- a/docs/sphinx/Reference/Parameters.md
+++ b/docs/sphinx/Reference/Parameters.md
@@ -20,3 +20,19 @@ These parameters should all be specified in the `[chemistry]` parameter table.
 :::{include} param/Chemistry.md
 :::
 
+(Reference-Feedback-Runtime-Params)=
+## Feedback
+
+These parameters should all be specified in the `[feedback]` parameter table.
+
+:::{include} param/Feedback.md
+:::
+
+
+## Gravity
+
+These parameters should all be specified in the `[gravity]` parameter table.
+
+:::{include} param/Gravity.md
+:::
+
diff --git a/docs/sphinx/Reference/param/Feedback.md b/docs/sphinx/Reference/param/Feedback.md
new file mode 100644
index 000000000..b9cc0b774
--- /dev/null
+++ b/docs/sphinx/Reference/param/Feedback.md
@@ -0,0 +1,57 @@
+
+:::{par:parameter} feedback.boundary_strategy
+
+:Summary: *Specify handling in the scenario when a feedback event is scheduled to occur for a particle near the edge of the grid-block that contains the particle and the stencil overlap with cells outside of the grid-block (this includes ghost cells).*
+:Type: {par:typefmt}`str`
+:Default: *None: must be provided*
+
+Valid options include:
+
+- `"ignore_issues"`: as its name implies, this choice ignores the issue (essentially we lose some fraction of the injected "stuff").
+- `"snap"`: compute the stencils as if the source-particle positions were snapped to the closest position in the grid-block such that the stencil only includes cells within a block.
+
+In the future, a more robust strategy will involve MPI communication (and/or ghost particles)
+
+:::
+
+---
+
+:::{par:parameter} feedback.snr_filename
+
+:Summary: path to the table used to determine the supernova rate (for the `"table"` rates)
+:Type:    {par:typefmt}`str`
+:Default: *None*
+
+This parameter is meaningless if {par:param}`feedback.sn_rate` isn't set to `"table"`.
+
+If this parameter is not set, then a default constant SNR is used.
+The default SNR corresponds to 1 supernova per {math}`100 M_\odot` of cluster mass, spread out over 36 Myr, starting when the cluster is 4 Myr old.
+A sample Starburst99 file is included in the source code at `src/particles/starburst99_snr.txt`.
+The sample represents a {math}`10^6 M_\odot` fixed mass cluster, created using a Kroupa initial mass function, and with an {math}`8 \mathrm{M}_\odot` supernova cutoff.
+More details are provided {ref}`here. <general-SNe-rate>`
+
+:::
+
+---
+
+:::{par:parameter} feedback.sn_model
+
+:Summary: Specifies the name of the supernova model
+:Type: {par:typefmt}`str`
+:Default: *None*
+
+More details are provided {ref}`here <SNe-Prescription-Descriptions>`.
+
+:::
+
+---
+
+:::{par:parameter} feedback.sn_rate
+
+:Summary: Specifies the kind of supernova rate
+:Type: {par:typefmt}`str`
+:Default: `"table"`
+
+When `"table"` (the default value) is specified, Cholla determines the rate from a table.
+`"immediate_sn"` schedules a single supernova to occur, immediately after the simulation starts.
+More details are provided ({ref}`here <general-SNe-rate>`
diff --git a/docs/sphinx/Reference/param/Gravity.md b/docs/sphinx/Reference/param/Gravity.md
new file mode 100644
index 000000000..072e87eaf
--- /dev/null
+++ b/docs/sphinx/Reference/param/Gravity.md
@@ -0,0 +1,9 @@
+
+::::{par:parameter} gravity.gas_only_use_static_grav
+
+:Summary: *Specify whether the gas is only sensitive to the analytic potential or to the analytic potential and self-gravity.*
+:Type: {par:typefmt}`bool`
+:Default: `false`
+
+:::
+
diff --git a/examples/3D/disk_particle.txt b/examples/3D/disk_particle.txt
index 939cefa47..f742aae17 100644
--- a/examples/3D/disk_particle.txt
+++ b/examples/3D/disk_particle.txt
@@ -19,10 +19,6 @@ n_hydro=10
 gamma=1.66666667
 # name of initial conditions
 init=Disk_3D_particles
-snr_filename=./src/feedback/starburst99_snr.txt
-sw_filename=./src/feedback/starburst99_sw.txt
-feedback_sn_model=legacy
-feedback_boundary_strategy=snap
 bc_potential_type=1
 # domain properties
 xmin=-2
@@ -51,3 +47,9 @@ ddelta_dt=-0.001
 # path to output directory
 outdir=./
 prng_seed=41
+
+
+feedback.snr_filename=./src/feedback/starburst99_snr.txt
+feedback.sw_filename=./src/feedback/starburst99_sw.txt
+feedback.sn_model=legacy
+feedback.boundary_strategy=snap
diff --git a/examples/3D/isolated_star_particle.txt b/examples/3D/isolated_star_particle.txt
index 1683e019c..3987e87df 100644
--- a/examples/3D/isolated_star_particle.txt
+++ b/examples/3D/isolated_star_particle.txt
@@ -18,8 +18,8 @@ n_hydro=10
 gamma=1.66666667
 # name of initial conditions
 init=Isolated_Stellar_Cluster
-snr_filename=./src/feedback/starburst99_snr.txt
-sw_filename=./src/feedback/starburst99_sw.txt
+feedback.snr_filename=./src/feedback/starburst99_snr.txt
+feedback.sw_filename=./src/feedback/starburst99_sw.txt
 bc_potential_type=1
 # domain properties
 xmin=-0.5
diff --git a/src/feedback/feedback.cu b/src/feedback/feedback.cu
index 833ae8859..a92e5a0fc 100644
--- a/src/feedback/feedback.cu
+++ b/src/feedback/feedback.cu
@@ -1,1128 +1,202 @@
-#if defined(FEEDBACK) && defined(PARTICLES_GPU) && defined(PARTICLE_AGE) && defined(PARTICLE_IDS)
 
-  #include <math.h>
-  #include <stdio.h>
-  #include <stdlib.h>
-  #include <unistd.h>
-
-  #include <cstring>
-  #include <fstream>
-  #include <sstream>
-  #include <vector>
-
-  #include "../global/global.h"
-  #include "../global/global_cuda.h"
-  #include "../grid/grid3D.h"
-  #include "../io/io.h"
-  #include "feedback.h"
-
-  #define FEED_INFO_N     8
-  #define i_RES           1
-  #define i_UNRES         2
-  #define i_ENERGY        3
-  #define i_MOMENTUM      4
-  #define i_UNRES_ENERGY  5
-  #define i_WIND_MOMENTUM 6
-  #define i_WIND_ENERGY   7
-
-  // the starburst 99 total stellar mass input
-  // stellar wind momentum fluxes and SN rates
-  // must be divided by this to get per solar
-  // mass values.
-  #define S_99_TOTAL_MASS 1e6
-
-  #define TPB_FEEDBACK 128
-  // seed for poisson random number generator
-  #define FEEDBACK_SEED 42
-
-namespace feedback
-{
-Real *dev_snr, snr_dt, time_sn_start, time_sn_end;
-Real *dev_sw_p, *dev_sw_e, sw_dt, time_sw_start, time_sw_end;
-int snr_n;
-}  // namespace feedback
-
-  #ifndef O_HIP
-inline __device__ double atomicMax(double* address, double val)
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old             = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-  #endif  // O_HIP
-
-inline __device__ Real Calc_Timestep(Real gamma, Real* density, Real* momentum_x, Real* momentum_y, Real* momentum_z,
-                                     Real* energy, int index, Real dx, Real dy, Real dz)
-{
-  Real dens = fmax(density[index], 1e-5);  // the minimum value is arbitrary. But, it doesn't really matter what we pick
-                                           // since we're removing this entire function in GH-PR #386
-  Real d_inv = 1.0 / dens;
-  Real vx    = momentum_x[index] * d_inv;
-  Real vy    = momentum_y[index] * d_inv;
-  Real vz    = momentum_z[index] * d_inv;
-  Real P     = fmax((energy[index] - 0.5 * dens * (vx * vx + vy * vy + vz * vz)) * (gamma - 1.0), TINY_NUMBER);
-  Real cs    = sqrt(gamma * P * d_inv);
-  return fmax(fmax((fabs(vx) + cs) / dx, (fabs(vy) + cs) / dy), (fabs(vz) + cs) / dz);
-}
-
-/** the prescription for dividing a scalar quantity between 3x3x3 cells is done
-   by imagining a 2x2x2 cell volume around the SN.  These fractions, then,
-   represent the linear extent of this volume into the cell in question. For i=0
-   this should be 1*1/2. For i=-1 this should be (1-dx)*1/2. For i=+1 this
-   should be dx*1/2. In the above the 1/2 factor is normalize over 2
-   cells/direction.
-  */
-inline __device__ Real Frac(int i, Real dx) { return (-0.5 * i * i - 0.5 * i + 1 + i * dx) * 0.5; }
-
-inline __device__ Real D_Frac(int i, Real dx)
-{
-  return (dx > 0.5) * i * (1 - 2 * dx) + ((i + 1) * dx + 0.5 * (i - 1)) - 3 * (i - 1) * (i + 1) * (0.5 - dx);
-}
-
-/** This function used for debugging potential race conditions.  Feedback from neighboring
-    particles could simultaneously alter one hydro cell's conserved quantities.
- */
-inline __device__ bool Particle_Is_Alone(Real* pos_x_dev, Real* pos_y_dev, Real* pos_z_dev, part_int_t n_local,
-                                         int gtid, Real dx)
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "../feedback/kernel.h"
+#include "../feedback/prescription.h"
+#include "../feedback/ratecalc.h"
+#include "../global/global.h"
+#include "../global/global_cuda.h"
+#include "../grid/grid3D.h"
+#include "../io/io.h"
+#include "../utils/DeviceVector.h"
+#include "../utils/error_handling.h"
+#include "feedback.h"
+
+/* determine the number of supernovae during the current step */
+__global__ void Get_SN_Count_Kernel(part_int_t n_local, part_int_t* id_dev, Real* mass_dev, Real* age_dev,
+                                    const feedback_details::CycleProps cycle_props, const feedback::SNRateCalc snr_calc,
+                                    int* num_SN_dev)
 {
-  Real x0 = pos_x_dev[gtid];
-  Real y0 = pos_y_dev[gtid];
-  Real z0 = pos_z_dev[gtid];
-  // Brute force loop to see if particle is alone
-  for (int i = 0; i < n_local; i++) {
-    if (i == gtid) continue;
-    if (abs(x0 - pos_x_dev[i]) > dx) continue;
-    if (abs(y0 - pos_y_dev[i]) > dx) continue;
-    if (abs(z0 - pos_z_dev[i]) > dx) continue;
-    // If we made it here, something is too close.
-    return false;
+  // All threads across the grid will iterate over the list of particles
+  // - this is grid-strided loop. This is a common idiom that makes the kernel more flexible
+  // - If there are more local particles than threads, some threads will visit more than 1 particle
+  const int start       = blockIdx.x * blockDim.x + threadIdx.x;
+  const int loop_stride = blockDim.x * gridDim.x;
+  for (int i = start; i < n_local; i += loop_stride) {
+    // note age_dev is actually the time of birth
+    Real age            = cycle_props.t - age_dev[i];
+    Real average_num_sn = snr_calc.Get_SN_Rate(age) * mass_dev[i] * cycle_props.dt;
+    num_SN_dev[i]       = snr_calc.Get_Number_Of_SNe_In_Cluster(average_num_sn, cycle_props.n_step, id_dev[i]);
   }
-  return true;
 }
 
-inline __device__ Real Get_Average_Density(Real* density, int xi, int yi, int zi, int nx_grid, int ny_grid, int n_ghost)
-{
-  Real d_average = 0.0;
-  for (int i = -1; i < 2; i++) {
-    for (int j = -1; j < 2; j++) {
-      for (int k = -1; k < 2; k++) {
-        d_average +=
-            density[(xi + n_ghost + i) + (yi + n_ghost + j) * nx_grid + (zi + n_ghost + k) * nx_grid * ny_grid];
-      }
-    }
-  }
-  return d_average / 27;
-}
+namespace
+{  // anonymous namespace
 
-inline __device__ Real Get_Average_Number_Density_CGS(Real* density, int xi, int yi, int zi, int nx_grid, int ny_grid,
-                                                      int n_ghost)
-{
-  return Get_Average_Density(density, xi, yi, zi, nx_grid, ny_grid, n_ghost) * DENSITY_UNIT / (MU * MP);
-}
-
-  #ifndef NO_SN_FEEDBACK
-/**
- * @brief
- * -# Read in SN rate data from Starburst 99. If no file exists, assume a
- * constant rate.
- *
- * @param P pointer to parameters struct. Passes in starburst 99 filename and
- * random number gen seed.
+/* This functor is the callback used in the main part of cholla
  */
-void feedback::Init_State(struct Parameters* P)
-{
-  chprintf("feedback::Init_State start\n");
-  std::string snr_filename(P->snr_filename);
-  if (not snr_filename.empty()) {
-    chprintf("Specified a SNR filename %s.\n", snr_filename.data());
-
-    // read in array of supernova rate values.
-    std::ifstream snr_in(snr_filename);
-    if (!snr_in.is_open()) {
-      chprintf("ERROR: but couldn't read SNR file.\n");
-      exit(-1);
-    }
-
-    std::vector<Real> snr_time;
-    std::vector<Real> snr;
-
-    const int N_HEADER    = 7;    // S'99 has 7 rows of header information
-    const char* s99_delim = " ";  // S'99 data separator
-    std::string line;
-    int line_counter = 0;
-
-    while (snr_in.good()) {
-      std::getline(snr_in, line);
-      if (line_counter++ < N_HEADER) continue;  // skip header processing
-
-      int i      = 0;
-      char* data = strtok(line.data(), s99_delim);
-      while (data != nullptr) {
-        if (i == 0) {
-          // in the following divide by # years per kyr (1000)
-          snr_time.push_back(std::stof(std::string(data)) / 1000);
-        } else if (i == 1) {
-          snr.push_back(pow(10, std::stof(std::string(data))) * 1000 / S_99_TOTAL_MASS);
-        }
-
-        if (i > 0) {
-          break;  // only care about the first 2 items.  Once i = 1 can break
-                  // here.
-        }
-        data = strtok(nullptr, s99_delim);
-        i++;
-      }
-    }
-
-    time_sn_end   = snr_time[snr_time.size() - 1];
-    time_sn_start = snr_time[0];
-    // the following is the time interval between data points
-    // (i.e. assumes regular temporal spacing)
-    snr_dt = (time_sn_end - time_sn_start) / (snr.size() - 1);
-
-    GPU_Error_Check(cudaMalloc((void**)&dev_snr, snr.size() * sizeof(Real)));
-    GPU_Error_Check(cudaMemcpy(dev_snr, snr.data(), snr.size() * sizeof(Real), cudaMemcpyHostToDevice));
-
-  } else {
-    chprintf("No SN rate file specified.  Using constant rate\n");
-    time_sn_start = DEFAULT_SN_START;
-    time_sn_end   = DEFAULT_SN_END;
-  }
-}
-  #endif  // NO_SN_FEEDBACK
-
-  #ifndef NO_WIND_FEEDBACK
-/**
- * @brief
- * Read in Stellar wind data from Starburst 99. If no file exists, assume a
- * constant rate.
- *
- *
- * @param P pointer to parameters struct. Passes in starburst 99 filepath
- */
-void feedback::Init_Wind_State(struct Parameters* P)
-{
-  chprintf("Init_Wind_State start\n");
-  std::string sw_filename(P->sw_filename);
-  if (sw_filename.empty()) {
-    chprintf("must specify a stellar wind file.\n");
-    exit(-1);
-  }
-
-  chprintf("Specified a stellar wind filename %s.\n", sw_filename.data());
-
-  // read in array of supernova rate values.
-  std::ifstream sw_in(sw_filename);
-  if (!sw_in.is_open()) {
-    chprintf("ERROR: couldn't read stellar wind file.\n");
-    exit(-1);
-  }
-
-  std::vector<Real> sw_time;
-  std::vector<Real> sw_p;
-  std::vector<Real> sw_e;
-
-  const int N_HEADER_LINES = 7;  // S'99 has 7 rows of header information
-  const int COL_TIME       = 0;
-  const int COL_POWER      = 1;
-  const int COL_ALL_P_FLUX = 7;
-
-  const char* s99_delim = " ";  // S'99 data separator
-  std::string line;
-  int line_counter = 0;
-
-  while (sw_in.good()) {
-    std::getline(sw_in, line);
-    if (line_counter++ < N_HEADER_LINES) continue;  // skip header processing
-
-    int i      = 0;
-    char* data = strtok(line.data(), s99_delim);
-    while (data != nullptr) {
-      if (i == COL_TIME) {
-        // in the following divide by # years per kyr (1000)
-        sw_time.push_back(std::stof(std::string(data)) / 1000);
-      } else if (i == COL_POWER) {
-        sw_e.push_back(std::stof(std::string(data)));
-      } else if (i == COL_ALL_P_FLUX) {
-        sw_p.push_back(std::stof(std::string(data)));
-      }
-      data = strtok(nullptr, s99_delim);
-      i++;
-    }
-  }
-
-  time_sw_end   = sw_time[sw_time.size() - 1];
-  time_sw_start = sw_time[0];
-  // the following is the time interval between data points
-  // (i.e. assumes regular temporal spacing)
-  sw_dt = (time_sw_end - time_sw_start) / (sw_p.size() - 1);
-  chprintf("wind t_s %.5e, t_e %.5e, delta T %0.5e\n", time_sw_start, time_sw_end, sw_dt);
-
-  GPU_Error_Check(cudaMalloc((void**)&dev_sw_p, sw_p.size() * sizeof(Real)));
-  GPU_Error_Check(cudaMemcpy(dev_sw_p, sw_p.data(), sw_p.size() * sizeof(Real), cudaMemcpyHostToDevice));
-
-  GPU_Error_Check(cudaMalloc((void**)&dev_sw_e, sw_e.size() * sizeof(Real)));
-  GPU_Error_Check(cudaMemcpy(dev_sw_e, sw_e.data(), sw_e.size() * sizeof(Real), cudaMemcpyHostToDevice));
-
-  chprintf("first 40 stellar wind momentum values:\n");
-  for (int i = 0; i < 40; i++) {
-    chprintf("%0.5e  %5f %5f \n", sw_time.at(i), sw_e.at(i), sw_p.at(i));
-  }
-}
-
-  #endif  // NO_WIND_FEEDBACK
-
-/**
- * @brief Get the Starburst 99 stellar wind momentum flux per solar mass.
- *
- * @param t cluster age in kyr
- * @param dev_sw_p device array of log base 10 momentum flux values in dynes.
- * @param sw_dt time interval between table data points in kyr.
- * @param t_start cluster age when flux becomes non-negligible (kyr).
- * @param t_end  cluster age when stellar winds turn off (kyr).
- * @return flux (in Cholla force units) per solar mass.
- */
-__device__ Real Get_Wind_Flux(Real t, Real* dev_sw_p, Real sw_dt, Real t_start, Real t_end)
-{
-  if (t < t_start || t >= t_end) return 0;
-
-  int index        = (int)((t - t_start) / sw_dt);
-  Real log_p_dynes = dev_sw_p[index] + (t - index * sw_dt) * (dev_sw_p[index + 1] - dev_sw_p[index]) / sw_dt;
-  return pow(10, log_p_dynes) / FORCE_UNIT / S_99_TOTAL_MASS;
-}
-
-/**
- * @brief Get the Starburst 99 stellar wind emitted power per solar mass.
- *
- * @param t cluster age in kyr
- * @param dev_sw_e device array of log base 10 power (erg/s).
- * @param sw_dt time interval between table data points in kyr.
- * @param t_start cluster age when power becomes non-negligible (kyr).
- * @param t_end  cluster age when stellar winds turn off (kyr).
- * @return power (in Cholla units) per solar mass.
- */
-__device__ Real Get_Wind_Power(Real t, Real* dev_sw_e, Real sw_dt, Real t_start, Real t_end)
-{
-  if (t < t_start || t >= t_end) return 0;
-
-  int index  = (int)((t - t_start) / sw_dt);
-  Real log_e = dev_sw_e[index] + (t - index * sw_dt) * (dev_sw_e[index + 1] - dev_sw_e[index]) / sw_dt;
-  Real e     = pow(10, log_e) / (MASS_UNIT * VELOCITY_UNIT * VELOCITY_UNIT) * TIME_UNIT / S_99_TOTAL_MASS;
-  return e;
-}
-
-/**
- * @brief Get the mass flux associated with stellar wind momentum flux
- *        and stellar wind power scaled per cluster mass.
- *
- * @param flux
- * @return mass flux in g/s per solar mass
- */
-__device__ Real Get_Wind_Mass(Real flux, Real power)
-{
-  if (flux <= 0 || power <= 0) return 0;
-  return flux * flux / power / 2;
-}
-
-/**
- * @brief returns SNR from starburst 99 (or default analytical rate).
- *        Time is in kyr.  Does a basic interpolation of S'99 table
- *        values.
- *
- * @param t   The cluster age.
- * @param dev_snr  device array with rate info
- * @param snr_dt  time interval between table data.  Constant value.
- * @param t_start cluster age when SNR is greater than zero.
- * @param t_end   cluster age when SNR drops to zero.
- * @return double number of SNe per kyr per solar mass
- */
-__device__ Real Get_SN_Rate(Real t, Real* dev_snr, Real snr_dt, Real t_start, Real t_end)
-{
-  if (t < t_start || t >= t_end) return 0;
-  if (dev_snr == nullptr) return feedback::DEFAULT_SNR;
-
-  int index = (int)((t - t_start) / snr_dt);
-  return dev_snr[index] + (t - index * snr_dt) * (dev_snr[index + 1] - dev_snr[index]) / snr_dt;
-}
-
-/**
- * @brief Get an actual number of SNe given the expected number.
- * Both the simulation step number and cluster ID is used to
- * set the state of the random number generator in a unique and
- * deterministic way.
- *
- * @param ave_num_sn expected number of SN, based on cluster
- * age, mass and time step.
- * @param n_step sim step number
- * @param cluster_id
- * @return number of supernovae
- */
-inline __device__ int Get_Number_Of_SNe_In_Cluster(Real ave_num_sn, int n_step, part_int_t cluster_id)
-{
-  feedback_prng_t state;
-  curand_init(FEEDBACK_SEED, 0, 0, &state);
-  unsigned long long skip = n_step * 10000 + cluster_id;
-  skipahead(skip, &state);  // provided by curand
-  return (int)curand_poisson(&state, ave_num_sn);
-}
-
-__device__ Real Apply_Resolved_SN(Real pos_x, Real pos_y, Real pos_z, Real xMin, Real yMin, Real zMin, Real dx, Real dy,
-                                  Real dz, int nx_g, int ny_g, int n_ghost, int n_cells, Real gamma,
-                                  Real* conserved_device, short time_direction, Real feedback_density,
-                                  Real feedback_energy)
-{
-  // For 2x2x2, a particle between 0-0.5 injects onto cell - 1
-  int indx_x = (int)floor((pos_x - xMin - 0.5 * dx) / dx);
-  int indx_y = (int)floor((pos_y - yMin - 0.5 * dy) / dy);
-  int indx_z = (int)floor((pos_z - zMin - 0.5 * dz) / dz);
-
-  Real cell_center_x = xMin + indx_x * dx + 0.5 * dx;
-  Real cell_center_y = yMin + indx_y * dy + 0.5 * dy;
-  Real cell_center_z = zMin + indx_z * dz + 0.5 * dz;
-
-  Real delta_x = 1 - (pos_x - cell_center_x) / dx;
-  Real delta_y = 1 - (pos_y - cell_center_y) / dy;
-  Real delta_z = 1 - (pos_z - cell_center_z) / dz;
-
-  Real* density    = conserved_device;
-  Real* momentum_x = &conserved_device[n_cells * grid_enum::momentum_x];
-  Real* momentum_y = &conserved_device[n_cells * grid_enum::momentum_y];
-  Real* momentum_z = &conserved_device[n_cells * grid_enum::momentum_z];
-  Real* energy     = &conserved_device[n_cells * grid_enum::Energy];
-  Real* gasEnergy  = &conserved_device[n_cells * grid_enum::GasEnergy];
-
-  Real local_dti = 0;
-
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < 2; j++) {
-      for (int k = 0; k < 2; k++) {
-        int indx    = (indx_x + i + n_ghost) + (indx_y + j + n_ghost) * nx_g + (indx_z + k + n_ghost) * nx_g * ny_g;
-        Real x_frac = i * (1 - delta_x) + (1 - i) * delta_x;
-        Real y_frac = j * (1 - delta_y) + (1 - j) * delta_y;
-        Real z_frac = k * (1 - delta_z) + (1 - k) * delta_z;
-
-        atomicAdd(&density[indx], x_frac * y_frac * z_frac * feedback_density);
-        atomicAdd(&gasEnergy[indx], x_frac * y_frac * z_frac * feedback_energy);
-        atomicAdd(&energy[indx], x_frac * y_frac * z_frac * feedback_energy);
-
-        if (time_direction > 0) {
-          Real cell_dti = Calc_Timestep(gamma, density, momentum_x, momentum_y, momentum_z, energy, indx, dx, dy, dz);
-
-          local_dti = fmax(local_dti, cell_dti);
-        }
-      }  // k loop
-    }    // j loop
-  }      // i loop
-
-  return local_dti;
-}
-
-__device__ Real Apply_Unresolved_SN(Real pos_x, Real pos_y, Real pos_z, Real xMin, Real yMin, Real zMin, Real dx,
-                                    Real dy, Real dz, int nx_g, int ny_g, int n_ghost, int n_cells, Real gamma,
-                                    Real* conserved_device, short time_direction, Real feedback_density,
-                                    Real feedback_momentum, Real feedback_energy, int indx_x, int indx_y, int indx_z)
-{
-  Real delta_x = (pos_x - xMin - indx_x * dx) / dx;
-  Real delta_y = (pos_y - yMin - indx_y * dy) / dy;
-  Real delta_z = (pos_z - zMin - indx_z * dz) / dz;
-
-  Real local_dti = 0;
-
-  Real* density    = conserved_device;
-  Real* momentum_x = &conserved_device[n_cells * grid_enum::momentum_x];
-  Real* momentum_y = &conserved_device[n_cells * grid_enum::momentum_y];
-  Real* momentum_z = &conserved_device[n_cells * grid_enum::momentum_z];
-  Real* energy     = &conserved_device[n_cells * grid_enum::Energy];
-  Real* gas_energy = &conserved_device[n_cells * grid_enum::GasEnergy];
-
-  Real x_frac, y_frac, z_frac;
-  Real mag = 0;
-  for (int i = -1; i < 2; i++) {
-    for (int j = -1; j < 2; j++) {
-      for (int k = -1; k < 2; k++) {
-        x_frac = D_Frac(i, delta_x) * Frac(j, delta_y) * Frac(k, delta_z);
-        y_frac = Frac(i, delta_x) * D_Frac(j, delta_y) * Frac(k, delta_z);
-        z_frac = Frac(i, delta_x) * Frac(j, delta_y) * D_Frac(k, delta_z);
-
-        mag += sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac);
-      }
-    }
-  }
-
-  for (int i = -1; i < 2; i++) {
-    for (int j = -1; j < 2; j++) {
-      for (int k = -1; k < 2; k++) {
-        // index in array of conserved quantities
-        int indx = (indx_x + i + n_ghost) + (indx_y + j + n_ghost) * nx_g + (indx_z + k + n_ghost) * nx_g * ny_g;
-
-        x_frac = D_Frac(i, delta_x) * Frac(j, delta_y) * Frac(k, delta_z);
-        y_frac = Frac(i, delta_x) * D_Frac(j, delta_y) * Frac(k, delta_z);
-        z_frac = Frac(i, delta_x) * Frac(j, delta_y) * D_Frac(k, delta_z);
-
-        Real px = x_frac * feedback_momentum;
-        Real py = y_frac * feedback_momentum;
-        Real pz = z_frac * feedback_momentum;
-        Real d  = sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac) / mag * feedback_density;
-        Real e  = sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac) / mag * feedback_energy;
-
-        atomicAdd(&momentum_x[indx], px);
-        atomicAdd(&momentum_y[indx], py);
-        atomicAdd(&momentum_z[indx], pz);
-        atomicAdd(&energy[indx], e);
-        atomicAdd(&density[indx], d);
-
-        gas_energy[indx] = energy[indx] - (momentum_x[indx] * momentum_x[indx] + momentum_y[indx] * momentum_y[indx] +
-                                           momentum_z[indx] * momentum_z[indx]) /
-                                              (2 * density[indx]);
-
-        if (time_direction > 0) {
-          Real cell_dti = Calc_Timestep(gamma, density, momentum_x, momentum_y, momentum_z, energy, indx, dx, dy, dz);
-          local_dti     = fmax(local_dti, cell_dti);
-        }
-      }  // k loop
-    }    // j loop
-  }      // i loop
-
-  return local_dti;
-}
-
-__device__ Real Apply_Wind(Real pos_x, Real pos_y, Real pos_z, Real xMin, Real yMin, Real zMin, Real dx, Real dy,
-                           Real dz, int nx_g, int ny_g, int n_ghost, int n_cells, Real gamma, Real* conserved_device,
-                           short time_direction, Real feedback_density, Real feedback_momentum, Real feedback_energy,
-                           int n_step, part_int_t id, int loop, int indx_x, int indx_y, int indx_z)
-{
-  Real delta_x = (pos_x - xMin - indx_x * dx) / dx;
-  Real delta_y = (pos_y - yMin - indx_y * dy) / dy;
-  Real delta_z = (pos_z - zMin - indx_z * dz) / dz;
-
-  Real local_dti = 0;
-  Real f_energy, x_frac, y_frac, z_frac, f_dens;
-
-  Real* density    = conserved_device;
-  Real* momentum_x = &conserved_device[n_cells * grid_enum::momentum_x];
-  Real* momentum_y = &conserved_device[n_cells * grid_enum::momentum_y];
-  Real* momentum_z = &conserved_device[n_cells * grid_enum::momentum_z];
-  Real* energy     = &conserved_device[n_cells * grid_enum::Energy];
-  Real* gas_energy = &conserved_device[n_cells * grid_enum::GasEnergy];
-
-  // loop over the 27 cells to add up all the allocated feedback
-  // momentum magnitudes.  For each cell allocate density and
-  // energy based on the ratio of allocated momentum to this overall sum.
-  Real mag = 0;
-  for (int i = -1; i < 2; i++) {
-    for (int j = -1; j < 2; j++) {
-      for (int k = -1; k < 2; k++) {
-        x_frac = D_Frac(i, delta_x) * Frac(j, delta_y) * Frac(k, delta_z);
-        y_frac = Frac(i, delta_x) * D_Frac(j, delta_y) * Frac(k, delta_z);
-        z_frac = Frac(i, delta_x) * Frac(j, delta_y) * D_Frac(k, delta_z);
-
-        mag += sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac);
-      }
-    }
-  }
-
-  for (int i = -1; i < 2; i++) {
-    for (int j = -1; j < 2; j++) {
-      for (int k = -1; k < 2; k++) {
-        // index in array of conserved quantities
-        int indx = (indx_x + i + n_ghost) + (indx_y + j + n_ghost) * nx_g + (indx_z + k + n_ghost) * nx_g * ny_g;
-
-        x_frac = D_Frac(i, delta_x) * Frac(j, delta_y) * Frac(k, delta_z);
-        y_frac = Frac(i, delta_x) * D_Frac(j, delta_y) * Frac(k, delta_z);
-        z_frac = Frac(i, delta_x) * Frac(j, delta_y) * D_Frac(k, delta_z);
-
-        Real px  = x_frac * feedback_momentum;
-        Real py  = y_frac * feedback_momentum;
-        Real pz  = z_frac * feedback_momentum;
-        f_dens   = sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac) / mag * feedback_density;
-        f_energy = sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac) / mag * feedback_energy;
-
-        atomicAdd(&density[indx], f_dens);
-        atomicAdd(&momentum_x[indx], px);
-        atomicAdd(&momentum_y[indx], py);
-        atomicAdd(&momentum_z[indx], pz);
-        atomicAdd(&energy[indx], f_energy);
-
-        gas_energy[indx] = energy[indx] - (momentum_x[indx] * momentum_x[indx] + momentum_y[indx] * momentum_y[indx] +
-                                           momentum_z[indx] * momentum_z[indx]) /
-                                              (2 * density[indx]);
-
-        /*
-        energy[indx] = ( momentum_x[indx] * momentum_x[indx] +
-                         momentum_y[indx] * momentum_y[indx] +
-                         momentum_z[indx] * momentum_z[indx] ) /
-                       2 / density[indx] + gasEnergy[indx];
-        */
-        if (time_direction > 0) {
-          Real cell_dti = Calc_Timestep(gamma, density, momentum_x, momentum_y, momentum_z, energy, indx, dx, dy, dz);
-          local_dti     = fmax(local_dti, cell_dti);
-        }
-
-      }  // k loop
-    }    // j loop
-  }      // i loop
-
-  return local_dti;
-}
-
-__device__ void SN_Feedback(Real pos_x, Real pos_y, Real pos_z, Real age, Real* mass_dev, part_int_t* id_dev, Real xMin,
-                            Real yMin, Real zMin, Real xMax, Real yMax, Real zMax, Real dx, Real dy, Real dz, int nx_g,
-                            int ny_g, int nz_g, int n_ghost, int n_step, Real t, Real dt, Real* dti, Real* dev_snr,
-                            Real snr_dt, Real time_sn_start, Real time_sn_end, Real* prev_dens, short time_direction,
-                            Real* s_info, Real* conserved_dev, Real gamma, int loop, int indx_x, int indx_y, int indx_z)
-{
-  int tid  = threadIdx.x;
-  int gtid = blockIdx.x * blockDim.x + tid;
-
-  Real dV = dx * dy * dz;
-  Real feedback_density, feedback_momentum, feedback_energy;
-  Real local_dti = 0.0;
-  int n_cells    = nx_g * ny_g * nz_g;
-
-  Real average_num_sn = Get_SN_Rate(age, dev_snr, snr_dt, time_sn_start, time_sn_end) * mass_dev[gtid] * dt;
-  int N               = Get_Number_Of_SNe_In_Cluster(average_num_sn, n_step, id_dev[gtid]) * time_direction;
-  /*
-  if (gtid == 0) {
-    kernel_printf("SNUMBER n_step: %d, id: %lld, N: %d\n", n_step, id_dev[gtid], N);
+template <typename FeedbackModel>
+struct ClusterFeedbackMethod {
+  ClusterFeedbackMethod() = delete;
+
+  ClusterFeedbackMethod(FeedbackAnalysis& analysis, bool use_snr_calc, feedback::SNRateCalc snr_calc,
+                        feedback_details::BoundaryStrategy bdry_strat)
+      : analysis(analysis),
+        use_snr_calc_(use_snr_calc),
+        snr_calc_(snr_calc),
+        bdry_strat_(bdry_strat),
+        lazy_ov_scheduler_(nullptr)
+  {
   }
-  */
-
-  // no sense doing anything if there was no SN
-  if (N != 0) {
-    Real n_0;
-    if (time_direction == -1) {
-      n_0 = prev_dens[gtid];
-    } else {
-      Real* density             = conserved_dev;
-      n_0                       = Get_Average_Number_Density_CGS(density, indx_x, indx_y, indx_z, nx_g, ny_g, n_ghost);
-      prev_dens[gtid]           = n_0;
-      s_info[FEED_INFO_N * tid] = 1. * N;
-    }
-
-    feedback_energy  = N * feedback::ENERGY_PER_SN / dV;
-    feedback_density = N * feedback::MASS_PER_SN / dV;
-
-    Real shell_radius = feedback::R_SH * pow(n_0, -0.46) * pow(fabsf(N), 0.29);
-  #ifdef ONLY_RESOLVED
-    bool is_resolved = true;
-  #else
-    bool is_resolved = 3 * max(dx, max(dy, dz)) <= shell_radius;
-  #endif
-
-    if (is_resolved) {
-      // inject energy and density
-      if (time_direction > 0) {
-        s_info[FEED_INFO_N * tid + i_RES]    = 1. * N;
-        s_info[FEED_INFO_N * tid + i_ENERGY] = feedback_energy * dV;
-      }
-      local_dti = Apply_Resolved_SN(pos_x, pos_y, pos_z, xMin, yMin, zMin, dx, dy, dz, nx_g, ny_g, n_ghost, n_cells,
-                                    gamma, conserved_dev, time_direction, feedback_density, feedback_energy);
-    } else {
-      // inject momentum and density
-      feedback_momentum =
-          time_direction * feedback::FINAL_MOMENTUM * pow(n_0, -0.17) * pow(fabsf(N), 0.93) / dV / sqrt(3.0);
-      if (time_direction > 0) {
-        s_info[FEED_INFO_N * tid + i_UNRES]        = 1. * N;
-        s_info[FEED_INFO_N * tid + i_MOMENTUM]     = feedback_momentum * dV * sqrt(3.0);
-        s_info[FEED_INFO_N * tid + i_UNRES_ENERGY] = feedback_energy * dV;
-      }
-      local_dti = Apply_Unresolved_SN(pos_x, pos_y, pos_z, xMin, yMin, zMin, dx, dy, dz, nx_g, ny_g, n_ghost, n_cells,
-                                      gamma, conserved_dev, time_direction, feedback_density, feedback_momentum,
-                                      feedback_energy, indx_x, indx_y, indx_z);
-    }
-  }
-
-  if (time_direction > 0) atomicMax(dti, local_dti);
-}
-
-__device__ void Wind_Feedback(Real pos_x, Real pos_y, Real pos_z, Real age, Real* mass_dev, part_int_t* id_dev,
-                              Real xMin, Real yMin, Real zMin, Real xMax, Real yMax, Real zMax, Real dx, Real dy,
-                              Real dz, int nx_g, int ny_g, int nz_g, int n_ghost, int n_step, Real t, Real dt,
-                              Real* dti, Real* dev_sw_p, Real* dev_sw_e, Real sw_dt, Real time_sw_start,
-                              Real time_sw_end, short time_direction, Real* s_info, Real* conserved_dev, Real gamma,
-                              int loop, int indx_x, int indx_y, int indx_z)
-{
-  int tid  = threadIdx.x;
-  int gtid = blockIdx.x * blockDim.x + tid;
-
-  Real dV = dx * dy * dz;
-  Real feedback_density, feedback_momentum, feedback_energy;
-  Real local_dti = 0.0;
-  int n_cells    = nx_g * ny_g * nz_g;
-
-  if (age < 0 || age > time_sw_end) return;
-  feedback_momentum = Get_Wind_Flux(age, dev_sw_p, sw_dt, time_sw_start, time_sw_end);
-  // no sense in proceeding if there is no feedback.
-  if (feedback_momentum == 0) return;
-  feedback_energy  = Get_Wind_Power(age, dev_sw_e, sw_dt, time_sw_start, time_sw_end);
-  feedback_density = Get_Wind_Mass(feedback_momentum, feedback_energy);
-
-  // feedback_momentum now becomes momentum component along one direction.
-  feedback_momentum *= mass_dev[gtid] * dt / dV / sqrt(3.0) * time_direction;
-  feedback_density *= mass_dev[gtid] * dt / dV * time_direction;
-  feedback_energy *= mass_dev[gtid] * dt / dV * time_direction;
-
-  /* TODO refactor into separate kernel call
-  if (time_direction > 0) {
-    mass_dev[gtid]   -= feedback_density * dV;
-  }*/
-
-  if (time_direction > 0) {
-    // we log net momentum, not momentum density, and magnitude (not the
-    // component along a direction)
-    s_info[FEED_INFO_N * tid + i_WIND_MOMENTUM] = feedback_momentum * dV * sqrt(3.0);
-    s_info[FEED_INFO_N * tid + i_WIND_ENERGY]   = feedback_energy * dV;
-  }
-
-  local_dti = Apply_Wind(pos_x, pos_y, pos_z, xMin, yMin, zMin, dx, dy, dz, nx_g, ny_g, n_ghost, n_cells, gamma,
-                         conserved_dev, time_direction, feedback_density, feedback_momentum, feedback_energy, n_step,
-                         id_dev[gtid], loop, indx_x, indx_y, indx_z);
-
-  if (time_direction > 0) atomicMax(dti, local_dti);
-}
-
-__device__ void Cluster_Feedback_Helper(part_int_t n_local, Real* pos_x_dev, Real* pos_y_dev, Real* pos_z_dev,
-                                        Real* age_dev, Real* mass_dev, part_int_t* id_dev, Real xMin, Real yMin,
-                                        Real zMin, Real xMax, Real yMax, Real zMax, Real dx, Real dy, Real dz, int nx_g,
-                                        int ny_g, int nz_g, int n_ghost, int n_step, Real t, Real dt, Real* dti,
-                                        Real* dev_snr, Real snr_dt, Real time_sn_start, Real time_sn_end,
-                                        Real* prev_dens, Real* dev_sw_p, Real* dev_sw_e, Real sw_dt, Real time_sw_start,
-                                        Real time_sw_end, short time_direction, Real* s_info, Real* conserved_dev,
-                                        Real gamma, int loop)
-{
-  int tid  = threadIdx.x;
-  int gtid = blockIdx.x * blockDim.x + tid;
-  // Bounds check on particle arrays
-  if (gtid >= n_local) return;
-
-  Real pos_x    = pos_x_dev[gtid];
-  Real pos_y    = pos_y_dev[gtid];
-  Real pos_z    = pos_z_dev[gtid];
-  bool in_local = (pos_x >= xMin && pos_x < xMax) && (pos_y >= yMin && pos_y < yMax) && (pos_z >= zMin && pos_z < zMax);
-  // Particle is outside bounds, exit
-  if (!in_local) return;
-
-  int indx_x  = (int)floor((pos_x - xMin) / dx);
-  int indx_y  = (int)floor((pos_y - yMin) / dy);
-  int indx_z  = (int)floor((pos_z - zMin) / dz);
-  bool ignore = indx_x < 0 || indx_y < 0 || indx_z < 0 || indx_x >= nx_g - 2 * n_ghost ||
-                indx_y >= ny_g - 2 * n_ghost || indx_z >= nz_g - 2 * n_ghost;
-  // Ignore this particle, exit
-  if (ignore) return;
-
-  // bool is_alone = Particle_Is_Alone(pos_x_dev, pos_y_dev, pos_z_dev, n_local, gtid, 6*dx);
-  // if (is_alone) kernel_printf(" particle not alone: step %d, id %ld\n", n_step, id_dev[gtid]);
-  // if (!is_alone) return;
-
-  // note age_dev is actually the time of birth
-  Real age = t - age_dev[gtid];
-
-  bool is_sn_feedback = false;
-  bool is_wd_feedback = false;
-  #ifndef NO_SN_FEEDBACK
-  is_sn_feedback = true;
-  #endif
-  #ifndef NO_WIND_FEEDBACK
-  is_wd_feedback = true;
-  #endif
-
-  // when applying different types of feedback, undoing the step requires
-  // reverising the order
-  if (time_direction > 0) {
-    if (is_sn_feedback) {
-      SN_Feedback(pos_x, pos_y, pos_z, age, mass_dev, id_dev, xMin, yMin, zMin, xMax, yMax, zMax, dx, dy, dz, nx_g,
-                  ny_g, nz_g, n_ghost, n_step, t, dt, dti, dev_snr, snr_dt, time_sn_start, time_sn_end, prev_dens,
-                  time_direction, s_info, conserved_dev, gamma, loop, indx_x, indx_y, indx_z);
-    }
-    if (is_wd_feedback) {
-      Wind_Feedback(pos_x, pos_y, pos_z, age, mass_dev, id_dev, xMin, yMin, zMin, xMax, yMax, zMax, dx, dy, dz, nx_g,
-                    ny_g, nz_g, n_ghost, n_step, t, dt, dti, dev_sw_p, dev_sw_e, sw_dt, time_sw_start, time_sw_end,
-                    time_direction, s_info, conserved_dev, gamma, loop, indx_x, indx_y, indx_z);
-    }
-  } else {
-    if (is_wd_feedback) {
-      Wind_Feedback(pos_x, pos_y, pos_z, age, mass_dev, id_dev, xMin, yMin, zMin, xMax, yMax, zMax, dx, dy, dz, nx_g,
-                    ny_g, nz_g, n_ghost, n_step, t, dt, dti, dev_sw_p, dev_sw_e, sw_dt, time_sw_start, time_sw_end,
-                    time_direction, s_info, conserved_dev, gamma, loop, indx_x, indx_y, indx_z);
-    }
-    if (is_sn_feedback) {
-      SN_Feedback(pos_x, pos_y, pos_z, age, mass_dev, id_dev, xMin, yMin, zMin, xMax, yMax, zMax, dx, dy, dz, nx_g,
-                  ny_g, nz_g, n_ghost, n_step, t, dt, dti, dev_snr, snr_dt, time_sn_start, time_sn_end, prev_dens,
-                  time_direction, s_info, conserved_dev, gamma, loop, indx_x, indx_y, indx_z);
-    }
-  }
-
-  return;
-}
-
-__global__ void Cluster_Feedback_Kernel(part_int_t n_local, part_int_t* id_dev, Real* pos_x_dev, Real* pos_y_dev,
-                                        Real* pos_z_dev, Real* mass_dev, Real* age_dev, Real xMin, Real yMin, Real zMin,
-                                        Real xMax, Real yMax, Real zMax, Real dx, Real dy, Real dz, int nx_g, int ny_g,
-                                        int nz_g, int n_ghost, Real t, Real dt, Real* dti, Real* info, Real* density,
-                                        Real gamma, Real* prev_dens, short time_direction, Real* dev_snr, Real snr_dt,
-                                        Real time_sn_start, Real time_sn_end, Real* dev_sw_p, Real* dev_sw_e,
-                                        Real sw_dt, Real time_sw_start, Real time_sw_end, int n_step, int loop)
-{
-  int tid = threadIdx.x;
-
-  // for collecting SN feedback information
-  __shared__ Real s_info[FEED_INFO_N * TPB_FEEDBACK];
-  s_info[FEED_INFO_N * tid]     = 0;  // number of supernovae
-  s_info[FEED_INFO_N * tid + 1] = 0;  // number of resolved events
-  s_info[FEED_INFO_N * tid + 2] = 0;  // number of unresolved events
-  s_info[FEED_INFO_N * tid + 3] = 0;  // resolved energy
-  s_info[FEED_INFO_N * tid + 4] = 0;  // unresolved momentum
-  s_info[FEED_INFO_N * tid + 5] = 0;  // unresolved KE added via momentum
-  s_info[FEED_INFO_N * tid + 6] = 0;  // wind momentum
-  s_info[FEED_INFO_N * tid + 7] = 0;  // wind energy added
-
-  Cluster_Feedback_Helper(n_local, pos_x_dev, pos_y_dev, pos_z_dev, age_dev, mass_dev, id_dev, xMin, yMin, zMin, xMax,
-                          yMax, zMax, dx, dy, dz, nx_g, ny_g, nz_g, n_ghost, n_step, t, dt, dti, dev_snr, snr_dt,
-                          time_sn_start, time_sn_end, prev_dens, dev_sw_p, dev_sw_e, sw_dt, time_sw_start, time_sw_end,
-                          time_direction, s_info, density, gamma, loop);
-
-  __syncthreads();
-
-  // reduce the info from all the threads in the block
-  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      s_info[FEED_INFO_N * tid] += s_info[FEED_INFO_N * (tid + s)];
-      s_info[FEED_INFO_N * tid + 1] += s_info[FEED_INFO_N * (tid + s) + 1];
-      s_info[FEED_INFO_N * tid + 2] += s_info[FEED_INFO_N * (tid + s) + 2];
-      s_info[FEED_INFO_N * tid + 3] += s_info[FEED_INFO_N * (tid + s) + 3];
-      s_info[FEED_INFO_N * tid + 4] += s_info[FEED_INFO_N * (tid + s) + 4];
-      s_info[FEED_INFO_N * tid + 5] += s_info[FEED_INFO_N * (tid + s) + 5];
-      s_info[FEED_INFO_N * tid + 6] += s_info[FEED_INFO_N * (tid + s) + 6];
-      s_info[FEED_INFO_N * tid + 7] += s_info[FEED_INFO_N * (tid + s) + 7];
-    }
-    __syncthreads();
-  }
-
-  // atomicAdd reduces across all blocks
-  if (tid == 0) {
-    atomicAdd(info, s_info[0]);
-    atomicAdd(info + 1, s_info[1]);
-    atomicAdd(info + 2, s_info[2]);
-    atomicAdd(info + 3, s_info[3]);
-    atomicAdd(info + 4, s_info[4]);
-    atomicAdd(info + 5, s_info[5]);
-    atomicAdd(info + 6, s_info[6]);
-    atomicAdd(info + 7, s_info[7]);
-  }
-}
-
-__global__ void Adjust_Cluster_Mass_Kernel(part_int_t n_local, Real* pos_x_dev, Real* pos_y_dev, Real* pos_z_dev,
-                                           Real* age_dev, Real* mass_dev, part_int_t* id_dev, Real xMin, Real yMin,
-                                           Real zMin, Real xMax, Real yMax, Real zMax, Real dx, Real dy, Real dz,
-                                           int nx_g, int ny_g, int nz_g, int n_ghost, int n_step, Real t, Real dt,
-                                           Real* dev_snr, Real snr_dt, Real time_sn_start, Real time_sn_end,
-                                           Real* dev_sw_p, Real* dev_sw_e, Real sw_dt, Real time_sw_start,
-                                           Real time_sw_end)
-{
-  int tid  = threadIdx.x;
-  int gtid = blockIdx.x * blockDim.x + tid;
-  // Bounds check on particle arrays
-  if (gtid >= n_local) return;
-
-  Real pos_x    = pos_x_dev[gtid];
-  Real pos_y    = pos_y_dev[gtid];
-  Real pos_z    = pos_z_dev[gtid];
-  bool in_local = (pos_x >= xMin && pos_x < xMax) && (pos_y >= yMin && pos_y < yMax) && (pos_z >= zMin && pos_z < zMax);
-  // Particle is outside bounds, exit
-  if (!in_local) return;
 
-  int indx_x  = (int)floor((pos_x - xMin) / dx);
-  int indx_y  = (int)floor((pos_y - yMin) / dy);
-  int indx_z  = (int)floor((pos_z - zMin) / dz);
-  bool ignore = indx_x < 0 || indx_y < 0 || indx_z < 0 || indx_x >= nx_g - 2 * n_ghost ||
-                indx_y >= ny_g - 2 * n_ghost || indx_z >= nz_g - 2 * n_ghost;
-  // Ignore this particle, exit
-  if (ignore) return;
+  /* Actually apply the stellar feedback (SNe and stellar winds) */
+  void operator()(Grid3D& G);
 
-  // bool is_alone = Particle_Is_Alone(pos_x_dev, pos_y_dev, pos_z_dev, n_local, gtid, 6*dx);
-  // if (is_alone) kernel_printf(" particle not alone: step %d, id %ld\n", n_step, id_dev[gtid]);
-  // if (!is_alone) return;
+ private:  // attributes
+  FeedbackAnalysis& analysis;
+  /* When false, ignore the snr_calc_ attribute. Instead, assume all clusters undergo a single
+   * supernova during the very first cycle and then never have a supernova again. */
+  const bool use_snr_calc_;
+  feedback::SNRateCalc snr_calc_;
+  /* Specifies the handling of feedback for particles along the boundaries */
+  feedback_details::BoundaryStrategy bdry_strat_;
+  /* Handles the scheduling of feedback from separate particles with overlapping stencils (lazily initialized) */
+  std::shared_ptr<feedback_details::OverlapScheduler> lazy_ov_scheduler_;
+};
 
-  Real age = t - age_dev[gtid];
-
-  #ifndef NO_SN_FEEDBACK
-  Real average_num_sn = Get_SN_Rate(age, dev_snr, snr_dt, time_sn_start, time_sn_end) * mass_dev[gtid] * dt;
-  int N               = Get_Number_Of_SNe_In_Cluster(average_num_sn, n_step, id_dev[gtid]);
-  mass_dev[gtid] -= N * feedback::MASS_PER_SN;
-  #endif
-
-  #ifndef NO_WIND_FEEDBACK
-  Real feedback_momentum  = Get_Wind_Flux(age, dev_sw_p, sw_dt, time_sw_start, time_sw_end);
-  Real feedback_energy    = Get_Wind_Power(age, dev_sw_e, sw_dt, time_sw_start, time_sw_end);
-  Real feedback_mass_rate = Get_Wind_Mass(feedback_momentum, feedback_energy);
-
-  mass_dev[gtid] -= feedback_mass_rate * dt;
-  #endif
-}
-
-__device__ void Set_Average_Density(int indx_x, int indx_y, int indx_z, int nx_g, int ny_g, int n_ghost, Real* density,
-                                    Real ave_dens)
-{
-  for (int i = -1; i < 2; i++) {
-    for (int j = -1; j < 2; j++) {
-      for (int k = -1; k < 2; k++) {
-        int indx = (indx_x + i + n_ghost) + (indx_y + j + n_ghost) * nx_g + (indx_z + k + n_ghost) * nx_g * ny_g;
-
-        density[indx] = ave_dens;
-      }
-    }
-  }
-}
-
-__global__ void Set_Ave_Density_Kernel(part_int_t n_local, Real* pos_x_dev, Real* pos_y_dev, Real* pos_z_dev,
-                                       Real* mass_dev, Real* age_dev, part_int_t* id_dev, Real xMin, Real yMin,
-                                       Real zMin, Real xMax, Real yMax, Real zMax, Real dx, Real dy, Real dz, int nx_g,
-                                       int ny_g, int nz_g, int n_ghost, Real t, Real dt, Real* density, Real* dev_snr,
-                                       Real snr_dt, Real time_sn_start, Real time_sn_end, Real time_sw_start,
-                                       Real time_sw_end, int n_step)
-{
-  int tid  = threadIdx.x;
-  int gtid = blockIdx.x * blockDim.x + tid;
-  // Bounds check on particle arrays
-  if (gtid >= n_local) return;
-
-  Real pos_x    = pos_x_dev[gtid];
-  Real pos_y    = pos_y_dev[gtid];
-  Real pos_z    = pos_z_dev[gtid];
-  bool in_local = (pos_x >= xMin && pos_x < xMax) && (pos_y >= yMin && pos_y < yMax) && (pos_z >= zMin && pos_z < zMax);
-  // Particle is outside bounds, exit
-  if (!in_local) return;
-
-  int indx_x  = (int)floor((pos_x - xMin) / dx);
-  int indx_y  = (int)floor((pos_y - yMin) / dy);
-  int indx_z  = (int)floor((pos_z - zMin) / dz);
-  bool ignore = indx_x < 0 || indx_y < 0 || indx_z < 0 || indx_x >= nx_g - 2 * n_ghost ||
-                indx_y >= ny_g - 2 * n_ghost || indx_z >= nz_g - 2 * n_ghost;
-  // Ignore this particle, exit
-  if (ignore) return;
-
-  // bool is_alone = Particle_Is_Alone(pos_x_dev, pos_y_dev, pos_z_dev, n_local, gtid, 6*dx);
-  // if (is_alone) kernel_printf(" particle not alone: step %d, id %ld\n", n_step, id_dev[gtid]);
-  // if (!is_alone) return;
-
-  bool is_sn_feedback   = false;
-  bool is_wind_feedback = false;
-  #ifndef NO_SN_FEEDBACK
-  is_sn_feedback = true;
-  #endif
-  #ifndef NO_WIND_FEEDBACK
-  is_wind_feedback = true;
-  #endif
-
-  Real ave_dens;
-  Real age = t - age_dev[gtid];
-  if (is_wind_feedback) {
-    if (time_sw_start <= age && age <= time_sw_end) {
-      ave_dens = Get_Average_Density(density, indx_x, indx_y, indx_z, nx_g, ny_g, n_ghost);
-      Set_Average_Density(indx_x, indx_y, indx_z, nx_g, ny_g, n_ghost, density, ave_dens);
-      // since we've set the average density, no need to keep
-      // checking whether we should do so.
-      return;
-    }
-  }
-  if (is_sn_feedback) {
-    if (time_sn_start <= age && age <= time_sn_end) {
-      Real average_num_sn = Get_SN_Rate(age, dev_snr, snr_dt, time_sn_start, time_sn_end) * mass_dev[gtid] * dt;
-      int N               = Get_Number_Of_SNe_In_Cluster(average_num_sn, n_step, id_dev[gtid]);
-      /*
-      if (gtid == 0) {
-        kernel_printf("AVEDENS n_step: %d, id: %lld, N: %d\n", n_step, id_dev[gtid], N);
-      }*/
-      Real n_0          = Get_Average_Number_Density_CGS(density, indx_x, indx_y, indx_z, nx_g, ny_g, n_ghost);
-      Real shell_radius = feedback::R_SH * pow(n_0, -0.46) * pow(N, 0.29);
-  #ifdef ONLY_RESOLVED
-      bool is_resolved = true;
-  #else
-      bool is_resolved = 3 * max(dx, max(dy, dz)) <= shell_radius;
-  #endif
-
-      // resolved SN feedback does not average densities.
-      if (!is_resolved && N > 0) {
-        ave_dens = n_0 * MU * MP / DENSITY_UNIT;
-        Set_Average_Density(indx_x, indx_y, indx_z, nx_g, ny_g, n_ghost, density, ave_dens);
-      }
-    }
-  }
-}
+}  // namespace
 
 /**
  * @brief Stellar feedback function (SNe and stellar winds)
  *
  * @param G
- * @param analysis
- * @return Real
  */
-Real feedback::Cluster_Feedback(Grid3D& G, FeedbackAnalysis& analysis)
+template <typename FeedbackModel>
+void ClusterFeedbackMethod<FeedbackModel>::operator()(Grid3D& G)
 {
+#if !(defined(PARTICLES_GPU) && defined(PARTICLE_AGE) && defined(PARTICLE_IDS))
+  CHOLLA_ERROR("This function can't be called with the current compiler flags");
+#else
   #ifdef CPU_TIME
   G.Timer.Feedback.Start();
   #endif
 
-  if (G.H.dt == 0) return 0.0;
-
-  Real h_dti = 0.0;
-  int time_direction, ngrid;
-  Real h_info[FEED_INFO_N] = {0, 0, 0, 0, 0, 0, 0, 0};
-  Real info[FEED_INFO_N];
-  Real *d_dti, *d_info;
-  // require d_prev_dens in case we have to undo feedback if the time
-  // step is too large.
-  Real* d_prev_dens;
-
-  // only apply feedback if we have clusters
-  if (G.Particles.n_local > 0) {
-    GPU_Error_Check(cudaMalloc(&d_dti, sizeof(Real)));
-    GPU_Error_Check(cudaMemcpy(d_dti, &h_dti, sizeof(Real), cudaMemcpyHostToDevice));
-    GPU_Error_Check(cudaMalloc(&d_prev_dens, G.Particles.n_local * sizeof(Real)));
-    GPU_Error_Check(cudaMemset(d_prev_dens, 0, G.Particles.n_local * sizeof(Real)));
-
-    ngrid = (G.Particles.n_local - 1) / TPB_FEEDBACK + 1;
-    GPU_Error_Check(cudaMalloc((void**)&d_info, FEED_INFO_N * sizeof(Real)));
-
-    // before applying feedback, set gas density around clusters to the
-    // average value from the 27 neighboring cells.  We don't want to
-    // do this during application of feedback since "undoing it" in the
-    // event that the time step is too large becomes difficult.
-    hipLaunchKernelGGL(Set_Ave_Density_Kernel, ngrid, TPB_FEEDBACK, 0, 0, G.Particles.n_local, G.Particles.pos_x_dev,
-                       G.Particles.pos_y_dev, G.Particles.pos_z_dev, G.Particles.mass_dev, G.Particles.age_dev,
-                       G.Particles.partIDs_dev, G.H.xblocal, G.H.yblocal, G.H.zblocal, G.H.xblocal_max, G.H.yblocal_max,
-                       G.H.zblocal_max, G.H.dx, G.H.dy, G.H.dz, G.H.nx, G.H.ny, G.H.nz, G.H.n_ghost, G.H.t, G.H.dt,
-                       G.C.d_density, dev_snr, snr_dt, time_sn_start, time_sn_end, time_sw_start, time_sw_end,
-                       G.H.n_step);
+  if (fmax(fabs(G.H.dy - G.H.dx), fabs(G.H.dz - G.H.dx)) > fabs(1e-15 * G.H.dx)) {
+    CHOLLA_ERROR("dx, dy, dz must all approximately be the same with the current feedback prescriptions");
   }
 
-  int loop_counter = 0;
-
-  do {
-    time_direction = 1;
-    loop_counter++;
-
-    if (G.Particles.n_local > 0) {
-      // always reset d_info to 0 since otherwise do/while looping could add
-      // values that should have been reverted.
-      cudaMemset(d_info, 0, FEED_INFO_N * sizeof(Real));
-      cudaMemset(d_dti, 0, sizeof(Real));
-      hipLaunchKernelGGL(Cluster_Feedback_Kernel, ngrid, TPB_FEEDBACK, 0, 0, G.Particles.n_local,
-                         G.Particles.partIDs_dev, G.Particles.pos_x_dev, G.Particles.pos_y_dev, G.Particles.pos_z_dev,
-                         G.Particles.mass_dev, G.Particles.age_dev, G.H.xblocal, G.H.yblocal, G.H.zblocal,
-                         G.H.xblocal_max, G.H.yblocal_max, G.H.zblocal_max, G.H.dx, G.H.dy, G.H.dz, G.H.nx, G.H.ny,
-                         G.H.nz, G.H.n_ghost, G.H.t, G.H.dt, d_dti, d_info, G.C.d_density, gama, d_prev_dens,
-                         time_direction, dev_snr, snr_dt, time_sn_start, time_sn_end, dev_sw_p, dev_sw_e, sw_dt,
-                         time_sw_start, time_sw_end, G.H.n_step, loop_counter);
-
-      GPU_Error_Check(cudaMemcpy(&h_dti, d_dti, sizeof(Real), cudaMemcpyDeviceToHost));
-    }
-
-  #ifdef MPI_CHOLLA
-    h_dti = ReduceRealMax(h_dti);
-    MPI_Barrier(world);
-  #endif  // MPI_CHOLLA
-    if (h_dti != 0) {
-      chprintf("+++++++  feed dt = %.12e, H.dt = %.12e\n", C_cfl / h_dti, G.H.dt);
-    }
+  if (G.H.dt == 0) return;
 
-    if (h_dti != 0 && (C_cfl / h_dti < G.H.dt)) {
-      // timestep too big: need to undo the last operation
-      time_direction = -1;
-      if (G.Particles.n_local > 0) {
-        hipLaunchKernelGGL(Cluster_Feedback_Kernel, ngrid, TPB_FEEDBACK, 0, 0, G.Particles.n_local,
-                           G.Particles.partIDs_dev, G.Particles.pos_x_dev, G.Particles.pos_y_dev, G.Particles.pos_z_dev,
-                           G.Particles.mass_dev, G.Particles.age_dev, G.H.xblocal, G.H.yblocal, G.H.zblocal,
-                           G.H.xblocal_max, G.H.yblocal_max, G.H.zblocal_max, G.H.dx, G.H.dy, G.H.dz, G.H.nx, G.H.ny,
-                           G.H.nz, G.H.n_ghost, G.H.t, G.H.dt, d_dti, d_info, G.C.d_density, gama, d_prev_dens,
-                           time_direction, dev_snr, snr_dt, time_sn_start, time_sn_end, dev_sw_p, dev_sw_e, sw_dt,
-                           time_sw_start, time_sw_end, G.H.n_step, loop_counter);
-
-        GPU_Error_Check(cudaDeviceSynchronize());
-      }
+  // h_info is used to store feedback summary info on the host. The following
+  // syntax sets all entries to 0 -- important if a process has no particles
+  // (this is valid C++ syntax, but historically wasn't valid C syntax)
+  Real h_info[FBInfoLUT::LEN] = {};
 
-      G.H.dt = C_cfl / h_dti;
-      if (loop_counter > 2) {  // avoid excessive looping
-        G.H.dt = 0.9 * C_cfl / h_dti;
+  // only apply feedback if we have clusters
+  if (G.Particles.n_local > 0) {
+    // compute the grid-size or the number of thread-blocks per grid. The number of threads in a block is
+    // given by TPB_FEEDBACK
+    int ngrid = (G.Particles.n_local - 1) / TPB_FEEDBACK + 1;
+
+    // setup some standard argument packs:
+    const feedback_details::ParticleProps particle_props{
+        G.Particles.n_local,   G.Particles.partIDs_dev, G.Particles.pos_x_dev, G.Particles.pos_y_dev,
+        G.Particles.pos_z_dev, G.Particles.vel_x_dev,   G.Particles.vel_y_dev, G.Particles.vel_z_dev,
+        G.Particles.mass_dev,  G.Particles.age_dev};
+
+    const feedback_details::FieldSpatialProps spatial_props{
+        G.H.xblocal, G.H.yblocal, G.H.zblocal, G.H.xblocal_max, G.H.yblocal_max, G.H.zblocal_max, G.H.dx,
+        G.H.dy,      G.H.dz,      G.H.nx,      G.H.ny,          G.H.nz,          G.H.n_ghost,
+    };
+
+    const feedback_details::CycleProps cycle_props{G.H.t, G.H.dt, G.H.n_step};
+
+    // Declare/allocate device buffer for holding the number of supernovae per particle in the current cycle
+    // (The following behavior can be accomplished without any memory allocations if we employ templates)
+    cuda_utilities::DeviceVector<int> d_num_SN(G.Particles.n_local, true);  // initialized to 0
+
+    if (use_snr_calc_) {
+      hipLaunchKernelGGL(Get_SN_Count_Kernel, ngrid, TPB_FEEDBACK, 0, 0, G.Particles.n_local, G.Particles.partIDs_dev,
+                         G.Particles.mass_dev, G.Particles.age_dev, cycle_props, snr_calc_, d_num_SN.data());
+      GPU_Error_Check(cudaDeviceSynchronize());
+    } else {
+      // in this branch, ``this->use_snr_calc_ == false``. This means that we assume all particles undergo
+      // a supernova during the very first cycle. Then there is never another supernova
+      if (G.H.n_step == 0) {
+        std::vector<int> tmp(G.Particles.n_local, 1);
+        GPU_Error_Check(
+            cudaMemcpy(d_num_SN.data(), tmp.data(), sizeof(int) * G.Particles.n_local, cudaMemcpyHostToDevice));
+      } else {
+        // do nothing - the number of supernovae is already zero
       }
     }
-  } while (time_direction == -1);
-
-  // TODO reduce cluster mass
-  if (G.Particles.n_local > 0) {
-    hipLaunchKernelGGL(Adjust_Cluster_Mass_Kernel, ngrid, TPB_FEEDBACK, 0, 0, G.Particles.n_local,
-                       G.Particles.pos_x_dev, G.Particles.pos_y_dev, G.Particles.pos_z_dev, G.Particles.age_dev,
-                       G.Particles.mass_dev, G.Particles.partIDs_dev, G.H.xblocal, G.H.yblocal, G.H.zblocal,
-                       G.H.xblocal_max, G.H.yblocal_max, G.H.zblocal_max, G.H.dx, G.H.dy, G.H.dz, G.H.nx, G.H.ny,
-                       G.H.nz, G.H.n_ghost, G.H.n_step, G.H.t, G.H.dt, dev_snr, snr_dt, time_sn_start, time_sn_end,
-                       dev_sw_p, dev_sw_e, sw_dt, time_sw_start, time_sw_end);
-  }
-  /*
-  part_int_t n_local, Real* pos_x_dev,
-  Real* pos_y_dev, Real* pos_z_dev, Real* age_dev, Real* mass_dev,
-  part_int_t* id_dev, Real xMin, Real yMin, Real zMin, Real xMax, Real yMax,
-  Real zMax, Real dx, Real dy, Real dz, int nx_g, int ny_g, int nz_g,
-  int n_ghost, int n_step, Real t, Real dt, Real* dev_snr,
-  Real snr_dt, Real time_sn_start, Real time_sn_end,
-  Real* dev_sw_p, Real* dev_sw_e, Real sw_dt, Real time_sw_start,
-  Real time_sw_end*/
 
-  chprintf("*******  looped %d time(s)\n", loop_counter);
+    if (lazy_ov_scheduler_ == nullptr) {
+      lazy_ov_scheduler_ = std::make_shared<feedback_details::OverlapScheduler>(
+          feedback_details::OverlapStrat::sequential, spatial_props.nx_g, spatial_props.ny_g, spatial_props.nz_g);
+    }
 
-  if (G.Particles.n_local > 0) {
-    GPU_Error_Check(cudaMemcpy(&h_info, d_info, FEED_INFO_N * sizeof(Real), cudaMemcpyDeviceToHost));
-    GPU_Error_Check(cudaFree(d_dti));
-    GPU_Error_Check(cudaFree(d_info));
-    GPU_Error_Check(cudaFree(d_prev_dens));
+    feedback_details::Exec_Cluster_Feedback_Kernel<FeedbackModel>(particle_props, spatial_props, cycle_props, h_info,
+                                                                  G.C.d_density, d_num_SN.data(), *lazy_ov_scheduler_,
+                                                                  bdry_strat_);
   }
 
+  // now gather the feedback summary info into an array called info.
   #ifdef MPI_CHOLLA
-  MPI_Reduce(&h_info, &info, FEED_INFO_N, MPI_CHREAL, MPI_SUM, root, world);
+  Real info[FBInfoLUT::LEN];
+  MPI_Reduce(&h_info, &info, FBInfoLUT::LEN, MPI_CHREAL, MPI_SUM, root, world);
   #else
-  info = h_info;
+  Real* info = h_info;
   #endif
 
   #ifdef MPI_CHOLLA  // only do stats gathering on root rank
   if (procID == 0) {
   #endif
 
-    analysis.countSN += (long)info[feedback::SN];
-    analysis.countResolved += (long)info[feedback::RESOLVED];
-    analysis.countUnresolved += (long)info[feedback::NOT_RESOLVED];
-    analysis.totalEnergy += info[feedback::ENERGY];
-    analysis.totalMomentum += info[feedback::MOMENTUM];
-    analysis.totalUnresEnergy += info[feedback::UNRES_ENERGY];
-    analysis.totalWindMomentum += info[i_WIND_MOMENTUM];
-    analysis.totalWindEnergy += info[i_WIND_ENERGY];
+    analysis.countSN += (long)info[FBInfoLUT::countSN];
+    analysis.countResolved += (long)info[FBInfoLUT::countResolved];
+    analysis.countUnresolved += (long)info[FBInfoLUT::countUnresolved];
+    analysis.totalEnergy += info[FBInfoLUT::totalEnergy];
+    analysis.totalMomentum += info[FBInfoLUT::totalMomentum];
+    analysis.totalUnresEnergy += info[FBInfoLUT::totalUnresEnergy];
+    analysis.totalWindMomentum += info[FBInfoLUT::totalWindMomentum];
+    analysis.totalWindEnergy += info[FBInfoLUT::totalWindEnergy];
 
     chprintf("iteration %d, t %.4e, dt %.4e", G.H.n_step, G.H.t, G.H.dt);
 
   #ifndef NO_SN_FEEDBACK
     Real global_resolved_ratio = 0.0;
     if (analysis.countResolved > 0 || analysis.countUnresolved > 0) {
-      global_resolved_ratio = analysis.countResolved / double(analysis.countResolved + analysis.countUnresolved);
+      global_resolved_ratio = (Real)analysis.countResolved / (analysis.countResolved + analysis.countUnresolved);
     }
-    chprintf(": number of SN: %d,(R: %d, UR: %d)\n", (int)info[feedback::SN], (long)info[feedback::RESOLVED],
-             (long)info[feedback::NOT_RESOLVED]);
+    chprintf(": number of SN: %d,(R: %d, UR: %d)\n", (int)info[FBInfoLUT::countSN],
+             (long)info[FBInfoLUT::countResolved], (long)info[FBInfoLUT::countUnresolved]);
     chprintf("    cummulative: #SN: %d, ratio of resolved (R: %d, UR: %d) = %.3e\n", (long)analysis.countSN,
              (long)analysis.countResolved, (long)analysis.countUnresolved, global_resolved_ratio);
-    chprintf("    sn  r energy  : %.5e erg, cumulative: %.5e erg\n", info[feedback::ENERGY] * FORCE_UNIT * LENGTH_UNIT,
-             analysis.totalEnergy * FORCE_UNIT * LENGTH_UNIT);
+    chprintf("    sn  r energy  : %.5e erg, cumulative: %.5e erg\n",
+             info[FBInfoLUT::totalEnergy] * FORCE_UNIT * LENGTH_UNIT, analysis.totalEnergy * FORCE_UNIT * LENGTH_UNIT);
     chprintf("    sn ur energy  : %.5e erg, cumulative: %.5e erg\n",
-             info[feedback::UNRES_ENERGY] * FORCE_UNIT * LENGTH_UNIT,
+             info[FBInfoLUT::totalUnresEnergy] * FORCE_UNIT * LENGTH_UNIT,
              analysis.totalUnresEnergy * FORCE_UNIT * LENGTH_UNIT);
     chprintf("    sn momentum  : %.5e SM km/s, cumulative: %.5e SM km/s\n",
-             info[feedback::MOMENTUM] * VELOCITY_UNIT / 1e5, analysis.totalMomentum * VELOCITY_UNIT / 1e5);
+             info[FBInfoLUT::totalMomentum] * VELOCITY_UNIT / 1e5, analysis.totalMomentum * VELOCITY_UNIT / 1e5);
   #endif  // NO_SN_FEEDBACK
 
   #ifndef NO_WIND_FEEDBACK
     chprintf("    wind momentum: %.5e S.M. km/s,  cumulative: %.5e S.M. km/s\n",
-             info[i_WIND_MOMENTUM] * VELOCITY_UNIT / 1e5, analysis.totalWindMomentum * VELOCITY_UNIT / 1e5);
-    chprintf("    wind energy  : %.5e erg,  cumulative: %.5e erg\n", info[i_WIND_ENERGY] * FORCE_UNIT * LENGTH_UNIT,
+             info[FBInfoLUT::totalWindMomentum] * VELOCITY_UNIT / 1e5,
+             analysis.totalWindMomentum * VELOCITY_UNIT / 1e5);
+    chprintf("    wind energy  : %.5e erg,  cumulative: %.5e erg\n",
+             info[FBInfoLUT::totalWindEnergy] * FORCE_UNIT * LENGTH_UNIT,
              analysis.totalWindEnergy * FORCE_UNIT * LENGTH_UNIT);
   #endif  // NO_WIND_FEEDBACK
 
@@ -1133,8 +207,83 @@ Real feedback::Cluster_Feedback(Grid3D& G, FeedbackAnalysis& analysis)
   #ifdef CPU_TIME
   G.Timer.Feedback.End();
   #endif
-
-  return h_dti;
+#endif  // the ifdef statement for Particle-stuff
 }
 
-#endif  // FEEDBACK & PARTICLES_GPU & PARTICLE_IDS & PARTICLE_AGE
+std::function<void(Grid3D&)> feedback::configure_feedback_callback(struct Parameters& P, ParameterMap& pmap,
+                                                                   FeedbackAnalysis& analysis)
+{
+#if !(defined(FEEDBACK) && defined(PARTICLES_GPU) && defined(PARTICLE_AGE) && defined(PARTICLE_IDS))
+  const bool supports_feedback = false;
+#else
+  const bool supports_feedback = true;
+#endif
+
+  // retrieve the supernova-feedback model name
+  const std::string sn_model = pmap.value_or("feedback.sn_model", "");
+
+  // handle the case when there is no feedback (or if the code can't support feedback)
+  if (sn_model == "none" or (sn_model.empty() and (not supports_feedback))) {
+    return {};  // return an empty object
+  } else if (not supports_feedback) {
+    CHOLLA_ERROR("The way that cholla was compiled does not currently support feedback");
+  } else if (sn_model.empty()) {
+    CHOLLA_ERROR(
+        "The feedback_sn_model parameter was not specified. It must be "
+        "specified when cholla has been compiled with support for feedback.");
+  }
+
+  // parse the supernova-rate-model to initialize some values
+  SNRateCalc snr_calc{};
+  bool use_snr_calc;
+
+  const std::string sn_rate_model = pmap.value_or("feedback.sn_rate", "table");
+  if (sn_rate_model == "table") {
+    use_snr_calc = true;
+    snr_calc     = feedback::SNRateCalc(pmap);
+  } else if (sn_rate_model == "immediate_sn") {
+    use_snr_calc = false;
+  } else {
+    CHOLLA_ERROR("Unrecognized option passed to sn_rate_model: %s", sn_rate_model.c_str());
+  }
+
+  // parse the boundary-strategy to initialize some values
+  // - a descriptive error is automatically produced if an error occurs
+  const std::string bndy_strat_name = pmap.value<std::string>("feedback.boundary_strategy");
+  feedback_details::BoundaryStrategy bndy_strat;
+  if (bndy_strat_name == "ignore_issues") {
+    bndy_strat = feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues;
+  } else if (bndy_strat_name == "snap") {
+    bndy_strat = feedback_details::BoundaryStrategy::excludeGhostParticle_snapActiveStencil;
+  } else {
+    CHOLLA_ERROR("Unrecognized option passed to feedback_boundary_strategy: %s", bndy_strat_name.c_str());
+  }
+
+  // now lets initialize ClusterFeedbackMethod<> and return
+  std::function<void(Grid3D&)> out;
+  // what are better names for legacy and legacyAlt?
+  // -> Hybrid_CiCResolved_27Unresolved
+  // -> Hybrid_27Resolved_27Unresolved
+  if (sn_model == "legacy") {
+    // this uses 8 Cell CiC Resolved Feedback and Orlando's 27 cell unresolved feedback
+    out = ClusterFeedbackMethod<fb_prescription::CiCLegacyResolvedAndUnresolvedPrescription>(analysis, use_snr_calc,
+                                                                                             snr_calc, bndy_strat);
+  } else if (sn_model == "legacyAlt") {
+    // this uses 8 Cell CiC Resolved Feedback and an alternate (slightly easier to understand)
+    // 27 cell unresolved feedback prescription
+    out = ClusterFeedbackMethod<fb_prescription::HybridResolvedAndUnresolvedPrescription>(analysis, use_snr_calc,
+                                                                                          snr_calc, bndy_strat);
+  } else if (sn_model == "resolvedCiC") {
+    out =
+        ClusterFeedbackMethod<fb_prescription::CiCResolvedSNPrescription>(analysis, use_snr_calc, snr_calc, bndy_strat);
+  } else if (sn_model == "resolved27cell") {
+    out = ClusterFeedbackMethod<fb_prescription::Sphere27ResolvedSNPrescription>(analysis, use_snr_calc, snr_calc,
+                                                                                 bndy_strat);
+  } else if (sn_model == "resolvedExperimentalBinarySphere") {
+    out = ClusterFeedbackMethod<fb_prescription::SphereBinaryResolvedSNPrescription>(analysis, use_snr_calc, snr_calc,
+                                                                                     bndy_strat);
+  } else {
+    CHOLLA_ERROR("Unrecognized sn_model: %s", sn_model.c_str());
+  }
+  return out;
+}
diff --git a/src/feedback/feedback.h b/src/feedback/feedback.h
index a07307ba0..757904058 100644
--- a/src/feedback/feedback.h
+++ b/src/feedback/feedback.h
@@ -1,46 +1,52 @@
+/*!
+ * \file feedback.h
+ * \brief Contains the public interface for using feedback. None of the implementation details are exposed.
+ *
+ */
+
 #pragma once
-#if defined(PARTICLES_GPU) && defined(FEEDBACK)
 
-  #include "../analysis/feedback_analysis.h"
-  #include "../global/global.h"
-  #ifdef O_HIP
-    #include <hiprand.h>
-    #include <hiprand_kernel.h>
-  #else
-    #include <curand.h>
-    #include <curand_kernel.h>
-  #endif  // O_HIP
+#include <functional>
+
+// since this is a public header and we want to hide all implementation details, we
+// explicitly avoid including other headers in from the feedback directory. This
+// helps ALWAYS include this header in main.cpp, regardless of the defined macros flags
+#include "../analysis/feedback_analysis.h"
+#include "../global/global.h"
+#include "../io/ParameterMap.h"
 
-typedef curandStateMRG32k3a_t feedback_prng_t;
+// we define the following as a struct so we can satisfy rules about namespace and struct
+// naming. But you should think of it as a namespace
+struct FBInfoLUT {
+  // this enum acts like a lookup table (LUT). It maps the names of analysis statistics to
+  // contiguous indices. LEN specfies the number of named analysis statistics
+  enum {
+    countSN = 0,
+    countResolved,
+    countUnresolved,
+    totalEnergy,
+    totalMomentum,
+    totalUnresEnergy,
+    totalWindMomentum,
+    totalWindEnergy,
+    // make sure the following is always the last entry so that it reflects the number of entries
+    LEN
+  };
+};
 
 namespace feedback
 {
-const int SN = 0, RESOLVED = 1, NOT_RESOLVED = 2, ENERGY = 3, MOMENTUM = 4, UNRES_ENERGY = 5;
-
-// supernova rate: 1SN / 100 solar masses per 36 Myr
-static const Real DEFAULT_SNR   = 2.8e-7;
 static const Real ENERGY_PER_SN = 1e51 / MASS_UNIT * TIME_UNIT * TIME_UNIT / LENGTH_UNIT / LENGTH_UNIT;
 // 10 solarMasses per SN
 static const Real MASS_PER_SN = 10.0;
 // 2.8e5 M_s km/s * n_0^{-0.17} -> eq.(34) Kim & Ostriker (2015)
 static const Real FINAL_MOMENTUM = 2.8e5 / LENGTH_UNIT * 1e5 * TIME_UNIT;
-// 30.2 pc * n_0^{-0.46} -> eq.(31) Kim & Ostriker (2015)
-static const Real R_SH = 0.0302;
-// default value for when SNe stop (40 Myr)
-static const Real DEFAULT_SN_END = 40000;
-// default value for when SNe start (4 Myr)
-static const Real DEFAULT_SN_START = 4000;
 
-extern Real *dev_snr, snr_dt, time_sn_end, time_sn_start;
-extern Real *dev_sw_p, *dev_sw_e, sw_dt, time_sw_start, time_sw_end;
+/* construct the feedback function (or not based on the specified parameters & compilation mode)
+ *
+ * \note
+ * we could probably define the following function regardless of the defined compiler flags */
+std::function<void(Grid3D&)> configure_feedback_callback(struct Parameters& P, ParameterMap& pmap,
+                                                         FeedbackAnalysis& analysis);
 
-  #ifndef NO_SN_FEEDBACK
-void Init_State(Parameters* P);
-  #endif
-  #ifndef NO_WIND_FEEDBACK
-void Init_Wind_State(Parameters* P);
-  #endif
-Real Cluster_Feedback(Grid3D& G, FeedbackAnalysis& sn_analysis);
 }  // namespace feedback
-
-#endif  // PARTICLES_GPU && FEEDBACK
diff --git a/src/feedback/feedback_tests.cu b/src/feedback/feedback_tests.cu
new file mode 100644
index 000000000..30cb4e4cf
--- /dev/null
+++ b/src/feedback/feedback_tests.cu
@@ -0,0 +1,1736 @@
+/* This provides unit-tests for extracted easily-testable components of the feedback module.
+ *
+ * This mainly includes things like the deposition stencil
+ */
+
+#include <array>
+#include <cmath>
+#include <cstdio>
+#include <map>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+// External Includes
+#include <gtest/gtest.h>  // Include GoogleTest and related libraries/headers
+
+#include "../feedback/kernel.h"
+#include "../feedback/prescription.h"
+#include "../global/global.h"
+#include "../utils/DeviceVector.h"
+#include "../utils/basic_structs.h"
+#include "../utils/error_handling.h"
+#include "../utils/gpu.hpp"  // gpuFor, GPU_Error_Check
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Define some general-purpose testing tools. These may be a little overkill for this particular file.
+// It may make sense to move these to the testing_utils file and namespace.
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace  // Anonymous namespace
+{
+
+template <typename T>
+std::string array_to_string_1D_helper_(T* arr, int len)
+{
+  std::string out;
+  for (int ix = 0; ix < len; ix++) {
+    if (ix != 0) out += ", ";  // put delimiter after last element
+    out += std::to_string(arr[ix]);
+  }
+  return out;
+}
+
+template <typename T>
+std::string array_to_string(T* arr, int len)
+{
+  return std::string("{") + array_to_string_1D_helper_(arr, len) + std::string("}");
+};
+
+struct Extent3D {
+  int nx;
+  int ny;
+  int nz;
+};
+
+/* Convert an array to a string.
+ *
+ * The indent_size arg is adopted from numpy's similar indent arg in array2string. As in numpy,
+ * we don't apply indent on first line
+ */
+template <typename T>
+std::string array_to_string(T* arr, Extent3D extent, unsigned int indent_size = 0)
+{
+  const std::string common_line_prefix = std::string(indent_size, ' ');
+
+  std::string out;
+  for (int iz = 0; iz < extent.nz; iz++) {
+    if (iz != 0) out += ",\n\n";  // add delimiter after last element
+
+    for (int iy = 0; iy < extent.ny; iy++) {
+      if (iy != 0) out += ",\n";  // add delimiter after last element
+
+      if ((iz == 0) and (iy == 0)) {
+        // explicitly don't insert the indents on very first line
+        out += "{{{";
+      } else if (iy == 0) {
+        out += common_line_prefix + " {{";
+      } else {
+        out += common_line_prefix + "  {";
+      }
+
+      // ToDo: replace following loop with out += array_to_string_1D_helper_(arr + extent.nx * (iy + extent.ny * iz),
+      // extent.nx);
+      for (int ix = 0; ix < extent.nx; ix++) {
+        if (ix != 0) out += ", ";  // put delimiter after last element
+
+        out += std::to_string(arr[ix + extent.nx * (iy + extent.ny * iz)]);
+      }
+      out += '}';
+    }
+    out += '}';
+  }
+  return out + '}';
+}
+
+/* converts a 3 element mathematical vector to a string */
+template <typename T>
+std::string Vec3_to_String(T* arr3D)
+{
+  return (std::string("(") + std::to_string(arr3D[0]) + ", " + std::to_string(arr3D[1]) + ", " +
+          std::to_string(arr3D[2]) + ")");
+}
+
+template <typename T>
+bool isclose(T actual, T desired, double rtol, double atol = 0.0, bool equal_nan = false)
+{
+  if (equal_nan and std::isnan(actual) and std::isnan(desired)) return true;
+
+  double abs_diff             = fabs(actual - desired);
+  double max_allowed_abs_diff = (atol + rtol * fabs(desired));
+  // need to use <= rather than <, to handle case where atol = actual = desired = 0
+  return abs_diff <= max_allowed_abs_diff;
+}
+
+// this is a little overkill, for right now, but it could be nice to have
+// based on signature of numpy's testing.assert_allclose function!
+template <typename T>
+void assert_allclose(T* actual, T* desired, Extent3D extent, double rtol, double atol = 0.0, bool equal_nan = false,
+                     const std::string& err_msg = "")
+{
+  auto is_close = [equal_nan, atol, rtol](double actual, double desired) -> bool {
+    return isclose(actual, desired, rtol, atol, equal_nan);
+  };
+
+  int count_notclose = 0;
+
+  // on device code, we want to swap the iteration order
+  for (int iz = 0; iz < extent.nz; iz++) {
+    for (int iy = 0; iy < extent.ny; iy++) {
+      for (int ix = 0; ix < extent.nx; ix++) {
+        int ind3D = ix + extent.nx * (iy + extent.ny * iz);
+        count_notclose += not is_close(actual[ind3D], desired[ind3D]);
+      }
+    }
+  }
+
+  if (count_notclose == 0) return;
+
+  // make another pass through - this time gather information to provide an informative error message
+  int first_bad_index[3] = {-1, 0, 0};
+  double max_abs_diff    = 0.0;
+  double max_rel_diff    = 0.0;
+
+  for (int iz = 0; iz < extent.nz; iz++) {
+    for (int iy = 0; iy < extent.ny; iy++) {
+      for (int ix = 0; ix < extent.nx; ix++) {
+        int ind3D    = ix + extent.nx * (iy + extent.ny * iz);
+        max_abs_diff = std::fmax(max_abs_diff, std::fabs(actual[ind3D] - desired[ind3D]));
+
+        if (desired[ind3D] != 0) {
+          double cur_rel_diff = (actual[ind3D] - desired[ind3D]) / double(desired[ind3D]);
+          max_rel_diff        = std::fmax(max_rel_diff, std::fabs(cur_rel_diff));
+        }
+
+        if (first_bad_index[0] == -1 and (not is_close(actual[ind3D], desired[ind3D]))) {
+          first_bad_index[0] = ix;
+          first_bad_index[1] = iy;
+          first_bad_index[2] = iz;
+        }
+      }
+    }
+  }
+
+  std::size_t total_size = std::size_t(extent.nz) * std::size_t(extent.ny) * std::size_t(extent.nx);
+  int bad_ind3D          = first_bad_index[0] + extent.nx * (first_bad_index[1] + extent.ny * first_bad_index[2]);
+
+  FAIL() << "Not equal to tolerance rtol=" << rtol << ", atol=" << atol << '\n'
+         << err_msg << '\n'
+         << "Mismatched elements: " << count_notclose << " / " << total_size << '\n'
+         << "Max absolute difference: " << max_abs_diff << '\n'
+         << "Max relative difference: " << max_rel_diff << '\n'
+         << "First bad index: " << Vec3_to_String(first_bad_index) << '\n'
+         << "    actual: " << actual[bad_ind3D] << ", desired: " << desired[bad_ind3D] << "\n";
+}
+
+}  // anonymous namespace
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Define some tools for testing deposition-stencils
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* Updates elements in the ``out_data`` 3D-array with the fraction of the volume that is enclosed by
+ * the specified ``stencil``, that is centered at the given position.
+ *
+ * \note
+ * Right now, this function should only be executed by a single thread-block with a single thread. This
+ * choice reflects the fact that a single thread is historically assigned to a single particle.
+ */
+template <typename Stencil>
+__global__ void Stencil_Overlap_Kernel_(Real* out_data, hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                        Stencil stencil, StencilEvalKind eval)
+{
+  // first, define the lambda function that actually updates out_data
+  auto update_entry_fn = [out_data](double dV, int indx3D) -> void { out_data[indx3D] = dV; };
+
+  // second, execute update_entry at each location where the stencil overlaps with the cells
+  switch (eval) {
+    case StencilEvalKind::enclosed_stencil_vol_frac:
+      stencil.for_each(pos_indU, nx_g, ny_g, update_entry_fn);
+      break;
+    case StencilEvalKind::enclosed_cell_vol_frac:
+      stencil.for_each_enclosedCellVol(pos_indU, nx_g, ny_g, update_entry_fn);
+      break;
+    case StencilEvalKind::for_each_overlap_zone:
+      stencil.for_each_overlap_zone(pos_indU, nx_g, ny_g, [out_data](int indx3D) -> void { out_data[indx3D] = 1.0; });
+      break;
+  }
+}
+
+/* Utility function used in multiple tests that evaluates overlap values with a grid of cells (on the device)
+ * and returns the results after copying them back to the host.
+ *
+ * \param pos_indxU A 3 element array specifying the postion of the (center of the) stencil
+ * \param full_extent describes the extent of the array that the stencil will be evaluated on. (It MUST
+ *     include contributions from the ghost_depth)
+ * \param n_ghost the number of ghost-zones
+ * \param stencil instance of the stencil that should be tested
+ * \param eval Specifies the precise calculation that will be performed.
+ */
+template <typename Stencil>
+std::vector<double> eval_stencil_overlap_(const Real* pos_indxU, Extent3D full_extent, int n_ghost, Stencil stencil,
+                                          StencilEvalKind eval = StencilEvalKind::enclosed_stencil_vol_frac)
+{
+  cuda_utilities::DeviceVector<Real> data(full_extent.nx * full_extent.ny * full_extent.nz, true);  // initialize to 0
+
+  // launch the kernel
+  const int num_blocks        = 1;
+  const int threads_per_block = 1;
+
+  hydro_utilities::VectorXYZ<Real> pos_indU{pos_indxU[0], pos_indxU[1], pos_indxU[2]};
+
+  hipLaunchKernelGGL(Stencil_Overlap_Kernel_, num_blocks, threads_per_block, 0, 0, data.data(), pos_indU,
+                     full_extent.nx, full_extent.ny, stencil, eval);
+
+  GPU_Error_Check(cudaDeviceSynchronize());
+  std::vector<double> out(full_extent.nx * full_extent.ny * full_extent.nz);
+  data.cpyDeviceToHost(out);
+  return out;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Define miscellaneous tools
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace
+{
+
+/* Struct that specifies 1D spatial properties. This is used to help parameterize tests */
+struct AxProps {
+  int num_cells;   /*!< number of cells along the given axis (excluding ghost zone)*/
+  Real min;        /*!< the position of the left edge of left-most (non-ghost) cell, in code units */
+  Real cell_width; /*!< Cell width, in code units along cur axis. Must be positive */
+};
+
+}  // anonymous namespace
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Define some tests that check some expected trends as we slowly move a stencil to the right along the
+// x-axis. These tests could definitely be generalized (so that they are performed along other axes)
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// records the first and last index along an index where the stencil has non-zero overlap as well
+// as the overlap values at those indices
+struct OverlapRange {
+  int first_indx, last_indx;
+  double first_overlap, last_overlap;
+};
+
+// iterate over the x-axis (at fixed y_ind and z_ind) of an array that holds the overlap values computed
+// with respect to the stencil. Return the OverlapRange object representing the range of cells with
+// non-zero values
+std::optional<OverlapRange> find_ovrange_(double* v, int y_ind, int z_ind, Extent3D full_extent)
+{
+  for (int ix = 0; ix < full_extent.nx; ix++) {
+    double overlap_frac = v[ix + full_extent.nx * (y_ind + full_extent.ny * z_ind)];
+    if (overlap_frac > 0) {
+      // launch inner-loop to search for the end of the overlap-range
+      double prev_inner_overlap = overlap_frac;
+      for (int inner_ix = (ix + 1); inner_ix < full_extent.nx; inner_ix++) {
+        double inner_overlap = v[inner_ix + full_extent.nx * (y_ind + full_extent.ny * z_ind)];
+        if (inner_overlap == 0.0) return {{ix, inner_ix - 1, overlap_frac, prev_inner_overlap}};
+        prev_inner_overlap = inner_overlap;
+      }
+      // if we got to this point, this means that the (full_extend.nx-1) overlaps with stencil
+      return {{ix, full_extent.nx - 1, overlap_frac, prev_inner_overlap}};
+    }
+  }
+
+  return {};  // there is no overlap
+};
+
+/* test some expected trends as we slowly move a stencil to the right along the
+ * x-axis.
+ *
+ * \param n_ghost the number of ghost-zones to use in the calculation
+ * \param tot_vol_atol the acceptable absolute error bound between the empirical
+ *                     total-overlap value and 1.0 (the expected value)tolerance for the
+ */
+template <typename Stencil>
+void sliding_stencil_test(int n_ghost, double tot_vol_atol = 0.0, bool ignore_monotonicity_comparisons = false)
+{
+  Extent3D full_extent{2 * n_ghost + 4,   // x-axis
+                       2 * n_ghost + 3,   // y-axis
+                       2 * n_ghost + 3};  // z-axis
+
+  // determine the centers of the stencil
+  Real dummy                              = 1.1 + n_ghost;
+  std::vector<Real> sliding_ax_indxU_vals = {
+      1.50 + n_ghost, 1.75 + n_ghost, 2.00 + n_ghost, 2.25 + n_ghost, 2.50 + n_ghost,
+  };
+
+  if (Stencil::max_enclosed_neighbors > 1) {
+    int min_width = 2 * n_ghost + 1 + 2 * Stencil::max_enclosed_neighbors;
+    full_extent   = {min_width + 1,  // x-axis
+                     min_width,      // y-axis
+                     min_width};     // z-axis
+    dummy         = 0.1 + n_ghost + Stencil::max_enclosed_neighbors;
+    sliding_ax_indxU_vals.clear();
+    sliding_ax_indxU_vals = {
+        n_ghost + Stencil::max_enclosed_neighbors + 0.5, n_ghost + Stencil::max_enclosed_neighbors + 0.75,
+        n_ghost + Stencil::max_enclosed_neighbors + 1.0, n_ghost + Stencil::max_enclosed_neighbors + 1.25,
+        n_ghost + Stencil::max_enclosed_neighbors + 1.5};
+  }
+
+  // evaluate the stencil at each location and store the fractional overlap grid in overlap_results
+  std::vector<std::vector<double>> overlap_results{};
+  for (const auto sliding_ax_indxU_val : sliding_ax_indxU_vals) {
+    Real pos_indxU[3] = {sliding_ax_indxU_val, dummy, dummy};
+
+    overlap_results.push_back(
+        eval_stencil_overlap_(pos_indxU, full_extent, n_ghost, Stencil{}, StencilEvalKind::enclosed_cell_vol_frac));
+
+    std::vector<double> stencil_overlap_frac =
+        eval_stencil_overlap_(pos_indxU, full_extent, n_ghost, Stencil{}, StencilEvalKind::enclosed_stencil_vol_frac);
+
+    // int num_indents = 2;
+    // std::string tmp_arr = array_to_string(overlap_results.back().data(), full_extent, num_indents);
+    // printf("\n  %s\n:", tmp_arr.c_str());
+
+    // sanity check: ensure non-zero vals sum to 1
+    std::string tmp      = Vec3_to_String(pos_indxU);
+    double total_overlap = 0.0;
+    for (const double& overlap : stencil_overlap_frac) {
+      total_overlap += overlap;
+    }
+    // in the future, we may nee
+    ASSERT_NEAR(total_overlap, 1.0, tot_vol_atol) << "the sum of the stencil-vol-frac is not 1.0, when "
+                                                  << "when the stencil is centered at " << Vec3_to_String(pos_indxU);
+  }
+
+  // perform some checks based on the ranges of cells with overlap:
+  // - To make this test as generic as possible,
+  //   -> need to handle cases (especially for super-sampled stencil) where a given y_ind/z_ind near
+  //      the edge of the stencil won't have any overlap at all, except when the stencil is positioned
+  //      at very particular x-values (basically it has to do with the distance of the subgrid to
+  //      the stencil-center)
+
+  // const int y_ind = int(dummy);
+  // const int z_ind = int(dummy);
+
+  for (int y_ind = 0; y_ind < full_extent.ny; y_ind++) {
+    for (int z_ind = 0; z_ind < full_extent.nz; z_ind++) {
+      // basically, we setup a separate test each time we encounter this inner loop
+      std::optional<OverlapRange> prev_ovrange;
+      for (std::vector<double>& overlap_result : overlap_results) {
+        std::optional<OverlapRange> cur_ovrange = find_ovrange_(overlap_result.data(), y_ind, z_ind, full_extent);
+
+        // printf("%zu, first_overlap: (%d, %g) last_overlap: (%d, %g)\n",
+        //        i, cur_ovrange.value().first_indx, cur_ovrange.value().first_overlap,
+        //        cur_ovrange.value().last_indx, cur_ovrange.value().last_overlap);
+
+        if (bool(prev_ovrange) and bool(cur_ovrange)) {  // make comparisons to previous stencil position
+
+          // its ok to disable to the following lints here since we explicitly cconfirmed
+          // that these options are not empty
+          // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+          const OverlapRange& prev = *prev_ovrange;
+          // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+          const OverlapRange& cur = *cur_ovrange;
+
+          // as the stencil moves rightwards, we expect the first_overlap and last_overlap
+          // indices to generally increase
+          ASSERT_GE(cur.first_indx, prev.first_indx);
+          ASSERT_GE(cur.last_indx, prev.last_indx);
+
+          if (not ignore_monotonicity_comparisons) {
+            // these stencils should be ignored for the sphere-binary-stencil
+
+            // if first_ind is the same for both the current and previous stencil position, check that the overlap
+            // fraction of that pixel has not increased
+            if (cur.first_indx == prev.first_indx) {
+              ASSERT_LE(cur.first_overlap, prev.first_overlap);
+            }
+
+            // if last_indx is the same for both the current and previous stencil position, confirm that the
+            // overlap fraction of that pixel has not decreased
+            if (cur.last_indx == prev.last_indx) {
+              ASSERT_GE(cur.last_overlap, prev.last_overlap);
+            }
+          }
+        }
+        prev_ovrange = cur_ovrange;
+      }
+    }
+  }
+}
+
+TEST(tALLFeedbackCiCStencil, SlidingTest) { sliding_stencil_test<fb_stencil::CIC>(0); }
+
+TEST(tALLFeedbackSphere27Stencil, SlidingTest)
+{
+  // primary stencil size we would use
+  sliding_stencil_test<fb_stencil::Sphere27<2>>(0, 2e-16);
+  // just testing this case because we can
+  sliding_stencil_test<fb_stencil::Sphere27<4>>(0, 3e-16);
+}
+
+TEST(tALLFeedbackSphereBinaryStencil, SlidingTest)
+{
+  // we have to ignore the part of the test where we check that the enclosed stencil fraction monotonically
+  // increases and decreases (we could refactor the test more a test a different version of that same
+  // behavior)
+  sliding_stencil_test<fb_stencil::SphereBinary<3>>(0, 2e-15, true);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Define some tests where we check our expectations about the total volume enclosed by the stencil
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A helper-tool that comes up with the standard stencil setup that is necessary
+// for evaluating a stencil (without overlapping beyond the edge of a grid)
+template <typename Stencil>
+struct StencilTestGridSetup {
+  std::vector<hydro_utilities::VectorXYZ<Real>> center_pos_indU_list;
+  Extent3D full_extent;
+};
+
+template <typename Stencil>
+static StencilTestGridSetup<Stencil> Build_Stencil_Test_Grid_Setup(
+    const std::vector<hydro_utilities::VectorXYZ<Real>>& center_offset_from_cellEdge_LIST, int n_ghost)
+{
+  StencilTestGridSetup<Stencil> out;
+
+  // compute the centers of the stencil and the extent of the grid for evaluating the stencil
+  for (const hydro_utilities::VectorXYZ<Real>& center_offset_from_cellEdge : center_offset_from_cellEdge_LIST) {
+    hydro_utilities::VectorXYZ<Real> pos_indU{};
+    for (std::size_t i = 0; i < 3; i++) {
+      // confirm the offset falls in the range 0 <= center_offset < 1
+      CHOLLA_ASSERT(center_offset_from_cellEdge[i] >= 0.0, "Test parameter is flawed");
+      CHOLLA_ASSERT(center_offset_from_cellEdge[i] < 1.0, "Test parameter is flawed");
+
+      pos_indU[i] = Stencil::max_enclosed_neighbors + center_offset_from_cellEdge[i];
+    }
+    out.center_pos_indU_list.push_back(pos_indU);
+  }
+
+  // choose an extent value that we know will work (even when n_ghost is zero!)
+  out.full_extent =
+      Extent3D{1 + 2 * (n_ghost + Stencil::max_enclosed_neighbors), 1 + 2 * (n_ghost + Stencil::max_enclosed_neighbors),
+               1 + 2 * (n_ghost + Stencil::max_enclosed_neighbors)};
+  return out;
+}
+
+template <typename Stencil>
+void stencil_volume_check(hydro_utilities::VectorXYZ<Real> center_offset_from_cellEdge, int n_ghost, Stencil stencil,
+                          double expected_vol, double vol_rtol = 0.0, double stencil_overlap_rtol = 0.0)
+{
+  // compute the center of the stencil and the extent of the grid for evaluating the stencil
+  StencilTestGridSetup<Stencil> setup = Build_Stencil_Test_Grid_Setup<Stencil>({center_offset_from_cellEdge}, n_ghost);
+  hydro_utilities::VectorXYZ<Real> pos_indU = setup.center_pos_indU_list[0];
+  Extent3D full_extent                      = setup.full_extent;
+
+  // now gather the amount of cell-volume enclosed by the stencil
+  std::vector<double> enclosed_cell_vol =
+      eval_stencil_overlap_(pos_indU.data(), full_extent, n_ghost, stencil, StencilEvalKind::enclosed_cell_vol_frac);
+  // compute the total stencil volume
+  double vtot = 0.0;
+  for (double val : enclosed_cell_vol) {
+    vtot += val;
+  }
+
+  // now perform the check on the total cell volume
+  EXPECT_TRUE(isclose(vtot, expected_vol, vol_rtol, 0.0, false))
+      << "stencil volume, " << vtot << ", does NOT match the expected "
+      << "volume, " << expected_vol << ", to within the relative tolerance "
+      << "of " << vol_rtol << ". The relative error is: " << (vtot - expected_vol) / expected_vol;
+
+  ASSERT_GT(vtot, 0.0);  // this is mostly a sanity check!
+
+  // now let's confirm consistency with the calculation of the total stencil volume
+  // (otherwise this test is meaningless)
+  for (double& val : enclosed_cell_vol) {
+    val /= vtot;
+  }
+
+  std::vector<double> enclosed_stencil_vol =
+      eval_stencil_overlap_(pos_indU.data(), full_extent, n_ghost, stencil, StencilEvalKind::enclosed_stencil_vol_frac);
+  assert_allclose(enclosed_cell_vol.data(), enclosed_stencil_vol.data(), full_extent, stencil_overlap_rtol, 0.0, false,
+                  "the grid of stencil-overlap-vol-fracs computed from the grid on cellvol-fracs is "
+                  "inconsistent with the direclty computed grid of stencil-overlap-vol-fracs");
+}
+
+TEST(tALLFeedbackCiCStencil, StencilVolumeTest)
+{
+  stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.0, 0.0, 0.0}, 0, fb_stencil::CIC{},
+                       /* expected_vol = */ 1.0, /* vol_rtol = */ 0.0, /*stencil_overlap_rtol =*/0.0);
+  stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.5, 0.5, 0.5}, 0, fb_stencil::CIC{},
+                       /* expected_vol = */ 1.0, /* vol_rtol = */ 0.0, /*stencil_overlap_rtol =*/0.0);
+}
+
+TEST(tALLFeedbackSphere27Stencil, StencilVolumeTest)
+{
+  const double radius       = 1;  // in units of cell_widths
+  const double expected_vol = 4 * 3.141592653589793 * (radius * radius) / 3;
+
+  stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.0, 0.0, 0.0}, 0, fb_stencil::Sphere27<2>{}, expected_vol,
+                       /* vol_rtol = */ 0.05, /*stencil_overlap_rtol =*/0.0);
+  stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.125, 0.0, 0.0}, 0, fb_stencil::Sphere27<2>{}, expected_vol,
+                       /* vol_rtol = */ 0.0004, /*stencil_overlap_rtol =*/0.0);
+  stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.5, 0.5, 0.5}, 0, fb_stencil::Sphere27<2>{}, expected_vol,
+                       /* vol_rtol = */ 0.05, /*stencil_overlap_rtol =*/0.0);
+}
+
+/* Something is funky! as you increase the radius, my intuition tells me that the relative error
+ * should improve, but that does not seem to be the case*/
+// TEST(tALLFeedbackSphereBinaryStencil, StencilVolumeTest)
+//{
+//   const double radius = 3; // in units of cell_widths
+//   const double expected_vol = 4 * 3.141592653589793 * (radius * radius) / 3.0;
+//
+//   stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.0,0.0,0.0}, 0, fb_stencil::SphereBinary<3>{},
+//                        expected_vol, /* vol_rtol = */ 0.0, /*stencil_overlap_rtol =*/ 0.0);
+//   stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.125,0.0,0.0}, 0, fb_stencil::SphereBinary<3>{},
+//                        expected_vol, /* vol_rtol = */ 0.0, /*stencil_overlap_rtol =*/ 0.0);
+//   stencil_volume_check(hydro_utilities::VectorXYZ<Real>{0.5,0.5,0.5}, 0, fb_stencil::SphereBinary<3>{},
+//                        expected_vol, /* vol_rtol = */ 0.0, /*stencil_overlap_rtol =*/ 0.0);
+// }
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Define some tests where we check consistency between the different flavors of for_each
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class tALLFeedbackStencil : public testing::Test  // NOLINT(readability-identifier-naming)
+{
+ public:
+  using StencilT = T;
+};
+
+using MyStencilTypes =
+    ::testing::Types<fb_stencil::CIC, fb_stencil::LegacyCIC27, fb_stencil::Sphere27<2>, fb_stencil::SphereBinary<3>>;
+TYPED_TEST_SUITE(tALLFeedbackStencil, MyStencilTypes);
+
+TYPED_TEST(tALLFeedbackStencil, ForEachFlavorConsistency)
+{
+  // determines the central positions of the stencils
+  std::vector<hydro_utilities::VectorXYZ<Real>> center_offset_from_cellEdge_List = {
+      {0.00, 0.00, 0.00}, {0.25, 0.00, 0.00}, {0.5, 0.0, 0.0},    {0.75, 0.00, 0.00}, {0.99, 0.00, 0.00},
+      {0.00, 0.25, 0.00}, {0.0, 0.5, 0.0},    {0.00, 0.75, 0.00}, {0.00, 0.99, 0.00}, {0.00, 0.00, 0.25},
+      {0.0, 0.0, 0.5},    {0.00, 0.00, 0.75}, {0.00, 0.00, 0.99}, {0.25, 0.25, 0.25}, {0.5, 0.5, 0.5},
+      {0.75, 0.75, 0.75}, {0.99, 0.99, 0.99},
+  };
+
+  const int n_ghost = 0;
+
+  using Stencil = typename TestFixture::StencilT;
+
+  // compute the center of the stencil and the extent of the grid for evaluating the stencil
+  StencilTestGridSetup<Stencil> setup =
+      Build_Stencil_Test_Grid_Setup<Stencil>(center_offset_from_cellEdge_List, n_ghost);
+  const Extent3D extent = setup.full_extent;
+
+  // pair specifying the flavor-name and the actual flavor value
+  std::vector<std::pair<std::string, StencilEvalKind>> flavor_pairs = {
+      {"enclosed_stencil_vol_frac", StencilEvalKind::enclosed_stencil_vol_frac},
+      {"enclosed_cell_vol_frac", StencilEvalKind::enclosed_cell_vol_frac},
+      {"for_each_overlap_zone", StencilEvalKind::for_each_overlap_zone}};
+
+  for (hydro_utilities::VectorXYZ<Real> pos_indU : setup.center_pos_indU_list) {
+    std::vector<std::vector<Real>> rslts{};
+
+    std::size_t num_flavors = flavor_pairs.size();
+    for (std::size_t i = 0; i < num_flavors; i++) {
+      // execute the current flavor of for_each
+      rslts.push_back(eval_stencil_overlap_(pos_indU.data(), extent, n_ghost, Stencil{}, flavor_pairs[i].second));
+
+      if (i == 0) continue;
+
+      // compare with the first flavor of for_each
+
+      Real* ref = rslts[0].data();
+      Real* cur = rslts[i].data();
+
+      for (int iz = 0; iz < extent.nz; iz++) {
+        for (int iy = 0; iy < extent.ny; iy++) {
+          for (int ix = 0; ix < extent.nx; ix++) {
+            int ind3D = ix + extent.nx * (iy + extent.ny * iz);
+
+            bool ref_nonzero = ref[ind3D] > 0.0;
+            bool cur_nonzero = cur[ind3D] > 0.0;
+
+            if (ref_nonzero != cur_nonzero) {
+              int index[3] = {ix, iy, iz};
+              FAIL() << "Encountered an inconsistency when comparing the '" << flavor_pairs[0].first
+                     << "' flavor of for_each against the '" << flavor_pairs[i].first
+                     << "' flavor at (ix, iy, iz) = " << Vec3_to_String(index)
+                     << ". Both should be zero or neither should be zero.";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Define some machinery to help with testing the full feedback functionality where we check our expectations about the
+// total volume enclosed by the stencil
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace
+{
+
+std::optional<Real> try_get_(const std::map<std::string, Real>& m, const std::string& key)
+{
+  if (auto rslt = m.find(key); rslt != m.end()) return {rslt->second};
+  return {};
+}
+
+#ifdef DE
+const bool idual = true;
+#else
+const bool idual = false;
+#endif
+
+void init_field_vals_(cuda_utilities::DeviceVector<Real>& data, std::size_t field_size,
+                      const std::map<std::string, Real>& dflt_vals)
+{
+  std::size_t total_size = data.size();
+
+  Real* ptr = data.data();
+
+  // default density should be 0.1 particles per cc
+  // default thermal energy density should correspond to pressure of 1e3 K / cm**3 (for a gamma of 5/3)
+  const Real density       = try_get_(dflt_vals, "density").value_or(1482737.17012665);
+  const Real thermal_edens = try_get_(dflt_vals, "thermal_edens").value_or(0.00021335 * 1.5);
+  const Real vx            = try_get_(dflt_vals, "velocity_x").value_or(0.0);
+  const Real vy            = try_get_(dflt_vals, "velocity_y").value_or(0.0);
+  const Real vz            = try_get_(dflt_vals, "velocity_z").value_or(0.0);
+  const Real tot_edens     = thermal_edens + 0.5 * density * (vx * vx + vy * vy + vz * vz);
+
+  auto loop_fn = [=] __device__(int index) {
+    ptr[index]                                      = density;
+    ptr[grid_enum::momentum_x * field_size + index] = density * vx;
+    ptr[grid_enum::momentum_y * field_size + index] = density * vy;
+    ptr[grid_enum::momentum_z * field_size + index] = density * vz;
+    ptr[grid_enum::Energy * field_size + index]     = tot_edens;
+#ifdef DE
+    ptr[grid_enum::GasEnergy * field_size + index] = thermal_edens;
+#endif
+  };
+
+  gpuFor(field_size, loop_fn);
+}
+
+// make sure this lives for the lifetime of the test where the data gets used!
+struct TestFieldData {
+ private:  // attributes
+  // when cuda_utilities::DeviceVector is updated so that it can be moved in the future, it won't
+  // be necessary to wrap particle_ids_ in a unique_ptr.
+  std::unique_ptr<cuda_utilities::DeviceVector<Real>> data_;
+
+  Extent3D single_field_extent_;  // must include ghost zones
+
+ public:
+  TestFieldData(Extent3D single_field_extent, std::map<std::string, Real> dflt_vals)
+      : data_(nullptr), single_field_extent_(single_field_extent)
+  {
+    const std::size_t single_field_size = single_field_extent.nx * single_field_extent.ny * single_field_extent.nz;
+    // originally, we set n_fields equal to `5 + idual`. Unfortunately, issues arose when we used
+    // when the cholla was compiled with passive scalars due to our use of grid_enum
+    std::size_t n_fields = grid_enum::num_fields;
+    data_                = std::make_unique<cuda_utilities::DeviceVector<Real>>(n_fields * single_field_size);
+    init_field_vals_(*data_, single_field_size, dflt_vals);
+  }
+
+  TestFieldData(TestFieldData&&)            = default;
+  TestFieldData& operator=(TestFieldData&&) = default;
+
+  Real* dev_ptr() { return data_->data(); }
+
+  Extent3D single_field_extent() { return single_field_extent_; }
+
+  std::vector<Real> host_copy()
+  {
+    std::vector<Real> out(this->data_->size());
+    this->data_->cpyDeviceToHost(out);
+    return out;
+  }
+
+  // this is inefficient! But should get the job done!
+  void print_debug_info()
+  {
+    std::vector<Real> tmp        = this->host_copy();
+    Extent3D single_field_extent = this->single_field_extent_;
+
+    auto print_fn = [single_field_extent, &tmp](int field_index, const std::string& name) {
+      std::size_t field_offset = single_field_extent.nx * single_field_extent.ny * single_field_extent.nz;
+
+      std::size_t output_indent_offset = name.size() + 2;
+      std::string arr_str =
+          array_to_string(tmp.data() + field_index * field_offset, single_field_extent, output_indent_offset);
+      printf("%s: %s\n", name.c_str(), arr_str.c_str());
+    };
+
+    print_fn(grid_enum::density, "density");
+    print_fn(grid_enum::momentum_x, "momentum_x");
+    print_fn(grid_enum::momentum_y, "momentum_y");
+    print_fn(grid_enum::momentum_z, "momentum_z");
+    print_fn(grid_enum::Energy, "etot_dens");
+#ifdef DE
+    print_fn(grid_enum::GasEnergy, "ethermal_dens");
+#endif
+  }
+
+  /* copy TestFieldData into a new object and shift the reference frame */
+  TestFieldData change_ref_frame(hydro_utilities::VectorXYZ<Real> bulk_velocity)
+  {
+    const Extent3D& single_field_extent = this->single_field_extent_;
+
+    TestFieldData out = TestFieldData(single_field_extent, {});
+
+    const Real* in_ptr = this->data_->data();
+    Real* out_ptr      = out.data_->data();
+
+    const std::size_t field_size = single_field_extent.nx * single_field_extent.ny * single_field_extent.nz;
+
+    auto loop_fn = [in_ptr, out_ptr, field_size, bulk_velocity] __device__(int index) {
+      Real density   = in_ptr[index];
+      Real old_mom_x = in_ptr[grid_enum::momentum_x * field_size + index];
+      Real old_mom_y = in_ptr[grid_enum::momentum_y * field_size + index];
+      Real old_mom_z = in_ptr[grid_enum::momentum_z * field_size + index];
+
+      Real old_KE_dens = 0.5 * ((old_mom_x * old_mom_x) + (old_mom_y * old_mom_y) + (old_mom_z * old_mom_z)) / density;
+
+      Real new_mom_x = old_mom_x - (bulk_velocity[0] * density);
+      Real new_mom_y = old_mom_y - (bulk_velocity[1] * density);
+      Real new_mom_z = old_mom_z - (bulk_velocity[2] * density);
+
+      Real new_KE_dens = 0.5 * ((new_mom_x * new_mom_x) + (new_mom_y * new_mom_y) + (new_mom_z * new_mom_z)) / density;
+
+      Real new_e = in_ptr[grid_enum::Energy * field_size + index] + (new_KE_dens - old_KE_dens);
+
+      out_ptr[index]                                      = density;
+      out_ptr[grid_enum::momentum_x * field_size + index] = new_mom_x;
+      out_ptr[grid_enum::momentum_y * field_size + index] = new_mom_y;
+      out_ptr[grid_enum::momentum_z * field_size + index] = new_mom_z;
+      out_ptr[grid_enum::Energy * field_size + index]     = new_e;
+#ifdef DE
+      out_ptr[grid_enum::GasEnergy * field_size + index] = in_ptr[grid_enum::GasEnergy * field_size + index];
+#endif
+    };
+
+    gpuFor(field_size, loop_fn);
+    return out;
+  }
+};
+
+void assert_fielddata_allclose(TestFieldData& actual_test_field_data, TestFieldData& ref_test_field_data,
+                               bool only_thermale_and_density = false, double rtol = 0.0, double atol = 0.0)
+{
+  std::vector<Real> ref_data    = ref_test_field_data.host_copy();
+  std::vector<Real> actual_data = actual_test_field_data.host_copy();
+
+  // to do: we should really check consistency of extent between actual & desired
+  Extent3D extent = ref_test_field_data.single_field_extent();
+
+  const std::size_t single_field_size = extent.nx * extent.ny * extent.nz;
+
+  const std::map<std::string, int> field_index_map = {
+      {"density", grid_enum::density},         {"momentum_x", grid_enum::momentum_x},
+      {"momentum_y", grid_enum::momentum_y},   {"momentum_z", grid_enum::momentum_z},
+#ifdef DE
+      {"ethermal_dens", grid_enum::GasEnergy},
+#endif
+      {"etot_dens", grid_enum::Energy}};
+
+  auto compare = [&](const std::string& name, Real* actual, Real* ref) {
+    if (actual == nullptr) actual = actual_data.data() + single_field_size * field_index_map.at(name);
+    if (ref == nullptr) ref = ref_data.data() + single_field_size * field_index_map.at(name);
+    std::string err_msg = "problem comparing the field: " + name;
+    assert_allclose(actual, ref, extent, rtol, atol, false, err_msg);
+  };
+
+  compare("density", nullptr, nullptr);
+  if (not only_thermale_and_density) {
+    compare("momentum_x", nullptr, nullptr);
+    compare("momentum_y", nullptr, nullptr);
+    compare("momentum_z", nullptr, nullptr);
+    compare("etot_dens", nullptr, nullptr);
+  } else {
+    // compute thermal_energy density:
+    std::vector<std::vector<Real>> ethermal_l{};
+    for (Real* field_ptr : {actual_data.data(), ref_data.data()}) {
+      Real* dens      = field_ptr + single_field_size * grid_enum::density;
+      Real* mom_x     = field_ptr + single_field_size * grid_enum::momentum_x;
+      Real* mom_y     = field_ptr + single_field_size * grid_enum::momentum_y;
+      Real* mom_z     = field_ptr + single_field_size * grid_enum::momentum_z;
+      Real* tot_edens = field_ptr + single_field_size * grid_enum::Energy;
+
+      std::vector<Real> thermal_energy(single_field_size);
+      for (std::size_t i = 0; i < single_field_size; i++) {
+        Real ke_dens      = 0.5 * (mom_x[i] * mom_x[i] + mom_y[i] * mom_y[i] + mom_z[i] * mom_z[i]) / dens[i];
+        thermal_energy[i] = tot_edens[i] - ke_dens;
+      }
+      ethermal_l.push_back(thermal_energy);
+    }
+    compare("(etot_dens - ke_dens)", ethermal_l[0].data(), ethermal_l[1].data());
+  }
+#ifdef DE
+  compare("ethermal_dens", nullptr, nullptr);
+#endif
+}
+
+// make sure this lives for the lifetime of the test where the data gets used!
+struct TestParticleData {
+ private:  // attributes
+  // when cuda_utilities::DeviceVector is updated so that it can be moved in the future, it won't
+  // be necessary to wrap particle_ids_ in a unique_ptr.
+  std::unique_ptr<cuda_utilities::DeviceVector<part_int_t>> particle_ids_;
+  std::map<std::string, cuda_utilities::DeviceVector<Real>> general_data_;
+
+ public:
+  TestParticleData(const std::vector<hydro_utilities::VectorXYZ<Real>>& pos_vec,
+                   const std::map<std::string, Real>& other_props)
+      : particle_ids_(nullptr)
+  {
+    const std::size_t count = pos_vec.size();
+
+    std::vector<std::string> array_names = {"pos_x", "pos_y", "pos_z", "vel_x", "vel_y", "vel_z", "mass", "age"};
+
+    // initialize host copy of each vector
+    std::vector<part_int_t> host_particle_ids(count);
+    std::map<std::string, std::vector<Real>> host_data_{};
+    for (const std::string& name : array_names) {
+      host_data_.emplace(name, count);
+    }
+
+    // now fill in the local vectors
+    for (std::size_t i = 0; i < count; i++) {
+      host_particle_ids[i] = part_int_t(i);
+
+      host_data_.at("pos_x")[i] = pos_vec[i][0];
+      host_data_.at("pos_y")[i] = pos_vec[i][1];
+      host_data_.at("pos_z")[i] = pos_vec[i][2];
+
+      // printf("pos: %g, %g, %g\n", host_data_.at("pos_x")[i], host_data_.at("pos_y")[i], host_data_.at("pos_z")[i]);
+
+      host_data_.at("vel_x")[i] = try_get_(other_props, "vel_x").value_or(0.0);
+      host_data_.at("vel_y")[i] = try_get_(other_props, "vel_y").value_or(0.0);
+      host_data_.at("vel_z")[i] = try_get_(other_props, "vel_z").value_or(0.0);
+      host_data_.at("mass")[i]  = try_get_(other_props, "mass").value_or(1e3);  // defaults to 1e3 solar masses
+      host_data_.at("age")[i]   = try_get_(other_props, "age").value_or(-1e4);  // defaults to -10 kyr (recall this is
+                                                                                // really the formation time)
+    }
+
+    // now copy host vector contents to the device
+    this->particle_ids_ = std::make_unique<cuda_utilities::DeviceVector<part_int_t>>(pos_vec.size(), false);
+    this->particle_ids_->cpyHostToDevice(host_particle_ids);
+    for (const std::string& name : array_names) {
+      this->general_data_.emplace(name, count);
+      this->general_data_.at(name).cpyHostToDevice(host_data_.at(name));
+    }
+
+    // now host-vectors are automatically deallocated
+  }
+
+  TestParticleData(TestParticleData&&)            = default;
+  TestParticleData& operator=(TestParticleData&&) = default;
+
+  part_int_t num_particles() { return part_int_t(particle_ids_->size()); }
+
+  feedback_details::ParticleProps particle_props()
+  {
+    return {
+        num_particles(),  // number of local particles
+        particle_ids_->data(),
+        general_data_.at("pos_x").data(),
+        general_data_.at("pos_y").data(),
+        general_data_.at("pos_z").data(),
+        general_data_.at("vel_x").data(),
+        general_data_.at("vel_y").data(),
+        general_data_.at("vel_z").data(),
+        general_data_.at("mass").data(),
+        general_data_.at("age").data(),
+    };
+  }
+
+  feedback_details::ParticleProps props_of_single_particle(int index)
+  {
+    CHOLLA_ASSERT((index >= 0) and (index < this->num_particles()), "Invalid Particle Index was specified!");
+    return {
+        1,  // number of local particles
+        particle_ids_->data() + index,
+        general_data_.at("pos_x").data() + index,
+        general_data_.at("pos_y").data() + index,
+        general_data_.at("pos_z").data() + index,
+        general_data_.at("vel_x").data() + index,
+        general_data_.at("vel_y").data() + index,
+        general_data_.at("vel_z").data() + index,
+        general_data_.at("mass").data() + index,
+        general_data_.at("age").data() + index,
+    };
+  }
+
+  std::vector<part_int_t> host_copy_particle_ids()
+  {
+    std::vector<part_int_t> out(this->num_particles());
+    this->particle_ids_->cpyDeviceToHost(out);
+    return out;
+  }
+
+  std::map<std::string, std::vector<Real>> host_copy_general()
+  {
+    const std::size_t particle_count = this->num_particles();
+    std::map<std::string, std::vector<Real>> out;
+    for (auto& kv_pair : this->general_data_) {
+      const std::string& key                  = kv_pair.first;
+      cuda_utilities::DeviceVector<Real>& vec = kv_pair.second;
+
+      out.emplace(key, particle_count);
+      vec.cpyDeviceToHost(out.at(key));
+    }
+    return out;
+  }
+
+  // this is inefficient! But should get the job done!
+  void print_debug_info()
+  {
+    auto print_fn = [](const std::string& name, auto& host_vec) {
+      std::string arr_str = array_to_string(host_vec.data(), int(host_vec.size()));
+      printf("%s: %s\n", name.c_str(), arr_str.c_str());
+    };
+
+    std::vector<part_int_t> particle_ids = this->host_copy_particle_ids();
+    print_fn("ids", particle_ids);
+    std::map<std::string, std::vector<Real>> general_data = this->host_copy_general();
+    for (auto& kv_pair : general_data) {
+      print_fn(kv_pair.first, kv_pair.second);
+    }
+  }
+};
+
+struct FeedbackResults {
+  TestFieldData test_field_data;
+  TestParticleData test_particle_data;
+  std::vector<Real> info;
+};
+
+template <typename Prescription = fb_prescription::CiCResolvedSNPrescription>
+FeedbackResults run_full_feedback_(const int n_ghost, const std::vector<AxProps>& prop_l,
+                                   const std::vector<hydro_utilities::VectorXYZ<Real>>& particle_pos_vec,
+                                   feedback_details::OverlapStrat ov_strat, bool separate_launch_per_particle,
+                                   feedback_details::BoundaryStrategy bdry_strat =
+                                       feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues,
+                                   const std::optional<Real> maybe_init_density        = std::optional<Real>(),
+                                   const std::optional<Real> maybe_init_internal_edens = std::optional<Real>(),
+                                   const std::optional<hydro_utilities::VectorXYZ<Real>> maybe_bulk_vel =
+                                       std::optional<hydro_utilities::VectorXYZ<Real>>())
+{
+  feedback_details::FieldSpatialProps spatial_props{
+      // left-edges of active zone:
+      prop_l[0].min,
+      prop_l[1].min,
+      prop_l[2].min,
+      // right-edges of active zone:
+      prop_l[0].min + prop_l[0].cell_width * prop_l[0].num_cells,
+      prop_l[1].min + prop_l[1].cell_width * prop_l[1].num_cells,
+      prop_l[2].min + prop_l[2].cell_width * prop_l[2].num_cells,
+      // cell_widths
+      prop_l[0].cell_width,
+      prop_l[1].cell_width,
+      prop_l[2].cell_width,
+      // cells along each axis (including ghost zone)
+      prop_l[0].num_cells + 2 * n_ghost,  // cells along x (with ghosts)
+      prop_l[1].num_cells + 2 * n_ghost,  // cells along y (with ghosts)
+      prop_l[2].num_cells + 2 * n_ghost,  // cells along z (with ghosts)
+      // number of ghost zones:
+      n_ghost,
+  };
+
+  // check for optional test-specific field/particle values::
+  const Real init_density        = maybe_init_density.value_or(1482737.17012665);  // should be 0.1 particles per cc
+  const Real init_internal_edens = maybe_init_internal_edens.value_or(0.00021335 * 1.5);
+
+  const hydro_utilities::VectorXYZ<Real> dflt_bulk_vel = {0.0, 0.0, 0.0};
+  const hydro_utilities::VectorXYZ<Real> bulk_vel      = maybe_bulk_vel.value_or(dflt_bulk_vel);
+
+  // allocate the temporary field data!
+  const Extent3D full_extent{spatial_props.nx_g, spatial_props.ny_g, spatial_props.nz_g};
+  TestFieldData test_field_data(full_extent, {{"density", init_density},
+                                              {"velocity_x", bulk_vel[0]},
+                                              {"velocity_y", bulk_vel[1]},
+                                              {"velocity_z", bulk_vel[2]},
+                                              {"thermal_edens", init_internal_edens}});
+
+  const std::size_t num_particles = particle_pos_vec.size();
+  // allocate the temporary particle data!
+  TestParticleData test_particle_data(particle_pos_vec,
+                                      {{"vel_x", bulk_vel[0]}, {"vel_y", bulk_vel[1]}, {"vel_z", bulk_vel[2]}});
+
+  // Declare/allocate device buffer for holding the number of supernovae per particle in the current cycle
+  cuda_utilities::DeviceVector<int> d_num_SN(particle_pos_vec.size(), true);  // initialized to 0
+
+  // initialize vector so that there is one SN per particle
+  {
+    std::vector<int> tmp(num_particles, 1);
+    d_num_SN.cpyHostToDevice(tmp);
+  }
+
+  // allocate a vector to hold summary-info. Make sure to initialize the counters to 0
+  std::vector<Real> info(FBInfoLUT::LEN, 0.0);
+
+  // give some dummy vals:
+  const feedback_details::CycleProps cycle_props{0.0,  // current time
+                                                 0.1,  // length of current timestep
+                                                 1};   // the current cycle-number
+
+  feedback_details::OverlapScheduler ov_scheduler(ov_strat, spatial_props.nx_g, spatial_props.ny_g, spatial_props.nz_g);
+
+  if (separate_launch_per_particle) {
+    // actually execute feedback
+    for (std::size_t i = 0; i < particle_pos_vec.size(); i++) {
+      std::array<Real, FBInfoLUT::LEN> info_tmp;
+      for (int j = 0; j < FBInfoLUT::LEN; j++) {
+        info_tmp[j] = 0.0;
+      }
+
+      feedback_details::Exec_Cluster_Feedback_Kernel<Prescription>(
+          test_particle_data.props_of_single_particle(int(i)), spatial_props, cycle_props, info_tmp.data(),
+          test_field_data.dev_ptr(), d_num_SN.data() + i, ov_scheduler, bdry_strat);
+
+      for (int j = 0; j < FBInfoLUT::LEN; j++) {
+        info[j] += info_tmp[j];
+      }
+    }
+  } else {
+    feedback_details::Exec_Cluster_Feedback_Kernel<Prescription>(test_particle_data.particle_props(), spatial_props,
+                                                                 cycle_props, info.data(), test_field_data.dev_ptr(),
+                                                                 d_num_SN.data(), ov_scheduler, bdry_strat);
+  }
+
+  FeedbackResults out{std::move(test_field_data), std::move(test_particle_data), std::move(info)};
+  return out;
+}
+
+}  // anonymous namespace
+
+bool is_integer_(Real val) { return std::trunc(val) == val; }
+
+void basic_infosummary_checks_(const std::vector<Real>& info)
+{
+  ASSERT_EQ(info.size(), FBInfoLUT::LEN);
+
+  // we may need to revisit the following if we ever add more summary-stats
+  for (int i = 0; i < FBInfoLUT::LEN; i++) {
+    ASSERT_GE(info[i], 0.0);
+  }
+
+  ASSERT_TRUE(is_integer_(info[FBInfoLUT::countSN]));
+  ASSERT_TRUE(is_integer_(info[FBInfoLUT::countResolved]));
+  ASSERT_TRUE(is_integer_(info[FBInfoLUT::countUnresolved]));
+  ASSERT_EQ(info[FBInfoLUT::countSN], info[FBInfoLUT::countResolved] + info[FBInfoLUT::countUnresolved]);
+}
+
+// check the equality of all integers in actual and ref
+void check_infosummary_int_equality_(const std::vector<Real>& actual, const std::vector<Real>& ref)
+{
+  basic_infosummary_checks_(actual);
+  basic_infosummary_checks_(ref);
+
+  auto is_loseless_integer = [](Real val) {
+    bool obvious_problem = (
+        // in this case Real is a 32 bit float and may encode an integer that can't
+        // be losslessly represented
+        ((sizeof(Real) == 4) and (fabs(val) > 16777217)) or
+        // in this case Real is a 64 bit float and may encode an integer that can't
+        // be losslessly represented
+        ((sizeof(Real) == 8) and (fabs(val) > 9007199254740992)));
+
+    if (obvious_problem) {
+      return false;
+    } else {
+      return std::trunc(val) == val;
+    }
+  };
+
+  for (int i = 0; i < FBInfoLUT::LEN; i++) {
+    if (is_loseless_integer(actual[i]) and is_loseless_integer(ref[i])) {
+      ASSERT_EQ(actual[i], ref[i]);
+    }
+  }
+}
+
+void check_equality(FeedbackResults& rslt_actual, FeedbackResults& rslt_ref)
+{
+  int num_particles = int(rslt_ref.test_particle_data.num_particles());
+  {
+    std::vector<part_int_t> actual_particle_ids = rslt_actual.test_particle_data.host_copy_particle_ids();
+    std::vector<part_int_t> ref_particle_ids    = rslt_ref.test_particle_data.host_copy_particle_ids();
+    assert_allclose(actual_particle_ids.data(), ref_particle_ids.data(), {num_particles, 1, 1}, 0.0, 0.0, false,
+                    "problem comparing the particle_ids");
+  }
+
+  {
+    std::map<std::string, std::vector<Real>> actual_data = rslt_actual.test_particle_data.host_copy_general();
+    std::map<std::string, std::vector<Real>> ref_data    = rslt_ref.test_particle_data.host_copy_general();
+
+    for (const auto& kv_pair : actual_data) {
+      const std::string& key = kv_pair.first;
+
+      std::vector<Real>& actual_vec = actual_data.at(key);
+      std::vector<Real>& ref_vec    = ref_data.at(key);
+
+      std::string err_msg = "problem comparing the particle property: " + key;
+
+      assert_allclose(actual_vec.data(), ref_vec.data(), {num_particles, 1, 1}, 0.0, 0.0, false, err_msg);
+    }
+  }
+
+  assert_fielddata_allclose(rslt_actual.test_field_data, rslt_ref.test_field_data, false, 0.0, 0.0);
+
+  // perform a check of the info-summary-statistics
+  check_infosummary_int_equality_(rslt_actual.info, rslt_ref.info);
+}
+
+template <typename T>
+class tALLFeedbackFull : public testing::Test  // NOLINT(readability-identifier-naming)
+{
+ public:
+  using PrescriptionT = T;
+};
+
+using MyPrescriptionTypes = ::testing::Types<fb_prescription::CiCResolvedSNPrescription,
+                                             fb_prescription::CiCLegacyResolvedAndUnresolvedPrescription>;
+TYPED_TEST_SUITE(tALLFeedbackFull, MyPrescriptionTypes);
+
+// in this test we check that results are identical if we inject feedback for a bunch of supernovae
+// that are directly on top of each other in 2 cases:
+// - a case where we launch a separate kernel for each supernova
+// - a case where we use the OverlapStrat functionallity to launch a single kernel, but within that
+//   kernel, we sequentially launch the supernova feedback
+TYPED_TEST(tALLFeedbackFull, CheckingOverlapStrat)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+
+  const int n_ghost                    = 0;
+  const Real dx                        = 1.0 / 256.0;
+  const std::vector<AxProps> ax_prop_l = {{5, 0.0, dx}, {5, 0.0, dx}, {5, 0.0, dx}};
+
+  // initialize 50 star particles directly atop each other
+  const std::vector<hydro_utilities::VectorXYZ<Real>> particle_pos_vec(50, {2.4 * dx, 2.4 * dx, 2.4 * dx});
+
+  // Get the reference answer - here we sequentially launch one kernel after to handle feedback of
+  // each individual particle
+  FeedbackResults rslt_ref = run_full_feedback_<Prescription>(
+      n_ghost, ax_prop_l, particle_pos_vec, feedback_details::OverlapStrat::ignore, true,
+      feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues);
+  FeedbackResults rslt_actual = run_full_feedback_<Prescription>(
+      n_ghost, ax_prop_l, particle_pos_vec, feedback_details::OverlapStrat::sequential, false,
+      feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues);
+
+  /*
+    printf("\nLooking at the OverlapStrat::ignore approach:\n");
+    rslt_ref.test_field_data.print_debug_info();
+    rslt_ref.test_particle_data.print_debug_info();
+    printf("\nLooking at the OverlapStrat::sequential approach:\n");
+    rslt_actual.test_field_data.print_debug_info();
+    rslt_actual.test_particle_data.print_debug_info();
+  */
+
+  check_equality(rslt_actual, rslt_ref);
+
+  ASSERT_EQ(rslt_actual.info[FBInfoLUT::countSN], particle_pos_vec.size());
+}
+
+struct InjectSummary {
+  Real mass;
+  Real net_mom_x;
+  Real net_mom_y;
+  Real net_mom_z;
+  Real abs_mom_mag;
+  Real thermal_energy;
+};
+
+/* calculate the amount that is injected in each quantity in the specified reference
+ * frame (ref_frame_vel is measured with respect to the original reference frame of the
+ * simulation)
+ */
+InjectSummary calc_inject_summary_(TestFieldData& field_data, Real init_density, Real init_internal_edens,
+                                   hydro_utilities::VectorXYZ<Real> init_bulk_vel, Real cell_vol,
+                                   hydro_utilities::VectorXYZ<Real> ref_frame_vel = {0.0, 0.0, 0.0})
+{
+  Extent3D extent                     = field_data.single_field_extent();
+  const std::size_t single_field_size = extent.nx * extent.ny * extent.nz;
+
+  std::vector<Real> vec;
+  if ((ref_frame_vel[0] == 0.0) and (ref_frame_vel[1] == 0.0) and (ref_frame_vel[2] == 0.0)) {
+    vec = field_data.host_copy();
+  } else {
+    TestFieldData tmp = field_data.change_ref_frame(ref_frame_vel);
+    vec               = tmp.host_copy();
+  }
+
+  hydro_utilities::VectorXYZ<Real> bulk_vel{init_bulk_vel[0] - ref_frame_vel[0], init_bulk_vel[1] - ref_frame_vel[1],
+                                            init_bulk_vel[2] - ref_frame_vel[2]};
+
+  InjectSummary out{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+  for (std::size_t i = 0; i < single_field_size; i++) {
+    const Real dens      = vec[single_field_size * grid_enum::density + i];
+    const Real mom_x     = vec[single_field_size * grid_enum::momentum_x + i];
+    const Real mom_y     = vec[single_field_size * grid_enum::momentum_y + i];
+    const Real mom_z     = vec[single_field_size * grid_enum::momentum_z + i];
+    const Real tot_edens = vec[single_field_size * grid_enum::Energy + i];
+
+    Real ke_dens        = 0.5 * (mom_x * mom_x + mom_y * mom_y + mom_z * mom_z) / dens;
+    Real thermal_energy = tot_edens - ke_dens;
+
+    // compute how different the momentum density is from the initial value
+    const Real excess_mom_x = (mom_x - init_density * bulk_vel[0]);
+    const Real excess_mom_y = (mom_y - init_density * bulk_vel[1]);
+    const Real excess_mom_z = (mom_z - init_density * bulk_vel[2]);
+
+    out.mass += (dens - init_density);
+    out.net_mom_x += excess_mom_x;
+    out.net_mom_y += excess_mom_y;
+    out.net_mom_z += excess_mom_z;
+    out.abs_mom_mag +=
+        sqrt((excess_mom_x * excess_mom_x) + (excess_mom_y * excess_mom_y) + (excess_mom_z * excess_mom_z));
+    out.thermal_energy += (thermal_energy - init_internal_edens);
+  }
+
+  out.mass *= cell_vol;
+  out.net_mom_x *= cell_vol;
+  out.net_mom_y *= cell_vol;
+  out.net_mom_z *= cell_vol;
+  out.abs_mom_mag *= cell_vol;
+  out.thermal_energy *= cell_vol;
+
+  return out;
+}
+
+/*! Confirms that the info vector records the expected kind of SNe event
+ */
+testing::AssertionResult check_encodes_single_expected_event(const std::vector<Real>& info, bool resolved)
+{
+  Real n_resolved   = info[FBInfoLUT::countResolved];
+  Real n_unresolved = info[FBInfoLUT::countUnresolved];
+  if ((n_resolved + n_unresolved) != info[FBInfoLUT::countSN]) {
+    return testing::AssertionFailure() << "failed sanity check: the summary statistics recorded at FBInfoLUT::countSN, "
+                                       << "FBInfoLUT::countUnresolved, and FBInfoLUT::countUnresolved are inconsistent";
+  } else if (info[FBInfoLUT::countSN] != 1) {
+    return testing::AssertionFailure() << "failed sanity check: summary stats indicate that multiple SNe occurred";
+  } else if ((n_unresolved != 0) and resolved) {
+    return testing::AssertionFailure() << "an unresolved SNe was recorded, we expect to be testing resolved feedback";
+  } else if ((n_resolved != 0) and not resolved) {
+    return testing::AssertionFailure() << "a resolved SNe was recorded, we expect to be testing unresolved feedback";
+  } else {
+    return testing::AssertionSuccess();
+  }
+}
+
+// in this test, we look into the actual injected amounts!
+// - we can look at the total amount of thermal energy
+// - we can also look at the total amount
+template <typename Prescription>
+void test_injection_magnitudes_(bool resolved, const hydro_utilities::VectorXYZ<Real>& bulk_vel)
+{
+  const int n_ghost                    = 0;
+  const Real dx                        = 1.0 / 256.0;
+  const std::vector<AxProps> ax_prop_l = {{5, 0.0, dx}, {5, 0.0, dx}, {5, 0.0, dx}};
+
+  // initialize 1 star particle
+  const std::vector<hydro_utilities::VectorXYZ<Real>> particle_pos_vec(1, {2.4 * dx, 2.4 * dx, 2.4 * dx});
+
+  const Real density = (resolved) ? 1e7 : 1e9;  // solar-masses per kpc**3
+
+  // default thermal energy density should correspond to pressure of 1e4 K / cm**3 (for a gamma of 5/3)
+  const Real internal_edens = 0.0021335 * 1.5;
+
+  // launch the feedback
+  FeedbackResults rslt = run_full_feedback_<Prescription>(
+      n_ghost, ax_prop_l, particle_pos_vec, feedback_details::OverlapStrat::ignore, true,
+      feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues, {density}, {internal_edens},
+      {bulk_vel});
+
+  // this is mostly just a sanity check to make sure that this test-case gets updated if the
+  // criteria for switching between resolved and unresolved feedback changes significantly
+  // (this has definitely happened in the past!)
+  ASSERT_TRUE(check_encodes_single_expected_event(rslt.info, resolved));
+
+  // compute the summary properties in the reference frame of the particle that
+  // underwent the feedback process
+  // - in the current implementation of this test-logic, we chose to make the particle
+  //   velocity match the bulk_vel (the bulk velocity of the gas)
+  const hydro_utilities::VectorXYZ<Real> ref_frame_vel = bulk_vel;
+  InjectSummary summary =
+      calc_inject_summary_(rslt.test_field_data, density, internal_edens, bulk_vel, dx * dx * dx, ref_frame_vel);
+
+  EXPECT_NEAR(feedback::MASS_PER_SN, summary.mass, 3.6e-15 * feedback::MASS_PER_SN)
+      << "The injected mass doesn't match the hard-coded constant";
+
+  if (rslt.info[FBInfoLUT::countResolved] > 0) {
+    Real rtol = 0.0;
+    Real atol = 0.0;
+    if ((bulk_vel[0] != 0) or (bulk_vel[1] != 0) or (bulk_vel[2] != 0)) {
+      // we need a tolerance in this case because the injection logic had to
+      // slightly alter the momentum values since we injected mass
+      atol = 2e-19;
+    }
+
+    EXPECT_NEAR(0.0, summary.net_mom_x, atol);
+    EXPECT_NEAR(0.0, summary.net_mom_y, atol);
+    EXPECT_NEAR(0.0, summary.net_mom_z, atol);
+    EXPECT_NEAR(0.0, summary.abs_mom_mag, atol);
+    EXPECT_NEAR(feedback::ENERGY_PER_SN, summary.thermal_energy, rtol * feedback::ENERGY_PER_SN);
+
+    // sanity check!
+    EXPECT_NEAR(feedback::ENERGY_PER_SN, rslt.info[FBInfoLUT::totalEnergy], rtol * feedback::ENERGY_PER_SN);
+  } else {
+    // NOTE: feedback::FINAL_MOMENTUM does NOT directly specify the injected radial-momentum
+    //       (the radial momentum also depends on the local conditions)
+
+    Real rtol = 3e-16;
+    Real atol = 9e-18;
+    if ((bulk_vel[0] != 0) or (bulk_vel[1] != 0) or (bulk_vel[2] != 0)) {
+      atol = 5e-17;
+    }
+
+    EXPECT_NEAR(0.0, summary.net_mom_x, atol);
+    EXPECT_NEAR(0.0, summary.net_mom_y, atol);
+    EXPECT_NEAR(0.0, summary.net_mom_z, atol);
+    // for thermal energy, we theoretically need tolerance since we added and subtracted
+    // kinetic energy
+    EXPECT_NEAR(0.0, summary.thermal_energy, atol);
+    EXPECT_NEAR(rslt.info[FBInfoLUT::totalMomentum], summary.abs_mom_mag, rtol * rslt.info[FBInfoLUT::totalMomentum]);
+
+    // sanity checks!
+    EXPECT_EQ(rslt.info[FBInfoLUT::totalUnresEnergy], 0.0);
+  }
+}
+
+TYPED_TEST(tALLFeedbackFull, InjectionMagnitudesResolved)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+  if (Prescription::has_resolved_prescription) {
+    test_injection_magnitudes_<Prescription>(true, hydro_utilities::VectorXYZ<Real>{0.0, 0.0, 0.0});
+  }
+}
+
+TYPED_TEST(tALLFeedbackFull, InjectionMagnitudesResolvedBulkV)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+  if (Prescription::has_resolved_prescription) {
+    hydro_utilities::VectorXYZ<Real> bulk_vel = {0.000205, 0.0, 0.0};  // roughly 200 km/s
+    test_injection_magnitudes_<Prescription>(true, bulk_vel);
+  }
+}
+
+TYPED_TEST(tALLFeedbackFull, InjectionMagnitudesUnresolved)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+  if (Prescription::has_unresolved_prescription) {
+    test_injection_magnitudes_<Prescription>(false, hydro_utilities::VectorXYZ<Real>{0.0, 0.0, 0.0});
+  }
+}
+
+TYPED_TEST(tALLFeedbackFull, InjectionMagnitudesUnresolvedBulkV)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+  if (Prescription::has_unresolved_prescription) {
+    hydro_utilities::VectorXYZ<Real> bulk_vel = {0.000205, 0.0, 0.0};  // roughly 200 km/s
+    test_injection_magnitudes_<Prescription>(false, bulk_vel);
+  }
+}
+
+// in this test, we run a case without bulk-velocity and then we run a case with
+// bulk velocity. Then, we check consistency between the 2 runs!
+//
+// - at this point, this may seem a little redundant with some of the other test,
+//   but this test does test some extra properties.
+// - while the other tests implicitly check the consitency in the total magnitude
+//   of the prescriptions, this checks consistency in how the injection is
+//   distributed
+TYPED_TEST(tALLFeedbackFull, ComparingFrameInvariance)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+
+  const int n_ghost                    = 0;
+  const Real dx                        = 1.0 / 256.0;
+  const std::vector<AxProps> ax_prop_l = {{5, 0.0, dx}, {5, 0.0, dx}, {5, 0.0, dx}};
+
+  // initialize some star particles directly atop each other
+  const std::vector<hydro_utilities::VectorXYZ<Real>> particle_pos_vec(1, {2.4 * dx, 2.4 * dx, 2.4 * dx});
+
+  [[maybe_unused]] const Real init_density = 1e9;  // solar-masses per kpc**3
+
+  // Get the reference answer (in the reference frame where there is no bulk velocity)
+  [[maybe_unused]] hydro_utilities::VectorXYZ<Real> bulk_vel_NULLCASE = {0.0, 0.0, 0.0};
+  FeedbackResults rslt_ref                                            = run_full_feedback_<Prescription>(
+      n_ghost, ax_prop_l, particle_pos_vec, feedback_details::OverlapStrat::ignore, true,
+      feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues, {init_density}, {},
+      {bulk_vel_NULLCASE});
+
+  [[maybe_unused]] hydro_utilities::VectorXYZ<Real> bulk_vel_ALT = {0.000205, 0.0, 0.0};  // roughly 200 km/s
+  FeedbackResults rslt_actual                                    = run_full_feedback_<Prescription>(
+      n_ghost, ax_prop_l, particle_pos_vec, feedback_details::OverlapStrat::ignore, true,
+      feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues, {init_density}, {}, {bulk_vel_ALT});
+
+  // shift the reference-frame of the case with the bulk velocity
+  TestFieldData actual_field_shifted = rslt_actual.test_field_data.change_ref_frame(
+      hydro_utilities::VectorXYZ<Real>{1 * bulk_vel_ALT[0], 1 * bulk_vel_ALT[1], 1 * bulk_vel_ALT[2]});
+
+  /*
+    printf("\nLooking at the case without bulk velocity:\n");
+    rslt_ref.test_field_data.print_debug_info();
+    rslt_ref.test_particle_data.print_debug_info();
+    printf("\nLooking at the case with bulk velocity:\n");
+    rslt_actual.test_field_data.print_debug_info();
+    rslt_actual.test_particle_data.print_debug_info();
+    printf("\nLooking at the second case after shifting reference frame:\n");
+    actual_field_shifted.print_debug_info();
+  */
+
+  double rtol = 2.0e-9;  // it would be nice to specify tolerances on a per-field basis
+  double atol = 5e-7;
+  assert_fielddata_allclose(actual_field_shifted, rslt_ref.test_field_data, false, rtol, atol);
+
+  ASSERT_EQ(rslt_actual.info[FBInfoLUT::countSN], particle_pos_vec.size());
+
+  if (Prescription::has_unresolved_prescription) {
+    // this is mostly just a sanity check to make sure that this test-case gets updated if the
+    // criteria for switching between resolved and unresolved feedback changes significantly
+    ASSERT_TRUE(rslt_actual.info[FBInfoLUT::countUnresolved] > 0) << "something is wrong, we expected to be "
+                                                                  << "testing unresolved feedback!";
+  }
+
+  check_infosummary_int_equality_(rslt_actual.info, rslt_ref.info);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Some machinery to help with testing the full feedback functionality
+// where we check our expectations about the total volume enclosed by the stencil
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// used to specify positions relative to outer ghost/active zone edges
+struct OuterEdgeOffset {
+  Real offset;       // the actual offset
+  bool active_edge;  // whether obj refers to outer edge of ghost/active regions
+  bool left_edge;    // whether obj refers to left/right outer edge
+
+  static OuterEdgeOffset L_Active(Real offset) { return {offset, true, true}; }
+  static OuterEdgeOffset R_Active(Real offset) { return {offset, true, false}; }
+  static OuterEdgeOffset L_Ghost(Real offset) { return {offset, false, true}; }
+  static OuterEdgeOffset R_Ghost(Real offset) { return {offset, false, false}; }
+
+ private:
+  // make this private to force usage of static factory methods
+  OuterEdgeOffset(Real offset, bool active_edge, bool left_edge)
+      : offset(offset), active_edge(active_edge), left_edge(left_edge)
+  {
+  }
+};
+
+// NOLINTBEGIN(misc-no-recursion)
+/* Convert a position specified in terms of OuterEdgeOffset vals to concrete position (in index
+ * units)
+ */
+hydro_utilities::VectorXYZ<Real> offset_to_concrete(hydro_utilities::VectorXYZ<OuterEdgeOffset> pos,
+                                                    const std::vector<AxProps>& ax_prop_l, int n_ghost,
+                                                    bool to_posIndU = false)
+{
+  if (to_posIndU) {
+    const Extent3D full_extent = {ax_prop_l[0].num_cells + 2 * n_ghost,   // x-axis
+                                  ax_prop_l[1].num_cells + 2 * n_ghost,   // y-axis
+                                  ax_prop_l[2].num_cells + 2 * n_ghost};  // z-axis
+    auto fn                    = [n_ghost](OuterEdgeOffset& arg, int full_ax_len) -> Real {
+      if (arg.active_edge and arg.left_edge) {
+        return n_ghost + arg.offset;
+      } else if (arg.active_edge and (not arg.left_edge)) {
+        // the active-zone has (full_ax_len - 2*n_ghost) elements. But the right
+        // edge of the active zone stops at ((full_ax_len - 2*n_ghost) + n_ghost)
+        return arg.offset + (full_ax_len - n_ghost);
+      } else if ((not arg.active_edge) and arg.left_edge) {
+        return arg.offset;
+      } else {  // ((not arg.active_edge) and (not arg.left_edge))
+        return full_ax_len + arg.offset;
+      }
+    };
+
+    return hydro_utilities::VectorXYZ<Real>{fn(pos[0], full_extent.nx), fn(pos[1], full_extent.ny),
+                                            fn(pos[2], full_extent.nz)};
+  } else {
+    hydro_utilities::VectorXYZ<Real> pos_indU = offset_to_concrete(pos, ax_prop_l, n_ghost, true);
+    return hydro_utilities::VectorXYZ<Real>{(pos_indU[0] - n_ghost) * ax_prop_l[0].cell_width + ax_prop_l[0].min,
+                                            (pos_indU[1] - n_ghost) * ax_prop_l[1].cell_width + ax_prop_l[1].min,
+                                            (pos_indU[2] - n_ghost) * ax_prop_l[2].cell_width + ax_prop_l[2].min};
+  }
+}
+// NOLINTEND(misc-no-recursion)
+
+/* Constructs a vector of all position permutations.
+ *
+ * At least one component of each position is from `target_vals`. The other
+ * components may come from `target_vals` OR `filler_vals`.
+ */
+template <typename T>
+std::vector<hydro_utilities::VectorXYZ<T>> build_pos_permutation_l_(const std::vector<T>& target_vals,
+                                                                    const std::vector<T>& filler_vals)
+{
+  const std::size_t target_len = target_vals.size();
+  const std::size_t total_len  = target_len + filler_vals.size();
+
+  auto get = [&](std::size_t ind) -> T {
+    if (ind < target_len) return target_vals[ind];
+    return filler_vals[ind - target_len];
+  };
+
+  std::vector<hydro_utilities::VectorXYZ<T>> out;
+  for (std::size_t i = 0; i < total_len; i++) {
+    for (std::size_t j = 0; j < total_len; j++) {
+      for (std::size_t k = 0; k < total_len; k++) {
+        if ((i >= target_len) and (j >= target_len) and (k >= target_len)) {
+          continue;
+        }
+
+        out.push_back(hydro_utilities::VectorXYZ<T>{get(i), get(j), get(k)});
+      }
+    }
+  }
+
+  return out;
+}
+
+enum struct BoundaryRelatedExpectation { no_update, only_active_zone_update, update_with_change_to_ghost };
+
+// maybe it would be better to just return the result... and check if it matches
+// the expectation...
+bool matches_expectation_(FeedbackResults& rslt, Real init_density, int n_ghost, BoundaryRelatedExpectation expectation)
+{
+  std::vector<Real> ref_data = rslt.test_field_data.host_copy();
+
+  Extent3D extent = rslt.test_field_data.single_field_extent();
+
+  int active_zone_end[3] = {extent.nx - n_ghost, extent.ny - n_ghost, extent.nz - n_ghost};
+
+  bool any_update       = false;
+  bool any_ghost_update = false;
+  for (int iz = 0; iz < extent.nz; iz++) {
+    for (int iy = 0; iy < extent.ny; iy++) {
+      for (int ix = 0; ix < extent.nx; ix++) {
+        int ind3D        = ix + extent.nx * (iy + extent.ny * iz);
+        bool is_modified = ref_data[ind3D] != init_density;
+        any_update       = any_update or is_modified;
+
+        bool is_ghost = ((ix < n_ghost) or (ix >= active_zone_end[0]) or (iy < n_ghost) or (iy >= active_zone_end[1]) or
+                         (iz < n_ghost) or (iz >= active_zone_end[2]));
+        any_ghost_update = any_ghost_update or (is_ghost and is_modified);
+      }
+    }
+  }
+
+  switch (expectation) {
+    case BoundaryRelatedExpectation::no_update:
+      return (!any_update) and !any_ghost_update;
+    case BoundaryRelatedExpectation::only_active_zone_update:
+      return any_update and !any_ghost_update;
+    case BoundaryRelatedExpectation::update_with_change_to_ghost:
+      return any_update and any_ghost_update;
+    default:
+      return false;
+  }
+}
+
+template <typename Prescription>
+void run_bdry_test_(feedback_details::BoundaryStrategy boundry_strat)
+{
+  const int max_enclosed_neighbors = 1;  // TODO: generalize this!
+
+  const Real dx = 1.0 / 256.0;
+
+  // we can guarantee that the feedback stencil won't extend past the edge of the ghost zone if
+  // when a particle is centered in the closest cell of the ghost zone to the active zone using the
+  // following number of ghost cells
+  const int n_ghost = 1 + max_enclosed_neighbors;
+
+  // to guarantee that we can place a particle at the center of a cell and avoid having the stencil
+  // overlap with the ghost zone, we adopt the following minimum number of active zones.
+  const int min_active_zones = 1 + 2 * max_enclosed_neighbors;
+
+  const std::vector<AxProps> ax_prop_l = {
+      {min_active_zones, 0.0, dx}, {min_active_zones + 1, 0.0, dx}, {min_active_zones + 2, 0.0, dx}};
+
+  std::vector<OuterEdgeOffset> good_pos_l = {OuterEdgeOffset::L_Active(max_enclosed_neighbors + 0.5)};
+
+  // particle is outside the ghost zone and the active zone
+  std::vector<OuterEdgeOffset> outside_ghost_and_active = {OuterEdgeOffset::L_Ghost(-0.5),
+                                                           OuterEdgeOffset::R_Ghost(0.5)};
+
+  // pariticle is inside the ghost-zone (don't worry, the stencil doesn't extend beyond the outer
+  // edge of the ghost zone)
+  std::vector<OuterEdgeOffset> in_ghost = {OuterEdgeOffset::L_Active(-0.5), OuterEdgeOffset::R_Active(0.5)};
+
+  // particle is inside the active-zone, but stencil overlaps with ghost
+  // - we explicitly don't locate these exactly halfway between cells in order to ensure overlap
+  //   in the CiC27-case
+  std::vector<OuterEdgeOffset> ghost_overlap = {OuterEdgeOffset::L_Active(0.49), OuterEdgeOffset::R_Active(-0.49)};
+
+  const Real init_density = 1000.0;  // solar-masses per kpc**3
+
+  // let's check what happens when we consider a case with a point outside of the active and
+  // ghost zones
+  {
+    std::vector<hydro_utilities::VectorXYZ<OuterEdgeOffset>> pos_l =
+        build_pos_permutation_l_(outside_ghost_and_active, good_pos_l);
+
+    for (const hydro_utilities::VectorXYZ<OuterEdgeOffset>& offset_pos : pos_l) {
+      hydro_utilities::VectorXYZ<Real> cur_pos = offset_to_concrete(offset_pos, ax_prop_l, n_ghost);
+      // printf("%f, %f, %f\n", cur_pos[0],cur_pos[1],cur_pos[2]);
+
+      FeedbackResults rslt = run_full_feedback_<Prescription>(
+          n_ghost, ax_prop_l, {cur_pos}, feedback_details::OverlapStrat::ignore, true, boundry_strat, {init_density});
+      ASSERT_EQ(rslt.info[FBInfoLUT::countSN], 0);
+      if (not matches_expectation_(rslt, init_density, n_ghost, BoundaryRelatedExpectation::no_update)) {
+        FAIL() << "When a particle is placed outside of the active and ghost zones, feedback from "
+               << "it should not be applied";
+      }
+    }
+  }
+
+  printf("\n");
+  // confirm that when we place a particle outside the active zone (but in the ghost zone) that
+  // feedback is skipped
+  {
+    std::vector<hydro_utilities::VectorXYZ<OuterEdgeOffset>> pos_l = build_pos_permutation_l_(in_ghost, good_pos_l);
+
+    for (const hydro_utilities::VectorXYZ<OuterEdgeOffset>& offset_pos : pos_l) {
+      hydro_utilities::VectorXYZ<Real> cur_pos = offset_to_concrete(offset_pos, ax_prop_l, n_ghost);
+      // printf("%f, %f, %f\n", cur_pos[0],cur_pos[1],cur_pos[2]);
+
+      FeedbackResults rslt = run_full_feedback_<Prescription>(
+          n_ghost, ax_prop_l, {cur_pos}, feedback_details::OverlapStrat::ignore, true, boundry_strat, {init_density});
+      ASSERT_EQ(rslt.info[FBInfoLUT::countSN], 0);
+      if (not matches_expectation_(rslt, init_density, n_ghost, BoundaryRelatedExpectation::no_update)) {
+        FAIL() << "When a particle is placed outside of the active zone (but in the ghost zone), "
+               << "feedback from it should NOT be applied";
+      }
+    }
+  }
+
+  // printf("\n");
+
+  // confirm the behavior when we place a particle inside the active zone with a stencil that overlaps
+  // with the ghost zone.
+  {
+    std::vector<hydro_utilities::VectorXYZ<OuterEdgeOffset>> pos_l =
+        build_pos_permutation_l_(ghost_overlap, good_pos_l);
+
+    BoundaryRelatedExpectation expectation;
+    std::string explanation;
+    if (boundry_strat == feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues) {
+      expectation = BoundaryRelatedExpectation::update_with_change_to_ghost;
+      explanation =
+          ("For BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues, when a particle is "
+           "placed in the active-zone with a stencil overlapping with the ghost-zone, we "
+           "expect the relevant locations in the ghost zone to be appropriately updated ");
+    } else if (boundry_strat == feedback_details::BoundaryStrategy::excludeGhostParticle_snapActiveStencil) {
+      expectation = BoundaryRelatedExpectation::only_active_zone_update;
+      explanation =
+          ("For BoundaryStrategy::excludeGhostParticle_snapActiveStencil, when a particle is "
+           "placed in the active-zone with a stencil overlapping with the ghost-zone, we "
+           "expect the particle position is temporarily changed so that no ghost-zone values are "
+           "modified.");
+    } else {
+      CHOLLA_ERROR("This test doesn't know how to handle the specified bdry_strat.");
+    }
+
+    for (const hydro_utilities::VectorXYZ<OuterEdgeOffset>& offset_pos : pos_l) {
+      hydro_utilities::VectorXYZ<Real> cur_pos = offset_to_concrete(offset_pos, ax_prop_l, n_ghost);
+      // printf("%f, %f, %f\n", cur_pos[0],cur_pos[1],cur_pos[2]);
+
+      FeedbackResults rslt = run_full_feedback_<Prescription>(
+          n_ghost, ax_prop_l, {cur_pos}, feedback_details::OverlapStrat::ignore, true, boundry_strat, {init_density});
+      // rslt.test_field_data.print_debug_info();
+      ASSERT_EQ(rslt.info[FBInfoLUT::countSN], 1);
+      if (not matches_expectation_(rslt, init_density, n_ghost, expectation)) {
+        FAIL() << explanation;
+      }
+
+      // NOTE: we can potentially perform a more rigorous test here. This involves running
+      // run_full_feedback_ again, but this time we set n_ghost to zero and expand active_zone
+      // to a larger value (such that the total field size is unchanged...)
+    }
+  }
+
+  // Finally confirm the behavior when we place a particle inside the active zone, when the stencil
+  // has no overlap with the ghost zone. (This is more of a sanity-check on the testing machinery
+  // than on anything else...)
+
+  {
+    hydro_utilities::VectorXYZ<OuterEdgeOffset> offset_pos = {good_pos_l[0], good_pos_l[0], good_pos_l[0]};
+    hydro_utilities::VectorXYZ<Real> cur_pos               = offset_to_concrete(offset_pos, ax_prop_l, n_ghost);
+    // printf("%f, %f, %f\n", cur_pos[0],cur_pos[1],cur_pos[2]);
+    FeedbackResults rslt = run_full_feedback_<Prescription>(
+        n_ghost, ax_prop_l, {cur_pos}, feedback_details::OverlapStrat::ignore, true, boundry_strat, {init_density});
+    ASSERT_EQ(rslt.info[FBInfoLUT::countSN], 1);
+    if (not matches_expectation_(rslt, init_density, n_ghost, BoundaryRelatedExpectation::only_active_zone_update)) {
+      FAIL() << "When a particle is placed inside the active-zone (such that the stencil "
+             << "doesn't overlap with the ghost zone), then only the active zone should be updated";
+    }
+  }
+}
+
+// in these tests case, we perform some tests of the behavior of feedback when particles are near the boundary
+TYPED_TEST(tALLFeedbackFull, BoundaryExcludeGhostParticleIgnoreStencilIssues)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+  run_bdry_test_<Prescription>(feedback_details::BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues);
+}
+
+TYPED_TEST(tALLFeedbackFull, BoundaryExcludeGhostParticleSnapActiveStencil)
+{
+  using Prescription = typename TestFixture::PrescriptionT;
+  run_bdry_test_<Prescription>(feedback_details::BoundaryStrategy::excludeGhostParticle_snapActiveStencil);
+}
\ No newline at end of file
diff --git a/src/feedback/kernel.h b/src/feedback/kernel.h
new file mode 100644
index 000000000..633f51059
--- /dev/null
+++ b/src/feedback/kernel.h
@@ -0,0 +1,758 @@
+#pragma once
+/* Define the main kernel that does most of the heavy lifting.
+ *
+ * This is primarily defined in a header so that we can directly test it.
+ */
+
+#include <climits>
+#include <type_traits>
+
+#include "../feedback/feedback.h"
+#include "../global/global.h"
+#include "../utils/DeviceVector.h"
+#include "../utils/basic_structs.h"
+#include "../utils/error_handling.h"
+#include "../utils/gpu.hpp"
+#include "../utils/reduction_utilities.h"
+
+// uncomment the following line for debugging
+// #define FEEDBACK_LOG_INDIVIDUAL 1
+
+#ifndef FEEDBACK_LOG_INDIVIDUAL
+  #define FEEDBACK_LOG_INDIVIDUAL 0
+#endif
+
+#define TPB_FEEDBACK 128
+
+// first we define some basic details, this could theoretically go into a separate file
+// ====================================================================================
+
+// The following is only here to simplify testing. In the future it may make sense to move it to a different header
+namespace feedback_details
+{
+
+/* Group together all of the particle-property arguments */
+struct ParticleProps {
+  part_int_t n_local;
+  const part_int_t* id_dev;
+  const Real* pos_x_dev;
+  const Real* pos_y_dev;
+  const Real* pos_z_dev;
+  const Real* vel_x_dev;
+  const Real* vel_y_dev;
+  const Real* vel_z_dev;
+  Real* mass_dev;
+  const Real* age_dev;
+};
+
+/* Group together all of arguments describing the spatial field structure. */
+struct FieldSpatialProps {
+  Real xMin;
+  Real yMin;
+  Real zMin;
+  Real xMax;
+  Real yMax;
+  Real zMax;
+  Real dx;
+  Real dy;
+  Real dz;
+  int nx_g;
+  int ny_g;
+  int nz_g;
+  int n_ghost;
+};
+
+/* Groups properties of the simulation's current (global) iteration cycle */
+struct CycleProps {
+  Real t;     /*!< The current time */
+  Real dt;    /*!< Size of the current timestep */
+  int n_step; /*!< This is the current step of the simulation */
+};
+
+}  // namespace feedback_details
+
+// N
+
+namespace feedback_details
+{
+
+/* Specifies the stategy for handling star-particles with overlapping stencils */
+enum struct BoundaryStrategy {
+  excludeGhostParticle_ignoreStencilIssues, /*!< Ignore particles in the ghost-zone. Ignore that there could be any
+                                             *problems with a stencil
+                                             *!< that overlaps with the region beyond the ghost zone. there is a problem
+                                             */
+  excludeGhostParticle_snapActiveStencil    /*!< Ignore particles in the ghost-zone. Temporarily (only during feedback)
+                                             *snap the positions
+                                             *!< to the closest location where feedback will only affect the active zone.
+                                             */
+};
+
+}  // namespace feedback_details
+
+// STENCIL OVERLAPS
+// ----------------
+// We refer to the pattern of cells around a particle that are affected by a given feedback perscription
+// as a stencil
+//
+// Because we use separate GPU threads to simultaneously process the impact of feedback from separate
+// particles on the local fluid fields, care must be taken when nearby particles have overlapping
+// stencils.
+//
+// Our chosen approach
+// -------------------
+// When there is overlap, we essentially handle feedback one-at-a-time where the order is based on some
+// deterministic particle property (nominally the particle id).
+//
+// Essentially, we first pre-register the stencils of all particles undergoing feedback in a "mask".
+// Then we make a pass through all of the particles with pending feedback.
+// - For each particle with pending feedback, we use the previously-constructed "mask" to check if
+//   its stencil overlaps with a stencil of a different particle (also with pending feedback) that
+//   has a larger particle id.
+//   - if it doesn't, we perform feedback now!
+//   - otherwise, we defer feedback until the next "pass" (and register its stencil in the "mask" used
+//     for the next pass)
+// We make additional passes until we have applied all feedback.
+//
+// In the best case scenario (no overlap), we just do one pass. In the pathological worst case (all
+// particles are directly on top of each other), the number of passes is equal to the number of
+// particles with pending feedback.
+//
+// Alternative Options
+// -------------------
+// For a small subset of prescriptions, it's possible to handle overlapping regions by using atomic
+// operations to update the fluid fields. This only works if you know how much each quantity will change
+// ahead of time (i.e. the changes are independent of the current values). However, because we primarily
+// parameterize the fluid's properties in terms of its mass density, momenta densities, and total energy
+// densities, the prescriptions that can be handled in this way are VERY limited. This approach is only
+// applicable for prescriptions
+// - that only inject a fixed amount of thermal energy density
+// - OR are fairly pathological. Here are 2 simple examples:
+//   1. if the prescription injects momentum (while not affecting mass), then assumptions need to be made
+//      that resulting changes to the kinetic energy are exactly balanced by changes to the thermal energy
+//      (this is because you can't know how the kinetic energy density will change unless you you know the
+//      current momentum and current density).
+//   2. if the prescription injects mass (while not affecting momentum -- effectively reducing the bulk
+//      velocity), makes a similar assumption that any changes to the kinetic energy density are exactly
+//      balanced by changes to the thermal energy
+// NOTE: If we adopted a strategy where we convert the total energy to the thermal energy, then apply all
+// feedback, and then at the end recompute the total energy afterwards this opens more options. It's worth
+// considering in the future (although it is less flexible than our chosen approach).
+
+// The following definitions are only in a header file to simplify testing.
+namespace feedback_details
+{
+
+// this is a temporary class meant to mimic shared_ptr. Longer term, I plan to use a class that mimics shared_ptr
+// - the ideology of a unique-ptr existing both on the host and a device is a little weird - but it is sound as
+//   long as the unique-ptr itself does not persist on the device outside of kernel calls (of course the data
+//   unique-ptr can/will persist on the device for longer periods of time)
+template <typename T>
+class SimpleUniqueDevPtr
+{
+  static_assert(std::is_trivially_copyable_v<T> and (!std::is_pointer_v<T>) and (!std::is_reference_v<T>));
+  T* ptr_;
+
+ public:
+  /* default constructor. Makes an empty shared pointer */
+  __host__ __device__ SimpleUniqueDevPtr() : ptr_(nullptr) {}
+  __host__ __device__ SimpleUniqueDevPtr(std::nullptr_t) : SimpleUniqueDevPtr() {}
+
+  /* Allocates a new unique pointer that holds ``count`` entries of ``T`` */
+  __host__ SimpleUniqueDevPtr(std::size_t count)
+  {
+    CHOLLA_ASSERT(count > 0, "count must be a positive integer");
+    GPU_Error_Check(cudaMalloc(&ptr_, count * sizeof(T)));
+  }
+
+  /* destructor. The memory is only deallocated when this is executed on the host */
+  __host__ __device__ ~SimpleUniqueDevPtr()
+  {
+#if !((defined(__HIP_DEVICE_COMPILE__) && defined(O_HIP)) || (defined(__CUDA_ARCH__) && !defined(O_HIP)))
+    GPU_Error_Check(cudaDeviceSynchronize());  // ensure we can't deallocate a ptr that a kernel is currently using
+    if (ptr_ != nullptr) GPU_Error_Check(cudaFree(ptr_));
+#endif
+  }
+
+  SimpleUniqueDevPtr(const SimpleUniqueDevPtr<T>&)               = delete;
+  SimpleUniqueDevPtr<T>& operator=(const SimpleUniqueDevPtr<T>&) = delete;
+  SimpleUniqueDevPtr(SimpleUniqueDevPtr<T>&& other) noexcept : ptr_(other.ptr_) { other.ptr_ = nullptr; }
+  SimpleUniqueDevPtr<T>& operator=(SimpleUniqueDevPtr<T>&& other) noexcept
+  {
+    this->swap(other);
+    return *this;
+  }
+
+  /* array-element-access of the underlying pointer (invokes undefined behavior when ``this`` is empty) */
+  __device__ __forceinline__ T& operator[](std::ptrdiff_t idx) const noexcept { return ptr_[idx]; }
+
+  /* dereference the stored pointer (invokes undefined behavior when ``this`` is empty) */
+  __device__ __forceinline__ T& operator*() const noexcept { return *ptr_; }
+
+  /* accessor-method that retrieves the stored pointer */
+  __host__ __device__ __forceinline__ T* get() const noexcept { return ptr_; }
+
+  /* Provides support for checking whether ``this`` is empty. */
+  __host__ __device__ __forceinline__ explicit operator bool() const noexcept { return ptr_ != nullptr; }
+
+  /* swap the contents of ``this`` with ``other`` */
+  __host__ __device__ void swap(SimpleUniqueDevPtr<T>& other) noexcept
+  {
+    T* tmp     = this->ptr_;
+    this->ptr_ = other.ptr_;
+    other.ptr_ = tmp;
+  }
+};
+
+/* Specifies the stategy for handling star-particles with overlapping stencils */
+enum struct OverlapStrat {
+  ignore,    /*<! simply ignore that there are overlaps. Schedule everything at once (useful for profiling) */
+  sequential /*<! Process feedback for all overlapping stencils (in order of the increasing particle ids) */
+};
+
+/* Class that implements most of the logic (and tracks associated data) for scheduling feedback from particles.
+ * It's essentially a state-machine.
+ *
+ * As a shorthand, lets refer to the collection of particles who need to have their feedback during a given
+ * simulation cycle, the "designated-feedback-particles"
+ *
+ * When configured with OverlapStrat::sequential (the primary use-case of this class), "designated-feedback-particles"
+ * with overlapping stencils will be scheduled sequentially. The intention is for an instance of this class, lets
+ * call it `ov_scheduler`, to be used in the following control flow:
+ *
+ *  - at the start of the relevant kernel, call `ov_scheduler.Reset_State`
+ *  - then iterate over all of the "designated-feedback-particles" (all particles that need to have their feedback
+ *    applied in the current kernel call). For each of these particles, call
+ *    `ov_scheduler.Register_Pending_Particle_Feedback`
+ *  - now enter a while-loop where `ov_scheduler.Prepare_Next_Pass` is evaluated as the condition-expression
+ *     - we refer to each evaluation of the loop body as a "pass".
+ *     - The loop-body should consist of iterating over all of the "designated-feedback-particles" (in other words, each
+ *       evaluation of the loop body corresponds to a "pass" through the "designated-feedback-particles"). For each of
+ *       these particles, evaluate `ov_scheduler.Is_Scheduled_And_Update`
+ *     - that method will return whether that is scheduled to have its feedback applied right now. It will also update
+ *       the internal state of
+ *
+ * \note
+ * The code would probably be faster if we specified OverlapStrat as a template argument, but everything is already
+ * fairly template-heavy.
+ *
+ * \note
+ * Problems could arise if you passed particle information to this class's methods (after you have reset the state)
+ * that aren't part of the "designated-feedback-particles".
+ */
+class OverlapScheduler
+{
+ private:
+  /* the overlap strategy */
+  OverlapStrat strat_ = OverlapStrat::ignore;
+  /* counts the number of "passes" */
+  part_int_t pass_count_ = 0;
+  /* the total size of each shared register */
+  std::size_t mask_size_ = 0;
+
+  // the following attributes are all pointers to global device memory
+
+  ///@{
+  /* the "masks" each are each pointers to have the same sizes as a fluid-field (including ghost zones) - each value
+   * corresponds to a distinct cell in the simulation.
+   *
+   * In each mask, the value at each location will either store ``OverlapScheduler::DFLT_VAL`` or the minimum particle
+   * id of a particle with pending feedback whose stencil "may" overlap with the location.
+   *
+   * In a given "pass", the values in curPass_mask_ are used to identify which particles can have their feedback applied
+   * in the current pass and the values of nextPass_mask_ are updated to help which particles can have their feedback
+   * applied in a future cycle.
+   */
+  SimpleUniqueDevPtr<part_int_t> curPass_mask_  = nullptr;
+  SimpleUniqueDevPtr<part_int_t> nextPass_mask_ = nullptr;
+  ///@}
+
+  /* pointer to a global memory address that is used to track whether there are any particles with pending feedback that
+   * will need to be applied in a future "pass".
+   *
+   * At the start of each "pass" this is initialized to zero and it is gradually updated over the course of the "pass"
+   * (as particles with pending feedback are encountered that must be handled in a future "pass"). If this has a
+   * non-zero value at the end of a given "pass", then another "pass" is required.
+   */
+  SimpleUniqueDevPtr<int> any_pending_particles_;
+
+ public:
+  /* the default value stored in a "mask".
+   *
+   * this is the max value representable by a 64-bit signed integer.
+   */
+  static inline constexpr part_int_t DFLT_VAL = 9223372036854775807;
+
+  /* default constructor */
+  __host__ __device__ OverlapScheduler()
+      : strat_(OverlapStrat::ignore),
+        pass_count_(0),
+        mask_size_(0),
+        curPass_mask_(nullptr),
+        nextPass_mask_(nullptr),
+        any_pending_particles_(nullptr)
+  {
+  }
+
+  /* Main constructor of OverlapScheduler.
+   *
+   * \note
+   * the data_storage argument MUST persist longer than the lifetime of this
+   * class. It manages the lifetime of the pointers used by this class.
+   */
+  __host__ OverlapScheduler(OverlapStrat strat, int ng_x, int ng_y, int ng_z) : OverlapScheduler()
+  {
+    this->strat_          = strat;
+    std::size_t mask_size = ng_x * ng_y * ng_z;
+
+    switch (strat) {
+      case OverlapStrat::ignore:
+        // this is all redundant, but we opt for explicitness
+        this->pass_count_            = 0;
+        this->mask_size_             = 0;
+        this->curPass_mask_          = nullptr;
+        this->nextPass_mask_         = nullptr;
+        this->any_pending_particles_ = nullptr;
+        break;
+      case OverlapStrat::sequential:
+        this->pass_count_            = 0;
+        this->mask_size_             = mask_size;
+        this->curPass_mask_          = SimpleUniqueDevPtr<part_int_t>(mask_size);
+        this->nextPass_mask_         = SimpleUniqueDevPtr<part_int_t>(mask_size);
+        this->any_pending_particles_ = SimpleUniqueDevPtr<int>(1);
+        break;
+    }
+  }
+
+  /* This must be called at the start of the kernel call where an OverlapScheduler will be used
+   *
+   * \note
+   * This is a collective operation that must be executed by all threads (throughout the entire
+   * grid) at once. Deadlocks will occur if this is executed in a conditional branch that some
+   * threads can't reach.
+   */
+  __device__ void Reset_State(const cooperative_groups::grid_group& g)
+  {
+    this->pass_count_ = 0;
+
+    if (this->strat_ != OverlapStrat::ignore) {
+      if (g.thread_rank() == 0) *(this->any_pending_particles_) = 0;
+      // in the above operation, it should't logically matter whether one or more thread modifies
+      // any_pending_particles_'s contents (but I suspect that it may affect performance)
+
+      OverlapScheduler::clear_mask(this->nextPass_mask_.get(), this->mask_size_);
+
+      g.sync();  // this sync is required to ensure that all threads across the grid are done
+                 // clearing the mask (before we start mutating the mask)
+    }
+  }
+
+  /* try to prepare for the next pass through particles with pending feedback.
+   *
+   * \returns true if there are any particles with pending feedback remaining.
+   *
+   * \note
+   * This is a collective operation that must be executed by all threads (throughout the entire
+   * grid) at once. Deadlocks will occur if this is executed in a conditional branch that some
+   * threads can't reach.
+   */
+  __device__ bool Prepare_Next_Pass(const cooperative_groups::grid_group& g)
+  {
+    if (this->strat_ == OverlapStrat::ignore) {  // in this scenario, only a single pass is required!
+      if (this->pass_count_ != 0) return false;
+      this->pass_count_ = 1;
+      return true;
+    }
+
+    // the rest of this logic is for OverlapStrat::sequential
+
+    g.sync();  // this is important! we need to be sure all threads across all thread-blocks are done
+               // completing any previous "passes" through the data to make sure all threads agree that
+               // another pass is required.
+
+    // if there are no particles with pending feedback, we are done! We can exit now
+    if (0 == *(this->any_pending_particles_)) return false;
+
+    // otherwise we need to continue on. Let's synchronize since we will be resetting the value of
+    // this->any_pending_particles_ (is this a place where we can use a memory fence?)
+    g.sync();
+    if (g.thread_rank() == 0) *(this->any_pending_particles_) = 0;
+    // in the above operation, it should't logically matter whether one or more thread modifies
+    // any_pending_particles_'s contents (but I suspect that it may affect performance)
+
+    // increment the total pass-count
+    this->pass_count_++;
+
+    // Finally prepare the masks for the next loop
+    this->curPass_mask_.swap(this->nextPass_mask_);
+    OverlapScheduler::clear_mask(this->nextPass_mask_.get(), this->mask_size_);
+
+    g.sync();  // this last sync is required to make sure all threads across the grid are done clearing
+               // the mask (before we start mutating the mask)
+    return true;
+  }
+
+  /* External users of OverlapScheduler must calls this during initial setup for each particle that will
+   * undergo feedback during the upcoming cycle.
+   *
+   * \note
+   * This is also used internally.
+   */
+  template <typename Prescription>
+  __device__ void Register_Pending_Particle_Feedback(Prescription p, long long int particle_id,
+                                                     hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y)
+  {
+    if (strat_ == OverlapStrat::ignore) return;
+
+    // record that a pass is necessary
+    atomicMax(this->any_pending_particles_.get(), 1);
+
+    static_assert(sizeof(long long int) == sizeof(part_int_t));
+    long long int* mask = (long long int*)(this->nextPass_mask_.get());
+    p.for_each_possible_overlap(
+        pos_indU[0], pos_indU[1], pos_indU[2], ng_x, ng_y,
+        [mask, particle_id](Real dummy_arg, int ind3d) -> void { atomicMin(mask + ind3d, particle_id); });
+  }
+
+  /* Checks whether a particle that applies feedback during the current cycle is scheduled to apply feedback right now
+   * (in the current "pass"). This function MAY also update the internal state to prepare for the schdeduler for future
+   * "passes".
+   *
+   * This will return `false` for particles that already applied feedback during the current simulation-cycle (but in a
+   * prior "pass")  It will also return `false` if the specified particle's feedback is scheduled for a future pass.
+   */
+  template <typename Prescription>
+  __device__ bool Is_Scheduled_And_Update(Prescription p, part_int_t particle_id,
+                                          hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y)
+  {
+    if (this->strat_ == OverlapStrat::ignore) return true;
+
+    // retrieve the minimum particle_id in the current zone
+    part_int_t min_id = OverlapScheduler::DFLT_VAL;
+    part_int_t* mask  = this->curPass_mask_.get();
+
+    p.for_each_possible_overlap(
+        pos_indU[0], pos_indU[1], pos_indU[2], ng_x, ng_y,
+        [mask, &min_id](Real dummy_arg, int ind3d) -> void { min_id = min(min_id, mask[ind3d]); });
+
+    if (particle_id < min_id) {  // feedback was applied in prior "pass"
+      return false;
+    } else if (particle_id == min_id) {  // feedback is scheduled for current "pass"
+      return true;
+    } else {  // particle feedback scheduled for future "pass"
+      // record that another pass is necessary & prepare the mask for the next pass
+      Register_Pending_Particle_Feedback(p, particle_id, pos_indU, ng_x, ng_y);
+      return false;
+    }
+  }
+
+ private:
+  /* To be called across all threads and blocks at once
+   *
+   * \note
+   * Assumes that blockDim.y, blockDim.z, gridDim.y, gridDim.z are all 1.
+   * We can fix this!
+   */
+  static __device__ void clear_mask(part_int_t* ptr, std::size_t len)
+  {
+    len *= std::size_t(ptr != nullptr);
+    const std::size_t start       = blockIdx.x * blockDim.x + threadIdx.x;
+    const std::size_t loop_stride = blockDim.x * gridDim.x;
+    for (int i = start; i < len; i += loop_stride) {
+      ptr[i] = OverlapScheduler::DFLT_VAL;
+    }
+  }
+};
+
+__device__ __forceinline__ hydro_utilities::VectorXYZ<Real> Calc_Pos_IndU(
+    int i, const feedback_details::ParticleProps& particle_props,
+    const feedback_details::FieldSpatialProps& spatial_props)
+{
+  const int n_ghost = spatial_props.n_ghost;
+  return {(particle_props.pos_x_dev[i] - spatial_props.xMin) / spatial_props.dx + n_ghost,
+          (particle_props.pos_y_dev[i] - spatial_props.yMin) / spatial_props.dy + n_ghost,
+          (particle_props.pos_z_dev[i] - spatial_props.zMin) / spatial_props.dz + n_ghost};
+}
+
+/* Applies cluster feedback.
+ *
+ * \tparam FeedbackModel type that encapsulates the actual feedback prescription
+ * \tparam BdryStrat specifies the policy used for handling feedback stencils that overlap with boundaries of the active
+ * zone
+ *
+ * \param[in,out] particle_props Encodes the actual particle data needed for feedback. If there is any feedback, the
+ *     relevant particle properties (like particle mass) will be updated during this call.
+ * \param[in]     spatial_props Encodes spatial information about the local domain and the fields
+ * \param[in]     cycle_props Encodes details about the simulation's current (global) iteration cycle
+ * \param[out]    info An array that will is intended to accumulate summary details about the feedback during the course
+ * of this kernel call. This function assumes it has FBInfoLUT::LEN entries that are all initialized to 0. \param[out]
+ * conserved_dev pointer to the fluid-fields that will be updated during this function call. \param[in]     An array of
+ * ``particle_props.n_local`` non-negative integers that specify the number of supernovae that are are scheduled to
+ * occur during the current cycle (for each particle). \param[in]     ov_scheduler helps schedule feedback of particles
+ * with overlapping stencils.
+ */
+template <typename FeedbackModel, BoundaryStrategy BdryStrat>
+__global__ void Cluster_Feedback_Kernel(const feedback_details::ParticleProps particle_props,
+                                        const feedback_details::FieldSpatialProps spatial_props,
+                                        const feedback_details::CycleProps cycle_props, Real* info, Real* conserved_dev,
+                                        int* num_SN_dev, OverlapScheduler ov_scheduler)
+{
+  const int tid                    = threadIdx.x;
+  cooperative_groups::grid_group g = cooperative_groups::this_grid();
+
+  // initialize fb_model - this doesn't carry any state. It's just here to help with inference of
+  // the FeedbackModel types in all of the helper functions
+  FeedbackModel fb_model{};
+
+  // prologoue: setup buffer for collecting SN feedback information
+  __shared__ Real s_info[FBInfoLUT::LEN * TPB_FEEDBACK];
+  for (unsigned int cur_ind = 0; cur_ind < FBInfoLUT::LEN; cur_ind++) {
+    s_info[FBInfoLUT::LEN * tid + cur_ind] = 0;
+  }
+
+  // this lambda func returns true if particle is in-bounds and has at least 1 SNe
+  // - based on the value of the BdryStrat template-parameter, it may also modify the position
+  //   that should be used when applying the feedback.
+  auto checkDontSkip_and_maybeRevisePos = [&spatial_props, num_SN_dev](int i,
+                                                                       hydro_utilities::VectorXYZ<Real>& pos_indU) {
+    const int n_ghost = spatial_props.n_ghost;
+
+    bool ignore = (((pos_indU[0] < n_ghost) or (pos_indU[0] >= (spatial_props.nx_g - n_ghost))) or
+                   ((pos_indU[1] < n_ghost) or (pos_indU[1] >= (spatial_props.ny_g - n_ghost))) or
+                   ((pos_indU[2] < n_ghost) or (pos_indU[2] >= (spatial_props.nz_g - n_ghost))));
+
+    // the branch-condition is determined at compile-time (since BdryStrat is a template parameter)
+    if (BdryStrat == BoundaryStrategy::excludeGhostParticle_snapActiveStencil) {
+      // overwrite pos_indU with the closest posititon, where stencil only includes active zones
+      // - if the stencil already just overlaps with active zone this should do nothing
+      // - it doesn't really matter if we alter the position of a particle outside of the active
+      //   zone since we will always ignore that particle.
+      pos_indU = FeedbackModel::nearest_noGhostOverlap_pos(pos_indU, spatial_props.nx_g, spatial_props.ny_g,
+                                                           spatial_props.nz_g, n_ghost);
+    }
+
+    return (not ignore) and (num_SN_dev[i] > 0);
+  };
+
+  // Prepare to iterate over the the list of particles
+  // - this is grid-strided loop. This is a common idiom that makes the kernel more flexible
+  // - If there are more local particles than threads, some threads will visit more than 1 particle
+  const int start       = blockIdx.x * blockDim.x + threadIdx.x;
+  const int loop_stride = blockDim.x * gridDim.x;
+
+  // get ov_scheduler set up properly
+  ov_scheduler.Reset_State(g);
+  for (int i = start; i < particle_props.n_local; i += loop_stride) {
+    // compute the position in index-units (appropriate for a field with a ghost-zone)
+    // - an integer value corresponds to the left edge of a cell
+    hydro_utilities::VectorXYZ<Real> pos_indU = Calc_Pos_IndU(i, particle_props, spatial_props);
+
+    if (checkDontSkip_and_maybeRevisePos(i, pos_indU)) {
+      ov_scheduler.Register_Pending_Particle_Feedback(fb_model, (long long int)(particle_props.id_dev[i]), pos_indU,
+                                                      spatial_props.nx_g, spatial_props.ny_g);
+    }
+  }
+
+  // do the main work.
+  while (ov_scheduler.Prepare_Next_Pass(g)) {
+    // if (g.thread_rank() == 0) kernel_printf("entered loop!\n");
+
+    for (int i = start; i < particle_props.n_local; i += loop_stride) {
+      // compute the position in index-units (appropriate for a field with a ghost-zone)
+      // - an integer value corresponds to the left edge of a cell
+      hydro_utilities::VectorXYZ<Real> pos_indU = Calc_Pos_IndU(i, particle_props, spatial_props);
+
+      if (checkDontSkip_and_maybeRevisePos(i, pos_indU)) {
+        bool is_scheduled = ov_scheduler.Is_Scheduled_And_Update(fb_model, particle_props.id_dev[i], pos_indU,
+                                                                 spatial_props.nx_g, spatial_props.ny_g);
+
+        if (is_scheduled) {
+          // note age_dev is actually the time of birth
+          const Real age = cycle_props.t - particle_props.age_dev[i];
+
+          // holds a reference to the particle's mass (this will be updated after feedback is handled)
+          Real& mass_ref = particle_props.mass_dev[i];
+
+#if FEEDBACK_LOG_INDIVIDUAL
+          // explicitly use json formatting to make log-parsing easier:
+          kernel_printf(
+              "...feedback-log-individual:\n"
+              "   {\"block\": %d, \"thread\":%d, \"cycle\":%d,\n"
+              "    \"index\": %d, \"id\": %lld, \"age\": %g,\n"
+              "    \"mass (pre-feedback)\": %g, num_SN: %d\n"
+              "    \"position (code units)\": [%g, %g, %g],\n"
+              "    \"position (index-units)\": [%g, %g, %g],\n"
+              "    \"vel (code-units)\": [%g, %g, %g]}\n",
+              blockIdx.x, threadIdx.x, cycle_props.n_step, i, (long long int)(particle_props.id_dev[i]), age, mass_ref,
+              num_SN_dev[i], particle_props.pos_x_dev[i], particle_props.pos_y_dev[i], particle_props.pos_z_dev[i],
+              pos_indU[0], pos_indU[1], pos_indU[2], particle_props.vel_x_dev[i], particle_props.vel_y_dev[i],
+              particle_props.vel_z_dev[i]);
+          int pre_countResolved = s_info[FBInfoLUT::countResolved];
+#endif /* FEEDBACK_LOG_INDIVIDUAL */
+
+          fb_model.apply_feedback(pos_indU[0], pos_indU[1], pos_indU[2], particle_props.vel_x_dev[i],
+                                  particle_props.vel_y_dev[i], particle_props.vel_z_dev[i], age, mass_ref,
+                                  particle_props.id_dev[i], spatial_props.dx, spatial_props.dy, spatial_props.dz,
+                                  spatial_props.nx_g, spatial_props.ny_g, spatial_props.nz_g, spatial_props.n_ghost,
+                                  num_SN_dev[i], cycle_props.n_step, s_info, conserved_dev);
+
+#if FEEDBACK_LOG_INDIVIDUAL
+          // explicitly use json formatting to make log-parsing easier:
+          kernel_printf(
+              "...feedback-log-individual-extra: {\"block\": %d, \"thread\":%d, \"cycle\":%d, \"id\": %lld, "
+              "\"isResolved\": %d}\n",
+              blockIdx.x, threadIdx.x, cycle_props.n_step, (long long int)(particle_props.id_dev[i]),
+              int(s_info[FBInfoLUT::countResolved] > pre_countResolved));
+#endif
+        }
+      }
+    }
+  }
+
+  // epilogue: sum the info from all threads (in all blocks) and add it into info
+  __syncthreads();  // synchronize all threads in the current block. It's important to do this before
+                    // the next function call because we accumulate values on the local block first
+  reduction_utilities::blockAccumulateIntoNReals<FBInfoLUT::LEN, TPB_FEEDBACK>(info, s_info);
+}
+
+struct KernelAndLaunchConf {
+  void* kernel_ptr;
+  dim3 dim_block;
+  dim3 dim_grid;
+};
+
+/* Helper function that is used to fetch the appropriant feedback-kernel varient and compute the launch parameters
+ *
+ * The launch parameters are chosen in order to maximize parallelism; they are based on how many blocks can fit
+ * simultaneously on a SM (streaming multiprocessor), given the specified variant of the kernel, the number of
+ * threads per block, and the intended, per-block, shared dynamic memory usage.
+ *
+ * \note
+ * This has been factored out of Exec_Cluster_Feedback_Kernel() to allow \c BdryStrat to be specified as a runtime
+ * argument, while only having a single switch statement responsible for mapping the runtime argument to a template
+ * parameter.
+ */
+template <typename FeedbackModel, BoundaryStrategy BdryStrat>
+KernelAndLaunchConf fetch_kernel_and_launch_conf_(int threads_per_block, int max_num_threadblocks,
+                                                  std::size_t dynamic_shared_mem_per_block)
+{
+  // do some work to configure the grid-size (i.e. the number of thread-blocks per grid)
+  // - since the kernel uses grid-wide synchronizations, some care needs to be taken to ensure
+  //   co-residency of the thread blocks on the GPU (if you have too many thread-blocks, then they
+  //   won't all be executed on the gpu at the same time)
+  // - we use static variables so we don't have to repeat these calculations. This should be fine as
+  //   long as: - the amount of dynamic shared memory usage for each block never changes between calls
+  //            - the threads per block don't ever change between calls
+  const dim3 dimBlock(threads_per_block, 1, 1);
+
+  static dim3 dimGrid;
+  static int last_max_num_threadblocks                 = 0;
+  static int last_threads_per_block                    = 0;
+  static std::size_t last_dynamic_shared_mem_per_block = 0;
+
+  CHOLLA_ASSERT(max_num_threadblocks > 0, "max_num_threadblocks must be positive!");
+  if ((last_max_num_threadblocks != max_num_threadblocks) or (last_threads_per_block != threads_per_block) or
+      (last_dynamic_shared_mem_per_block != dynamic_shared_mem_per_block)) {
+    last_max_num_threadblocks         = max_num_threadblocks;
+    last_threads_per_block            = threads_per_block;
+    last_dynamic_shared_mem_per_block = dynamic_shared_mem_per_block;
+
+    int dev                = 0;
+    int supportsCoopLaunch = 0;
+    cudaError err          = cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev);
+    CHOLLA_ASSERT(cudaSuccess == err,
+                  "Error encountered within cudaDeviceGetAttribute while querying whether the "
+                  "system supports cooperative kernels");
+    CHOLLA_ASSERT(supportsCoopLaunch != 0, "System is unable to launch cooperative kernels");
+
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, dev);
+    int numBlocksPerSm = 0;  // this will be updated to hold the max number of blocks on the GPU
+    err                = cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm,
+                                                                       Cluster_Feedback_Kernel<FeedbackModel, BdryStrat>,
+                                                                       threads_per_block, dynamic_shared_mem_per_block);
+    CHOLLA_ASSERT(cudaSuccess == err,
+                  "Error encountered within cudaOccupancyMaxActiveBlocksPerMultiprocessor while "
+                  "querying whether the max active blocks per SM");
+    CHOLLA_ASSERT(numBlocksPerSm > 0, "Something is wrong! The number of blocks per SM should be positive");
+
+    dimGrid = dim3(std::min(deviceProp.multiProcessorCount * numBlocksPerSm, max_num_threadblocks), 1, 1);
+  }
+
+  return KernelAndLaunchConf{(void*)Cluster_Feedback_Kernel<FeedbackModel, BdryStrat>, dimBlock, dimGrid};
+}
+
+/* Launches the Kernel for ClusterFeedback
+ *
+ * \tparam FeedbackModel type that encapsulates the actual feedback prescription
+ *
+ * \param[in,out] particle_props Encodes the actual particle data needed for feedback. If there is any feedback, the
+ *     relevant particle properties (like particle mass) will be updated during this call.
+ * \param[in]     spatial_props Encodes spatial information about the local domain and the fields
+ * \param[in]     cycle_props Encodes details about the simulation's current (global) iteration cycle
+ * \param[out]    info An array on the host that will is intended to accumulate summary details about the feedback
+ * during the course of this kernel call. This function assumes it has FBInfoLUT::LEN entries that are all initialized
+ * to 0. \param[out]    conserved_dev pointer to the fluid-fields that will be updated during this function call.
+ * \param[in]     An array of ``particle_props.n_local`` non-negative integers that specify the number of supernovae
+ * that are are scheduled to occur during the current cycle (for each particle). \param[in]     ov_scheduler helps
+ * schedule feedback of particles with overlapping stencils. \param[in]     bdry_strat specifies the policy used for
+ * handling feedback stencils that overlap with boundaries of the active zone \param[in]     max_num_threadblocks This
+ * is here to put an arbitrary upper limit on the maximum number of thread-blocks. This is for debugging purposes. Only
+ * positive values are allowed.
+ */
+template <typename FeedbackModel>
+void Exec_Cluster_Feedback_Kernel(const feedback_details::ParticleProps& particle_props,
+                                  const feedback_details::FieldSpatialProps& spatial_props,
+                                  const feedback_details::CycleProps& cycle_props, Real* info, Real* conserved_dev,
+                                  int* num_SN_dev, OverlapScheduler& ov_scheduler, BoundaryStrategy bdry_strat,
+                                  int max_num_threadblocks = INT_MAX)
+{
+  // Declare/allocate device buffer for accumulating summary information about feedback
+  cuda_utilities::DeviceVector<Real> d_info(FBInfoLUT::LEN, true);  // initialized to 0
+
+  // fetch the kernel and launch parameters (some care is taken to ensure that )
+  const std::size_t dynamic_shared_mem_per_block = 0;
+  const int threads_per_block                    = TPB_FEEDBACK;
+
+  KernelAndLaunchConf tmp;
+  switch (bdry_strat) {
+    case BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues:
+      tmp = fetch_kernel_and_launch_conf_<FeedbackModel, BoundaryStrategy::excludeGhostParticle_ignoreStencilIssues>(
+          threads_per_block, max_num_threadblocks, dynamic_shared_mem_per_block);
+      break;
+    case BoundaryStrategy::excludeGhostParticle_snapActiveStencil:
+      tmp = fetch_kernel_and_launch_conf_<FeedbackModel, BoundaryStrategy::excludeGhostParticle_snapActiveStencil>(
+          threads_per_block, max_num_threadblocks, dynamic_shared_mem_per_block);
+      break;
+    default:
+      CHOLLA_ERROR(
+          "Unable to handle specified bdry_strat. This probably means a new stategy "
+          "was introduced without modifying the switch-statement this error occurs in.");
+  }
+
+  // actually launch the kernel
+  Real* d_info_ptr   = d_info.data();
+  void* kernelArgs[] = {(void*)(&particle_props), (void*)(&spatial_props), (void*)(&cycle_props), (void*)(&d_info_ptr),
+                        (void*)(&conserved_dev),  (void*)(&num_SN_dev),    (void*)(&ov_scheduler)};
+
+  cudaLaunchCooperativeKernel((void*)tmp.kernel_ptr, tmp.dim_grid, tmp.dim_block, kernelArgs,
+                              dynamic_shared_mem_per_block, 0);
+
+  /*
+  // compute the grid-size or the number of thread-blocks per grid. The number of threads in a block is
+  // given by TPB_FEEDBACK
+  const int blocks_per_grid = (particle_props.n_local - 1) / TPB_FEEDBACK + 1;
+  hipLaunchKernelGGL(feedback_details::Cluster_Feedback_Kernel, blocks_per_grid, TPB_FEEDBACK, 0, 0,
+                     particle_props, spatial_props, cycle_props, d_info.data(), conserved_dev, num_SN_dev,
+  feedback_model);
+  */
+
+  if (info != nullptr) {
+    // copy summary data back to the host
+    GPU_Error_Check(cudaMemcpy(info, d_info.data(), FBInfoLUT::LEN * sizeof(Real), cudaMemcpyDeviceToHost));
+  } else {
+    GPU_Error_Check(cudaDeviceSynchronize());
+  }
+}
+
+}  // namespace feedback_details
diff --git a/src/feedback/prescription.h b/src/feedback/prescription.h
new file mode 100644
index 000000000..e9ac82264
--- /dev/null
+++ b/src/feedback/prescription.h
@@ -0,0 +1,515 @@
+/* This file defines feeback prescriptions that can actually be used within Cholla
+ *
+ * Some (not all) of these prescriptions are defined in terms of factored out stencils. Those stencils
+ * are defined separately in a different header file (feedback/stencil).
+ */
+
+#pragma once
+
+#include "../feedback/feedback.h"
+#include "../feedback/ratecalc.h"
+#include "../feedback/stencil.h"
+#include "../global/global.h"
+#include "../utils/basic_structs.h"
+
+namespace fb_prescription
+{
+
+inline __device__ void log_fb(int cycle_num, int num_SN, bool is_resolved, Real pos_x_indU, Real pos_y_indU,
+                              Real pos_z_indU, Real vel_x, Real vel_y, Real vel_z, part_int_t particle_id,
+                              Real n_0_cgs = -1.23456789)
+{
+  // use json formatting so this is easier to parse from logs
+  kernel_printf(
+      "..fb: { \"cycle\":%d, \"id\": %lld, \"num_SN\": %d, \"resolved\": %d, \"n0\": %g, \"pos_indU\": [%g, %g, %g], "
+      "\"vel\": [%g, %g, %g]}\n",
+      cycle_num, (long long int)(particle_id), num_SN, int(is_resolved), n_0_cgs, pos_x_indU, pos_y_indU, pos_z_indU,
+      vel_x, vel_y, vel_z);
+}
+
+/** Compute radius of shell-formation (in kpc) of a supernova
+ *
+ *  Some feedback prescriptions used this to switch between resolved & unresolved
+ *  feedback strategies.
+ *
+ *  \param ndens_cgs the average ambient number density
+ */
+inline __device__ __host__ Real radius_shell_formation_kpc(Real ndens_cgs, int num_SN)
+{
+  // originally, we adopted the normalization from eq.(31) from Kim & Ostriker (2015)
+  // 30.2 pc (the fit to R_sh for a multiphase ambient medium) and the analytic
+  // scaling for a unifom medium eq.(8)
+  //
+  // subsequently, we made a plot for making comparisons with Figure 6 of
+  // Tigress paper, (Kim & Ostriker 2017) -- we did this for CIE cooling with
+  // ambient temperatures fixed at 1e4 K. This made it clear that we, like TIGRESS,
+  // should use the analytic choice for a singlephase ambient medium (it means we
+  // will more readily transition to unresolved and will decrease chance of over-cooling)
+  return 0.0226 * pow(ndens_cgs, -0.46) * pow(fabs(Real(num_SN)), 0.29);
+}
+
+template <typename Stencil>
+struct ResolvedSNPrescription {
+  /* the following 2 attributes exists for testing purposes */
+  static constexpr bool has_resolved_prescription   = true;
+  static constexpr bool has_unresolved_prescription = false;
+
+  static __device__ hydro_utilities::VectorXYZ<Real> nearest_noGhostOverlap_pos(
+      hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y, int ng_z, int n_ghost)
+  {
+    return Stencil::nearest_noGhostOverlap_pos(pos_indU, ng_x, ng_y, ng_z, n_ghost);
+  }
+
+  // ToDo: refactor to make use of Stencil::for_each_overlap_zone
+  template <typename Function>
+  static __device__ void for_each_possible_overlap(Real pos_x_indU, Real pos_y_indU, Real pos_z_indU, int nx_g,
+                                                   int ny_g, Function&& f)
+  {
+    Stencil::for_each(hydro_utilities::VectorXYZ<Real>{pos_x_indU, pos_y_indU, pos_z_indU}, nx_g, ny_g,
+                      std::forward<Function>(f));
+  }
+
+  static __device__ void apply_feedback(Real pos_x_indU, Real pos_y_indU, Real pos_z_indU, Real vel_x, Real vel_y,
+                                        Real vel_z, Real age, Real& mass_ref, part_int_t particle_id, Real dx, Real dy,
+                                        Real dz, int nx_g, int ny_g, int nz_g, int n_ghost, int num_SN, int cycle_num,
+                                        Real* s_info, Real* conserved_dev)
+  {
+    int tid = threadIdx.x;
+
+    s_info[FBInfoLUT::LEN * tid + FBInfoLUT::countSN] += num_SN;
+    s_info[FBInfoLUT::LEN * tid + FBInfoLUT::countResolved] += num_SN;
+    s_info[FBInfoLUT::LEN * tid + FBInfoLUT::totalEnergy] += feedback::ENERGY_PER_SN;
+
+    Real dV               = dx * dy * dz;
+    Real feedback_energy  = num_SN * feedback::ENERGY_PER_SN / dV;
+    Real feedback_density = num_SN * feedback::MASS_PER_SN / dV;
+
+    mass_ref = max(0.0, mass_ref - num_SN * feedback::MASS_PER_SN);  // update the cluster mass
+
+    log_fb(cycle_num, num_SN, true, pos_x_indU, pos_y_indU, pos_z_indU, vel_x, vel_y, vel_z, particle_id);
+
+    ResolvedSNPrescription::apply(hydro_utilities::VectorXYZ<Real>{pos_x_indU, pos_y_indU, pos_z_indU}, vel_x, vel_y,
+                                  vel_z, nx_g, ny_g, nx_g * ny_g * nz_g, conserved_dev, feedback_density,
+                                  feedback_energy);
+  }
+
+  /* apply the resolved feedback prescription */
+  static __device__ void apply(hydro_utilities::VectorXYZ<Real> pos_indU, Real vel_x, Real vel_y, Real vel_z, int nx_g,
+                               int ny_g, int n_cells, Real* conserved_device, Real feedback_density,
+                               Real feedback_energy)
+  {
+    Real* density    = conserved_device;
+    Real* momentum_x = &conserved_device[n_cells * grid_enum::momentum_x];
+    Real* momentum_y = &conserved_device[n_cells * grid_enum::momentum_y];
+    Real* momentum_z = &conserved_device[n_cells * grid_enum::momentum_z];
+    Real* energy     = &conserved_device[n_cells * grid_enum::Energy];
+#ifdef DE
+    Real* gasEnergy = &conserved_device[n_cells * grid_enum::GasEnergy];
+#endif
+
+    Stencil::for_each(pos_indU, nx_g, ny_g, [=](double stencil_vol_frac, int idx3D) {
+      // stencil_vol_frac is the fraction of the total stencil volume enclosed by the given cell
+      // indx3D can be used to index the conserved fields (it assumes ghost-zones are present)
+
+      // Step 1: substract off the kinetic-energy-density from total energy density.
+      //  - While we aren't going to inject any of the supernova energy directly as kinetic energy,
+      //    the kinetic energy density will change to some degree because the gas density and gas
+      //    momentum will be changed
+
+      Real intial_ke_density = 0.5 *
+                               (momentum_x[idx3D] * momentum_x[idx3D] + momentum_y[idx3D] * momentum_y[idx3D] +
+                                momentum_z[idx3D] * momentum_z[idx3D]) /
+                               density[idx3D];
+
+      energy[idx3D] -= intial_ke_density;
+
+      // Step 2: convert the momentum-density into the star's reference frame, update the density,
+      //  and then update the momentum-density back into the initial reference-frame
+      //  - since we aren't explicitly injecting the supernova-energy as kinetic energy, this is
+      //    equivalent to adding momentum in the original frame as is done below
+      double injected_density = stencil_vol_frac * feedback_density;
+
+      momentum_x[idx3D] += vel_x * injected_density;
+      momentum_y[idx3D] += vel_y * injected_density;
+      momentum_z[idx3D] += vel_z * injected_density;
+
+      // Step 2b: actually update the density
+      density[idx3D] += injected_density;
+
+      // Step 3: inject thermal energy
+#ifdef DE
+      gasEnergy[idx3D] += stencil_vol_frac * feedback_energy;
+#endif
+      energy[idx3D] += stencil_vol_frac * feedback_energy;
+
+      // Step 4: reintroduce the kinetic energy density back to the total energy field
+      energy[idx3D] += 0.5 *
+                       (momentum_x[idx3D] * momentum_x[idx3D] + momentum_y[idx3D] * momentum_y[idx3D] +
+                        momentum_z[idx3D] * momentum_z[idx3D]) /
+                       density[idx3D];
+    });
+  }
+};
+
+// Define some type aliases to be able to more easily refer to more conveniently
+// refer to different flavors of Resolved Feedback Prescriptions
+using CiCResolvedSNPrescription      = ResolvedSNPrescription<fb_stencil::CIC>;
+using Sphere27ResolvedSNPrescription = ResolvedSNPrescription<fb_stencil::Sphere27<2>>;
+// the following case is mostly experimental
+using SphereBinaryResolvedSNPrescription = ResolvedSNPrescription<fb_stencil::SphereBinary<3>>;
+
+/* Overwrite the stencil region with average density specified by `overwrite_average`
+ *
+ * This overwrites each component of the momentum density with the average value and holding the
+ * thermal-energy density constant.
+ *
+ * \note
+ * We were a little torn about how to handle the momentum. There were essentially 3 options when
+ * overwriting the density:
+ *   1. Holding the velocity constant at each location.
+ *      - Pro: If you had a bunch of cells with common velocity (e.g. because all of the gas rotates
+ *        in a disk) but had a varying density, this would ensure that the gas remains comoving.
+ *      - Con: Imagine that you had a bunch of cells that varied in density and velocity and you
+ *        had 1 cell with a particularly high velocity, but below-average density. This approach
+ *        would give the gas in that cell a lot of additional inertia, which could cause problems
+ *   2. Holding the momentum constant at each location
+ *      - Pro: You would ALWAYS avoid converting a very fast-moving underdense cell into a
+ *        fast-moving average-density cell (This avoid the CON of holding velocity constant)
+ *      - Con: In the case where all cells are co-rotating around the disk, but have differing
+ *        densities, the initially under-dense (over-dense) cells would move slower (faster) after
+ *        the overwrite operation. (This loses the PRO of holding velocity constant)
+ *   3. Overwriting the momentum of each cell with the average momentum
+ *      - Pro: this has the advantages and none of the disadvantages of the other options
+ *      - Con: this involves more averaging (Ideally, we wouldn't average any field)
+ * Since we are already overwriting the density with the average value anyways, we decided that it
+ * made the most sense to adopt option #3
+ */
+template <typename Stencil>
+__device__ void Overwrite_Average(hydro_utilities::VectorXYZ<Real> stencil_pos_indU, int nx_g, int ny_g, int nz_g,
+                                  Real* conserved_device, Real overwrite_density)
+{
+  // step 1: load in the relevant fields
+  // - Note: even if we were holding the momentum density constant in each cell, we would need still
+  //   need to modify total energy to reflect changes in kinetic energy density
+  const int n_cells = nx_g * ny_g * nz_g;
+  Real* density     = conserved_device;
+  Real* momentum_x  = &conserved_device[n_cells * grid_enum::momentum_x];
+  Real* momentum_y  = &conserved_device[n_cells * grid_enum::momentum_y];
+  Real* momentum_z  = &conserved_device[n_cells * grid_enum::momentum_z];
+  Real* energy      = &conserved_device[n_cells * grid_enum::Energy];
+
+  // step 2: determine the average momentum in each cell
+  // - Note: we use overwrite_density as the average density since the caller already needed to
+  //   compute that value anyways
+  hydro_utilities::VectorXYZ<Real> avg_momentum;
+  {
+    Real tot_momentum[3] = {0.0, 0.0, 0.0};
+    int num              = 0;
+    Stencil::for_each_overlap_zone(stencil_pos_indU, nx_g, ny_g, [&](int idx3D) {
+      tot_momentum[0] += momentum_x[idx3D];
+      tot_momentum[1] += momentum_y[idx3D];
+      tot_momentum[2] += momentum_z[idx3D];
+      num++;
+    });
+    avg_momentum =
+        hydro_utilities::VectorXYZ<Real>{tot_momentum[0] / num, tot_momentum[1] / num, tot_momentum[2] / num};
+  }
+
+  // step 3: Actually overwrite the fields
+  const Real new_ke_density =
+      0.5 *
+      (avg_momentum[0] * avg_momentum[0] + avg_momentum[1] * avg_momentum[1] + avg_momentum[2] * avg_momentum[2]) /
+      overwrite_density;
+  Stencil::for_each_overlap_zone(stencil_pos_indU, nx_g, ny_g, [=](int idx3D) {
+    // precompute 1/initial_density (take care to avoid divide by 0)
+    const Real inv_initial_dens  = 1.0 / (density[idx3D] + TINY_NUMBER * (density[idx3D] == 0.0));
+    const Real intial_ke_density = 0.5 * inv_initial_dens *
+                                   (momentum_x[idx3D] * momentum_x[idx3D] + momentum_y[idx3D] * momentum_y[idx3D] +
+                                    momentum_z[idx3D] * momentum_z[idx3D]);
+    density[idx3D]    = overwrite_density;
+    momentum_x[idx3D] = avg_momentum[0];
+    momentum_y[idx3D] = avg_momentum[1];
+    momentum_z[idx3D] = avg_momentum[2];
+    energy[idx3D] += new_ke_density - intial_ke_density;
+  });
+}
+
+/* \brief Function used for depositing energy or momentum from an unresolved
+ * supernova or from a stellar wind
+ *
+ * \note
+ * Previously there were 2 separate functions defined to perform this operation.
+ * They were functionally the same. They only differences were the names of
+ * variables.
+ *
+ * \par
+ * There are currently issues with the internals of this function:
+ * - this requires the codebase to be compiled with the dual energy formalism
+ * - momentum and total energy are not updated self-consistently
+ */
+template <typename Stencil>
+inline __device__ void Apply_Energy_Momentum_Deposition(Real pos_x_indU, Real pos_y_indU, Real pos_z_indU, Real vel_x,
+                                                        Real vel_y, Real vel_z, int nx_g, int ny_g, int n_ghost,
+                                                        int n_cells, Real* conserved_device, Real feedback_density,
+                                                        Real feedback_momentum, Real feedback_energy)
+{
+  Real* density    = conserved_device;
+  Real* momentum_x = &conserved_device[n_cells * grid_enum::momentum_x];
+  Real* momentum_y = &conserved_device[n_cells * grid_enum::momentum_y];
+  Real* momentum_z = &conserved_device[n_cells * grid_enum::momentum_z];
+  Real* energy     = &conserved_device[n_cells * grid_enum::Energy];
+#ifdef DE
+  Real* gas_energy = &conserved_device[n_cells * grid_enum::GasEnergy];
+#endif
+
+  Stencil::for_each_vecflavor(
+      {pos_x_indU, pos_y_indU, pos_z_indU}, nx_g, ny_g,
+      [=](Real scalar_weight, hydro_utilities::VectorXYZ<Real> momentum_weights, int idx3D) {
+        // precompute 1/initial_density (take care to avoid divide by 0)
+        const Real inv_initial_density = 1.0 / (density[idx3D] + TINY_NUMBER * (density[idx3D] == 0.0));
+
+        // Step 1: substract off the kinetic-energy-density from total energy density.
+        //  - Regardles of whether we inject thermal energy, the kinetic energy density will change to
+        //    some degree because the gas density and gas momentum will be changed
+
+        const Real intial_ke_density = 0.5 * inv_initial_density *
+                                       (momentum_x[idx3D] * momentum_x[idx3D] + momentum_y[idx3D] * momentum_y[idx3D] +
+                                        momentum_z[idx3D] * momentum_z[idx3D]);
+        energy[idx3D] -= intial_ke_density;
+
+        // Step 2: convert the gas's momentum density to its value in the particle's reference frame
+        //  - This must be done after subtracting off KE
+        //  - This could probably be written more concisely (momentum_x[idx3D] -= density[idx3D] * vel_x),
+        //    but before we do that, we should leave the 3 lines of algebra used to derive that in the
+        //    comments since the abbreviated form "looks wrong" at a quick glance
+        {
+          // compute the local velocity
+          Real gas_vx = inv_initial_density * momentum_x[idx3D];
+          Real gas_vy = inv_initial_density * momentum_y[idx3D];
+          Real gas_vz = inv_initial_density * momentum_z[idx3D];
+
+          // adjust the velocity so its in the new frame
+          gas_vx -= vel_x;
+          gas_vy -= vel_y;
+          gas_vz -= vel_z;
+
+          // update the momentum
+          momentum_x[idx3D] = density[idx3D] * gas_vx;
+          momentum_y[idx3D] = density[idx3D] * gas_vy;
+          momentum_z[idx3D] = density[idx3D] * gas_vz;
+        }
+
+        // step 3a: inject density, and momentum
+        density[idx3D] += scalar_weight * feedback_density;
+        momentum_x[idx3D] += momentum_weights[0] * feedback_momentum;
+        momentum_y[idx3D] += momentum_weights[1] * feedback_momentum;
+        momentum_z[idx3D] += momentum_weights[2] * feedback_momentum;
+
+        // Step 3b: inject any thermal energy
+        // - Note: its weird to be inject a fixed amount of thermal energy and momentum. This means we are
+        //   injecting a variable amount of total energy...
+
+        energy[idx3D] += scalar_weight * feedback_energy;
+#ifdef DE
+        gas_energy[idx3D] += scalar_weight * feedback_energy;
+#endif
+
+        // precompute 1/final_density (take care to avoid divide by 0)
+        const Real inv_final_density = 1.0 / (density[idx3D] + TINY_NUMBER * (density[idx3D] == 0.0));
+
+        // Step 4: convert the momentum back to the starting reference frame.
+        //  - again, this could certainly be done more concisely
+        {
+          // compute the local velocity
+          Real gas_vx = inv_final_density * momentum_x[idx3D];
+          Real gas_vy = inv_final_density * momentum_y[idx3D];
+          Real gas_vz = inv_final_density * momentum_z[idx3D];
+
+          // adjust the velocity that it's in the original frame (it's no longer in the particle's frame)
+          gas_vx += vel_x;
+          gas_vy += vel_y;
+          gas_vz += vel_z;
+
+          // update the momentum
+          momentum_x[idx3D] = density[idx3D] * gas_vx;
+          momentum_y[idx3D] = density[idx3D] * gas_vy;
+          momentum_z[idx3D] = density[idx3D] * gas_vz;
+        }
+
+        // Step 5: add the new kinetic energy density to the total_energy density field
+        //  - currently the total_energy density field just holds the non-kinetic energy density
+        //  - this needs to happen after changing reference frames (since KE is reference frame dependent)
+        energy[idx3D] += 0.5 * inv_final_density *
+                         (momentum_x[idx3D] * momentum_x[idx3D] + momentum_y[idx3D] * momentum_y[idx3D] +
+                          momentum_z[idx3D] * momentum_z[idx3D]);
+      });
+}
+
+/* Legacy SNe prescription that combines resolved and unresolved */
+template <typename ResolvedPrescriptionT, typename UnresolvedStencil>
+struct ResolvedAndUnresolvedSNe {
+  /* the following 2 attributes exists for testing purposes */
+  static constexpr bool has_resolved_prescription   = true;
+  static constexpr bool has_unresolved_prescription = true;
+
+  static __device__ hydro_utilities::VectorXYZ<Real> nearest_noGhostOverlap_pos(
+      hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y, int ng_z, int n_ghost)
+  {
+    // for right now, we are assuming that the stencil of the unresolved feedback is the same size or
+    // bigger than the stencil used for the resolved feedback
+    return UnresolvedStencil::nearest_noGhostOverlap_pos(pos_indU, ng_x, ng_y, ng_z, n_ghost);
+  }
+
+  // ToDo: refactor to make use of UnresolvedStencil::for_each_overlap_zone
+  template <typename Function>
+  static __device__ void for_each_possible_overlap(Real pos_x_indU, Real pos_y_indU, Real pos_z_indU, int nx_g,
+                                                   int ny_g, Function&& f)
+  {
+    UnresolvedStencil::for_each(hydro_utilities::VectorXYZ<Real>{pos_x_indU, pos_y_indU, pos_z_indU}, nx_g, ny_g,
+                                std::forward<Function>(f));
+  }
+
+  static __device__ void apply_feedback(Real pos_x_indU, Real pos_y_indU, Real pos_z_indU, Real vel_x, Real vel_y,
+                                        Real vel_z, Real age, Real& mass_ref, part_int_t particle_id, Real dx, Real dy,
+                                        Real dz, int nx_g, int ny_g, int nz_g, int n_ghost, int num_SN, int cycle_num,
+                                        Real* s_info, Real* conserved_dev)
+  {
+    int tid = threadIdx.x;
+
+    Real dV     = dx * dy * dz;
+    int n_cells = nx_g * ny_g * nz_g;
+
+    hydro_utilities::VectorXYZ<Real> pos_indU{pos_x_indU, pos_y_indU, pos_z_indU};
+
+    Real* density = conserved_dev;
+
+    // compute the average mass density
+    Real avg_mass_dens;
+    {
+      Real dtot = 0.0;
+      int num   = 0;
+      UnresolvedStencil::for_each_overlap_zone(pos_indU, nx_g, ny_g, [&dtot, &num, density](int idx3) {
+        dtot += density[idx3];
+        num++;
+      });
+      avg_mass_dens = dtot / num;
+    }
+    Real n_0_cgs = avg_mass_dens * DENSITY_UNIT / (MU * MP);  // average number density in cgs
+
+    s_info[FBInfoLUT::LEN * tid + FBInfoLUT::countSN] += num_SN;
+
+    Real shell_radius = radius_shell_formation_kpc(n_0_cgs, num_SN);
+
+    const bool is_resolved = (3 * max(dx, max(dy, dz)) <= shell_radius);
+
+    mass_ref              = max(0.0, mass_ref - num_SN * feedback::MASS_PER_SN);  // update the cluster mass
+    Real feedback_density = num_SN * feedback::MASS_PER_SN / dV;
+
+    log_fb(cycle_num, num_SN, is_resolved, pos_x_indU, pos_y_indU, pos_z_indU, vel_x, vel_y, vel_z, particle_id,
+           n_0_cgs);
+
+    if (is_resolved) {
+      // inject energy and density
+      Real feedback_energy = num_SN * feedback::ENERGY_PER_SN / dV;
+
+      s_info[FBInfoLUT::LEN * tid + FBInfoLUT::countResolved] += num_SN;
+      s_info[FBInfoLUT::LEN * tid + FBInfoLUT::totalEnergy] += feedback_energy * dV;
+
+      ResolvedPrescriptionT::apply(pos_indU, vel_x, vel_y, vel_z, nx_g, ny_g, n_cells, conserved_dev, feedback_density,
+                                   feedback_energy);
+    } else {
+      // only unresolved SN feedback involves averaging the densities.
+      // -> we decided that if we are averaging the densities, it probably also
+      //    makes sense to average the momentum
+      Overwrite_Average<UnresolvedStencil>(pos_indU, nx_g, ny_g, nz_g, conserved_dev, avg_mass_dens);
+
+      // inject momentum and density
+
+      // the calculation of momentum was inherited from Orlando's older implementation
+      // -> `feedback::FINAL_MOMENTUM * pow(n_0_cgs, -0.17)` comes directly from eqn 34 of Kim &
+      //    Ostriker (2015). This is equation is also cited in Kim & Ostriker (2017) -- the TIGRESS
+      //    paper.
+      // -> the factor of pow(Real(num_SN), 0.93) has a less clear origin
+      //    - my speculation is that it comes from the E_{51}^{0.93} term in equation 17 of Kim &
+      //      Ostriker (2015).
+      //    - Note: in that other equation, the coefficient is slightly different AND n_0_cgs's
+      //      exponent is also slightly different
+      //    - It's not clear to me whether we should include this term.
+      // -> Earlier versions of the code divided by sqrt(3).
+      //    - I didn't totally understand this, but I'm confident that this is due to the fact that
+      //      we weren't explicitly normalizing the momentum based on the full normalized stencil
+      //      (effectively momentum normalization was computed a prior). Now we do normalize by the
+      //      the total magnitude, so it's definitely unnecessary!
+      //    - if we ever want to reintroduce this extra factor of sqrt(3), based on the current
+      //      organization of the code, we should now do it in the stencil.
+      Real feedback_momentum         = feedback::FINAL_MOMENTUM * pow(n_0_cgs, -0.17) * pow(Real(num_SN), 0.93);
+      Real feedback_momentum_density = feedback_momentum / dV;
+      Real feedback_energy           = 0.0;  // for now, don't inject any energy
+
+      s_info[FBInfoLUT::LEN * tid + FBInfoLUT::countUnresolved] += num_SN;
+      s_info[FBInfoLUT::LEN * tid + FBInfoLUT::totalMomentum] += feedback_momentum;
+      s_info[FBInfoLUT::LEN * tid + FBInfoLUT::totalUnresEnergy] += feedback_energy * dV;
+      Apply_Energy_Momentum_Deposition<UnresolvedStencil>(pos_x_indU, pos_y_indU, pos_z_indU, vel_x, vel_y, vel_z, nx_g,
+                                                          ny_g, n_ghost, n_cells, conserved_dev, feedback_density,
+                                                          feedback_momentum_density, feedback_energy);
+    }
+  }
+};
+
+// the next line defines a type-aliases to make it easier to refer to different hybrid prescriptions
+// - this uses Orlando's unresolved feedback depostion
+using CiCLegacyResolvedAndUnresolvedPrescription =
+    ResolvedAndUnresolvedSNe<CiCResolvedSNPrescription, fb_stencil::LegacyCIC27>;
+
+// the next line defines a shorthand for describing a prescription that is mostly used for testing purposes
+// - in this case, we adopt a slightly different strategy for momentum-feedback. We still use 27 cells,
+//   but I think some of the choices make more sense (and are easier to understand). With that said, testing
+//   doesn't seem to reveal much of a difference from the prior case. So we primarily stick with the prior case
+//   (since we used it to run simulations already)
+using HybridResolvedAndUnresolvedPrescription =
+    ResolvedAndUnresolvedSNe<CiCResolvedSNPrescription, fb_stencil::Sphere27<2>>;
+
+// the following code is left over from a much earlier version:
+
+/*
+inline __device__ void Wind_Feedback(Real pos_x, Real pos_y, Real pos_z, Real age, Real& mass_ref, part_int_t
+particle_id, Real xMin, Real yMin, Real zMin, Real xMax, Real yMax, Real zMax, Real dx, Real dy, Real dz, int nx_g, int
+ny_g, int nz_g, int n_ghost, int n_step, Real t, Real dt, const feedback::SWRateCalc sw_calc, Real* s_info, Real*
+conserved_dev, Real gamma, int indx_x, int indx_y, int indx_z)
+{
+  int tid  = threadIdx.x;
+
+  Real dV = dx * dy * dz;
+  int n_cells    = nx_g * ny_g * nz_g;
+
+  if ((age < 0) or not sw_calc.is_active(age)) return;
+  Real feedback_momentum = sw_calc.Get_Wind_Flux(age);
+  // no sense in proceeding if there is no feedback.
+  if (feedback_momentum == 0) return;
+  Real feedback_energy  = sw_calc.Get_Wind_Power(age);
+  Real feedback_density = sw_calc.Get_Wind_Mass(feedback_momentum, feedback_energy);
+
+  // feedback_momentum now becomes momentum component along one direction.
+  feedback_momentum *= mass_ref * dt / dV / sqrt(3.0);
+  feedback_density *= mass_ref * dt / dV;
+  feedback_energy *= mass_ref * dt / dV;
+
+  mass_ref   -= feedback_density * dV;
+
+  // we log net momentum, not momentum density, and magnitude (not the
+  // component along a direction)
+  s_info[FBInfoLUT::LEN * tid + FBInfoLUT::totalWindMomentum] += feedback_momentum * dV * sqrt(3.0);
+  s_info[FBInfoLUT::LEN * tid + FBInfoLUT::totalWindEnergy]   += feedback_energy * dV;
+
+
+  const double pos_x_indU = (pos_x - xMin) / dx + n_ghost;
+  const double pos_y_indU = (pos_y - yMin) / dy + n_ghost;
+  const double pos_z_indU = (pos_z - zMin) / dz + n_ghost;
+
+  Apply_Energy_Momentum_Deposition(pos_x_indU, pos_y_indU, pos_z_indU, nx_g, ny_g, n_ghost,
+                                   n_cells, conserved_dev, feedback_density,
+                                   feedback_momentum, feedback_energy);
+}
+*/
+
+}  // namespace fb_prescription
\ No newline at end of file
diff --git a/src/feedback/ratecalc.cu b/src/feedback/ratecalc.cu
new file mode 100644
index 000000000..f5023b998
--- /dev/null
+++ b/src/feedback/ratecalc.cu
@@ -0,0 +1,98 @@
+#include <string>
+#include <vector>
+
+#include "../feedback/ratecalc.h"
+#include "../feedback/s99table.h"
+#include "../io/ParameterMap.h"
+#include "../io/io.h"
+
+feedback::SNRateCalc::SNRateCalc(ParameterMap& pmap)
+    : SNRateCalc()  // the dfault constructor sets up some sensible defaults
+{
+  chprintf("feedback::Init_State start\n");
+
+  std::string snr_filename(pmap.value_or("feedback.snr_filename", ""));
+  if (snr_filename.empty()) {
+    // the constant-rate configuration is handled by the default constructor for us
+    chprintf("No SN rate file specified.  Using constant rate\n");
+  } else {
+    chprintf("Specified a SNR filename %s.\n", snr_filename.data());
+
+    feedback::S99Table tab     = parse_s99_table(snr_filename, feedback::S99TabKind::supernova);
+    const std::size_t time_col = tab.col_index("TIME");
+    const std::size_t rate_col = tab.col_index("ALL SUPERNOVAE: TOTAL RATE");
+    const std::size_t nrows    = tab.nrows();
+
+    // read in array of supernova rate values.
+    std::vector<Real> snr_time(nrows);
+    std::vector<Real> snr(nrows);
+
+    for (std::size_t i = 0; i < nrows; i++) {
+      // in the following divide by # years per kyr (1000)
+      snr_time[i] = tab(time_col, i) / 1000;
+      snr[i]      = pow(10, tab(rate_col, i)) * 1000 / S_99_TOTAL_MASS;
+    }
+
+    time_sn_end_   = snr_time[snr_time.size() - 1];
+    time_sn_start_ = snr_time[0];
+    // the following is the time interval between data points
+    // (i.e. assumes regular temporal spacing)
+    snr_dt_ = (time_sn_end_ - time_sn_start_) / (snr.size() - 1);
+
+    GPU_Error_Check(cudaMalloc((void**)&dev_snr_, snr.size() * sizeof(Real)));
+    GPU_Error_Check(cudaMemcpy(dev_snr_, snr.data(), snr.size() * sizeof(Real), cudaMemcpyHostToDevice));
+  }
+}
+
+/* Read in Stellar wind data from Starburst 99. If no file exists, assume a
+ * constant rate.
+ *
+ *
+ * @param P reference to parameters struct. Passes in starburst 99 filepath
+ */
+feedback::SWRateCalc::SWRateCalc(ParameterMap& P)
+    : dev_sw_p_(nullptr), dev_sw_e_(nullptr), sw_dt_(0.0), time_sw_start_(0.0), time_sw_end_(0.0)
+{
+#if (!defined(FEEDBACK)) || defined(NO_WIND_FEEDBACK)
+  return;
+#else
+  chprintf("Init_Wind_State start\n");
+  // this will produce a nicely formatted error if we forget to specify the required parameter
+  std::string sw_filename = pmap.value<std::string>("feedback.sw_filename");
+
+  feedback::S99Table tab = parse_s99_table(sw_filename, feedback::S99TabKind::stellar_wind);
+
+  const std::size_t COL_TIME       = tab.col_index("TIME");
+  const std::size_t COL_POWER      = tab.col_index("POWER: ALL");
+  const std::size_t COL_ALL_P_FLUX = tab.col_index("MOMENTUM FLUX: ALL");
+  const std::size_t nrows          = tab.nrows();
+
+  std::vector<Real> sw_time(nrows);
+  std::vector<Real> sw_p(nrows);
+  std::vector<Real> sw_e(nrows);
+
+  for (std::size_t i = 0; i < nrows; i++) {
+    sw_time[i] = tab(COL_TIME, i) / 1000;  // divide by # years per kyr (1000)
+    sw_e[i]    = tab(COL_POWER, i);
+    sw_p[i]    = tab(COL_ALL_P_FLUX, i);
+  }
+
+  time_sw_end_   = sw_time[sw_time.size() - 1];
+  time_sw_start_ = sw_time[0];
+  // the following is the time interval between data points
+  // (i.e. assumes regular temporal spacing)
+  sw_dt_ = (time_sw_end_ - time_sw_start_) / (sw_p.size() - 1);
+  chprintf("wind t_s %.5e, t_e %.5e, delta T %0.5e\n", time_sw_start_, time_sw_end_, sw_dt_);
+
+  GPU_Error_Check(cudaMalloc((void**)&dev_sw_p_, sw_p.size() * sizeof(Real)));
+  GPU_Error_Check(cudaMemcpy(dev_sw_p_, sw_p.data(), sw_p.size() * sizeof(Real), cudaMemcpyHostToDevice));
+
+  GPU_Error_Check(cudaMalloc((void**)&dev_sw_e_, sw_e.size() * sizeof(Real)));
+  GPU_Error_Check(cudaMemcpy(dev_sw_e_, sw_e.data(), sw_e.size() * sizeof(Real), cudaMemcpyHostToDevice));
+
+  chprintf("first 40 stellar wind momentum values:\n");
+  for (int i = 0; i < 40; i++) {
+    chprintf("%0.5e  %5f %5f \n", sw_time.at(i), sw_e.at(i), sw_p.at(i));
+  }
+#endif
+}
diff --git a/src/feedback/ratecalc.h b/src/feedback/ratecalc.h
new file mode 100644
index 000000000..1c62fb9ac
--- /dev/null
+++ b/src/feedback/ratecalc.h
@@ -0,0 +1,217 @@
+#ifndef FEEDBACK_RATECALC_H
+#define FEEDBACK_RATECALC_H
+
+#ifdef O_HIP
+  #include <hiprand.h>
+  #include <hiprand_kernel.h>
+#else
+  #include <curand.h>
+  #include <curand_kernel.h>
+#endif  // O_HIP
+
+#include <string>
+
+#include "../global/global.h"
+#include "../io/ParameterMap.h"
+#include "../utils/gpu.hpp"
+
+typedef curandStateMRG32k3a_t feedback_prng_t;
+
+// This header declares classes that encapsulate calculations of SN rates and the rate of SW
+// deposition
+//
+// Currently, they don't have destructors that deallocate the data. This is fine in the short term,
+// since we only construct up to a single instance of each class on the host during the entire
+// simulation. With that said, I do have a strategy in mind for resolving this.
+
+// seed for poisson random number generator
+#define FEEDBACK_SEED 42
+
+// the starburst 99 total stellar mass input
+// stellar wind momentum fluxes and SN rates
+// must be divided by this to get per solar
+// mass values.
+#define S_99_TOTAL_MASS 1e6
+
+namespace feedback
+{
+/* The following should really be macros */
+// supernova rate: 1SN / 100 solar masses per 36 Myr
+static const Real DEFAULT_SNR = 2.8e-7;
+// default value for when SNe stop (40 Myr)
+static const Real DEFAULT_SN_END = 40000;
+// default value for when SNe start (4 Myr)
+static const Real DEFAULT_SN_START = 4000;
+
+/* Encapsulate Supernova Rate Calculation that is primarily intended to interpolate the data from
+ * starburst99 tables.
+ *
+ * @note
+ * The destructor doesn't currently deallocate the device heap-data. That's okay for the moment
+ * because the only way to allocate that data at the moment is to call the table-reader constructor,
+ * and that table-reader particular constructor is only called once during the entire duration of
+ * the simulation. With that said, I do have plans to address this issue in the future.
+ */
+struct SNRateCalc {
+ public:
+  /* Default constructor. Ensures this object is always in a usable state
+   *
+   * This assumes a constant supernova rate given by feedback::DEFAULT_SNR
+   */
+  __host__ __device__ SNRateCalc()
+      : dev_snr_(nullptr),
+        snr_dt_(feedback::DEFAULT_SN_END - feedback::DEFAULT_SN_START),
+        time_sn_start_(feedback::DEFAULT_SN_START),
+        time_sn_end_(feedback::DEFAULT_SN_END)
+  {
+  }
+
+  /* The "table-reader" constructor.
+   *
+   * Reads data from the specified file and allocates heapdata. If no file was specified, fall back
+   * to configuration assumed in default constructor.
+   *
+   * @param P reference to parameters struct. Passes in starburst 99 filename.
+   */
+  __host__ SNRateCalc(ParameterMap &pmap);
+
+  /* returns supernova rate from starburst 99 (or default analytical rate).
+   *
+   * Does a basic interpolation of S'99 table values.
+   *
+   * @param t   The cluster age.
+   * @return number of SNe per kyr per solar mass
+   *
+   * @note
+   * It's important to retain the inline annotation to maximize the chance of inlining.
+   */
+  inline __device__ Real Get_SN_Rate(Real t) const
+  {
+    if ((t < time_sn_start_) or (t >= time_sn_end_)) return 0;
+    if (dev_snr_ == nullptr) return feedback::DEFAULT_SNR;
+
+    int index = (int)((t - time_sn_start_) / snr_dt_);
+    return dev_snr_[index] + (t - index * snr_dt_) * (dev_snr_[index + 1] - dev_snr_[index]) / snr_dt_;
+  }
+
+  /* Get an actual number of SNe given the expected number. Both the simulation step number
+   * and cluster ID is used to set the state of the random number generator in a unique and
+   * deterministic way.
+   *
+   * @param ave_num_sn expected number of SN, based on cluster age, mass and time step.
+   * @param n_step sim step number
+   * @param cluster_id
+   * @return number of supernovae
+   *
+   * @note
+   * It's important to retain the inline annotation to maximize the chance of inlining
+   */
+  static inline __device__ int Get_Number_Of_SNe_In_Cluster(Real ave_num_sn, int n_step, part_int_t cluster_id)
+  {
+    feedback_prng_t state;
+    // Note: in the C++ spec, wrap-around behavior is well-defined for unsigned types during integer
+    //       overflow (overflow for signed types invokes undefined behavior)
+    unsigned long long seed = (cluster_id < 0)
+                                  ? (unsigned long long)(FEEDBACK_SEED) - (unsigned long long)(-1 * cluster_id)
+                                  : (unsigned long long)(FEEDBACK_SEED) + (unsigned long long)(cluster_id);
+    curand_init(seed, 0, 0, &state);
+    skipahead((unsigned long long)(n_step), &state);  // provided by curand
+    return (int)curand_poisson(&state, ave_num_sn);
+  }
+
+  inline __device__ bool nonzero_sn_probability(Real age) const
+  {
+    return (time_sn_start_ <= age) and (age <= time_sn_end_);
+  }
+
+ private:  // attributes
+  /* device array with rate info */
+  Real *dev_snr_;
+  /* time interval between table data. Assumed to be constant. */
+  Real snr_dt_;
+  /* cluster age when SNR is first greater than zero. */
+  Real time_sn_start_;
+  /* cluster age when SNR drops to zero. */
+  Real time_sn_end_;
+};
+
+/* Class responsible for computing stellar-wind rates
+ *
+ * @note
+ * These were pretty much extracted directly from feedback.cu. There's a chance that there are some
+ * logical errors in these functions. In particular, the way we have been using the Wind_Flux and
+ * Wind_Power to update gas-momentum and gas-energy is inconsistent.
+ *
+ * @note
+ * The destructor doesn't currently deallocate the device heap-data. That's okay for the moment
+ * because the only way to allocate that data at the moment is to call the table-reader constructor,
+ * and that table-reader particular constructor is only called once during the entire duration of
+ * the simulation. With that said, I do have plans to address this issue in the future.
+ */
+struct SWRateCalc {
+  __host__ SWRateCalc(ParameterMap &P);
+
+  __host__ __device__ SWRateCalc(Real *dev_sw_p, Real *dev_sw_e, Real dt, Real t_start, Real t_end)
+      : dev_sw_p_(dev_sw_p), dev_sw_e_(dev_sw_e), sw_dt_(dt), time_sw_start_(t_start), time_sw_end_(t_end)
+  {
+  }
+
+  /* Get the Starburst 99 stellar wind momentum flux per solar mass.
+   *
+   * @param t cluster age in kyr
+   * @return flux (in Cholla force units) per solar mass.
+   */
+  inline __device__ Real Get_Wind_Flux(Real t) const
+  {
+    if ((t < time_sw_start_) or (t >= time_sw_end_)) return 0;
+
+    int index        = (int)((t - time_sw_start_) / sw_dt_);
+    Real log_p_dynes = (dev_sw_p_[index] + (t - index * sw_dt_) * (dev_sw_p_[index + 1] - dev_sw_p_[index]) / sw_dt_);
+    return pow(10, log_p_dynes) / FORCE_UNIT / S_99_TOTAL_MASS;
+  }
+
+  /* Get the Starburst 99 stellar wind emitted power per solar mass.
+   *
+   * @param t cluster age in kyr
+   * @return power (in Cholla units) per solar mass.
+   */
+  inline __device__ Real Get_Wind_Power(Real t) const
+  {
+    if ((t < time_sw_start_) or (t >= time_sw_end_)) return 0;
+
+    int index  = (int)((t - time_sw_start_) / sw_dt_);
+    Real log_e = (dev_sw_e_[index] + (t - index * sw_dt_) * (dev_sw_e_[index + 1] - dev_sw_e_[index]) / sw_dt_);
+    Real e     = pow(10, log_e) / (MASS_UNIT * VELOCITY_UNIT * VELOCITY_UNIT) * TIME_UNIT / S_99_TOTAL_MASS;
+    return e;
+  }
+
+  /* Get the mass flux associated with stellar wind momentum flux and stellar wind power scaled per
+   * cluster mass.
+   *
+   * @param flux
+   * @return mass flux in g/s per solar mass
+   */
+  static __device__ Real Get_Wind_Mass(Real flux, Real power)
+  {
+    if ((flux <= 0) or (power <= 0)) return 0;
+    return flux * flux / power / 2;
+  }
+
+  inline __device__ bool is_active(Real age) const { return (time_sw_start_ <= age) and (age <= time_sw_end_); }
+
+ private:  // attributes
+  /* device array of log base 10 momentum flux values in dynes. */
+  Real *dev_sw_p_ = nullptr;
+  /* device array of log base 10 power (erg/s) */
+  Real *dev_sw_e_ = nullptr;
+  /* time interval between table data points in kyr. */
+  Real sw_dt_ = 0.0;
+  /* cluster age when flux becomes non-negligible (kyr). */
+  Real time_sw_start_ = 0.0;
+  /* cluster age when stellar winds turn off (kyr). */
+  Real time_sw_end_ = 0.0;
+};
+
+}  // namespace feedback
+
+#endif /* FEEDBACK_RATECALC_H */
diff --git a/src/feedback/s99table.cpp b/src/feedback/s99table.cpp
new file mode 100644
index 000000000..916b994c3
--- /dev/null
+++ b/src/feedback/s99table.cpp
@@ -0,0 +1,223 @@
+#include "s99table.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <string_view>
+
+namespace
+{  // anonymous namespace
+
+std::string_view rstripped_view_(std::string_view s)
+{
+  std::size_t i = s.size();
+  while (i > 0) {
+    i--;
+    if (s[i] != ' ') return s.substr(0, i + 1);
+  }
+  return {};
+}
+
+std::string_view trimmed_view_(std::string_view s)
+{
+  std::size_t substr_start = s.find_first_not_of(' ');
+  if (substr_start == std::string_view::npos) return {};
+  std::string_view lstripped_view = s.substr(substr_start);
+  return rstripped_view_(lstripped_view);
+}
+
+/* counts the number of contiguous ' ' characters at start of a string that is
+ * null-character-terminated */
+std::size_t leading_space_count_(const char* s)
+{
+  for (std::size_t i = 0; true; i++) {
+    if (s[i] != ' ') return i;
+  }
+}
+
+template <std::size_t min_contig_space_count>
+struct TokenFinder {
+  const std::string& line;
+  std::size_t next_token_start = 0;  // first index of next token
+
+  /* gets the next token. The template argument specifies the minimum number of
+   * spaces required to delimit tokens (relevant for parsing col names) */
+  std::string_view next()
+  {
+    std::size_t start = (next_token_start == 0) ? line.find_first_not_of(' ') : next_token_start;
+    if (line.size() == start) return std::string_view{};
+
+    for (std::size_t i = start; i < line.size(); /* update i within loop */) {
+      const std::size_t space_count = leading_space_count_(line.data() + i);
+      if (space_count >= min_contig_space_count) {
+        this->next_token_start = space_count + i;
+        return {line.data() + start, i - start};
+      }
+      i += (space_count > 0) ? space_count : 1;
+    }
+
+    this->next_token_start = line.size();
+    return {line.data() + start, line.size() - start};
+  }
+};
+
+std::string descr_string_(feedback::S99TabKind kind)
+{
+  switch (kind) {
+    case feedback::S99TabKind::supernova:
+      return "RESULTS FOR THE SUPERNOVA RATE";
+    case feedback::S99TabKind::stellar_wind:
+      return "RESULTS FOR THE **STELLAR** WIND POWER AND ENERGY";
+    default:
+      CHOLLA_ERROR("there is an unhandled s99 table kind");
+  }
+}
+
+std::vector<std::string> better_col_names(std::vector<std::string> parsed_names, feedback::S99TabKind kind)
+{
+  std::string kind_name;
+  std::vector<std::string> full_names{};
+  switch (kind) {
+    case feedback::S99TabKind::supernova:
+      kind_name  = "supernova";
+      full_names = {"TIME",
+                    "ALL SUPERNOVAE: TOTAL RATE",
+                    "ALL SUPERNOVAE: POWER",
+                    "ALL SUPERNOVAE: ENERGY",
+                    "TYPE IB SUPERNOVAE: TOTAL RATE",
+                    "TYPE IB SUPERNOVAE: POWER",
+                    "TYPE IB SUPERNOVAE: ENERGY",
+                    "ALL SUPERNOVAE: TYPICAL MASS",
+                    "ALL SUPERNOVAE: LOWEST PROG. MASS",
+                    "STARS + SUPERNOVAE: POWER",
+                    "STARS + SUPERNOVAE: ENERGY"};
+      break;
+    case feedback::S99TabKind::stellar_wind:
+      kind_name  = "stellar wind";
+      full_names = {"TIME",
+                    "POWER: ALL",
+                    "POWER: OB",
+                    "POWER: RSG",
+                    "POWER: LBV",
+                    "POWER: WR",
+                    "ENERGY: ALL",
+                    "MOMENTUM FLUX: ALL",
+                    "MOMENTUM FLUX: OB",
+                    "MOMENTUM FLUX: RSG",
+                    "MOMENTUM FLUX: LBV",
+                    "MOMENTUM FLUX: WR"};
+      break;
+    default:
+      CHOLLA_ERROR("there is an unhandled kind of starburst99 table");
+  }
+
+  if (full_names.size() != parsed_names.size()) {
+    CHOLLA_ERROR("A \"%s\" starburst99 table should have %zu cols, not %zu", kind_name.c_str(), full_names.size(),
+                 parsed_names.size());
+  }
+
+  for (std::size_t i = 0; i < parsed_names.size(); i++) {
+    const std::string& full_name = full_names[i];
+
+    // parsed_names should match everything after the ": " substring if its
+    // present (if the ": " substring isn't present, match the full string)
+    std::size_t separator_pos = full_name.find(": ");
+    std::string expected_name;
+    if (separator_pos == std::string::npos) {
+      expected_name = full_name;
+    } else {
+      expected_name = full_name.substr(separator_pos + 2);
+    }
+
+    if (expected_name != parsed_names[i]) {
+      CHOLLA_ERROR(
+          "column %zu is expected to be labelled %s in a \"%s\" "
+          "starburst99 table, not \"%s\"",
+          i, expected_name.c_str(), kind_name.c_str(), parsed_names[i].c_str());
+    }
+  }
+  return full_names;
+}
+
+}  // namespace
+
+feedback::S99Table parse_s99_table(const std::string& fname, feedback::S99TabKind kind)
+{
+  // read in the file
+  std::ifstream stream{fname};
+  if (!stream.is_open()) {
+    CHOLLA_ERROR("problem opening %s", fname.c_str());
+  }
+
+  // parse the first 6 lines of the header
+  for (int i = 0; i < 6; i++) {
+    std::string line;
+    std::getline(stream, line);
+    if (i == 3) {
+      std::string expected_description_str = descr_string_(kind);
+      if (trimmed_view_(line) != expected_description_str) {
+        std::string trimmed_line = std::string(trimmed_view_(line));
+        CHOLLA_ERROR(
+            "Expected the description string: \"%s\" on the fourth "
+            "line of the table. Instead, the line reads: \"%s\"",
+            expected_description_str.c_str(), trimmed_line.c_str());
+      }
+      std::string tmp = std::string(trimmed_view_(line));
+    }
+  }
+
+  // finally, read the column names:
+  std::vector<std::string> parsed_col_names;
+  {
+    std::string line;
+    std::getline(stream, line);
+    TokenFinder<2> f{line, 0};
+
+    for (std::string_view tok = f.next(); not tok.empty(); tok = f.next()) {
+      parsed_col_names.emplace_back(tok);
+    }
+  }
+
+  // confirm that the column names are consistent with our expectations (for
+  // the specified table-kind) and retrieve better names for the columns
+  // - these improved column names reflect the fact that file is structured to
+  //   have nested columns. The upper level is tricky to parse so we just parse
+  //   the bottom layer. As a consequence, our parsed_col_names doesn't contain
+  //   unique vals. The better names are unique
+  std::vector<std::string> col_names = better_col_names(parsed_col_names, kind);
+  const std::size_t ncols            = col_names.size();
+
+  // finally, read the actual data:
+  std::vector<double> data = {};
+  while (stream.good()) {
+    std::string line;
+    std::getline(stream, line);
+    if (trimmed_view_(line).size() == 0) break;
+
+    TokenFinder<2> f{line, 0};
+
+    for (std::string_view tok = f.next(); not tok.empty(); tok = f.next()) {
+      char* ptr_end{};
+      errno = 0;
+      // note the string of characters in tok is guaranteed to be followed
+      // (eventually) by a null character since tok is a view of line.data()
+      double val = std::strtod(tok.data(), &ptr_end);
+
+      if ((errno != 0) or ((ptr_end - tok.data()) != tok.size())) {
+        CHOLLA_ERROR("error parsing floating-point val in %s on line: \"%s\"", fname.c_str(), line.c_str());
+      }
+      data.push_back(val);
+    }
+
+    // sanity check:
+    if ((data.size() % ncols) != 0) CHOLLA_ERROR("parsed wrong # of cols");
+  }
+
+  if (data.size() == 0) {
+    CHOLLA_ERROR("No data parsed from %s. Is the table empty?", fname.c_str());
+  }
+  const std::size_t nrows = data.size() / ncols;
+
+  return {std::move(col_names), std::move(data), ncols, nrows};
+}
diff --git a/src/feedback/s99table.h b/src/feedback/s99table.h
new file mode 100644
index 000000000..c3487e0a7
--- /dev/null
+++ b/src/feedback/s99table.h
@@ -0,0 +1,77 @@
+#ifndef S99TABLE_H
+#define S99TABLE_H
+
+#include <string>
+#include <utility>  // std::move
+#include <vector>
+
+#include "../utils/error_handling.h"
+
+namespace feedback
+{
+
+/* kinds of starburst99 datatables used by cholla */
+enum class S99TabKind { supernova, stellar_wind };
+
+/* Class that represents a parsed starburst99 table. This is a little over the top, but it's
+ * probably fine. We just use this when reading the data out of the table
+ */
+class S99Table
+{
+ public:
+  S99Table(std::vector<std::string> col_names, std::vector<double> data, std::size_t ncols, std::size_t nrows)
+      : col_names_(std::move(col_names)), data_(std::move(data)), ncols_(ncols), nrows_(nrows)
+  {
+  }
+
+  /* number of rows in the table */
+  std::size_t nrows() const noexcept { return nrows_; }
+
+  /* number of columns in the table */
+  std::size_t ncols() const noexcept { return ncols_; }
+
+  /* access an entry in the table */
+  double operator()(std::size_t col_ind, std::size_t row_ind) const noexcept
+  {
+    // probably could remove this check...
+    if ((col_ind >= ncols_) or (row_ind >= nrows_)) {
+      CHOLLA_ERROR(
+          "invalid index col_ind, %zu, must be less than %zu and row_ind, %zu, must be "
+          "less than %zu",
+          col_ind, ncols_, row_ind, nrows_);
+    }
+    return data_[row_ind * ncols_ + col_ind];
+  }
+
+  /* query the name of a given column */
+  std::string col_name(std::size_t col_ind) const noexcept
+  {
+    if (col_ind < ncols_) return col_names_[col_ind];
+    return "";
+  }
+
+  /* Returns the index of the specified column. */
+  std::size_t col_index(const std::string& col_name) const noexcept
+  {
+    for (std::size_t i = 0; i < ncols_; i++) {
+      if (col_names_[i] == col_name) return i;
+    }
+    CHOLLA_ERROR("the table doesn't hold a column called: \"%s\"", col_name.c_str());
+  }
+
+ private:  // attributes
+  std::vector<std::string> col_names_;
+  std::vector<double> data_;
+  std::size_t ncols_;
+  std::size_t nrows_;
+};
+
+}  // namespace feedback
+
+/* Parse a Starburst99 table
+ *
+ * TODO: put this back into the feedback namespace
+ */
+feedback::S99Table parse_s99_table(const std::string& fname, feedback::S99TabKind kind);
+
+#endif /* S99TABLE_H */
\ No newline at end of file
diff --git a/src/feedback/stencil.h b/src/feedback/stencil.h
new file mode 100644
index 000000000..a9df730a5
--- /dev/null
+++ b/src/feedback/stencil.h
@@ -0,0 +1,937 @@
+// this file defines stencils that are used within a prescription
+//
+// the goal here is to be somewhat modular to make it easy to support mutliple versions
+// of a perscription
+
+#pragma once
+
+#include <cstdint>
+
+#include "../feedback/feedback.h"
+#include "../global/global.h"
+#include "../utils/basic_structs.h"
+#include "../utils/math_utilities.h"  // math_utils::clamp
+
+enum struct StencilEvalKind {
+  enclosed_stencil_vol_frac, /*!< compute the fraction of the total stencil volume enclosed by each cell */
+  enclosed_cell_vol_frac,    /*!< compute the fraction of each cell's volume that is enclosed by the stencil */
+  for_each_overlap_zone      /*!< execute function at each cell with any non-zero amount of overlap (but
+                              *!< without specifying the amount of overlap)*/
+};
+
+// maybe this should be called feedback_stencil
+namespace fb_stencil
+{
+
+/* helper function used to help implement stencils calculate the nearest location to (pos_x_indU, pos_y_indU,
+ * pos_z_indU) that the stencil's center can be shifted to in order to avoid overlapping with the ghost zone.
+ *
+ * If the specified location already does not overlap with the ghost zone, that is the returned
+ *
+ * \param min_stencil_offset The minimum distance a stencil must be from a cell-edge such that the stencil does not
+ * extend past the edge.
+ *
+ * \note
+ * It's okay for this to be a static function and live in a header since this header should only included in a single
+ * source file (2 if running unit-tests).
+ */
+static inline __device__ hydro_utilities::VectorXYZ<Real> nearest_noGhostOverlap_pos_(
+    Real min_stencil_offset, hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y, int ng_z, int n_ghost)
+{
+  const Real edge_offset = n_ghost + min_stencil_offset;
+  return {math_utils::clamp(pos_indU[0], edge_offset, ng_x - edge_offset),
+          math_utils::clamp(pos_indU[1], edge_offset, ng_y - edge_offset),
+          math_utils::clamp(pos_indU[2], edge_offset, ng_z - edge_offset)};
+}
+
+/* Represents the stencil for cloud-in-cell deposition */
+struct CIC {
+  /* along any axis, gives the max number of neighboring cells that may be enclosed by the stencil,
+   * that are on one side of the cell containing the stencil's center.
+   *
+   * \note
+   * this primarily exists for testing purposes
+   */
+  inline static constexpr int max_enclosed_neighbors = 1;
+
+  /* excute f at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * The function should expect 2 arguments:
+   *   1. ``stencil_enclosed_frac``: the fraction of the stencil enclosed by the cell
+   *   2. ``indx3x``: the index used to index a 3D array (that has ghost zones)
+   */
+  template <typename Function>
+  static __device__ void for_each(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g, Function f)
+  {
+    // Step 1: along each axis, identify the integer-index of the leftmost cell covered by the stencil.
+    //  - Consider the cell containing the stencil-center. If the stencil-center is at all to the left
+    //    of that cell-center, then the stencil overlaps with the current cell and the one to the left
+    //  - otherwise, the stencil covers the current cell and the one to the right
+    int leftmost_indx_x = int(pos_indU[0] - 0.5);
+    int leftmost_indx_y = int(pos_indU[1] - 0.5);
+    int leftmost_indx_z = int(pos_indU[2] - 0.5);
+
+    // Step 2: along each axis, compute the distance between the stencil-center of the leftmost cell
+    //  - Recall that an integer index, ``indx``, specifies the position of the left edge of a cell.
+    //    In other words the reference point of the cell is on the left edge.
+    //  - The center of the cell specified by ``indx`` is actually ``indx+0.5``
+    Real delta_x = pos_indU[0] - (leftmost_indx_x + 0.5);
+    Real delta_y = pos_indU[1] - (leftmost_indx_y + 0.5);
+    Real delta_z = pos_indU[2] - (leftmost_indx_z + 0.5);
+
+    // Step 3: Actually invoke f at each cell-location that overlaps with the stencil location, passing both:
+    //  1. fraction of the total stencil volume enclosed by the given cell
+    //  2. the 1d index specifying cell-location (for a field with ghost zones)
+    //
+    // note: it's not exactly clear to me how we go from delta_x,delta_y,delta_z to volume-frac, (I just
+    //       refactored the code I inherited and get consistent and sensible results)
+
+#define to_idx3D(i, j, k) ((leftmost_indx_x + (i)) + nx_g * ((leftmost_indx_y + (j)) + ny_g * (leftmost_indx_z + (k))))
+
+    f((1 - delta_x) * (1 - delta_y) * (1 - delta_z), to_idx3D(0, 0, 0));  // (i=0, j = 0, k = 0)
+    f((1 - delta_x) * (1 - delta_y) * delta_z, to_idx3D(0, 0, 1));        // (i=0, j = 0, k = 1)
+    f((1 - delta_x) * delta_y * (1 - delta_z), to_idx3D(0, 1, 0));        // (i=0, j = 1, k = 0)
+    f((1 - delta_x) * delta_y * delta_z, to_idx3D(0, 1, 1));              // (i=0, j = 1, k = 1)
+    f(delta_x * (1 - delta_y) * (1 - delta_z), to_idx3D(1, 0, 0));        // (i=1, j = 0, k = 0)
+    f(delta_x * (1 - delta_y) * delta_z, to_idx3D(1, 0, 1));              // (i=1, j = 0, k = 1)
+    f(delta_x * delta_y * (1 - delta_z), to_idx3D(1, 1, 0));              // (i=1, j = 1, k = 0)
+    f(delta_x * delta_y * delta_z, to_idx3D(1, 1, 1));                    // (i=1, j = 1, k = 1)
+
+#undef to_idx3D
+  }
+
+  /* identical to for_each (provided for compatability with interfaces of other stencils). */
+  template <typename Function>
+  static __device__ void for_each_enclosedCellVol(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                                  Function f)
+  {
+    CIC::for_each(pos_indU, nx_g, ny_g, f);
+  }
+
+  ///* calls the unary function f at ever location where there probably is non-zero overlap with
+  // * the stencil.
+  // *
+  // * \note
+  // * This is intended to be conservative (it's okay for this to call the function on a cell with
+  // * non-zero overlap). The reason this exacts (rather than just calling for_each), is that it
+  // * it may be significantly cheaper for some stencils
+  // */
+  template <typename UnaryFunction>
+  static __device__ void for_each_overlap_zone(hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y,
+                                               UnaryFunction f)
+  {
+    // this is a little crude!
+    CIC::for_each(pos_indU, ng_x, ng_y, [f](double stencil_enclosed_frac, int idx3D) {
+      if (stencil_enclosed_frac > 0) f(idx3D);
+    });
+  }
+
+  /* returns the nearest location to (pos_x_indU, pos_y_indU, pos_z_indU) that the stencil's center
+   * can be shifted to in order to avoid overlapping with the ghost zone.
+   *
+   * If the specified location already does not overlap with the ghost zone, that is the returned
+   * value.
+   */
+  static __device__ hydro_utilities::VectorXYZ<Real> nearest_noGhostOverlap_pos(
+      hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y, int ng_z, int n_ghost)
+  {
+    const Real min_stencil_offset = 0.5;
+    return nearest_noGhostOverlap_pos_(min_stencil_offset, pos_indU, ng_x, ng_y, ng_z, n_ghost);
+  }
+};
+
+/* implements a stencil for depositing scalar quantities into a rectangular-prism region
+ * (each side-length centered on `pos_indU` that has a length of 2 cell-widths along each
+ * direction.
+ */
+template <typename Function, StencilEvalKind flavor>
+static __device__ void for_each_cic27_(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g, Function f)
+{
+  // this visits a 3x3x3 cells region
+
+  int leftmost_indx_x = int(pos_indU[0]) - 1;
+  int leftmost_indx_y = int(pos_indU[1]) - 1;
+  int leftmost_indx_z = int(pos_indU[2]) - 1;
+
+  // compute the distance between the left edge of the cell containing pos_indU and pos_indU
+  Real offset_x = pos_indU[0] - int(pos_indU[0]);
+  Real offset_y = pos_indU[1] - int(pos_indU[1]);
+  Real offset_z = pos_indU[2] - int(pos_indU[2]);
+
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        const int ind3D = (leftmost_indx_x + i) + nx_g * ((leftmost_indx_y + j) + ny_g * (leftmost_indx_z + k));
+
+        // compute the length (in units of cellwidths) of the stencil that overlaps with the current
+        // cell along each axis. Along the x-axis this is given by
+        //       1 - offset_x: when i == 0
+        //                  1: when i == 1
+        //           offset_x: when i == 2
+        Real x_len = (i < 2) + (i - 1) * offset_x;
+        Real y_len = (j < 2) + (j - 1) * offset_y;
+        Real z_len = (k < 2) + (k - 1) * offset_z;
+
+        // The volume enclosed by the current cell is
+        Real volEnclosed = x_len * y_len * z_len;
+
+        if constexpr (flavor == StencilEvalKind::enclosed_stencil_vol_frac) {
+          // the fraction of the stencil that is enclosed is volEnclosed/volStencil
+          // and volStencil is 4 cell_widths^3
+          f(0.25 * volEnclosed, ind3D);
+        } else if constexpr (flavor == StencilEvalKind::enclosed_cell_vol_frac) {
+          f(volEnclosed, ind3D);
+        } else if constexpr (flavor == StencilEvalKind::for_each_overlap_zone) {
+          if (volEnclosed > 0) f(ind3D);
+        }
+      }
+    }
+  }
+}
+
+// Define the legacy stencil previously used for feedback with momentum deposition
+//
+// I don't totally understand the logic (other than the fact that it uses 27-Cell CIC stencil).
+// It has some quirks. Including the fact that the amount of scalar deposition is directly related
+// to the magnitude of the vector.
+
+/** the prescription for dividing a scalar quantity between 3x3x3 cells is done
+   by imagining a 2x2x2 cell volume around the SN.  These fractions, then,
+   represent the linear extent of this volume into the cell in question. For i=0
+   this should be 1*1/2. For i=-1 this should be (1-dx)*1/2. For i=+1 this
+   should be dx*1/2. In the above the 1/2 factor is normalize over 2
+   cells/direction.
+  */
+static inline __device__ Real Frac(int i, Real dx) { return (-0.5 * i * i - 0.5 * i + 1 + i * dx) * 0.5; }
+
+static inline __device__ Real D_Frac(int i, Real dx)
+{
+  // I believe this is a piecwise function that does the following:
+  //    (i == -1, dx <= 0.5): -1.0
+  //    (i == -1, dx  > 0.5): -2*dx - 2
+  //    (i ==  0,    any dx): 1 - 2*dx
+  //    (i ==  1, dx <= 0.5): -1.0
+  //    (i ==  1, dx  > 0.5): 2*dx - 2
+  //  - elif (i == 1):  2*dx + (1 - 2*dx)*(dx > 0.5)
+  return (dx > 0.5) * i * (1 - 2 * dx) + ((i + 1) * dx + 0.5 * (i - 1)) - 3 * (i - 1) * (i + 1) * (0.5 - dx);
+}
+
+struct LegacyCIC27 {
+  /* along any axis, gives the max number of neighboring cells that may be enclosed by the stencil,
+   * that are on one side of the cell containing the stencil's center.
+   *
+   * \note
+   * this primarily exists for testing purposes!
+   */
+  inline static constexpr int max_enclosed_neighbors = 1;
+
+  /* excute f at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * The function should expect 2 arguments:
+   *   1. ``stencil_enclosed_frac``: the fraction of the stencil enclosed by the cell
+   *   2. ``indx3x``: the index used to index a 3D array (that has ghost zones)
+   */
+  template <typename Function>
+  static __device__ void for_each(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g, Function &&f)
+  {
+    for_each_cic27_<Function, StencilEvalKind::enclosed_stencil_vol_frac>(pos_indU, nx_g, ny_g,
+                                                                          std::forward<Function>(f));
+  }
+
+  /* excute ``f`` at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * This is just like for_each, except that it passes the fraction of the cell-volume that is enclosed
+   * by the stencil to ``f`` (instead of passing fraction of the stencil-volume enclosed by the cell).
+   *
+   * \note
+   * This is primarily intended for testing purposes.
+   */
+  template <typename Function>
+  static __device__ void for_each_enclosedCellVol(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                                  Function &&f)
+  {
+    for_each_cic27_<Function, StencilEvalKind::enclosed_cell_vol_frac>(pos_indU, nx_g, ny_g, std::forward<Function>(f));
+  }
+
+  /* excute f at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * The function should expect 3 arguments (it's not totally clear to what the first 2 arguments truly "mean",
+   * but they are used similarly to the corresponding arguments passed by other kernels' for_each_vecflavor):
+   *   1. ``scalar_weight``: multiply this by the scalar to determine how much scalar to inject
+   *   2. ``vec_comp_factor``: multiply each by a momentumvelocity component to get the amount of momentum to inject.
+   *   2. ``indx3x``: the index used to index a 3D array (that has ghost zones)
+   */
+  template <typename Function>
+  static __device__ void for_each_vecflavor(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g, Function f)
+  {
+    const Real pos_x_indU = pos_indU[0];
+    const Real pos_y_indU = pos_indU[1];
+    const Real pos_z_indU = pos_indU[2];
+
+    int indx_x = (int)floor(pos_x_indU);
+    int indx_y = (int)floor(pos_y_indU);
+    int indx_z = (int)floor(pos_z_indU);
+
+    Real delta_x = pos_x_indU - indx_x;
+    Real delta_y = pos_y_indU - indx_y;
+    Real delta_z = pos_z_indU - indx_z;
+
+    // loop over the 27 cells to add up all the allocated feedback
+    // momentum magnitudes.  For each cell allocate density and
+    // energy based on the ratio of allocated momentum to this overall sum.
+    Real mag = 0;
+    for (int i = -1; i < 2; i++) {
+      for (int j = -1; j < 2; j++) {
+        for (int k = -1; k < 2; k++) {
+          Real x_frac = D_Frac(i, delta_x) * Frac(j, delta_y) * Frac(k, delta_z);
+          Real y_frac = Frac(i, delta_x) * D_Frac(j, delta_y) * Frac(k, delta_z);
+          Real z_frac = Frac(i, delta_x) * Frac(j, delta_y) * D_Frac(k, delta_z);
+
+          mag += sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac);
+        }
+      }
+    }
+
+    Real inv_mag = 1.0 / mag;
+
+    for (int i = -1; i < 2; i++) {
+      for (int j = -1; j < 2; j++) {
+        for (int k = -1; k < 2; k++) {
+          // index in array of conserved quantities
+          int indx = (indx_x + i) + (indx_y + j) * nx_g + (indx_z + k) * nx_g * ny_g;
+
+          Real x_frac        = D_Frac(i, delta_x) * Frac(j, delta_y) * Frac(k, delta_z);
+          Real y_frac        = Frac(i, delta_x) * D_Frac(j, delta_y) * Frac(k, delta_z);
+          Real z_frac        = Frac(i, delta_x) * Frac(j, delta_y) * D_Frac(k, delta_z);
+          Real scalar_weight = sqrt(x_frac * x_frac + y_frac * y_frac + z_frac * z_frac) * inv_mag;
+          hydro_utilities::VectorXYZ<Real> momentum_weights{x_frac * inv_mag, y_frac * inv_mag, z_frac * inv_mag};
+
+          f(scalar_weight, momentum_weights, indx);
+
+        }  // k loop
+      }    // j loop
+    }      // i loop
+  }
+
+  /* calls the unary function f at ever location where there probably is non-zero overlap with
+   * the stencil.
+   *
+   * \note
+   * This is intended to be conservative (it's okay for this to call the function on a cell with
+   * non-zero overlap). The reason this exacts (rather than just calling for_each), is that it
+   * it may be significantly cheaper for some stencils
+   */
+  template <typename UnaryFunction>
+  static __device__ void for_each_overlap_zone(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                               UnaryFunction f)
+  {
+    for_each_cic27_<UnaryFunction, StencilEvalKind::for_each_overlap_zone>(pos_indU, nx_g, ny_g,
+                                                                           std::forward<UnaryFunction>(f));
+  }
+
+  /* returns the nearest location to (pos_x_indU, pos_y_indU, pos_z_indU) that the stencil's center
+   * can be shifted to in order to avoid overlapping with the ghost zone.
+   *
+   * If the specified location already does not overlap with the ghost zone, that is the returned
+   * value.
+   */
+  static __device__ hydro_utilities::VectorXYZ<Real> nearest_noGhostOverlap_pos(
+      hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y, int ng_z, int n_ghost)
+  {
+    const Real min_stencil_offset = 1.0;  // I think this is right, I'm a little fuzzy on the precised
+    return nearest_noGhostOverlap_pos_(min_stencil_offset, pos_indU, ng_x, ng_y, ng_z, n_ghost);
+  }
+};
+
+/* Represents a sphere. This is used to help implement stencils. */
+struct SphereObj {
+  // attributes
+  double center_indU[3]; /*!< center of the sphere (in index units). An integer value corresponds to a cell-edge.
+                          *!< Integer-values plus 0.5 correspond to cell-centers*/
+  int raidus2_indU;      /*!< squared radius of the sphere (in units of cell-widths)*/
+
+  // interface
+
+  /* queries whether the sphere encloses a given point */
+  __forceinline__ __device__ bool encloses_point(double pos_x_indU, double pos_y_indU, double pos_z_indU) const
+  {
+    double delta_x = pos_x_indU - center_indU[0];
+    double delta_y = pos_y_indU - center_indU[1];
+    double delta_z = pos_z_indU - center_indU[2];
+
+    return (delta_x * delta_x + delta_y * delta_y + delta_z * delta_z) < raidus2_indU;
+  }
+
+  /* queries whether the sphere encloses any super-sampled points within a cell that correspond to integer indices of
+   * (cell_idx_x, cell_idx_y, cell_idx_z).
+   *
+   *  \tparam Log2DivsionsPerAx parameterizes the amount of super-sampling. There are ``2^Log2DivsionsPerAx``
+   * equidistant points along each axis of the algorithm. In other words, this can return a max value of
+   * ``2^(Log2DivsionsPerAx_PerCell*3)``.
+   */
+  template <int Log2DivsionsPerAx>
+  __device__ bool Encloses_Any_Supersample(int cell_idx_x, int cell_idx_y, int cell_idx_z) const
+  {
+    // compute some basic information for the algorithm
+    // - we employ ternary conditionals to avoid functions-calls/floating-point operations for the most
+    //   common choice of Log2DivsionsPerAx
+    // - since Log2DivsionsPerAx is a template-arg, these branches should be compiled away
+    // - we could probably be a little more clever here
+    const int num_subdivisions_per_ax    = (Log2DivsionsPerAx == 2) ? 4 : std::pow(2, Log2DivsionsPerAx);
+    const double subgrid_width           = (Log2DivsionsPerAx == 2) ? 0.25 : 1.0 / num_subdivisions_per_ax;
+    const double leftmost_subgrid_offset = (Log2DivsionsPerAx == 2) ? 0.125 : 0.5 * subgrid_width;
+
+    // the following is mathematically equivalent to 1-leftmost_subgrid_offset, but we do the following to try to
+    // have consistent rounding with other supersampling calculations
+    const double rightmost_subgrid_offset = leftmost_subgrid_offset + ((num_subdivisions_per_ax - 1) * subgrid_width);
+
+    // IMPLICIT ASSUMPTION is that the radius is 1 cell-width or larger
+
+    double dx_left  = center_indU[0] - (cell_idx_x + leftmost_subgrid_offset);
+    double dx_right = center_indU[0] - (cell_idx_x + rightmost_subgrid_offset);
+    double dy_left  = center_indU[1] - (cell_idx_y + leftmost_subgrid_offset);
+    double dy_right = center_indU[1] - (cell_idx_y + rightmost_subgrid_offset);
+    double dz_left  = center_indU[2] - (cell_idx_z + leftmost_subgrid_offset);
+    double dz_right = center_indU[2] - (cell_idx_z + rightmost_subgrid_offset);
+
+    double min_squared_dist =
+        (fmin(dx_left * dx_left, dx_right * dx_right) + fmin(dy_left * dy_left, dy_right * dy_right) +
+         fmin(dz_left * dz_left, dz_right * dz_right));
+    return min_squared_dist < raidus2_indU;
+  }
+
+  /* returns the count of the number of super-sampled points within a cell that correspond to integer indices of
+   * (cell_idx_x, cell_idx_y, cell_idx_z).
+   *
+   *  \tparam Log2DivsionsPerAx parameterizes the amount of super-sampling. The super-sampling algorithm checks
+   *       ``2^Log2DivsionsPerAx`` equidistant points along each axis of the algorithm. In other words, this
+   *       can return a max value of ``2^(Log2DivsionsPerAx_PerCell*3)``.
+   *
+   * \note
+   * In the context of this function, integer indices specify the left edge of a cell. An integer index + 0.5
+   * specifies the center of a cell.
+   *
+   * \note
+   * None of the super-samples are placed on the edges of the cells.
+   */
+  template <int Log2DivsionsPerAx>
+  __device__ unsigned int Count_Super_Samples(int cell_idx_x, int cell_idx_y, int cell_idx_z) const
+  {
+    static_assert((0 <= Log2DivsionsPerAx) and ((Log2DivsionsPerAx * 3) <= (8 * sizeof(unsigned int))),
+                  "Log2DivsionsPerAx must be a non-negative integer AND 2^(Log2DivsionsPerAx*3), the total "
+                  "number of super-samples in a given cell, must be representable by an unsigned int");
+
+    // compute some basic information for the algorithm
+    // - we employ ternary conditionals to avoid functions-calls/floating-point operations for the most
+    //   common choice of Log2DivsionsPerAx
+    // - since Log2DivsionsPerAx is a template-arg, these branches should be compiled away
+    // - we could probably be a little more clever here
+    const int num_subdivisions_per_ax    = (Log2DivsionsPerAx == 2) ? 4 : std::pow(2, Log2DivsionsPerAx);
+    const double subgrid_width           = (Log2DivsionsPerAx == 2) ? 0.25 : 1.0 / num_subdivisions_per_ax;
+    const double leftmost_subgrid_offset = (Log2DivsionsPerAx == 2) ? 0.125 : 0.5 * subgrid_width;
+
+    unsigned int count = 0;
+    for (int ix = 0; ix < num_subdivisions_per_ax; ix++) {
+      for (int iy = 0; iy < num_subdivisions_per_ax; iy++) {
+        for (int iz = 0; iz < num_subdivisions_per_ax; iz++) {
+          // since cell_idx_x, cell_idx_y, cell_idx_z are all integers, they specify
+          // the position of the left edge of the cell
+          double x = cell_idx_x + (leftmost_subgrid_offset + ix * subgrid_width);
+          double y = cell_idx_y + (leftmost_subgrid_offset + iy * subgrid_width);
+          double z = cell_idx_z + (leftmost_subgrid_offset + iz * subgrid_width);
+
+          count += encloses_point(x, y, z);
+        }
+      }
+    }
+
+    return count;
+  }
+
+  /* Estimates the volume integral over the overlapping region of a cell of the radial unit-vector measured from
+   * ``ref_pos_IndU``. Specifically, the cell corresponds to integer indices of (cell_idx_x, cell_idx_y, cell_idx_z)
+   * and the integral makes use of super-sampling.
+   *
+   * The result has units of subcell-volume. To convert it to units of cell-volume multiply by
+   * `pow(2,-3*Log2DivsionsPerAx)`
+   *
+   * In more detail, The evaluated integral looks like:
+   * \f[
+   *   \hat{x}\int (\hat{x} \cdot\hat{r})\  dV_{\rm cell} + \hat{y}\int (\hat{y} \cdot\hat{r})\  dV_{\rm cell} +
+   *   \hat{z}\int (\hat{z} \cdot\hat{r})\  dV_{\rm cell}
+   * \f]
+   * Where the bounds of the integral are understood to only include the portion of the region of the specified sphere
+   * that overlaps with the sphere. Additionally, \f$ \hat{r} \f$ is the radial unit vector measured after transforming
+   * the coordinate-system so that the origin coincides with the ``ref_pos_IndU`` argument.
+   *
+   * The calculation makes 2 assumptions:
+   *   1. We assume that subcells are either entirely enclosed by the sphere or aren't enclosed at all
+   *   2. Throughout a given subcell, \f$ \hat{r} \f$ is constant; it's equal to the value at the center of the subcell.
+   *      (Note: it would be possible to avoid assumption. There is an exact analytic solution, it's just very
+   * involved).
+   *
+   * Under these assumptions the evaluated integral becomes:
+   * \f[
+   *   V_{\rm subcell}\sum_{ijk}^{\rm subcells} \frac{W_{ijk} (x_i \hat{x} + y_j \hat{y} + z_k\hat{z})}{r_{ijk}}
+   * \f]
+   * where the subscripted variables are computed at the center of each subcell. \f$ W_{ijk} \f$ has a
+   * value of 1 for subcells whose centers lie within the sphere and are zero in other cases
+   *
+   *  \tparam Log2DivsionsPerAx parameterizes the amount of super-sampling. The super-sampling algorithm checks
+   *       ``2^Log2DivsionsPerAx`` equidistant points along each axis of the algorithm. In other words, this
+   *       can return a max value of ``2^(Log2DivsionsPerAx_PerCell*3)``.
+   *
+   * \note
+   * In the context of this function, integer indices specify the left edge of a cell. An integer index + 0.5
+   * specifies the center of a cell.
+   *
+   * \note
+   * None of the super-samples are placed on the edges of the cells.
+   */
+  template <int Log2DivsionsPerAx>
+  __device__ hydro_utilities::VectorXYZ<Real> Super_Sampled_RadialUnitVec_VolIntegral(
+      int cell_idx_x, int cell_idx_y, int cell_idx_z, const hydro_utilities::VectorXYZ<Real> ref_pos_IndU) const
+  {
+    static_assert((0 <= Log2DivsionsPerAx) and ((Log2DivsionsPerAx * 3) <= (8 * sizeof(unsigned int))),
+                  "Log2DivsionsPerAx must be a non-negative integer AND 2^(Log2DivsionsPerAx*3), the total "
+                  "number of super-samples in a given cell, must be representable by an unsigned int");
+
+    // compute some basic information for the algorithm
+    // - we employ ternary conditionals to avoid functions-calls/floating-point operations for the most
+    //   common choice of Log2DivsionsPerAx
+    // - since Log2DivsionsPerAx is a template-arg, these branches should be compiled away
+    // - we could probably be a little more clever here
+    const int num_subdivisions_per_ax    = (Log2DivsionsPerAx == 2) ? 4 : std::pow(2, Log2DivsionsPerAx);
+    const double subgrid_width           = (Log2DivsionsPerAx == 2) ? 0.25 : 1.0 / num_subdivisions_per_ax;
+    const double leftmost_subgrid_offset = (Log2DivsionsPerAx == 2) ? 0.125 : 0.5 * subgrid_width;
+
+    hydro_utilities::VectorXYZ<Real> out{0.0, 0.0, 0.0};
+
+    for (int ix = 0; ix < num_subdivisions_per_ax; ix++) {
+      for (int iy = 0; iy < num_subdivisions_per_ax; iy++) {
+        for (int iz = 0; iz < num_subdivisions_per_ax; iz++) {
+          // since cell_idx_x, cell_idx_y, cell_idx_z are all integers, they specify
+          // the position of the left edge of the cell
+
+          // compute the center of the subcell
+          const double orig_frame_x = cell_idx_x + (leftmost_subgrid_offset + ix * subgrid_width);
+          const double orig_frame_y = cell_idx_y + (leftmost_subgrid_offset + iy * subgrid_width);
+          const double orig_frame_z = cell_idx_z + (leftmost_subgrid_offset + iz * subgrid_width);
+
+          const bool subcell_enclosed_by_sphere = encloses_point(orig_frame_x, orig_frame_y, orig_frame_z);
+
+          // compute the x, y, and z components in the coordinate system that has been translated so that
+          // ref_pos_IndU coincides with the origin
+          const double x = orig_frame_x - ref_pos_IndU[0];
+          const double y = orig_frame_y - ref_pos_IndU[1];
+          const double z = orig_frame_z - ref_pos_IndU[2];
+
+          const bool coincides_with_origin = ((x == 0.0) && (y == 0.0) && (z == 0.0));
+
+          // for r = sqrt((x*x) + (y*y) + (z*z)), we need to compute x/r, y/r, z/r.
+          // - we add coincides_with_origin here to make sure we don't divide by zero if (x,y,z) = (0,0,0)
+          const double inv_r_mag = 1.0 / (coincides_with_origin + sqrt((x * x) + (y * y) + (z * z)));
+
+          out[0] += subcell_enclosed_by_sphere * x * inv_r_mag;
+          out[1] += subcell_enclosed_by_sphere * y * inv_r_mag;
+          out[2] += subcell_enclosed_by_sphere * z * inv_r_mag;
+        }
+      }
+    }
+
+    return out;
+  }
+};
+
+/* implements a stencil for depositing scalar quantities into a rectangular-prism region
+ * (each side-length centered on `pos_indU` that has a length of 2 cell-widths along each
+ * direction.
+ */
+template <typename Function, StencilEvalKind flavor, int CellsPerDiameter, int Log2DivsionsPerAx_PerCell>
+static __forceinline__ __device__ void for_each_sphere_(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                                        Function f)
+{
+  const SphereObj sphere{/* center = */ {pos_indU[0], pos_indU[1], pos_indU[2]},
+                         /* squared_radius = */ 1 * 1};
+  // we are intentionally using integer division to speed up the next line
+  // NOLINTNEXTLINE(bugprone-integer-division)
+  const Real l_offset = ((CellsPerDiameter % 2) == 0) ? CellsPerDiameter / 2 : 0.5 * CellsPerDiameter;
+
+  // Step 1: along each axis, identify the integer-index of the leftmost cell covered by the stencil.
+  const int leftmost_indx_x = int(pos_indU[0] - l_offset);
+  const int leftmost_indx_y = int(pos_indU[1] - l_offset);
+  const int leftmost_indx_z = int(pos_indU[2] - l_offset);
+
+  static_assert(CellsPerDiameter == 2);  // this is temporary!
+
+  // Step 2: get the number of super-samples within each of the 27 possible cells (This is not
+  //         actually necessary for some stencil evaluation-flavors)
+
+  // Step 2a: declare variables used to accumulate the total count and to act as a cache
+  //          for tracking the number of super-sample per cell
+  // -> we label these with [[maybe_unused]] attribute to suppress warnings for the flavors where
+  //    these variables aren't used.
+  // -> for applicable "flavors", the compiler should optimize out the unusued variables.
+  // -> we want to keep the array-element size small for the cached_counts variable in order to
+  //    reduce memory-pressure on the stack (especially since every thread will be allocating this
+  //    much stack-space at the same time)
+  [[maybe_unused]] unsigned long total_count = 0;
+  [[maybe_unused]] uint_least16_t cached_counts[3][3][3];
+
+  // Step 2b: actually get the number of supersamples
+  if constexpr (flavor == StencilEvalKind::enclosed_stencil_vol_frac) {
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < 3; k++) {
+          unsigned int count = sphere.Count_Super_Samples<Log2DivsionsPerAx_PerCell>(
+              leftmost_indx_x + i, leftmost_indx_y + j, leftmost_indx_z + k);
+          cached_counts[i][j][k] = std::uint_least16_t(count);
+          total_count += count;
+        }
+      }
+    }
+  }
+
+  // kernel_printf("ref: %g, %g, %g\n", pos_indU[0], pos_indU[1], pos_indU[2]);
+
+  // Step 3: actually invoke f at each cell-location that overlaps with the stencil location
+  //    (for flavors where we specify some kind of enclosed volume as a function argument,
+  //    its okay to specify the function at a location without any overlap)
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        const int indx_x = leftmost_indx_x + i;
+        const int indx_y = leftmost_indx_y + j;
+        const int indx_z = leftmost_indx_z + k;
+        const int ind3D  = indx_x + nx_g * (indx_y + ny_g * indx_z);
+
+        // kernel_printf("%d, %d, %d: %g\n", leftmost_indx_x + i, (leftmost_indx_y + j), (leftmost_indx_z + k),
+        //               double(counts[i][j][k])/total_count);
+
+        if constexpr (flavor == StencilEvalKind::enclosed_stencil_vol_frac) {
+          // pass both of the following to the function
+          //  1. fraction of the total stencil volume enclosed by the given cell
+          //  2. the 1d index specifying cell-location (for a field with ghost zones)
+          f(double(cached_counts[i][j][k]) / total_count, ind3D);
+
+        } else if constexpr (flavor == StencilEvalKind::enclosed_cell_vol_frac) {
+          // pass both of the following to the function:
+          // 1. pass the fraction of the cell-volume that is enclosed by the stencil
+          // 2. the 1d index specifying cell-location
+
+          // it would nominally make more sense to precompute the following outside of this loop,
+          // but that's probably fine (after all, this branch is mostly for testing purposes)
+          double inverse_max_counts_per_cell = 1.0 / double(std::pow(2, Log2DivsionsPerAx_PerCell * 3));
+
+          // in this case, we have not precomputed the compute
+          unsigned int count = sphere.Count_Super_Samples<Log2DivsionsPerAx_PerCell>(indx_x, indx_y, indx_z);
+          f(count * inverse_max_counts_per_cell, ind3D);
+
+        } else if constexpr (flavor == StencilEvalKind::for_each_overlap_zone) {
+          bool is_enclosed = sphere.Encloses_Any_Supersample<Log2DivsionsPerAx_PerCell>(indx_x, indx_y, indx_z);
+          if (is_enclosed) f(ind3D);
+        }
+      }
+    }
+  }
+}
+
+/* Represents a 27-cell deposition stencil for a sphere with a radius of 1 cell-width. This stencil computes
+ * the fraction of the stencil that is enclosed in each cell. The overlap between the stencil and a given cell
+ * is computed via super-sampling.
+ *
+ * \tparam Log2DivsionsPerAx_PerCell parameterizes the amount of super-sampling. For a given cell, the super-sampling
+ *    algorithm the number of subgrid-points enclosed by the stencil; there are ``2^Log2DivsionsPerAx_PerCell``
+ *    sub-grid points along each axis. In other words, there are ``2^(Log2DivsionsPerAx_PerCell*3)`` subgrid-points
+ *    per cell.
+ */
+template <int Log2DivsionsPerAx_PerCell = 2>
+struct Sphere27 {
+  static_assert((Log2DivsionsPerAx_PerCell >= 0) and (Log2DivsionsPerAx_PerCell <= 5),
+                "Log2DivsionsPerAx_PerCell must be a non-negative integer. It also can't exceed 5 "
+                "so that 2^(Log2DivsionsPerAx_PerCell*3) can be represented by uint16_t");
+
+  /* along any axis, gives the max number of neighboring cells that may be enclosed by the stencil,
+   * that are on one side of the cell containing the stencil's center.
+   *
+   * \note
+   * this primarily exists for testing purposes!
+   */
+  inline static constexpr int max_enclosed_neighbors = 1;
+
+  /* excute f at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * The function should expect 2 arguments:
+   *   1. ``stencil_enclosed_frac``: the fraction of the total stencil volume enclosed by the cell
+   *   2. ``indx3x``: the index used to index a 3D array (that has ghost zones)
+   */
+  template <typename Function>
+  static __device__ void for_each(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g, Function &&f)
+  {
+    for_each_sphere_<Function, StencilEvalKind::enclosed_stencil_vol_frac, 2, Log2DivsionsPerAx_PerCell>(
+        pos_indU, nx_g, ny_g, std::forward<Function>(f));
+  }
+
+  /* excute f at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * The function should expect 3 arguments:
+   *   1. ``stencil_enclosed_frac``: the fraction of the total stencil volume enclosed by the cell. In other
+   *      words, its the volume of the cell enclosed by the stencil divided by the total stencil volume.
+   *   2. ``vec_comp_factor``: a `hydro_utilities::VectorXYZ<Real>` where the elements represent math-vector components
+   * (x, y, z). Essentially, this stores the volume integral (of the region enclosed by the stencil) over the
+   *      radial-unit vector (originating from the stencil center) divided by a normalization constant.
+   *      - The normalization constant is computed by taking the sum of each volume-integrated radial-unit
+   *        vector computed at each cell enclosed by the stencil
+   *      - The alternative would be to just normalize by the total volume. The problem with this alternative
+   *        is if you are trying to inject a constant amount of radial momentum per unit-volume, then
+   *        cancelation in the most-central cell may cause you to underinject momementum (primarily in the
+   *        case where the stencil is near the center of a cell)
+   *   2. ``indx3x``: the index used to index a 3D array (that has ghost zones)
+   */
+  template <typename Function>
+  static __device__ void for_each_vecflavor(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g, Function f)
+  {
+    // Step 1: along each axis, identify the integer-index of the leftmost cell covered by the stencil.
+    const int leftmost_indx_x = int(pos_indU[0] - 1);
+    const int leftmost_indx_y = int(pos_indU[1] - 1);
+    const int leftmost_indx_z = int(pos_indU[2] - 1);
+
+    // Step 2: get the number of super-samples within each of the 27 possible cells
+    const SphereObj sphere{{pos_indU[0], pos_indU[1], pos_indU[2]}, 1 * 1};
+
+    // we intentionally keep the array-element size to reduce memory pressure on the stack (especially
+    // since every thread will be allocating this much stack-space at the same time)
+    // - If we weren't concerned about memory-pressure (e.g. we used cooperative_groups), we could
+    //   save time and consolidate the calculation of integrated vector components and the enclosed
+    //   volume into a single operation)
+    uint_least16_t cached_counts[3][3][3];
+
+    unsigned long total_count = 0;
+    Real vector_norm          = 0.0;
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < 3; k++) {
+          unsigned int cur_count = sphere.Count_Super_Samples<Log2DivsionsPerAx_PerCell>(
+              leftmost_indx_x + i, leftmost_indx_y + j, leftmost_indx_z + k);
+          total_count += cur_count;                                 // update total_count
+          cached_counts[i][j][k] = std::uint_least16_t(cur_count);  // cache the value of
+
+          const hydro_utilities::VectorXYZ<Real> integrated_vec =
+              sphere.Super_Sampled_RadialUnitVec_VolIntegral<Log2DivsionsPerAx_PerCell>(
+                  leftmost_indx_x + i, leftmost_indx_y + j, leftmost_indx_z + k, pos_indU);
+          vector_norm += norm3d(integrated_vec[0], integrated_vec[1], integrated_vec[2]);
+          // we don't cache the value of integrated_vec... That would put a LOT of strain on registers
+        }
+      }
+    }
+
+    const Real vec_factor = 1.0 / vector_norm;
+
+    // Step 3: actually invoke f at each cell-location that overlaps with the stencil location, passing both:
+    //  1. fraction of the total stencil volume enclosed by the given cell
+    //  2. the volume integral (of the region enclosed by the stencil) over the  radial-unit vector (originating
+    //     from the stencil center) divided by the total stencil volume
+    //  3. the 1d index specifying cell-location (for a field with ghost zones)
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < 3; k++) {
+          const int ind3D = (leftmost_indx_x + i) + nx_g * ((leftmost_indx_y + j) + ny_g * (leftmost_indx_z + k));
+
+          // kernel_printf("%d, %d, %d: %g\n", leftmost_indx_x + i, (leftmost_indx_y + j), (leftmost_indx_z + k),
+          //               double(counts[i][j][k])/total_count);
+
+          // this has units of subcell volume. We need to divide it by total_count before passing it along
+          const hydro_utilities::VectorXYZ<Real> tmp =
+              sphere.Super_Sampled_RadialUnitVec_VolIntegral<Log2DivsionsPerAx_PerCell>(
+                  leftmost_indx_x + i, leftmost_indx_y + j, leftmost_indx_z + k, pos_indU);
+
+          f(double(cached_counts[i][j][k]) / total_count,
+            hydro_utilities::VectorXYZ<Real>{tmp[0] * vec_factor, tmp[1] * vec_factor, tmp[2] * vec_factor}, ind3D);
+        }
+      }
+    }
+  }
+
+  /* excute ``f`` at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * This is just like for_each, except that it passes the fraction of the cell-volume that is enclosed
+   * by the stencil to ``f`` (instead of passing fraction of the stencil-volume enclosed by the cell).
+   *
+   * \note
+   * This is primarily intended for testing purposes.
+   */
+  template <typename Function>
+  static __device__ void for_each_enclosedCellVol(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                                  Function f)
+  {
+    for_each_sphere_<Function, StencilEvalKind::enclosed_cell_vol_frac, 2, Log2DivsionsPerAx_PerCell>(
+        pos_indU, nx_g, ny_g, std::forward<Function>(f));
+  }
+
+  /* calls the unary function f at ever location where there probably is non-zero overlap with
+   * the stencil.
+   *
+   * \note
+   * This is is significantly cheaper than calling for_each.
+   */
+  template <typename UnaryFunction>
+  static __device__ void for_each_overlap_zone(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                               UnaryFunction &&f)
+  {
+    for_each_sphere_<UnaryFunction, StencilEvalKind::for_each_overlap_zone, 2, Log2DivsionsPerAx_PerCell>(
+        pos_indU, nx_g, ny_g, std::forward<UnaryFunction>(f));
+  }
+
+  /* returns the nearest location to (pos_x_indU, pos_y_indU, pos_z_indU) that the stencil's center
+   * can be shifted to in order to avoid overlapping with the ghost zone.
+   *
+   * If the specified location already does not overlap with the ghost zone, that is the returned
+   * value.
+   */
+  static __device__ hydro_utilities::VectorXYZ<Real> nearest_noGhostOverlap_pos(
+      hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y, int ng_z, int n_ghost)
+  {
+    // we actually provide an alternative more clever. technically, we just can't overlap with nearest
+    // super-sampled point inside of ghost zone. Any other amount of overlap is fair game!
+    constexpr Real min_stencil_offset = 1.0;
+    return nearest_noGhostOverlap_pos_(min_stencil_offset, pos_indU, ng_x, ng_y, ng_z, n_ghost);
+  }
+};
+
+/* Represents a spherical stencil with a radius of 3 cells, where the inclusion where inclusion of
+ * cells in the sphere is a binary choice.
+ *
+ * Specifically, a cell is included if the cell-center lies within the sphere.
+ */
+template <int CellsPerRadius = 3>
+struct SphereBinary {
+  static_assert(CellsPerRadius > 0);
+
+  /* along any axis, gives the max number of neighboring cells that may be enclosed by the stencil,
+   * that are on one side of the cell containing the stencil's center.
+   *
+   * \note
+   * this primarily exists for testing purposes
+   */
+  inline static constexpr int max_enclosed_neighbors = CellsPerRadius;
+
+  template <typename Function>
+  static __device__ void for_each(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g, Function f)
+  {
+    // Step 1: along each axis, identify the integer-index of the leftmost cell covered by the stencil.
+    int leftmost_indx_x = int(pos_indU[0]) - CellsPerRadius;
+    int leftmost_indx_y = int(pos_indU[1]) - CellsPerRadius;
+    int leftmost_indx_z = int(pos_indU[2]) - CellsPerRadius;
+
+    // Step 2: get the number of cells enclosed by the sphere
+    const SphereObj sphere{{pos_indU[0], pos_indU[1], pos_indU[2]}, CellsPerRadius * CellsPerRadius};
+    int total_count = 0;
+
+    const int stop = (2 * CellsPerRadius) + 1;
+    for (int i = 0; i < stop; i++) {
+      for (int j = 0; j < stop; j++) {
+        for (int k = 0; k < stop; k++) {
+          total_count +=
+              sphere.encloses_point(leftmost_indx_x + i + 0.5, leftmost_indx_y + j + 0.5, leftmost_indx_z + k + 0.5);
+        }
+      }
+    }
+
+    double enclosed_stencil_frac = 1.0 / total_count;  // each enclosed cell, encloses this fraction of the sphere
+
+    // Step 3: actually invoke f at each cell-location that overlaps with the stencil location, passing both:
+    //  1. fraction of the total stencil volume enclosed by the given cell
+    //  2. the 1d index specifying cell-location (for a field with ghost zones)
+    for (int i = 0; i < stop; i++) {
+      for (int j = 0; j < stop; j++) {
+        for (int k = 0; k < stop; k++) {
+          bool is_enclosed =
+              sphere.encloses_point(leftmost_indx_x + i + 0.5, leftmost_indx_y + j + 0.5, leftmost_indx_z + k + 0.5);
+          // kernel_printf("(%d, %d, %d), enclosed: %d\n", i,j,k, is_enclosed);
+          if (is_enclosed) {
+            const int ind3D = (leftmost_indx_x + i) + nx_g * ((leftmost_indx_y + j) + ny_g * (leftmost_indx_z + k));
+            f(enclosed_stencil_frac, ind3D);
+          }
+        }
+      }
+    }
+  }
+
+  /* excute ``f`` at each location included in the stencil centered at (pos_x_indU, pos_y_indU, pos_z_indU).
+   *
+   * This is just like for_each, except that it passes the fraction of the cell-volume that is enclosed
+   * by the stencil to ``f`` (instead of passing fraction of the stencil-volume enclosed by the cell).
+   *
+   * \note
+   * This is primarily intended for testing purposes.
+   */
+  template <typename Function>
+  static __device__ void for_each_enclosedCellVol(hydro_utilities::VectorXYZ<Real> pos_indU, int nx_g, int ny_g,
+                                                  Function f)
+  {
+    // along each axis, identify the integer-index of the leftmost cell covered by the stencil.
+    int leftmost_indx_x = int(pos_indU[0]) - CellsPerRadius;
+    int leftmost_indx_y = int(pos_indU[1]) - CellsPerRadius;
+    int leftmost_indx_z = int(pos_indU[2]) - CellsPerRadius;
+
+    const SphereObj sphere{{pos_indU[0], pos_indU[1], pos_indU[2]}, CellsPerRadius * CellsPerRadius};
+
+    const int stop = (2 * CellsPerRadius) + 1;
+    for (int i = 0; i < stop; i++) {
+      for (int j = 0; j < stop; j++) {
+        for (int k = 0; k < stop; k++) {
+          bool is_enclosed =
+              sphere.encloses_point(leftmost_indx_x + i + 0.5, leftmost_indx_y + j + 0.5, leftmost_indx_z + k + 0.5);
+          double enclosed_cell_vol = (is_enclosed) ? 1.0 : 0.0;  // could just cast is_enclosed
+          const int ind3D = (leftmost_indx_x + i) + nx_g * ((leftmost_indx_y + j) + ny_g * (leftmost_indx_z + k));
+          f(enclosed_cell_vol, ind3D);
+        }
+      }
+    }
+  }
+
+  template <typename UnaryFunction>
+  static __device__ void for_each_overlap_zone(hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y,
+                                               UnaryFunction f)
+  {
+    // along each axis, identify the integer-index of the leftmost cell covered by the stencil.
+    int leftmost_indx_x = int(pos_indU[0]) - CellsPerRadius;
+    int leftmost_indx_y = int(pos_indU[1]) - CellsPerRadius;
+    int leftmost_indx_z = int(pos_indU[2]) - CellsPerRadius;
+
+    const SphereObj sphere{/* center = */ {pos_indU[0], pos_indU[1], pos_indU[2]},
+                           /* squared_radius = */ CellsPerRadius * CellsPerRadius};
+
+    const int stop = (2 * CellsPerRadius) + 1;
+    for (int i = 0; i < stop; i++) {
+      for (int j = 0; j < stop; j++) {
+        for (int k = 0; k < stop; k++) {
+          const int indx_x = leftmost_indx_x + i;
+          const int indx_y = leftmost_indx_y + j;
+          const int indx_z = leftmost_indx_z + k;
+          const int ind3D  = indx_x + ng_x * (indx_y + ng_y * indx_z);
+          bool is_enclosed =
+              sphere.encloses_point(leftmost_indx_x + i + 0.5, leftmost_indx_y + j + 0.5, leftmost_indx_z + k + 0.5);
+          if (is_enclosed) f(ind3D);
+        }
+      }
+    }
+  }
+
+  /* returns the nearest location to (pos_x_indU, pos_y_indU, pos_z_indU) that the stencil's center
+   * can be shifted to in order to avoid overlapping with the ghost zone.
+   *
+   * If the specified location already does not overlap with the ghost zone, that is the returned
+   * value.
+   */
+  static __device__ hydro_utilities::VectorXYZ<Real> nearest_noGhostOverlap_pos(
+      hydro_utilities::VectorXYZ<Real> pos_indU, int ng_x, int ng_y, int ng_z, int n_ghost)
+  {
+    // we actually provide an alternative more clever implementation. technically, we just can't overlap with the center
+    // of a cell in the ghost-zone. Any other amount of overlap is fair game!
+    constexpr Real min_stencil_offset = CellsPerRadius;
+    return nearest_noGhostOverlap_pos_(min_stencil_offset, pos_indU, ng_x, ng_y, ng_z, n_ghost);
+  }
+};
+
+}  // namespace fb_stencil
diff --git a/src/global/global.cpp b/src/global/global.cpp
index 63094637c..a203473ca 100644
--- a/src/global/global.cpp
+++ b/src/global/global.cpp
@@ -399,15 +399,29 @@ void Init_Param_Struct_Members(ParameterMap &pmap, struct Parameters *parms)
   Load_String_Param_Into_Char_Buffer(pmap, "outdir", parms->outdir, "");
   Load_String_Param_Into_Char_Buffer(pmap, "indir", parms->indir, "");
 
-  // in the future, the feedback module will read in its own parameters (the global Parameter struct won't
-  // know anything about it)
-#ifdef FEEDBACK
-  #ifndef NO_SN_FEEDBACK
-  Load_String_Param_Into_Char_Buffer(pmap, "snr_filename", parms->snr_filename, "");
-  #endif
-  #ifndef NO_WIND_FEEDBACK
-  Load_String_Param_Into_Char_Buffer(pmap, "sw_filename", parms->sw_filename, "");
-  #endif
+  // Deal with the gravity.gas_only_use_static_grav parameter
+  // - it would be great to move reading of this parameter to the Gravity class (that would probably
+  //   require us to unify STATIC_GRAV and GRAVITY)
+  // - the following flag is only meaningful when GRAVITY and GRAVITY_ANALYTIC_COMP
+  //   are defined.
+  // - In other cases, we raise an error if specified without a sensible value.
+#if defined(GRAVITY) && defined(GRAVITY_ANALYTIC_COMP)
+  parms->gas_only_use_static_grav = pmap.value_or("gravity.gas_only_use_static_grav", false);
+#elif defined(GRAVITY)
+  parms->gas_only_use_static_grav = pmap.value_or("gravity.gas_only_use_static_grav", false);
+  CHOLLA_ASSERT(parms->gas_only_use_static_grav == false,
+                "It is an error to set gravity.gas_only_use_static_grav to `true` when Cholla is compiled with "
+                "GRAVITY but not GRAVITY_ANALYTIC_COMP");
+#elif defined(STATIC_GRAV)
+  parms->gas_only_use_static_grav = pmap.value_or("gravity.gas_only_use_static_grav", true);
+  CHOLLA_ASSERT(
+      parms->gas_only_use_static_grav == true,
+      "It is an error to set gravity.gas_only_use_static_grav to `true` when Cholla is compiled with STATIC_GRAV");
+#else
+  CHOLLA_ASSERT(not pmap.has_param("gravity.gas_only_use_static_grav"),
+                "it doesn't make sense to specify gravity.gas_only_use_static_grav when cholla isn't compiled "
+                "with gravity");
+  parms->gas_only_use_static_grav = false;
 #endif
 
   // in the future, it would probably be good to move this logic into Cosmology::Initialize (or somewhere similar)
diff --git a/src/global/global.h b/src/global/global.h
index ef9665c27..ac9da10ab 100644
--- a/src/global/global.h
+++ b/src/global/global.h
@@ -121,16 +121,17 @@ typedef double Real;
 typedef long int grav_int_t;
 #endif
 
-#ifdef PARTICLES
-  #ifdef PARTICLES_LONG_INTS
+#ifdef PARTICLES_LONG_INTS
 typedef long int part_int_t;
-  #else
+#else
 typedef int part_int_t;
-  #endif  // PARTICLES_LONG_INTS
+#endif  // PARTICLES_LONG_INTS
 
-  #include <vector>
+#include <vector>
 typedef std::vector<Real> real_vector_t;
 typedef std::vector<part_int_t> int_vector_t;
+
+#ifdef PARTICLES
   #ifdef MPI_CHOLLA
 // Constants for the inital size of the buffers for particles transfer
 // and the number of data transferred for each particle
@@ -208,6 +209,9 @@ struct Parameters {
   bool output_always      = false;
   bool legacy_flat_outdir = false;
   int n_steps_limit       = -1;  // Note that negative values indicate that there is no limit
+  // At the moment, the following flag is only meaningful when GRAVITY and GRAVITY_ANALYTIC_COMP
+  // are defined. In other cases, we force this to initialize to a sensible value
+  bool gas_only_use_static_grav;
 #ifdef STATIC_GRAV
   int custom_grav = 0;  // flag to set specific static gravity field
 #endif
@@ -284,14 +288,6 @@ struct Parameters {
   // machine dependent seed will be generated.
   std::uint_fast64_t prng_seed = 0;
 #endif  // PARTICLES
-#ifdef FEEDBACK
-  #ifndef NO_SN_FEEDBACK
-  char snr_filename[MAXLEN];
-  #endif
-  #ifndef NO_WIND_FEEDBACK
-  char sw_filename[MAXLEN];
-  #endif
-#endif
 #ifdef ROTATED_PROJECTION
   // initialize rotation parameters to zero
   int nxr;
diff --git a/src/gravity/grav3D.cpp b/src/gravity/grav3D.cpp
index 8f29c51a1..5faf46304 100644
--- a/src/gravity/grav3D.cpp
+++ b/src/gravity/grav3D.cpp
@@ -149,6 +149,8 @@ void Grav3D::AllocateMemory_CPU(void)
 
   #ifdef GRAVITY_ANALYTIC_COMP
   F.analytic_potential_h = (Real *)malloc(n_cells_potential * sizeof(Real));
+  #else
+  F.analytic_potential_h = nullptr;
   #endif
 }
 
diff --git a/src/gravity/grav3D.h b/src/gravity/grav3D.h
index dc9d85579..e97142c6e 100644
--- a/src/gravity/grav3D.h
+++ b/src/gravity/grav3D.h
@@ -130,9 +130,10 @@ class Grav3D
      * grid at the previous time step */
     Real *potential_1_h;
 
-#ifdef GRAVITY_ANALYTIC_COMP
+    /*! \var analyic_potential_h
+     *  \brief Array containing the gravitational potential of each from the static, analytic potential.
+     */
     Real *analytic_potential_h;
-#endif
 
 #ifdef GRAVITY_GPU
 
@@ -150,9 +151,10 @@ class Grav3D
      * in the grid at the previous time step */
     Real *potential_1_d;
 
-  #ifdef GRAVITY_ANALYTIC_COMP
+    /*! \var analyic_potential_h
+     *  \brief Device Array containing the gravitational potential of each from the static, analytic potential.
+     */
     Real *analytic_potential_d;
-  #endif
 
 #endif  // GRAVITY_GPU
 
diff --git a/src/gravity/gravity_functions_gpu.cu b/src/gravity/gravity_functions_gpu.cu
index b92d19084..0244c869e 100644
--- a/src/gravity/gravity_functions_gpu.cu
+++ b/src/gravity/gravity_functions_gpu.cu
@@ -17,6 +17,8 @@ void Grav3D::AllocateMemory_GPU()
 
     #ifdef GRAVITY_ANALYTIC_COMP
   GPU_Error_Check(cudaMalloc((void **)&F.analytic_potential_d, n_cells_potential * sizeof(Real)));
+    #else
+  F.analytic_potential_d = nullptr;
     #endif
 
     #ifdef GRAV_ISOLATED_BOUNDARY_X
diff --git a/src/grid/grid3D.cpp b/src/grid/grid3D.cpp
index 34b30a4ed..272fe4993 100644
--- a/src/grid/grid3D.cpp
+++ b/src/grid/grid3D.cpp
@@ -142,6 +142,8 @@ void Grid3D::Initialize(struct Parameters *P)
   }
 #endif
 
+  H.gas_only_use_static_grav = P->gas_only_use_static_grav;
+
   // Set the CFL coefficient (a global variable)
   C_cfl = 0.3;
 
@@ -415,6 +417,24 @@ void Grid3D::Execute_Hydro_Integrator(void)
   Timer.Hydro_Integrator.Start();
 #endif  // CPU_TIME
 
+  [[maybe_unused]] Real *d_Grav_potential = nullptr;
+  if (H.gas_only_use_static_grav) {
+    // this supports a crude-workaround for when we run cholla with
+    // - particles that are influenced by their own self-gravity, the gravity of the gas, and a static
+    //   analytic potential
+    // - AND we only want the gas to be influenced by the static analytic poential
+    //
+    // Be aware, STATIC_GRAV won't directly use this pointer (in fact, when STATIC_GRAV is defined, this
+    // pointer should be NULL). We should probably refactor to unify STATIC_GRAV and GRAVITY
+#ifdef GRAVITY
+    d_Grav_potential = Grav.F.analytic_potential_d;
+#else
+    CHOLLA_ERROR("this should be unreachable when GRAVITY isn't defined");
+#endif
+  } else {
+    d_Grav_potential = C.d_Grav_potential;
+  }
+
   // this buffer holds 1 element that is initialized to 0
   cuda_utilities::DeviceVector<int> error_code_buffer(1, true);
 
@@ -440,13 +460,13 @@ void Grid3D::Execute_Hydro_Integrator(void)
   } else if (H.nx > 1 && H.ny > 1 && H.nz > 1)  // 3D
   {
 #ifdef VL
-    VL_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy,
-                         H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, H.density_floor,
+    VL_Algorithm_3D_CUDA(C.device, d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy, H.dz,
+                         H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, H.density_floor,
                          C.Grav_potential, SlowCellConditionChecker(1.0 / H.min_dt_slow, H.dx, H.dy, H.dz),
                          error_code_buffer.data());
 #endif  // VL
 #ifdef SIMPLE
-    Simple_Algorithm_3D_CUDA(C.device, C.d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy,
+    Simple_Algorithm_3D_CUDA(C.device, d_Grav_potential, H.nx, H.ny, H.nz, x_off, y_off, z_off, H.n_ghost, H.dx, H.dy,
                              H.dz, H.xbound, H.ybound, H.zbound, H.dt, H.n_fields, H.custom_grav, H.density_floor,
                              C.Grav_potential, SlowCellConditionChecker(1.0 / H.min_dt_slow, H.dx, H.dy, H.dz),
                              error_code_buffer.data());
@@ -465,7 +485,8 @@ void Grid3D::Execute_Hydro_Integrator(void)
 #endif  // CPU_TIME
 }
 
-Real Grid3D::Update_Hydro_Grid(std::function<void(Grid3D &)> &chemistry_callback)
+Real Grid3D::Update_Hydro_Grid(std::function<void(Grid3D &)> &feedback_callback,
+                               std::function<void(Grid3D &)> &chemistry_callback)
 {
 #ifdef ONLY_PARTICLES
   // Don't integrate the Hydro when only solving for particles
@@ -482,8 +503,13 @@ Real Grid3D::Update_Hydro_Grid(std::function<void(Grid3D &)> &chemistry_callback
   Extrapolate_Grav_Potential();
 #endif  // GRAVITY
 
+  // Evolve the hydrodynamical quantities
   Execute_Hydro_Integrator();
 
+  // apply the floors
+  // ================
+  // -> we need do this before we handle source terms because it is necessary for chemistry/cooling
+
 #ifdef TEMPERATURE_FLOOR
   // Set the lower limit temperature (Internal Energy)
   Real U_floor;
@@ -502,6 +528,12 @@ Real Grid3D::Update_Hydro_Grid(std::function<void(Grid3D &)> &chemistry_callback
   #endif
 #endif  // SCALAR_FLOOR
 
+  // apply source terms
+  // ==================
+  if (feedback_callback) {
+    feedback_callback(*this);
+  }
+
   // == Perform chemistry/cooling (there are a few different cases) ==
 
   if (chemistry_callback) {
@@ -565,6 +597,11 @@ Real Grid3D::Update_Hydro_Grid(std::function<void(Grid3D &)> &chemistry_callback
   #endif  // CPU_TIME
 #endif    // COOLING_GRACKLE
 
+  // Finally, it is time to handle calculation of the timestep for the next cycle
+  // ============================================================================
+  // -> first, we perform certain modifications to the fields that are partially
+  //    motivated by the impact that they have on the size of the timestep
+
   // Temperature Ceiling
 #ifdef TEMPERATURE_CEILING
   // 1e51 ergs / (m_p * (pc/cm)^3) = 45000 km/s
diff --git a/src/grid/grid3D.h b/src/grid/grid3D.h
index 1cdaaab39..8869be7ac 100644
--- a/src/grid/grid3D.h
+++ b/src/grid/grid3D.h
@@ -252,6 +252,9 @@ struct Header {
   Real sphere_center_y;
   Real sphere_center_z;
 
+  // only meaningful when GRAVITY and GRAVITY_ANALYTIC_COMP are defined
+  bool gas_only_use_static_grav;
+
 #ifdef GRAVITY
   /*! \var n_ghost_potential_offset
    *  \brief Number of offset betewen hydro_ghost_cells and
@@ -421,7 +424,9 @@ class Grid3D
     Real *d_density, *d_momentum_x, *d_momentum_y, *d_momentum_z, *d_Energy, *d_scalar, *d_basic_scalar,
         *d_dust_density, *d_magnetic_x, *d_magnetic_y, *d_magnetic_z, *d_GasEnergy;
 
-    /*! pointer to gravitational potential on device */
+    /*! pointer to gravitational potential on device
+     *  - This is primarily used to hold the extrapolated potential
+     */
     Real *d_Grav_potential;
   } C;
 
@@ -469,11 +474,15 @@ class Grid3D
   /*! \fn void Update_Hydro_Grid(void)
    *  \brief Do all steps to update the hydro.
    *
+   *  \param feedback_callback is a crude way to optionally provide a feedback
+   *  function that is invoked after the hydro-integrator, but before
+   *  heating/cooling/chemistry
    *  \param chemistry_callback is a crude way to optionally provide a cooling
    *  function that is invoked after the hydro-integrator. At the moment,
    *  this does not support chemistry.
    */
-  Real Update_Hydro_Grid(std::function<void(Grid3D &)> &chemistry_callback);
+  Real Update_Hydro_Grid(std::function<void(Grid3D &)> &feedback_callback,
+                         std::function<void(Grid3D &)> &chemistry_callback);
 
   void Update_Time();
   /*! \fn void Write_Header_Text(FILE *fp)
diff --git a/src/main.cpp b/src/main.cpp
index edb0472a1..5d59453c4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -177,14 +177,11 @@ int main(int argc, char *argv[])
   }
 #endif
 
+  std::function<void(Grid3D &)> feedback_callback;
+
 #if defined(FEEDBACK) && defined(PARTICLE_AGE)
   FeedbackAnalysis sn_analysis(G, &P);
-  #ifndef NO_SN_FEEDBACK
-  feedback::Init_State(&P);
-  #endif  // NO_SN_FEEDBACK
-  #ifndef NO_WIND_FEEDBACK
-  feedback::Init_Wind_State(&P);
-  #endif
+  feedback_callback = feedback::configure_feedback_callback(P, pmap, sn_analysis);
 #endif  // FEEDBACK && PARTICLE_AGE
 
 #ifdef STAR_FORMATION
@@ -279,10 +276,6 @@ int main(int argc, char *argv[])
       G.H.dt = next_scheduled_time - G.H.t;
     }
 
-#if defined(FEEDBACK) && defined(PARTICLE_AGE)
-    feedback::Cluster_Feedback(G, sn_analysis);
-#endif  // FEEDBACK && PARTICLE_AGE
-
 #ifdef PARTICLES
     // Advance the particles KDK( first step ): Velocities are updated by 0.5*dt
     // and positions are updated by dt
@@ -292,7 +285,7 @@ int main(int argc, char *argv[])
 #endif
 
     // Advance the grid by one timestep
-    dti = G.Update_Hydro_Grid(chemistry_callback);
+    dti = G.Update_Hydro_Grid(feedback_callback, chemistry_callback);
 
     // update the simulation time ( t += dt )
     G.Update_Time();
diff --git a/src/model/disk_ICs.cu b/src/model/disk_ICs.cu
index 6bcd9012b..024e1bfc8 100644
--- a/src/model/disk_ICs.cu
+++ b/src/model/disk_ICs.cu
@@ -916,10 +916,7 @@ void Grid3D::Disk_3D(Parameters p)
   //   thermal-energy-density field in the total-energy-density field (we need to
   //   add the kinetic energy contribution afterwards)
 
-  bool self_gravity = false;
-#ifdef GRAVITY
-  self_gravity = true;
-#endif
+  bool self_gravity = not p.gas_only_use_static_grav;
 
   // since we are adding contributions from the halo across the entire domain, let's initialize it
   // first (we will need to account for its influence on the radial pressure gradients when
@@ -942,9 +939,6 @@ void Grid3D::Disk_3D(Parameters p)
       //                 `(gamma - 1) * specific_internal_energy`
       Real isoth_term                     = hdp.cs * hdp.cs;  // <- square of the isothermal sound speed
       Real initial_gas_scale_height_guess = gas_disk.H_d;
-
-      // we always pass H.n_ghost into SelfGravHydroStaticColMaker because the class computes the total
-      // length of the column as `H.n_ghost * 2 + p.nz`
       SelfGravHydroStaticColMaker col_maker(H.n_ghost, ZGridProps(p.zmin, p.zlen, p.nz), isoth_term, nongas_phi_fn,
                                             initial_gas_scale_height_guess);
 
@@ -955,8 +949,6 @@ void Grid3D::Disk_3D(Parameters p)
       };
       partial_initialize_isothermal_disk(p, this->H, *this, this->C, hdp, col_maker, vrot2_from_phi_fn);
     } else {
-      // we always pass H.n_ghost into IsothermalStaticGravHydroStaticColMaker because the class computes the
-      // total length of the column as `H.n_ghost * 2 + p.nz`
       IsothermalStaticGravHydroStaticColMaker col_maker(p.zlen / ((Real)p.nz), p.nz, H.n_ghost, hdp);
       // the following function is used to compute the rotational velocity for a collisionless particle
       // (this includes an estimate for the potential of the gas disk)
@@ -1131,18 +1123,11 @@ void partial_initialize_isothermal_disk(const Parameters& p, const Header& H, co
                                         const HydroStaticColMaker& col_maker,
                                         const Vrot2FromPotential& vrot2_from_phi_fn)
 {
-  // Step 0: allocate buffers & determine the loop bounds
+  // Step 0: allocate buffers
   // -> this buffer tracks the midplane mass density
   std::vector<Real> rho_midplane_2Dbuffer((H.ny * H.nx), 0.0);
   // -> this buffer tracks the locations where we have contributed mass from the disk
   std::vector<Real> rho_disk((H.nz * H.ny * H.nx), 0.0);
-  // -> to apply the same pressure-derivative stencil everywhere (when assigning velocities), we
-  //    must make sure to initialize density & pressure slightly outside of the the active zone
-  CHOLLA_ASSERT(H.n_ghost >= 1, "Ghost zone depth must be 1 or larger");
-  const int index_start = H.n_ghost - 1;
-  const int k_stop      = H.nz - (H.n_ghost - 1);
-  const int j_stop      = H.ny - (H.n_ghost - 1);
-  const int i_stop      = H.nx - (H.n_ghost - 1);
 
   // Step 1: add the gas-disk density and thermal energy to the density and energy arrays
   // -> At each (x,y) pair, we use col_maker to loop over all z-values and compute "hydrostatic column"
@@ -1151,8 +1136,8 @@ void partial_initialize_isothermal_disk(const Parameters& p, const Header& H, co
   // -> then we compute the disk density & thermal energy based on values in that buffer
   std::vector<Real> rho_buffer(col_maker.buffer_len(), 0.0);
   bool any_density_error = false;
-  for (int j = index_start; j < j_stop; j++) {
-    for (int i = index_start; i < i_stop; i++) {
+  for (int j = H.n_ghost; j < H.ny - H.n_ghost; j++) {
+    for (int i = H.n_ghost; i < H.nx - H.n_ghost; i++) {
       // get the centered x & y positions (the way the function is written, we also get a z position)
       const int dummy_k = H.n_ghost + H.ny;
       Real x_pos, y_pos, dummy_z_pos;
@@ -1170,7 +1155,7 @@ void partial_initialize_isothermal_disk(const Parameters& p, const Header& H, co
       rho_midplane_2Dbuffer[i + j * H.nx] = rho_midplane;
 
       // store densities (from the column)
-      for (int k = index_start; k < k_stop; k++) {
+      for (int k = H.n_ghost; k < H.nz - H.n_ghost; k++) {
         int id = i + j * H.nx + k * H.nx * H.ny;
 
         // get density from hydrostatic column computation
diff --git a/src/model/disk_galaxy.cu b/src/model/disk_galaxy.cu
index 6cf2ac64e..c5c60b7cb 100644
--- a/src/model/disk_galaxy.cu
+++ b/src/model/disk_galaxy.cu
@@ -5,13 +5,23 @@
 
 // all masses in M_sun and all distances in kpc
 
+// Original Philosophy
+// -------------------
 // For the MilkyWay model, we adopt radial scale lengths of 2.5 kpc and 3.5 kpc for
 // the stellar and gas disks, respectively. If the newly formed stars follow the
 // Kennicut-Schmidt law with a power of 1.4, the newly formed stars will organize
-// into a disk with scale-length of 2.5 kpc
-const ClusteredDiskGalaxy galaxies::MW(ClusterMassDistribution{1e2, 5e5, 2.0},
-                                       MiyamotoNagaiPotential{6.5e10, 2.5, 0.7},                // stellar_disk
-                                       GasDiskProps{0.15 * 6.5e10, 3.5, 0.7, 1e4, true, 0.02},  // gas_disk
+// into a disk with scale-length of 2.5 kpc.
+// We also liked an upper cluster-mass limit of 5e5 Msun
+//
+// Actual Choice
+// -------------
+// For consistency with the CGOLs style model:
+//  -> stellar disk scale-length of 2.7 kpc
+//  -> gas disk scale-length of 5.4 kpc
+//  -> upper cluster-mass limit of 2e5 Msun
+const ClusteredDiskGalaxy galaxies::MW(ClusterMassDistribution{1e2, 2e5, 2.0},
+                                       MiyamotoNagaiPotential{6.5e10, 2.7, 0.7},                // stellar_disk
+                                       GasDiskProps{0.15 * 6.5e10, 5.4, 0.7, 1e4, true, 0.02},  // gas_disk
                                        1.077e12, 261, 18, 157.0);
 const DiskGalaxy galaxies::M82(MiyamotoNagaiPotential{1.0e10, 0.8, 0.15},                       // stellar_disk
                                GasDiskProps{0.25 * 1.0e10, 2 * 0.8, 0.15, 1e4, true, 2 * 0.8},  // gas_disk
diff --git a/src/model/disk_galaxy.h b/src/model/disk_galaxy.h
index 23e0d0b95..9a773a73f 100644
--- a/src/model/disk_galaxy.h
+++ b/src/model/disk_galaxy.h
@@ -205,13 +205,13 @@ class ClusteredDiskGalaxy : public DiskGalaxy
 inline Real Get_StarCluster_Truncation_Radius(const Parameters& p)
 {
   if ((20.4 < p.xlen) and (p.xlen < 20.5)) return 9.5;
-  return p.xlen / 2.0 - 0.2;
+  return p.xlen / 2.0 - 0.5;
 }
 
 inline Real Get_Gas_Truncation_Radius(const Parameters& p)
 {
   if ((20.4 < p.xlen) and (p.xlen < 20.5)) return 9.9;
-  return p.xlen / 2.0 - 0.1;
+  return p.xlen / 2.0 - 0.3;
 }
 
 // Forward declare galaxy instances. These are defined in disk_galaxy.cu
diff --git a/src/utils/basic_structs.h b/src/utils/basic_structs.h
index 42d7176b3..60ae81168 100644
--- a/src/utils/basic_structs.h
+++ b/src/utils/basic_structs.h
@@ -19,6 +19,14 @@ namespace hydro_utilities
 /*!
  * \brief A data only struct that acts as a simple 3 element vector.
  *
+ * Because this is an aggregate type, you can construct it as:
+ * \code{.cpp}
+ *    VectorXYZ<double> my_var{1.0, 2.0, 3.0}
+ *    VectorXYZ<float> my_var2 = {1.0, 2.0, 3.0};
+ *    VectorXYZ<int> my_var3;  // Using the default constructor. Based on a note from the docs of
+ *                             // std::array, the values for a non-class type (like float/double/int)
+ *                             // may be indeterminate in this case.
+ * \endcode
  */
 template <typename T>
 struct VectorXYZ {
diff --git a/src/utils/gpu.hpp b/src/utils/gpu.hpp
index 196db01e4..33cf8858c 100644
--- a/src/utils/gpu.hpp
+++ b/src/utils/gpu.hpp
@@ -12,12 +12,29 @@
 
   #include <hip/hip_runtime.h>
 
+  #include <hip/hip_cooperative_groups.h>
+
   #if defined(PARIS) || defined(PARIS_GALACTIC)
 
     #include <hipfft.h>
 
   #endif  // CUFFT PARIS PARIS_GALACTIC
 
+  #if !defined(HIP_VERSION) || (HIP_VERSION < 40200000)
+  // here, we are enforcing the requirement that HIP is version 4.2 or newer
+  // -> this check picks 4.2 because that's when the `HIP_VERSION` macro was first provided.
+  //    This fact and the format of `HIP_VERSION` are described here:
+  //    https://rocm.docs.amd.com/projects/HIP/en/docs-5.7.1/user_guide/faq.html#how-can-i-know-the-version-of-hip
+  // -> if we really want to add support for older HIP versions, we may need to compile test
+  //    programs to query version numbers as part of the build-systems. But that seems
+  //    unnecessary since, as of Nov 2025, AMD doesn't seem to document versions before 5.0
+  // -> in practice, I suspect we probably use some features that require versions of HIP
+  //    released some time after 4.2 (ideally, we would update the HIP_VERSION requirement
+  //    to reflect that)
+
+    #error "The current version of HIP is too old"
+  #endif
+
   #define WARPSIZE 64
 static constexpr int maxWarpsPerBlock = 1024 / WARPSIZE;
 
@@ -29,46 +46,50 @@ static constexpr int maxWarpsPerBlock = 1024 / WARPSIZE;
   #define CUFFT_SUCCESS HIPFFT_SUCCESS
   #define cufftResult_t hipfftResult_t
 
-  #define cudaDeviceSynchronize              hipDeviceSynchronize
-  #define cudaError                          hipError_t
-  #define cudaError_t                        hipError_t
-  #define cudaErrorInsufficientDriver        hipErrorInsufficientDriver
-  #define cudaErrorNoDevice                  hipErrorNoDevice
-  #define cudaEvent_t                        hipEvent_t
-  #define cudaEventCreate                    hipEventCreate
-  #define cudaEventElapsedTime               hipEventElapsedTime
-  #define cudaEventRecord                    hipEventRecord
-  #define cudaEventSynchronize               hipEventSynchronize
-  #define cudaFree                           hipFree
-  #define cudaFreeHost                       hipHostFree
-  #define cudaGetDevice                      hipGetDevice
-  #define cudaGetDeviceCount                 hipGetDeviceCount
-  #define cudaGetErrorString                 hipGetErrorString
-  #define cudaGetLastError                   hipGetLastError
-  #define cudaHostAlloc                      hipHostMalloc
-  #define cudaHostAllocDefault               hipHostMallocDefault
-  #define cudaMalloc                         hipMalloc
-  #define cudaMemcpy                         hipMemcpy
-  #define cudaMemcpyAsync                    hipMemcpyAsync
-  #define cudaMemcpyPeer                     hipMemcpyPeer
-  #define cudaMemcpyDeviceToHost             hipMemcpyDeviceToHost
-  #define cudaMemcpyDeviceToDevice           hipMemcpyDeviceToDevice
-  #define cudaMemcpyHostToDevice             hipMemcpyHostToDevice
-  #define cudaMemGetInfo                     hipMemGetInfo
-  #define cudaMemset                         hipMemset
-  #define cudaReadModeElementType            hipReadModeElementType
-  #define cudaSetDevice                      hipSetDevice
-  #define cudaSuccess                        hipSuccess
-  #define cudaDeviceProp                     hipDeviceProp_t
-  #define cudaGetDeviceProperties            hipGetDeviceProperties
-  #define cudaPointerAttributes              hipPointerAttribute_t
-  #define cudaPointerGetAttributes           hipPointerGetAttributes
-  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-  #define cudaMemGetInfo                     hipMemGetInfo
-  #define cudaDeviceGetPCIBusId              hipDeviceGetPCIBusId
-  #define cudaPeekAtLastError                hipPeekAtLastError
-  #define cudaFuncAttributes                 hipFuncAttributes
-  #define cudaFuncGetAttributes              hipFuncGetAttributes
+  #define cudaDeviceSynchronize                         hipDeviceSynchronize
+  #define cudaError                                     hipError_t
+  #define cudaError_t                                   hipError_t
+  #define cudaErrorInsufficientDriver                   hipErrorInsufficientDriver
+  #define cudaErrorNoDevice                             hipErrorNoDevice
+  #define cudaEvent_t                                   hipEvent_t
+  #define cudaEventCreate                               hipEventCreate
+  #define cudaEventElapsedTime                          hipEventElapsedTime
+  #define cudaEventRecord                               hipEventRecord
+  #define cudaEventSynchronize                          hipEventSynchronize
+  #define cudaFree                                      hipFree
+  #define cudaFreeHost                                  hipHostFree
+  #define cudaGetDevice                                 hipGetDevice
+  #define cudaGetDeviceCount                            hipGetDeviceCount
+  #define cudaGetErrorString                            hipGetErrorString
+  #define cudaGetLastError                              hipGetLastError
+  #define cudaHostAlloc                                 hipHostMalloc
+  #define cudaHostAllocDefault                          hipHostMallocDefault
+  #define cudaMalloc                                    hipMalloc
+  #define cudaMemcpy                                    hipMemcpy
+  #define cudaMemcpyAsync                               hipMemcpyAsync
+  #define cudaMemcpyPeer                                hipMemcpyPeer
+  #define cudaMemcpyDeviceToHost                        hipMemcpyDeviceToHost
+  #define cudaMemcpyDeviceToDevice                      hipMemcpyDeviceToDevice
+  #define cudaMemcpyHostToDevice                        hipMemcpyHostToDevice
+  #define cudaMemGetInfo                                hipMemGetInfo
+  #define cudaMemset                                    hipMemset
+  #define cudaReadModeElementType                       hipReadModeElementType
+  #define cudaSetDevice                                 hipSetDevice
+  #define cudaSuccess                                   hipSuccess
+  #define cudaDeviceProp                                hipDeviceProp_t
+  #define cudaGetDeviceProperties                       hipGetDeviceProperties
+  #define cudaPointerAttributes                         hipPointerAttribute_t
+  #define cudaPointerGetAttributes                      hipPointerGetAttributes
+  #define cudaOccupancyMaxPotentialBlockSize            hipOccupancyMaxPotentialBlockSize
+  #define cudaDeviceGetAttribute                        hipDeviceGetAttribute
+  #define cudaDevAttrCooperativeLaunch                  hipDeviceAttributeCooperativeLaunch
+  #define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
+  #define cudaLaunchCooperativeKernel                   hipLaunchCooperativeKernel
+  #define cudaMemGetInfo                                hipMemGetInfo
+  #define cudaDeviceGetPCIBusId                         hipDeviceGetPCIBusId
+  #define cudaPeekAtLastError                           hipPeekAtLastError
+  #define cudaFuncAttributes                            hipFuncAttributes
+  #define cudaFuncGetAttributes                         hipFuncGetAttributes
 
   // Texture definitions
   #define cudaArray           hipArray
@@ -113,6 +134,7 @@ static constexpr int maxWarpsPerBlock = 1024 / WARPSIZE;
 
 #else  // not O_HIP
 
+  #include <cooperative_groups.h>
   #include <cuda_runtime.h>
 
   #if defined(PARIS) || defined(PARIS_GALACTIC)
diff --git a/src/utils/math_utilities.h b/src/utils/math_utilities.h
index 39bf43091..2647302c3 100644
--- a/src/utils/math_utilities.h
+++ b/src/utils/math_utilities.h
@@ -129,4 +129,22 @@ inline __device__ __host__ void Cyclic_Permute_Twice(hydro_utilities::VectorXYZ<
 }
 // =====================================================================================================================
 
+/*!
+ * \brief When `val` lies within the inclusive range `[lo, hi]`, returns `val`. Otherwise, return the closest value in
+ * the range.
+ *
+ * \warning
+ * The behavior is undefined when `lo>hi` or any of the values are `std::nan`.
+ *
+ * \note
+ * Instead of using this implementation, we may be able to use `std::clamp` (unclear if supported by CUDA/Rocm). We
+ * could also consider using CUDA/Rocm intrinsics to speed this up
+ */
+template <typename T>
+__device__ __host__ T clamp(T val, T lo, T hi)
+{
+  const T tmp = val < lo ? lo : val;
+  return tmp > hi ? hi : tmp;
+}
+
 }  // namespace math_utils
diff --git a/src/utils/reduction_utilities.h b/src/utils/reduction_utilities.h
index 99191d8c5..7b4412c7b 100644
--- a/src/utils/reduction_utilities.h
+++ b/src/utils/reduction_utilities.h
@@ -8,6 +8,7 @@
 #pragma once
 
 // STL Includes
+#include <climits>
 #include <cstdint>
 
 // External Includes
@@ -17,6 +18,62 @@
 #include "../global/global_cuda.h"
 #include "../utils/gpu.hpp"
 
+namespace reduction_utilities::backport
+{
+/*!
+ * \brief Do a device side bit cast
+ *
+ * \tparam To The output type
+ * \tparam From The input type
+ * \param from The input value
+ * \return To The bit cast version of From as type To
+ */
+template <class To, class From>
+__device__ constexpr To bit_cast(const From& from) noexcept
+{
+  // TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it
+  To to{};
+  static_assert(sizeof(To) == sizeof(From));
+  memcpy(&to, &from, sizeof(To));
+  return to;
+}
+
+/*!
+ * \brief Perform an atomic reduction to find the minimum value of `val`
+ *
+ * \param[out] address The pointer to where to store the reduced scalar
+ * value in device memory
+ * \param[in] val The thread local variable to find the minimum of
+ */
+inline long long __device__ atomicMin(long long* address, long long val)
+{
+  // this uses the pattern recommended by CUDA docs for implementing atomics in terms of CAS
+  unsigned long long* address_as_ull = (unsigned long long*)address;
+  unsigned long long old             = *address_as_ull;
+  unsigned long long assumed;
+
+  do {
+    assumed              = old;
+    long long assumed_LL = bit_cast<long long>(assumed);
+    long long newval_LL  = (assumed_LL < val) ? assumed_LL : val;
+    old                  = atomicCAS(address_as_ull, assumed, bit_cast<long long>(newval_LL));
+
+  } while (assumed != old);
+
+  return bit_cast<long long>(old);
+}
+
+}  // namespace reduction_utilities::backport
+
+#if defined(O_HIP) && (HIP_VERSION < 50700000)
+// HIP versions before 5.7 did not implement atomicMin (or atomicMax) for `long long`, so we
+// backport the function
+
+// expose atomicMin as part of the global namespace
+using ::reduction_utilities::backport::atomicMin;
+
+#endif  // defined(O_HIP) && (HIP_VERSION < 50700000)
+
 /*!
  * \brief Namespace to contain device resident reduction functions. Includes
  * functions and kernels for array reduction, warp level, block level, and
@@ -88,23 +145,6 @@ __inline__ __device__ Real blockReduceMax(Real val)
 // https://github.com/rapidsai/cuml/blob/dc14361ba11c41f7a4e1e6a3625bbadd0f52daf7/cpp/src_prims/stats/minmax.cuh
 // with slight tweaks for our use case.
 // =====================================================================
-/*!
- * \brief Do a device side bit cast
- *
- * \tparam To The output type
- * \tparam From The input type
- * \param from The input value
- * \return To The bit cast version of From as type To
- */
-template <class To, class From>
-__device__ constexpr To bit_cast(const From& from) noexcept
-{
-  // TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it
-  To to{};
-  static_assert(sizeof(To) == sizeof(From));
-  memcpy(&to, &from, sizeof(To));
-  return to;
-}
 
 /*!
  * \brief Encode a float as an int
@@ -114,7 +154,7 @@ __device__ constexpr To bit_cast(const From& from) noexcept
  */
 inline __device__ int encode(float val)
 {
-  int i = bit_cast<int>(val);
+  int i = backport::bit_cast<int>(val);
   return i >= 0 ? i : (1 << 31) | ~i;  // NOLINT(hicpp-signed-bitwise)
 }
 
@@ -126,7 +166,7 @@ inline __device__ int encode(float val)
  */
 inline __device__ long long encode(double val)
 {
-  auto i = bit_cast<std::int64_t>(val);
+  auto i = backport::bit_cast<std::int64_t>(val);
   return i >= 0 ? i : (1ULL << 63) | ~i;  // NOLINT(hicpp-signed-bitwise)
 }
 
@@ -141,7 +181,7 @@ inline __device__ float decode(int val)
   if (val < 0) {
     val = (1 << 31) | ~val;  // NOLINT(hicpp-signed-bitwise)
   }
-  return bit_cast<float>(val);
+  return backport::bit_cast<float>(val);
 }
 
 /*!
@@ -155,7 +195,7 @@ inline __device__ double decode(long long val)
   if (val < 0) {
     val = (1ULL << 63) | ~val;  // NOLINT(hicpp-signed-bitwise)
   }
-  return bit_cast<double>(val);
+  return backport::bit_cast<double>(val);
 }
 #endif  // O_HIP
 /*!
@@ -303,4 +343,43 @@ __inline__ __device__ void gridReduceMax(Real val, Real* out)
  */
 __global__ void kernelReduceMax(Real* in, Real* out, size_t N);
 // =====================================================================
+
+// =====================================================================
+/* Performs N sum-reductions on an input shared memory array within the block and adds the results to an
+ * N-element array.
+ *
+ * \tparam N the length of the output array
+ * \tparam blocksize the number of threads per block
+ *
+ * \param[out] dest destination memory address containing N entries. This is NOT ALLOWED to overlap with
+ *    src_shared
+ * \param[in] src_shared Source memory address. This MUST refer to a __shared__ memory region containing
+ *    N times blocksize elements. The numbers that are added together are offset by blocksize elements.
+ *
+ * \note
+ * If passing src to a function causes problems (since we are potentially obfuscating the __shared__ descriptor),
+ * we can always convert this into a macro.
+ */
+template <std::size_t N, std::size_t Blocksize>
+__device__ void blockAccumulateIntoNReals(Real* __restrict__ dest, Real* __restrict__ src_shared)
+{
+  // reduce the info from all the threads within the block (since src_shared was declared as __shared__,
+  // each block has its own copy of src_shared)
+  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      for (unsigned int cur_ind = 0; cur_ind < N; cur_ind++) {
+        src_shared[N * threadIdx.x + cur_ind] += src_shared[N * (threadIdx.x + s) + cur_ind];
+      }
+    }
+    __syncthreads();
+  }
+
+  // atomicAdd reduces across all blocks
+  if (threadIdx.x == 0) {
+    for (unsigned int cur_ind = 0; cur_ind < N; cur_ind++) {
+      atomicAdd(dest + cur_ind,  // <- pointer arithmetic
+                src_shared[cur_ind]);
+    }
+  }
+}
 }  // namespace reduction_utilities
diff --git a/src/utils/reduction_utilities_tests.cu b/src/utils/reduction_utilities_tests.cu
index 528de22a7..f6e516bbc 100644
--- a/src/utils/reduction_utilities_tests.cu
+++ b/src/utils/reduction_utilities_tests.cu
@@ -7,7 +7,10 @@
  */
 
 // STL Includes
+#include <algorithm>
+#include <cstdio>
 #include <iostream>
+#include <limits>
 #include <random>
 #include <string>
 #include <vector>
@@ -22,6 +25,49 @@
 #include "../utils/reduction_utilities.h"
 #include "../utils/testing_utilities.h"
 
+long long perform_atomic_min(const std::vector<long long>& host_vals)
+{
+  // it appears that we need to define the lambda function outside of a googletest
+  // test case (which is why this function exists)
+
+  // construct a device vector that holds copies of each host value
+  cuda_utilities::DeviceVector<long long> device_vals(host_vals.size());
+  device_vals.cpyHostToDevice(host_vals);
+
+  // construct an output buffer where we will write the results
+  cuda_utilities::DeviceVector<long long> device_outbuffer(1);
+  device_outbuffer.assign(std::numeric_limits<long long>::max());
+
+  // invoke the kernel that we want to check
+
+  const long long* device_vals_ptr = device_vals.data();
+  long long* device_out_ptr        = device_outbuffer.data();
+
+  auto loop_fn = [device_vals_ptr, device_out_ptr] __device__(int index) {
+    reduction_utilities::backport::atomicMin(device_out_ptr, device_vals_ptr[index]);
+  };
+  gpuFor(host_vals.size(), loop_fn);
+  return device_outbuffer[0];
+}
+
+TEST(tALLBackports, AtomicMinLL)
+{
+  // construct a vector of values to compute the minimum of
+  std::vector<long long> host_vals(64);
+  for (std::size_t i = 0; i < host_vals.size(); i++) {
+    host_vals[i] = static_cast<long long>(i);
+  }
+  host_vals[1] = host_vals[0];
+  host_vals[2] = std::numeric_limits<long long>::min();
+  host_vals[3] = std::numeric_limits<long long>::max();
+
+  // get the expected value
+  long long expected = *(std::min_element(host_vals.begin(), host_vals.end()));
+  long long actual   = perform_atomic_min(host_vals);
+
+  ASSERT_EQ(expected, actual) << "reduction_utilities::backport::atomicMin produced an unexpected result";
+}
+
 // =============================================================================
 // Tests for divergence max reduction
 // =============================================================================