diff --git a/src/include/OSL/platform.h b/src/include/OSL/platform.h
index c133ef89f..cf0ed36f1 100644
--- a/src/include/OSL/platform.h
+++ b/src/include/OSL/platform.h
@@ -488,7 +488,11 @@
 /// to use regular assert() for this purpose if you need to eliminate the
 /// dependency on this header from a particular place (and don't mind that
 /// assert won't format identically on all platforms).
-#ifndef NDEBUG
+///
+/// These macros are no-ops when compiling for CUDA because they were found
+/// to cause strange issues in device code (e.g., function bodies being
+/// eliminated when OSL_DASSERT is used).
+#if !defined(NDEBUG) && !defined(__CUDACC__)
 #    define OSL_DASSERT OSL_ASSERT
 #    define OSL_DASSERT_MSG OSL_ASSERT_MSG
 #else
diff --git a/src/testrender/CMakeLists.txt b/src/testrender/CMakeLists.txt
index 5f72bfbdb..7e89188e2 100644
--- a/src/testrender/CMakeLists.txt
+++ b/src/testrender/CMakeLists.txt
@@ -22,13 +22,23 @@ if (OSL_USE_OPTIX)
         )
 
     # We need to make sure that the PTX files are regenerated whenever these
-    # headers change.
+    # files change.
     set (testrender_cuda_headers
-        cuda/rend_lib.h)
+        cuda/rend_lib.h
+        background.h
+        optics.h
+        render_params.h
+        raytracer.h
+        sampling.h
+        shading.h
+        shading.cpp
+        simpleraytracer.cpp
+        cuda/vec_math.h
+        )
 
     # Generate PTX for all of the CUDA files
     foreach (cudasrc ${testrender_cuda_srcs})
-        NVCC_COMPILE ( ${cudasrc} "" ptx_generated "" )
+        NVCC_COMPILE ( ${cudasrc} "${testrender_cuda_headers}" ptx_generated "" )
         list (APPEND ptx_list ${ptx_generated})
     endforeach ()
 
@@ -48,7 +58,7 @@ if (OSL_USE_OPTIX)
     list (APPEND ptx_list ${rend_lib_ptx})
 
     add_custom_target (testrender_ptx ALL
-        DEPENDS ${ptx_list}
+        DEPENDS ${ptx_list} ${testrender_cuda_headers}
         SOURCES ${testrender_cuda_srcs} )
 
     # Install the PTX files in a fixed location so that they can be
diff --git a/src/testrender/background.h b/src/testrender/background.h
index ac196cf27..3c79dd927 100644
--- a/src/testrender/background.h
+++ b/src/testrender/background.h
@@ -10,17 +10,51 @@
 
 OSL_NAMESPACE_ENTER
 
+
+#ifdef __CUDACC__
+// std::upper_bound is not supported in device code, so define a version of it here.
+// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information.
+template<typename T>
+inline OSL_HOSTDEVICE const T*
+upper_bound_cuda(const T* data, int count, const T value)
+{
+    const T* first = data;
+    const T value_ = value;
+    int len        = count;
+    while (len != 0) {
+        int l2     = len / 2;
+        const T* m = first;
+        m += l2;
+        if (value_ < *m)
+            len = l2;
+        else {
+            first = ++m;
+            len -= l2 + 1;
+        }
+    }
+    return first;
+}
+#endif
+
+
 struct Background {
+    OSL_HOSTDEVICE
     Background() : values(0), rows(0), cols(0) {}
+
+    OSL_HOSTDEVICE
     ~Background()
     {
+#ifndef __CUDACC__
         delete[] values;
         delete[] rows;
         delete[] cols;
+#endif
     }
 
-    template<typename F, typename T> void prepare(int resolution, F cb, T* data)
+    template<typename F, typename T>
+    void prepare(int resolution, F cb, T* data)
     {
+        // These values are set via set_variables() in CUDA
         res = resolution;
         if (res < 32)
             res = 32;  // validate
@@ -29,6 +63,7 @@ struct Background {
         values      = new Vec3[res * res];
         rows        = new float[res];
         cols        = new float[res * res];
+
         for (int y = 0, i = 0; y < res; y++) {
             for (int x = 0; x < res; x++, i++) {
                 values[i] = cb(map(x + 0.5f, y + 0.5f), data);
@@ -43,8 +78,9 @@ struct Background {
                     cols[i - res + x] /= cols[i - 1];
         }
         // normalize the pdf across all scanlines
-        for (int y = 0; y < res; y++)
+        for (int y = 0; y < res; y++) {
             rows[y] /= rows[res - 1];
+        }
 
         // both eval and sample below return a "weight" that is
         // value[i] / row*col_pdf, so might as well bake it into the table
@@ -65,6 +101,7 @@ struct Background {
 #endif
     }
 
+    OSL_HOSTDEVICE
     Vec3 eval(const Vec3& dir, float& pdf) const
     {
         // map from sphere to unit-square
@@ -90,6 +127,7 @@ struct Background {
         return values[i];
     }
 
+    OSL_HOSTDEVICE
     Vec3 sample(float rx, float ry, Dual2<Vec3>& dir, float& pdf) const
     {
         float row_pdf, col_pdf;
@@ -101,8 +139,96 @@ struct Background {
         return values[y * res + x];
     }
 
+#ifdef __CUDACC__
+    OSL_HOSTDEVICE
+    void set_variables(Vec3* values_in, float* rows_in, float* cols_in,
+                       int res_in)
+    {
+        values      = values_in;
+        rows        = rows_in;
+        cols        = cols_in;
+        res         = res_in;
+        invres      = __frcp_rn(res);
+        invjacobian = __fdiv_rn(res * res, float(4 * M_PI));
+        assert(res >= 32);
+    }
+
+    template<typename F>
+    OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb)
+    {
+        prepare_cuda_01(stride, idx, cb);
+        if (idx == 0)
+            prepare_cuda_02();
+        prepare_cuda_03(stride, idx);
+    }
+
+    // Pre-compute the 'values' table in parallel
+    template<typename F>
+    OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb)
+    {
+        for (int y = 0; y < res; y++) {
+            const int row_start = y * res;
+            const int row_end   = row_start + res;
+            int i               = row_start + idx;
+            for (int x = idx; x < res; x += stride, i += stride) {
+                if (i >= row_end)
+                    continue;
+                values[i] = cb(map(x + 0.5f, y + 0.5f));
+            }
+        }
+    }
+
+    // Compute 'cols' and 'rows' using a single thread
+    OSL_HOSTDEVICE void prepare_cuda_02()
+    {
+        for (int y = 0, i = 0; y < res; y++) {
+            for (int x = 0; x < res; x++, i++) {
+                cols[i] = std::max(std::max(values[i].x, values[i].y),
+                                   values[i].z)
+                          + ((x > 0) ? cols[i - 1] : 0.0f);
+            }
+            rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f);
+            // normalize the pdf for this scanline (if it was non-zero)
+            if (cols[i - 1] > 0) {
+                for (int x = 0; x < res; x++) {
+                    cols[i - res + x] = __fdiv_rn(cols[i - res + x],
+                                                  cols[i - 1]);
+                }
+            }
+        }
+    }
+
+    // Normalize the row PDFs and finalize the 'values' table
+    OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx)
+    {
+        // normalize the pdf across all scanlines
+        for (int y = idx; y < res; y += stride) {
+            rows[y] = __fdiv_rn(rows[y], rows[res - 1]);
+        }
+
+        // both eval and sample below return a "weight" that is
+        // value[i] / row*col_pdf, so might as well bake it into the table
+        for (int y = 0; y < res; y++) {
+            float row_pdf       = rows[y] - (y > 0 ? rows[y - 1] : 0.0f);
+            const int row_start = y * res;
+            const int row_end   = row_start + res;
+            int i               = row_start + idx;
+            for (int x = idx; x < res; x += stride, i += stride) {
+                if (i >= row_end)
+                    continue;
+                float col_pdf       = cols[i] - (x > 0 ? cols[i - 1] : 0.0f);
+                const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf),
+                                                invjacobian);
+                values[i].x         = __fdiv_rn(values[i].x, divisor);
+                values[i].y         = __fdiv_rn(values[i].y, divisor);
+                values[i].z         = __fdiv_rn(values[i].z, divisor);
+            }
+        }
+    }
+#endif
+
 private:
-    Dual2<Vec3> map(float x, float y) const
+    OSL_HOSTDEVICE Dual2<Vec3> map(float x, float y) const
     {
         // pixel coordinates of entry (x,y)
         Dual2<float> u     = Dual2<float>(x, 1, 0) * invres;
@@ -115,14 +241,20 @@ struct Background {
         return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi);
     }
 
-    static float sample_cdf(const float* data, unsigned int n, float x,
-                            unsigned int* idx, float* pdf)
+    static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n,
+                                           float x, unsigned int* idx,
+                                           float* pdf)
     {
-        OSL_DASSERT(x >= 0);
-        OSL_DASSERT(x < 1);
+        OSL_DASSERT(x >= 0.0f);
+        OSL_DASSERT(x < 1.0f);
+#ifndef __CUDACC__
         *idx = std::upper_bound(data, data + n, x) - data;
+#else
+        *idx = upper_bound_cuda(data, n, x) - data;
+#endif
         OSL_DASSERT(*idx < n);
         OSL_DASSERT(x < data[*idx]);
+
         float scaled_sample;
         if (*idx == 0) {
             *pdf          = data[0];
@@ -137,12 +269,12 @@ struct Background {
         return std::min(scaled_sample, 0.99999994f);
     }
 
-    Vec3* values;  // actual map
-    float* rows;   // probability of choosing a given row 'y'
-    float* cols;  // probability of choosing a given column 'x', given that we've chosen row 'y'
-    int res;       // resolution in pixels of the precomputed table
-    float invres;  // 1 / resolution
-    float invjacobian;
+    Vec3* values = nullptr;    // actual map
+    float* rows  = nullptr;    // probability of choosing a given row 'y'
+    float* cols  = nullptr;    // probability of choosing a given column 'x', given that we've chosen row 'y'
+    int res      = -1;         // resolution in pixels of the precomputed table
+    float invres = 0.0f;       // 1 / resolution
+    float invjacobian = 0.0f;
 };
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/cuda/optix_raytracer.cu b/src/testrender/cuda/optix_raytracer.cu
index 43e910db7..67acc3eb8 100644
--- a/src/testrender/cuda/optix_raytracer.cu
+++ b/src/testrender/cuda/optix_raytracer.cu
@@ -4,15 +4,26 @@
 
 
 #include <optix.h>
-
-#include "util.h"
-
 #include <optix_device.h>
 
 #include <OSL/hashes.h>
 
+#include "optix_raytracer.h"
 #include "rend_lib.h"
-#include "render_params.h"
+#include "vec_math.h"
+
+#include "../background.h"
+#include "../raytracer.h"
+#include "../render_params.h"
+#include "../sampling.h"
+
+// clang-format off
+// These files must be included in this specific order
+#include "../shading.h"
+#include "../shading.cpp"
+// clang-format on
+
+#include <cstdint>
 
 
 OSL_NAMESPACE_ENTER
@@ -34,6 +45,208 @@ __device__ __constant__ RenderParams render_params;
 }
 
 
+static __device__ void
+globals_from_hit(ShaderGlobalsType& sg, float radius = 0.0f, float spread = 0.0f,
+                 Ray::RayType raytype = Ray::RayType::CAMERA)
+{
+    ShaderGlobalsType local_sg;
+    // hit-kind 0: quad hit
+    //          1: sphere hit
+    optixDirectCall<void, unsigned int, float, float3, float3, ShaderGlobalsType*>(
+        optixGetHitKind(), optixGetPrimitiveIndex(), optixGetRayTmax(),
+        optixGetWorldRayOrigin(), optixGetWorldRayDirection(), &local_sg);
+    // Setup the ShaderGlobals
+    const float3 ray_direction = optixGetWorldRayDirection();
+    const float3 ray_origin    = optixGetWorldRayOrigin();
+    const float t_hit          = optixGetRayTmax();
+
+    // Construct a Ray in order to calculate P and its derivatives
+    Ray ray(F3_TO_V3(ray_origin), F3_TO_V3(ray_direction), radius, spread,
+            Ray::RayType::CAMERA);
+    Dual2<float> t(t_hit);
+    Dual2<Vec3> P = ray.point(t);
+
+    sg.I  = ray_direction;
+    sg.N  = normalize(optixTransformNormalFromObjectToWorldSpace(V3_TO_F3(local_sg.N)));
+    sg.Ng = normalize(optixTransformNormalFromObjectToWorldSpace(V3_TO_F3(local_sg.Ng)));
+    sg.P  = V3_TO_F3(P.val());
+    sg.dPdx        = V3_TO_F3(P.dx());
+    sg.dPdy        = V3_TO_F3(P.dy());
+    sg.dPdu        = local_sg.dPdu;
+    sg.dPdv        = local_sg.dPdv;
+    sg.u           = local_sg.u;
+    sg.v           = local_sg.v;
+    sg.Ci          = nullptr;
+    sg.surfacearea = local_sg.surfacearea;
+    sg.backfacing  = dot(V3_TO_F3(sg.N), V3_TO_F3(sg.I)) > 0.0f;
+    sg.shaderID    = local_sg.shaderID;
+
+    if (sg.backfacing) {
+        sg.N  = -sg.N;
+        sg.Ng = -sg.Ng;
+    }
+
+    sg.raytype        = raytype;
+    sg.flipHandedness = dot(V3_TO_F3(sg.N), cross(V3_TO_F3(sg.dPdx), V3_TO_F3(sg.dPdy))) < 0.0f;
+}
+
+
+static inline __device__ void
+execute_shader(ShaderGlobalsType& sg, char* closure_pool)
+{
+    if (sg.shaderID < 0) {
+        // TODO: should probably never get here ...
+        return;
+    }
+
+    // Pack the "closure pool" into one of the ShaderGlobals pointers
+    *(int*)&closure_pool[0] = 0;
+    sg.renderstate          = &closure_pool[0];
+
+    // Create some run-time options structs. The OSL shader fills in the structs
+    // as it executes, based on the options specified in the shader source.
+    NoiseOptCUDA noiseopt;
+    TextureOptCUDA textureopt;
+    TraceOptCUDA traceopt;
+
+    // Pack the pointers to the options structs in a faux "context",
+    // which is a rough stand-in for the host ShadingContext.
+    ShadingContextCUDA shading_context = { &noiseopt, &textureopt, &traceopt };
+    sg.context                         = &shading_context;
+
+    // Run the OSL callable
+    void* interactive_ptr = reinterpret_cast<void**>(
+        render_params.interactive_params)[sg.shaderID];
+    const unsigned int shaderIdx = 2u + sg.shaderID + 0u;
+    optixDirectCall<void, ShaderGlobalsType*, void*, void*, void*, int, void*>(
+        shaderIdx, &sg /*shaderglobals_ptr*/,
+        nullptr /*groupdata_ptr*/,
+        nullptr /*userdata_base_ptr*/,
+        nullptr /*output_base_ptr*/,
+        0 /*shadeindex - unused*/,
+        interactive_ptr /*interactive_params_ptr*/
+    );
+}
+
+
+static inline __device__ void
+trace_ray(OptixTraversableHandle handle, const Payload& payload, const float3& origin,
+          const float3& direction)
+{
+    uint32_t p0 = payload.raw[0];
+    uint32_t p1 = payload.raw[1];
+    uint32_t p2 = __float_as_uint(payload.radius);
+    uint32_t p3 = __float_as_uint(payload.spread);
+    uint32_t p4 = payload.raytype;
+
+    optixTrace(handle,                         // handle
+               origin,                         // origin
+               direction,                      // direction
+               1e-3f,                          // tmin
+               1e13f,                          // tmax
+               0,                              // ray time
+               OptixVisibilityMask(1),         // visibility mask
+               OPTIX_RAY_FLAG_DISABLE_ANYHIT,  // ray flags
+               0,                              // SBT offset
+               1,                              // SBT stride
+               0,                              // miss SBT offset
+               p0, p1, p2, p3, p4);
+};
+
+//
+// CudaScene
+//
+
+OSL_HOSTDEVICE bool
+CudaScene::intersect(const Ray& r, Dual2<float>& t, int& primID, void* sg) const
+{
+    Payload payload;
+    payload.sg_ptr  = reinterpret_cast<ShaderGlobalsType*>(sg);
+    payload.radius  = r.radius;
+    payload.spread  = r.spread;
+    payload.raytype = *reinterpret_cast<const Ray::RayType*>(&r.raytype);
+    TraceData tracedata(*payload.sg_ptr, primID);
+    trace_ray(handle, payload, V3_TO_F3(r.origin), V3_TO_F3(r.direction));
+    primID = tracedata.hit_id;
+    t      = tracedata.hit_t;
+    return (payload.sg_ptr->shaderID >= 0);
+}
+
+
+OSL_HOSTDEVICE float
+CudaScene::shapepdf(int primID, const Vec3& x, const Vec3& p) const
+{
+    SphereParams* spheres = (SphereParams*)spheres_buffer;
+    QuadParams* quads     = (QuadParams*)quads_buffer;
+    if (primID < num_spheres) {
+        const SphereParams& params = spheres[primID];
+        const OSL::Sphere sphere(F3_TO_V3(params.c), params.r, 0, false);
+        return sphere.shapepdf(x, p);
+    } else {
+        const QuadParams& params = quads[primID - num_spheres];
+        const OSL::Quad quad(F3_TO_V3(params.p), F3_TO_V3(params.ex),
+                             F3_TO_V3(params.ey), 0, false);
+        return quad.shapepdf(x, p);
+    }
+}
+
+
+OSL_HOSTDEVICE bool
+CudaScene::islight(int primID) const
+{
+    SphereParams* spheres = (SphereParams*)spheres_buffer;
+    QuadParams* quads     = (QuadParams*)quads_buffer;
+
+    if (primID < num_spheres)
+        return spheres[primID].isLight;
+    return quads[primID - num_spheres].isLight;
+}
+
+
+OSL_HOSTDEVICE Vec3
+CudaScene::sample(int primID, const Vec3& x, float xi, float yi,
+                  float& pdf) const
+{
+    SphereParams* spheres = (SphereParams*)spheres_buffer;
+    QuadParams* quads     = (QuadParams*)quads_buffer;
+    if (primID < num_spheres) {
+        const SphereParams& params = spheres[primID];
+        const OSL::Sphere sphere(F3_TO_V3(params.c), params.r, 0, false);
+        return sphere.sample(x, xi, yi, pdf);
+    } else {
+        const QuadParams& params = quads[primID - num_spheres];
+        const OSL::Quad quad(F3_TO_V3(params.p), F3_TO_V3(params.ex),
+                             F3_TO_V3(params.ey), 0, false);
+        return quad.sample(x, xi, yi, pdf);
+    }
+}
+
+
+OSL_HOSTDEVICE int
+CudaScene::num_prims() const
+{
+    return num_spheres + num_quads;
+}
+
+//------------------------------------------------------------------------------
+
+// Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics"
+// let's do the texture look-ups in this file.
+extern "C" __device__ float4
+osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx, float dsdy, float dtdy)
+{
+    const float2 dx = {dsdx, dtdx};
+    const float2 dy = {dsdy, dtdy};
+    cudaTextureObject_t texID = cudaTextureObject_t(handle);
+    return tex2DGrad<float4>(texID, s, t, dx, dy);
+}
+
+
+//
+// OptiX Programs
+//
+
+
 extern "C" __global__ void
 __miss__()
 {
@@ -51,12 +264,45 @@ __miss__()
 extern "C" __global__ void
 __raygen__setglobals()
 {
+    uint3 launch_dims  = optixGetLaunchDimensions();
+    uint3 launch_index = optixGetLaunchIndex();
+
     // Set global variables
-    OSL::pvt::osl_printf_buffer_start = render_params.osl_printf_buffer_start;
-    OSL::pvt::osl_printf_buffer_end   = render_params.osl_printf_buffer_end;
-    OSL::pvt::s_color_system          = render_params.color_system;
-    OSL::pvt::test_str_1              = render_params.test_str_1;
-    OSL::pvt::test_str_2              = render_params.test_str_2;
+    if (launch_index.x == 0 && launch_index.y == 0) {
+        OSL::pvt::osl_printf_buffer_start
+            = render_params.osl_printf_buffer_start;
+        OSL::pvt::osl_printf_buffer_end = render_params.osl_printf_buffer_end;
+        OSL::pvt::s_color_system        = render_params.color_system;
+        OSL::pvt::test_str_1            = render_params.test_str_1;
+        OSL::pvt::test_str_2            = render_params.test_str_2;
+    }
+
+    Background background;
+    background.set_variables((Vec3*)render_params.bg_values,
+                             (float*)render_params.bg_rows,
+                             (float*)render_params.bg_cols,
+                             render_params.bg_res);
+
+    SimpleRaytracer raytracer;
+    raytracer.background           = background;
+    raytracer.backgroundResolution = render_params.bg_id >= 0
+        ? render_params.bg_res
+        : 0;
+    raytracer.backgroundShaderID   = render_params.bg_id;
+    raytracer.max_bounces          = render_params.max_bounces;
+    raytracer.rr_depth             = 5;
+    raytracer.show_albedo_scale    = render_params.show_albedo_scale;
+
+    if (render_params.bg_id < 0)
+        return;
+
+    auto evaler = [&](const Dual2<Vec3>& dir) {
+        return raytracer.eval_background(dir, nullptr);
+    };
+
+    // Background::prepare_cuda must run on a single warp
+    assert(launch_index.x < 32 && launch_index.y == 0);
+    background.prepare_cuda(launch_dims.x, launch_index.x, evaler);
 }
 
 
@@ -67,37 +313,102 @@ __miss__setglobals()
 
 
 extern "C" __global__ void
-__raygen__()
+__closesthit__deferred()
 {
-    uint3 launch_dims  = optixGetLaunchDimensions();
-    uint3 launch_index = optixGetLaunchIndex();
-    const float3 eye   = render_params.eye;
-    const float3 dir   = render_params.dir;
-    const float3 cx    = render_params.cx;
-    const float3 cy    = render_params.cy;
-    const float invw   = render_params.invw;
-    const float invh   = render_params.invh;
-
-    // Compute the pixel coordinates
-    const float2 d = make_float2(static_cast<float>(launch_index.x) + 0.5f,
-                                 static_cast<float>(launch_index.y) + 0.5f);
-
-    // Make the ray for the current pixel
-    RayGeometry r;
-    r.origin    = eye;
-    r.direction = normalize(cx * (d.x * invw - 0.5f) + cy * (0.5f - d.y * invh)
-                            + dir);
-    optixTrace(render_params.traversal_handle, r.origin, r.direction, 1e-3f,
-               1e13f, 0, OptixVisibilityMask(1), OPTIX_RAY_FLAG_DISABLE_ANYHIT,
-               0, 1, 0);
+    Payload payload;
+    payload.get();
+    ShaderGlobalsType* sg_ptr = payload.sg_ptr;
+    TraceData* tracedata      = reinterpret_cast<TraceData*>(sg_ptr->tracedata);
+    globals_from_hit(*sg_ptr, payload.radius, payload.spread, payload.raytype);
+
+    const unsigned int hit_idx  = optixGetPrimitiveIndex();
+    const unsigned int hit_kind = optixGetHitKind();
+    if (hit_kind == 0) {
+        const QuadParams* quads = reinterpret_cast<const QuadParams*>(
+            render_params.quads_buffer);
+        tracedata->hit_id = quads[hit_idx].objID;
+    } else if (hit_kind == 1) {
+        const SphereParams* spheres = reinterpret_cast<const SphereParams*>(
+            render_params.spheres_buffer);
+        tracedata->hit_id = spheres[hit_idx].objID;
+    }
+    const float hit_t = optixGetRayTmax();
+    tracedata->hit_t  = *(uint32_t*)&hit_t;
 }
 
 
-// Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics"
-// let's do the texture look-ups in this file.
-extern "C" __device__ float4
-osl_tex2DLookup(void* handle, float s, float t)
+extern "C" __global__ void
+__raygen__deferred()
 {
-    cudaTextureObject_t texID = cudaTextureObject_t(handle);
-    return tex2D<float4>(texID, s, t);
+    Background background;
+    background.set_variables((Vec3*)render_params.bg_values,
+                             (float*)render_params.bg_rows,
+                             (float*)render_params.bg_cols,
+                             render_params.bg_res);
+
+    Color3 result(0, 0, 0);
+    const int aa = render_params.aa;
+    for (int si = 0, n = aa * aa; si < n; si++) {
+        uint3 launch_index = optixGetLaunchIndex();
+        Sampler sampler(launch_index.x, launch_index.y, si);
+        Vec3 j = sampler.get();
+        // warp distribution to approximate a tent filter [-1,+1)^2
+        j.x *= 2;
+        j.x = j.x < 1 ? sqrtf(j.x) - 1 : 1 - sqrtf(2 - j.x);
+        j.y *= 2;
+        j.y = j.y < 1 ? sqrtf(j.y) - 1 : 1 - sqrtf(2 - j.y);
+
+        if (render_params.no_jitter) {
+            j *= 0.0f;
+        }
+
+        // Compute the pixel coordinates
+        const float2 d
+            = make_float2(static_cast<float>(launch_index.x) + 0.5f + j.x,
+                          static_cast<float>(launch_index.y) + 0.5f + j.y);
+
+        SimpleRaytracer raytracer;
+        raytracer.background           = background;
+        raytracer.backgroundResolution = render_params.bg_id >= 0
+                                             ? render_params.bg_res
+                                             : 0;
+        raytracer.backgroundShaderID   = render_params.bg_id;
+        raytracer.max_bounces          = render_params.max_bounces;
+        raytracer.rr_depth             = 5;
+        raytracer.show_albedo_scale    = render_params.show_albedo_scale;
+
+        const Vec3 eye  = F3_TO_V3(render_params.eye);
+        const Vec3 dir  = F3_TO_V3(render_params.dir);
+        const Vec3 up   = F3_TO_V3(render_params.up);
+        const float fov = render_params.fov;
+
+        uint3 launch_dims = optixGetLaunchDimensions();
+        raytracer.camera.resolution(launch_dims.x, launch_dims.y);
+        raytracer.camera.lookat(eye, dir, up, fov);
+        raytracer.camera.finalize();
+
+        raytracer.scene = { render_params.num_spheres, render_params.num_quads,
+                            render_params.spheres_buffer,
+                            render_params.quads_buffer,
+                            render_params.traversal_handle };
+
+        Color3 r = raytracer.subpixel_radiance(d.x, d.y, sampler, nullptr);
+
+        result = OIIO::lerp(result, r, 1.0f / (si + 1));
+    }
+
+    uint3 launch_dims     = optixGetLaunchDimensions();
+    uint3 launch_index    = optixGetLaunchIndex();
+    float3* output_buffer = reinterpret_cast<float3*>(
+        render_params.output_buffer);
+    int pixel            = launch_index.y * launch_dims.x + launch_index.x;
+    output_buffer[pixel] = C3_TO_F3(result);
 }
+
+//------------------------------------------------------------------------------
+
+// We need to pull in the definition of SimpleRaytracer::subpixel_radiance(),
+// which is shared between the host and CUDA renderers.
+#include "../simpleraytracer.cpp"
+
+//------------------------------------------------------------------------------
diff --git a/src/testrender/cuda/optix_raytracer.h b/src/testrender/cuda/optix_raytracer.h
new file mode 100644
index 000000000..76d1f691d
--- /dev/null
+++ b/src/testrender/cuda/optix_raytracer.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <OSL/oslconfig.h>
+#include <optix.h>
+
+#include "rend_lib.h"
+#include "../background.h"
+#include "../raytracer.h"
+#include "../sampling.h"
+
+#include <cstdint>
+
+#ifdef __CUDACC__
+
+struct Payload {
+    union {
+        uint32_t raw[2];
+        OSL_CUDA::ShaderGlobals* sg_ptr;
+    };
+    float radius;
+    float spread;
+    OSL::Ray::RayType raytype;
+
+    __forceinline__ __device__ void set()
+    {
+        optixSetPayload_0(raw[0]);
+        optixSetPayload_1(raw[1]);
+        optixSetPayload_2(__float_as_uint(radius));
+        optixSetPayload_3(__float_as_uint(spread));
+        optixSetPayload_4((uint32_t)raytype);
+    }
+
+    __forceinline__ __device__ void get()
+    {
+        raw[0]  = optixGetPayload_0();
+        raw[1]  = optixGetPayload_1();
+        radius  = __uint_as_float(optixGetPayload_2());
+        spread  = __uint_as_float(optixGetPayload_3());
+        raytype = (OSL::Ray::RayType)optixGetPayload_4();
+    }
+};
+
+OSL_NAMESPACE_ENTER
+
+struct CudaScene {
+    OSL_HOSTDEVICE bool intersect(const Ray& r, Dual2<float>& t, int& primID,
+                                  void* sg = nullptr) const;
+    OSL_HOSTDEVICE float shapepdf(int primID, const Vec3& x,
+                                  const Vec3& p) const;
+    OSL_HOSTDEVICE bool islight(int primID) const;
+    OSL_HOSTDEVICE Vec3 sample(int primID, const Vec3& x, float xi, float yi,
+                               float& pdf) const;
+    OSL_HOSTDEVICE int num_prims() const;
+
+    uint64_t num_spheres;
+    uint64_t num_quads;
+    CUdeviceptr spheres_buffer;
+    CUdeviceptr quads_buffer;
+    OptixTraversableHandle handle;
+};
+
+struct SimpleRaytracer {
+    using ShadingContext = ShadingContextCUDA;
+
+    Background background;
+    Camera camera;
+    CudaScene scene;
+    int aa                   = 1;
+    int backgroundResolution = 1024;
+    int backgroundShaderID   = -1;
+    int max_bounces          = 1000000;
+    int rr_depth             = 5;
+    float show_albedo_scale  = 0.0f;
+
+    OSL_HOSTDEVICE Vec3 eval_background(const Dual2<Vec3>& dir,
+                                        ShadingContext* ctx, int bounce = -1);
+    OSL_HOSTDEVICE Color3 subpixel_radiance(float x, float y, Sampler& sampler,
+                                            ShadingContext* ctx = nullptr);
+};
+
+OSL_NAMESPACE_EXIT
+
+#endif  // #ifdef __CUDACC__
diff --git a/src/testrender/cuda/quad.cu b/src/testrender/cuda/quad.cu
index ce6ad4e30..8d1ad2fa5 100644
--- a/src/testrender/cuda/quad.cu
+++ b/src/testrender/cuda/quad.cu
@@ -5,33 +5,45 @@
 
 #include <optix.h>
 
+#include "../raytracer.h"
+#include "optix_raytracer.h"
 #include "rend_lib.h"
-#include "render_params.h"
-#include "wrapper.h"
+#include "vec_math.h"
 
 
 extern "C" __device__ void
 __direct_callable__quad_shaderglobals(const unsigned int idx, const float t_hit,
                                       const float3 ray_origin,
                                       const float3 ray_direction,
-                                      ShaderGlobals* sg)
+                                      OSL_CUDA::ShaderGlobals* sg)
 {
     const GenericData* g_data = reinterpret_cast<const GenericData*>(
         optixGetSbtDataPointer());
     const QuadParams* g_quads = reinterpret_cast<const QuadParams*>(
         g_data->data);
-    const QuadParams& quad = g_quads[idx];
-    const float3 P         = ray_origin + t_hit * ray_direction;
-
-    float3 h = P - quad.p;
-
-    sg->N = sg->Ng  = quad.n;
-    sg->u           = dot(h, quad.ex) * quad.eu;
-    sg->v           = dot(h, quad.ey) * quad.ev;
-    sg->dPdu        = quad.ey;
-    sg->dPdv        = quad.ex;
-    sg->surfacearea = quad.a;
-    sg->shaderID    = quad.shaderID;
+    const QuadParams& params = g_quads[idx];
+    const float3 P           = ray_origin + t_hit * ray_direction;
+
+    sg->I = ray_direction;
+    sg->N = sg->Ng  = params.n;
+    sg->surfacearea = params.a;
+    sg->shaderID    = params.shaderID;
+    sg->backfacing  = dot(V3_TO_F3(sg->N), V3_TO_F3(sg->I)) > 0.0f;
+
+    if (sg->backfacing) {
+        sg->N  = -sg->N;
+        sg->Ng = -sg->Ng;
+    }
+
+    const OSL::Quad quad(F3_TO_V3(params.p), F3_TO_V3(params.ex),
+                         F3_TO_V3(params.ey), 0, false);
+    OSL::Vec3 dPdu, dPdv;
+    OSL::Dual2<OSL::Vec2> uv = quad.uv(F3_TO_V3(P), F3_TO_V3(sg->N), dPdu,
+                                       dPdv);
+    sg->u                    = uv.val().x;
+    sg->v                    = uv.val().y;
+    sg->dPdu                 = V3_TO_F3(dPdu);
+    sg->dPdv                 = V3_TO_F3(dPdv);
 }
 
 
@@ -42,21 +54,29 @@ __intersection__quad()
         optixGetSbtDataPointer());
     const QuadParams* g_quads = reinterpret_cast<const QuadParams*>(
         g_data->data);
+
+    Payload payload;
+    payload.get();
+    const OSL_CUDA::ShaderGlobals* sg_ptr
+        = reinterpret_cast<OSL_CUDA::ShaderGlobals*>(payload.sg_ptr);
+    const TraceData* tracedata = reinterpret_cast<TraceData*>(
+        sg_ptr->tracedata);
+    const int obj_id           = tracedata->obj_id;
     const unsigned int idx     = optixGetPrimitiveIndex();
-    const QuadParams& quad     = g_quads[idx];
+    const QuadParams& params   = g_quads[idx];
     const float3 ray_origin    = optixGetObjectRayOrigin();
     const float3 ray_direction = optixGetObjectRayDirection();
+    const bool self            = obj_id == params.objID;
 
-    float dn = dot(ray_direction, quad.n);
-    float en = dot(quad.p - ray_origin, quad.n);
-    if (dn * en > 0) {
-        float t  = en / dn;
-        float3 h = (ray_origin + ray_direction * t) - quad.p;
-        float dx = dot(h, quad.ex) * quad.eu;
-        float dy = dot(h, quad.ey) * quad.ev;
-
-        if (dx >= 0 && dx < 1.0f && dy >= 0 && dy < 1.0f
-            && t < optixGetRayTmax())
-            optixReportIntersection(t, RAYTRACER_HIT_QUAD);
-    }
+    if (self)
+        return;
+
+    const OSL::Quad quad(F3_TO_V3(params.p), F3_TO_V3(params.ex),
+                         F3_TO_V3(params.ey), 0, false);
+    const OSL::Ray ray(F3_TO_V3(ray_origin), F3_TO_V3(ray_direction),
+                       payload.radius, payload.spread, payload.raytype);
+    const OSL::Dual2<float> t = quad.intersect(ray, self);
+
+    if (t.val() != 0.0f && t.val() < optixGetRayTmax())
+        optixReportIntersection(t.val(), RAYTRACER_HIT_QUAD);
 }
diff --git a/src/testrender/cuda/rend_lib.cu b/src/testrender/cuda/rend_lib.cu
index 9d59345e7..165a1f9d2 100644
--- a/src/testrender/cuda/rend_lib.cu
+++ b/src/testrender/cuda/rend_lib.cu
@@ -58,7 +58,7 @@ closure_component_allot(void* pool, int id, size_t prim_size,
     ((OSL::ClosureComponent*)pool)->id = id;
     ((OSL::ClosureComponent*)pool)->w  = w;
 
-    size_t needed = (sizeof(OSL::ClosureComponent) - sizeof(void*) + prim_size
+    size_t needed = (sizeof(OSL::ClosureComponent) + prim_size
                      + (alignof(OSL::ClosureComponent) - 1))
                     & ~(alignof(OSL::ClosureComponent) - 1);
     char* char_ptr = (char*)pool;
@@ -120,7 +120,7 @@ closure_add_allot(void* pool, OSL::ClosureColor* a, OSL::ClosureColor* b)
 __device__ void*
 osl_allocate_closure_component(void* sg_, int id, int size)
 {
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     OSL::Color3 w = OSL::Color3(1, 1, 1);
     // Fix up the alignment
@@ -141,7 +141,7 @@ __device__ void*
 osl_allocate_weighted_closure_component(void* sg_, int id, int size,
                                         const void* w)
 {
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     const OSL::Color3* wc
         = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float));
@@ -166,7 +166,7 @@ osl_allocate_weighted_closure_component(void* sg_, int id, int size,
 __device__ void*
 osl_mul_closure_color(void* sg_, void* a, const void* w)
 {
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
     const OSL::Color3* wc
         = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float));
 
@@ -198,7 +198,7 @@ osl_mul_closure_float(void* sg_, void* a, float w)
 {
     a = __builtin_assume_aligned(a, alignof(float));
 
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     if (a == NULL || w == 0.0f) {
         return NULL;
@@ -226,7 +226,7 @@ osl_add_closure_closure(void* sg_, void* a, void* b)
     a = __builtin_assume_aligned(a, alignof(float));
     b = __builtin_assume_aligned(b, alignof(float));
 
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     if (a == NULL) {
         return b;
@@ -359,7 +359,7 @@ osl_printf(void* sg_, OSL::ustringhash_pod fmt_str_hash, void* args)
 __device__ void*
 osl_get_noise_options(void* sg_)
 {
-    ShaderGlobals* sg = ((ShaderGlobals*)sg_);
+    OSL_CUDA::ShaderGlobals* sg = ((OSL_CUDA::ShaderGlobals*)sg_);
     NoiseOptCUDA* opt
         = (NoiseOptCUDA*)((ShadingContextCUDA*)sg->context)->noise_options_ptr();
     new (opt) NoiseOptCUDA;
@@ -403,7 +403,7 @@ make_float3(const float4& a)
 // optix_raytrace.cu).
 // (clang++ 9.0 error 'undefined __nv_tex_surf_handler')
 extern __device__ float4
-osl_tex2DLookup(void* handle, float s, float t);
+osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx, float dsdy, float dtdy);
 
 __device__ int
 osl_texture(void* sg_, OSL::ustringhash_pod name, void* handle, void* opt_,
@@ -415,7 +415,7 @@ osl_texture(void* sg_, OSL::ustringhash_pod name, void* handle, void* opt_,
     if (!handle)
         return 0;
     // cudaTextureObject_t texID = cudaTextureObject_t(handle);
-    float4 fromTexture = osl_tex2DLookup(handle, s, t);
+    float4 fromTexture = osl_tex2DLookup(handle, s, t, dsdx, dtdx, dsdy, dtdy);
     // see note above
     // float4 fromTexture = tex2D<float4>(texID, s, t);
     *((float3*)result) = make_float3(fromTexture.x, fromTexture.y,
@@ -444,9 +444,9 @@ osl_range_check_err(int indexvalue, int length, OSL::ustringhash_pod symname,
 __device__ int
 osl_get_matrix(void* sg_, void* r, OSL::ustringhash_pod from_)
 {
-    r                     = __builtin_assume_aligned(r, alignof(float));
-    OSL::ustringhash from = OSL::ustringhash_from(from_);
-    ShaderGlobals* sg     = (ShaderGlobals*)sg_;
+    r                           = __builtin_assume_aligned(r, alignof(float));
+    OSL::ustringhash from       = OSL::ustringhash_from(from_);
+    OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_;
     if (from == OSL::Hashes::common) {
         MAT(r).makeIdentity();
         return true;
@@ -489,9 +489,9 @@ osl_get_matrix(void* sg_, void* r, OSL::ustringhash_pod from_)
 __device__ int
 osl_get_inverse_matrix(void* sg_, void* r, OSL::ustringhash_pod to_)
 {
-    r                   = __builtin_assume_aligned(r, alignof(float));
-    OSL::ustringhash to = OSL::ustringhash_from(to_);
-    ShaderGlobals* sg   = (ShaderGlobals*)sg_;
+    r                           = __builtin_assume_aligned(r, alignof(float));
+    OSL::ustringhash to         = OSL::ustringhash_from(to_);
+    OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_;
     if (to == OSL::Hashes::common) {
         MAT(r).makeIdentity();
         return true;
diff --git a/src/testrender/cuda/rend_lib.h b/src/testrender/cuda/rend_lib.h
index 0fe9a027d..4be40a15b 100644
--- a/src/testrender/cuda/rend_lib.h
+++ b/src/testrender/cuda/rend_lib.h
@@ -17,6 +17,12 @@
 #include <OSL/hashes.h>
 #include <OSL/oslexec.h>
 
+#include "../raytracer.h"
+
+
+#define RAYTRACER_HIT_QUAD   0
+#define RAYTRACER_HIT_SPHERE 1
+
 
 OSL_NAMESPACE_ENTER
 
@@ -87,19 +93,20 @@ struct ShadingContextCUDA {
 };
 
 
+namespace OSL_CUDA {
 struct ShaderGlobals {
-    float3 P, dPdx, dPdy;
-    float3 dPdz;
-    float3 I, dIdx, dIdy;
-    float3 N;
-    float3 Ng;
+    OSL::Vec3 P, dPdx, dPdy;
+    OSL::Vec3 dPdz;
+    OSL::Vec3 I, dIdx, dIdy;
+    OSL::Vec3 N;
+    OSL::Vec3 Ng;
     float u, dudx, dudy;
     float v, dvdx, dvdy;
-    float3 dPdu, dPdv;
+    OSL::Vec3 dPdu, dPdv;
     float time;
     float dtime;
-    float3 dPdtime;
-    float3 Ps, dPsdx, dPsdy;
+    OSL::Vec3 dPdtime;
+    OSL::Vec3 Ps, dPsdx, dPsdy;
     void* renderstate;
     void* tracedata;
     void* objdata;
@@ -117,86 +124,25 @@ struct ShaderGlobals {
     int backfacing;
     int shaderID;
 };
-
-
-enum RayType {
-    CAMERA       = 1,
-    SHADOW       = 2,
-    REFLECTION   = 4,
-    REFRACTION   = 8,
-    DIFFUSE      = 16,
-    GLOSSY       = 32,
-    SUBSURFACE   = 64,
-    DISPLACEMENT = 128
-};
-
-
-// Closures supported by the OSL sample renderer.  This list is mostly aspirational.
-enum ClosureIDs {
-    EMISSION_ID = 1,
-    BACKGROUND_ID,
-    DIFFUSE_ID,
-    OREN_NAYAR_ID,
-    TRANSLUCENT_ID,
-    PHONG_ID,
-    WARD_ID,
-    MICROFACET_ID,
-    REFLECTION_ID,
-    FRESNEL_REFLECTION_ID,
-    REFRACTION_ID,
-    TRANSPARENT_ID,
-    DEBUG_ID,
-    HOLDOUT_ID,
-};
-
-
-// ========================================
-//
-// Some helper vector functions
-//
-static __forceinline__ __device__ float3
-operator*(const float a, const float3& b)
-{
-    return make_float3(a * b.x, a * b.y, a * b.z);
-}
-
-static __forceinline__ __device__ float3
-operator*(const float3& a, const float b)
-{
-    return make_float3(a.x * b, a.y * b, a.z * b);
 }
 
-static __forceinline__ __device__ float3
-operator+(const float3& a, const float3& b)
-{
-    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-
-static __forceinline__ __device__ float3
-operator-(const float3& a, const float3& b)
-{
-    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-
-static __forceinline__ __device__ float3
-operator-(const float3& a)
-{
-    return make_float3(-a.x, -a.y, -a.z);
-}
-
-static __forceinline__ __device__ float
-dot(const float3& a, const float3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static __forceinline__ __device__ float3
-normalize(const float3& v)
-{
-    float invLen = 1.0f / sqrtf(dot(v, v));
-    return invLen * v;
-}
-//
-// ========================================
+struct TraceData {
+    // OUT: The ID of the object that was hit
+    int32_t hit_id;
+    union {
+        // IN:  Set before calling trace to avoid self-intersection
+        int32_t obj_id;
+        // OUT: The hit distance
+        float hit_t;
+    };
+
+    OSL_HOSTDEVICE TraceData(OSL_CUDA::ShaderGlobals& sg, int id)
+    {
+        hit_id       = -1;
+        obj_id       = id;
+        sg.shaderID  = -1;  // Set in the CH program
+        sg.tracedata = (void*)this;
+    }
+};
 
 }  // anonymous namespace
diff --git a/src/testrender/cuda/sphere.cu b/src/testrender/cuda/sphere.cu
index c09a7ae32..fdefd366b 100644
--- a/src/testrender/cuda/sphere.cu
+++ b/src/testrender/cuda/sphere.cu
@@ -6,42 +6,10 @@
 #include <optix.h>
 
 
+#include "../raytracer.h"
+#include "optix_raytracer.h"
 #include "rend_lib.h"
-#include "render_params.h"
-#include "wrapper.h"
-
-
-static __device__ __inline__ void
-calc_uv(float3 shading_normal, float& u, float& v, float3& dPdu, float3& dPdv)
-{
-    const float3 n = shading_normal;
-
-    const float nx = n.x;
-    const float ny = n.y;
-    const float nz = n.z;
-
-    u = (atan2(nx, nz) + M_PI) * 0.5f * float(M_1_PI);
-    v = acos(ny) * float(M_1_PI);
-
-    float xz2 = nx * nx + nz * nz;
-    if (xz2 > 0.0f) {
-        const float PI    = float(M_PI);
-        const float TWOPI = float(2 * M_PI);
-        float xz          = sqrtf(xz2);
-        float inv         = 1.0f / xz;
-        dPdu              = make_float3(-TWOPI * nx, TWOPI * nz, 0.0f);
-        dPdv = make_float3(-PI * nz * inv * ny, -PI * nx * inv * ny, PI * xz);
-    } else {
-        // pick arbitrary axes for poles to avoid division by 0
-        if (ny > 0.0f) {
-            dPdu = make_float3(0.0f, 0.0f, 1.0f);
-            dPdv = make_float3(1.0f, 0.0f, 0.0f);
-        } else {
-            dPdu = make_float3(0.0f, 0.0f, 1.0f);
-            dPdv = make_float3(-1.0f, 0.0f, 0.0f);
-        }
-    }
-}
+#include "vec_math.h"
 
 
 extern "C" __device__ void
@@ -49,20 +17,27 @@ __direct_callable__sphere_shaderglobals(const unsigned int idx,
                                         const float t_hit,
                                         const float3 ray_origin,
                                         const float3 ray_direction,
-                                        ShaderGlobals* sg)
+                                        OSL_CUDA::ShaderGlobals* sg)
 {
     const GenericData* g_data = reinterpret_cast<const GenericData*>(
         optixGetSbtDataPointer());
     const SphereParams* g_spheres = reinterpret_cast<const SphereParams*>(
         g_data->data);
-    const SphereParams& sphere = g_spheres[idx];
+    const SphereParams& params = g_spheres[idx];
     const float3 P             = ray_origin + t_hit * ray_direction;
 
-    sg->N = sg->Ng  = normalize(P - sphere.c);
-    sg->surfacearea = sphere.a;
-    sg->shaderID    = sphere.shaderID;
-
-    calc_uv(sg->N, sg->u, sg->v, sg->dPdu, sg->dPdv);
+    sg->N = sg->Ng  = normalize(P - params.c);
+    sg->surfacearea = params.a;
+    sg->shaderID    = params.shaderID;
+
+    const OSL::Sphere sphere(F3_TO_V3(params.c), params.r, 0, false);
+    OSL::Vec3 dPdu, dPdv;
+    OSL::Dual2<OSL::Vec2> uv = sphere.uv(F3_TO_V3(P), F3_TO_V3(sg->N), dPdu,
+                                         dPdv);
+    sg->u                    = uv.val().x;
+    sg->v                    = uv.val().y;
+    sg->dPdu                 = V3_TO_F3(dPdu);
+    sg->dPdv                 = V3_TO_F3(dPdv);
 }
 
 
@@ -73,24 +48,25 @@ __intersection__sphere()
         optixGetSbtDataPointer());
     const SphereParams* g_spheres = reinterpret_cast<const SphereParams*>(
         g_data->data);
+
+    Payload payload;
+    payload.get();
+    const OSL_CUDA::ShaderGlobals* sg_ptr
+        = reinterpret_cast<OSL_CUDA::ShaderGlobals*>(payload.sg_ptr);
+    const TraceData* tracedata = reinterpret_cast<TraceData*>(
+        sg_ptr->tracedata);
+    const int obj_id           = tracedata->obj_id;
     const unsigned int idx     = optixGetPrimitiveIndex();
-    const SphereParams& sphere = g_spheres[idx];
+    const SphereParams& params = g_spheres[idx];
     const float3 ray_origin    = optixGetObjectRayOrigin();
     const float3 ray_direction = optixGetObjectRayDirection();
+    const bool self            = obj_id == params.objID;
 
-    float3 oc = sphere.c - ray_origin;
-    float b   = dot(oc, ray_direction);
-    float det = b * b - dot(oc, oc) + sphere.r2;
-    if (det >= 0.0f) {
-        det     = sqrtf(det);
-        float x = b - det;
-        float y = b + det;
-
-        // NB: this does not included the 'self' check from
-        // the testrender sphere intersection
-        float t = (x > 0) ? x : ((y > 0) ? y : 0);
+    const OSL::Sphere sphere(F3_TO_V3(params.c), params.r, 0, false);
+    const OSL::Ray ray(F3_TO_V3(ray_origin), F3_TO_V3(ray_direction),
+                       payload.radius, payload.spread, payload.raytype);
+    const OSL::Dual2<float> t = sphere.Sphere::intersect(ray, self);
 
-        if (t < optixGetRayTmax())
-            optixReportIntersection(t, RAYTRACER_HIT_SPHERE);
-    }
+    if (t.val() != 0.0f && t.val() < optixGetRayTmax())
+        optixReportIntersection(t.val(), RAYTRACER_HIT_SPHERE);
 }
diff --git a/src/testrender/cuda/util.h b/src/testrender/cuda/util.h
deleted file mode 100644
index 2d6a18b24..000000000
--- a/src/testrender/cuda/util.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright Contributors to the Open Shading Language project.
-// SPDX-License-Identifier: BSD-3-Clause
-// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
-
-
-#pragma once
-
-#include <optix.h>
-
-#include <stdint.h>
-
-
-struct PRD_radiance {
-    float3 result;
-};
-
-
-struct RayGeometry {
-    float3 origin;
-    float3 direction;
-};
-
-
-static __device__ __inline__ uchar4
-make_color(const float3& c)
-{
-    return make_uchar4(
-        static_cast<unsigned char>(__saturatef(c.z) * 255.99f), /* B */
-        static_cast<unsigned char>(__saturatef(c.y) * 255.99f), /* G */
-        static_cast<unsigned char>(__saturatef(c.x) * 255.99f), /* R */
-        255u);                                                  /* A */
-}
diff --git a/src/testrender/cuda/vec_math.h b/src/testrender/cuda/vec_math.h
new file mode 100644
index 000000000..ca8f5e157
--- /dev/null
+++ b/src/testrender/cuda/vec_math.h
@@ -0,0 +1,95 @@
+// Copyright Contributors to the Open Shading Language project.
+// SPDX-License-Identifier: BSD-3-Clause
+// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
+
+
+#pragma once
+
+#include <OSL/dual_vec.h>
+
+#include <vector_functions.h>
+#include <vector_types.h>
+
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#include <cstdlib>
+#endif
+
+namespace {  // anonymous namespace
+
+// ========================================
+//
+// Some helper vector functions
+//
+static __forceinline__ __device__ float3
+operator*(const float a, const float3& b)
+{
+    return make_float3(a * b.x, a * b.y, a * b.z);
+}
+
+static __forceinline__ __device__ float3
+operator*(const float3& a, const float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+
+static __forceinline__ __device__ float3
+operator+(const float3& a, const float3& b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+static __forceinline__ __device__ float3
+operator-(const float3& a, const float3& b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+static __forceinline__ __device__ float3
+operator-(const float3& a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+
+static __forceinline__ __device__ float
+dot(const float3& a, const float3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __forceinline__ __device__ float3
+normalize(const float3& v)
+{
+    float invLen = 1.0f / sqrtf(dot(v, v));
+    return invLen * v;
+}
+
+static __forceinline__ __device__
+float3 cross(const float3& a, const float3& b)
+{
+  return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+
+static __forceinline__ __device__
+float length(const float3& v)
+{
+    return __fsqrt_rn((v.x * v.x) + (v.y * v.y) + (v.z * v.z));
+}
+
+
+static __forceinline__ __device__
+void ortho(const float3& n, float3& x, float3& y)
+{
+    x = normalize(fabsf(n.x) > .01f ? make_float3(n.z, 0, -n.x) : make_float3(0, -n.z, n.y));
+    y = cross(n, x);
+}
+
+
+}  // anonymous namespace
+
+// Conversion macros for casting between vector types
+#define F3_TO_V3(f3) (*reinterpret_cast<const OSL::Vec3*>(&f3))
+#define F3_TO_C3(f3) (*reinterpret_cast<const OSL::Color3*>(&f3))
+#define V3_TO_F3(v3) (*reinterpret_cast<const float3*>(&v3))
+#define C3_TO_F3(c3) (*reinterpret_cast<const float3*>(&c3))
diff --git a/src/testrender/cuda/wrapper.cu b/src/testrender/cuda/wrapper.cu
index 79c1dbce9..c14105c9e 100644
--- a/src/testrender/cuda/wrapper.cu
+++ b/src/testrender/cuda/wrapper.cu
@@ -10,10 +10,14 @@
 
 #include <OSL/oslclosure.h>
 
+#include "optix_raytracer.h"
 #include "rend_lib.h"
-#include "util.h"
+#include "vec_math.h"
 
 #include "../render_params.h"
+#include "../shading.h"
+
+using OSL_CUDA::ShaderGlobals;
 
 
 extern "C" {
@@ -30,25 +34,25 @@ __anyhit__any_hit_shadow()
 
 
 static __device__ void
-globals_from_hit(ShaderGlobals& sg)
+globals_from_hit(OSL_CUDA::ShaderGlobals& sg)
 {
     const GenericRecord* record = reinterpret_cast<GenericRecord*>(
         optixGetSbtDataPointer());
 
-    ShaderGlobals local_sg;
+    OSL_CUDA::ShaderGlobals local_sg;
     // hit-kind 0: quad hit
     //          1: sphere hit
-    optixDirectCall<void, unsigned int, float, float3, float3, ShaderGlobals*>(
+    optixDirectCall<void, unsigned int, float, float3, float3, OSL_CUDA::ShaderGlobals*>(
         optixGetHitKind(), optixGetPrimitiveIndex(), optixGetRayTmax(),
         optixGetWorldRayOrigin(), optixGetWorldRayDirection(), &local_sg);
     // Setup the ShaderGlobals
     const float3 ray_direction = optixGetWorldRayDirection();
     const float3 ray_origin    = optixGetWorldRayOrigin();
-    const float t_hit          = optixGetRayTmin();
+    const float t_hit          = optixGetRayTmax();
 
     sg.I  = ray_direction;
-    sg.N  = normalize(optixTransformNormalFromObjectToWorldSpace(local_sg.N));
-    sg.Ng = normalize(optixTransformNormalFromObjectToWorldSpace(local_sg.Ng));
+    sg.N  = normalize(optixTransformNormalFromObjectToWorldSpace(V3_TO_F3(local_sg.N)));
+    sg.Ng = normalize(optixTransformNormalFromObjectToWorldSpace(V3_TO_F3(local_sg.Ng)));
     sg.P  = ray_origin + t_hit * ray_direction;
     sg.dPdu        = local_sg.dPdu;
     sg.dPdv        = local_sg.dPdv;
@@ -56,7 +60,7 @@ globals_from_hit(ShaderGlobals& sg)
     sg.v           = local_sg.v;
     sg.Ci          = NULL;
     sg.surfacearea = local_sg.surfacearea;
-    sg.backfacing  = dot(sg.N, sg.I) > 0.0f;
+    sg.backfacing  = dot(V3_TO_F3(sg.N), V3_TO_F3(sg.I)) > 0.0f;
     sg.shaderID    = local_sg.shaderID;
 
     if (sg.backfacing) {
@@ -65,7 +69,7 @@ globals_from_hit(ShaderGlobals& sg)
     }
 
     // NB: These variables are not used in the current iteration of the sample
-    sg.raytype        = CAMERA;
+    sg.raytype        = OSL::Ray::CAMERA;
     sg.flipHandedness = 0;
 }
 
@@ -95,38 +99,39 @@ process_closure(const OSL::ClosureColor* closure_tree)
 
     const void* cur = closure_tree;
     while (cur) {
-        switch (((OSL::ClosureColor*)cur)->id) {
-        case OSL::ClosureColor::ADD: {
+        ClosureIDs id = static_cast<ClosureIDs>(((OSL::ClosureColor*)cur)->id);
+        switch (id) {
+        case ClosureIDs::ADD: {
             ptr_stack[stack_idx]      = ((OSL::ClosureAdd*)cur)->closureB;
             weight_stack[stack_idx++] = weight;
             cur                       = ((OSL::ClosureAdd*)cur)->closureA;
             break;
         }
 
-        case OSL::ClosureColor::MUL: {
+        case ClosureIDs::MUL: {
             weight *= ((OSL::ClosureMul*)cur)->weight;
             cur = ((OSL::ClosureMul*)cur)->closure;
             break;
         }
 
-        case EMISSION_ID: {
+        case ClosureIDs::EMISSION_ID: {
             cur = NULL;
             break;
         }
 
-        case DIFFUSE_ID:
-        case OREN_NAYAR_ID:
-        case PHONG_ID:
-        case WARD_ID:
-        case REFLECTION_ID:
-        case REFRACTION_ID:
-        case FRESNEL_REFLECTION_ID: {
+        case ClosureIDs::DIFFUSE_ID:
+        case ClosureIDs::OREN_NAYAR_ID:
+        case ClosureIDs::PHONG_ID:
+        case ClosureIDs::WARD_ID:
+        case ClosureIDs::REFLECTION_ID:
+        case ClosureIDs::REFRACTION_ID:
+        case ClosureIDs::FRESNEL_REFLECTION_ID: {
             result += ((OSL::ClosureComponent*)cur)->w * weight;
             cur = NULL;
             break;
         }
 
-        case MICROFACET_ID: {
+        case ClosureIDs::MICROFACET_ID: {
             const char* mem = (const char*)((OSL::ClosureComponent*)cur)->data();
             OSL::ustringhash dist_uh = *(OSL::ustringhash*)&mem[0];
 
@@ -161,7 +166,7 @@ __closesthit__closest_hit_osl()
     //       exceeded.
     alignas(8) char closure_pool[256];
 
-    ShaderGlobals sg;
+    OSL_CUDA::ShaderGlobals sg;
     globals_from_hit(sg);
 
     // Pack the "closure pool" into one of the ShaderGlobals pointers
@@ -184,7 +189,7 @@ __closesthit__closest_hit_osl()
     void* interactive_ptr = reinterpret_cast<void**>(
         render_params.interactive_params)[sg.shaderID];
     const unsigned int shaderIdx = 2u + sg.shaderID + 0u;
-    optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
+    optixDirectCall<void, OSL_CUDA::ShaderGlobals*, void*, void*, void*, int, void*>(
         shaderIdx, &sg /*shaderglobals_ptr*/, nullptr /*groupdata_ptr*/,
         nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
         0 /*shadeindex - unused*/, interactive_ptr /*interactive_params_ptr*/);
diff --git a/src/testrender/cuda/wrapper.h b/src/testrender/cuda/wrapper.h
deleted file mode 100644
index c939f6064..000000000
--- a/src/testrender/cuda/wrapper.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#pragma once
-
-#define RAYTRACER_HIT_QUAD   0
-#define RAYTRACER_HIT_SPHERE 1
diff --git a/src/testrender/optics.h b/src/testrender/optics.h
index 5a709a065..c8f6ae166 100644
--- a/src/testrender/optics.h
+++ b/src/testrender/optics.h
@@ -9,7 +9,8 @@
 
 OSL_NAMESPACE_ENTER
 
-inline float
+
+inline OSL_HOSTDEVICE float
 fresnel_dielectric(float cosi, float eta)
 {
     // special case: ignore fresnel
@@ -30,7 +31,7 @@ fresnel_dielectric(float cosi, float eta)
     return 1.0f;  // TIR (no refracted component)
 }
 
-inline float
+inline OSL_HOSTDEVICE float
 fresnel_refraction(const Vec3& I, const Vec3& N, float eta, Vec3& T)
 {
     // compute refracted direction and fresnel term
@@ -63,7 +64,7 @@ fresnel_refraction(const Vec3& I, const Vec3& N, float eta, Vec3& T)
     return 0;
 }
 
-Color3
+OSL_HOSTDEVICE Color3
 fresnel_conductor(float cos_theta, Color3 n, Color3 k)
 {
     cos_theta       = OIIO::clamp(cos_theta, 0.0f, 1.0f);
@@ -89,7 +90,7 @@ fresnel_conductor(float cos_theta, Color3 n, Color3 k)
     return 0.5f * (rp + rs);
 }
 
-inline float
+inline OSL_HOSTDEVICE float
 fresnel_schlick(float cos_theta, float F0, float F90)
 {
     float x  = OIIO::clamp(1.0f - cos_theta, 0.0f, 1.0f);
@@ -99,7 +100,7 @@ fresnel_schlick(float cos_theta, float F0, float F90)
     return OIIO::lerp(F0, F90, x5);
 }
 
-inline Color3
+inline OSL_HOSTDEVICE Color3
 fresnel_generalized_schlick(float cos_theta, Color3 F0, Color3 F90,
                             float exponent)
 {
diff --git a/src/testrender/optixraytracer.cpp b/src/testrender/optixraytracer.cpp
index 98c6ff191..59d7f7da0 100644
--- a/src/testrender/optixraytracer.cpp
+++ b/src/testrender/optixraytracer.cpp
@@ -11,9 +11,11 @@
 
 #include "optixraytracer.h"
 
+#include "cuda/optix_raytracer.h"
 #include "render_params.h"
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <optix_function_table_definition.h>
 #include <optix_stack_size.h>
 #include <optix_stubs.h>
@@ -115,6 +117,10 @@ OptixRaytracer::~OptixRaytracer()
 {
     if (m_optix_ctx)
         OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
+    for (void* ptr : device_ptrs)
+        cudaFree(ptr);
+    for (void* arr : array_ptrs)
+        cudaFreeArray(reinterpret_cast<cudaArray_t>(arr));
 }
 
 
@@ -237,11 +243,13 @@ OptixRaytracer::synch_attributes()
                        podDataSize + sizeof(ustringhash_pod) * numStrings));
         CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_color_system), colorSys,
                               podDataSize, cudaMemcpyHostToDevice));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_color_system));
 
         CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_osl_printf_buffer),
                               OSL_PRINTF_BUFFER_SIZE));
         CUDA_CHECK(cudaMemset(reinterpret_cast<void*>(d_osl_printf_buffer), 0,
                               OSL_PRINTF_BUFFER_SIZE));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_osl_printf_buffer));
 
         // then copy the device string to the end, first strings starting at dataPtr - (numStrings)
         // FIXME -- Should probably handle alignment better.
@@ -309,72 +317,48 @@ OptixRaytracer::create_optix_pg(const OptixProgramGroupDesc* pg_desc,
 
 
 
-bool
-OptixRaytracer::make_optix_materials()
+void
+OptixRaytracer::create_modules(State& state)
 {
-    // Stand-in: names of shader outputs to preserve
-    std::vector<const char*> outputs { "Cout" };
-
-    std::vector<OptixModule> modules;
-
-    // Space for message logging
     char msg_log[8192];
     size_t sizeof_msg_log;
 
-    // Make module that contains programs we'll use in this scene
-    OptixModuleCompileOptions module_compile_options = {};
+    // Set the pipeline compile options
+    state.pipeline_compile_options.traversableGraphFlags
+        = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+    state.pipeline_compile_options.usesMotionBlur     = false;
+    state.pipeline_compile_options.numPayloadValues   = 5;
+    state.pipeline_compile_options.numAttributeValues = 3;
+    state.pipeline_compile_options.exceptionFlags
+        = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW;
+    state.pipeline_compile_options.pipelineLaunchParamsVariableName
+        = "render_params";
 
-    module_compile_options.maxRegisterCount
+    // Set the module compile options
+    state.module_compile_options.maxRegisterCount
         = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
-    module_compile_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_DEFAULT;
+    state.module_compile_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_DEFAULT;
 #if OPTIX_VERSION >= 70400
-    module_compile_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL;
+    state.module_compile_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL;
 #else
-    module_compile_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+    state.module_compile_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
 #endif
 
-    OptixPipelineCompileOptions pipeline_compile_options = {};
-
-    pipeline_compile_options.traversableGraphFlags
-        = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
-    pipeline_compile_options.usesMotionBlur     = false;
-    pipeline_compile_options.numPayloadValues   = 3;
-    pipeline_compile_options.numAttributeValues = 3;
-    pipeline_compile_options.exceptionFlags
-        = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW;
-    pipeline_compile_options.pipelineLaunchParamsVariableName = "render_params";
-
-    // Create 'raygen' program
-
-    // Load the renderer CUDA source and generate PTX for it
-    OptixModule program_module;
-    load_optix_module("optix_raytracer.ptx", &module_compile_options,
-                      &pipeline_compile_options, &program_module);
-
-    // Record it so we can destroy it later
-    modules.push_back(program_module);
-
-    OptixModule quad_module;
-    load_optix_module("quad.ptx", &module_compile_options,
-                      &pipeline_compile_options, &quad_module);
-
-    OptixModule sphere_module;
-    load_optix_module("sphere.ptx", &module_compile_options,
-                      &pipeline_compile_options, &sphere_module);
-
-    OptixModule wrapper_module;
-    load_optix_module("wrapper.ptx", &module_compile_options,
-                      &pipeline_compile_options, &wrapper_module);
-
-    OptixModule rend_lib_module;
-    load_optix_module("rend_lib_testrender.ptx", &module_compile_options,
-                      &pipeline_compile_options, &rend_lib_module);
+    load_optix_module("optix_raytracer.ptx", &state.module_compile_options,
+                      &state.pipeline_compile_options, &state.program_module);
+    load_optix_module("quad.ptx", &state.module_compile_options,
+                      &state.pipeline_compile_options, &state.quad_module);
+    load_optix_module("sphere.ptx", &state.module_compile_options,
+                      &state.pipeline_compile_options, &state.sphere_module);
+    load_optix_module("wrapper.ptx", &state.module_compile_options,
+                      &state.pipeline_compile_options, &state.wrapper_module);
+    load_optix_module("rend_lib_testrender.ptx", &state.module_compile_options,
+                      &state.pipeline_compile_options, &state.rend_lib_module);
 
     // Retrieve the compiled shadeops PTX
     const char* shadeops_ptx = nullptr;
     shadingsys->getattribute("shadeops_cuda_ptx", OSL::TypeDesc::PTR,
                              &shadeops_ptx);
-
     int shadeops_ptx_size = 0;
     shadingsys->getattribute("shadeops_cuda_ptx_size", OSL::TypeDesc::INT,
                              &shadeops_ptx_size);
@@ -382,143 +366,166 @@ OptixRaytracer::make_optix_materials()
     if (shadeops_ptx == nullptr || shadeops_ptx_size == 0) {
         errhandler().severefmt(
             "Could not retrieve PTX for the shadeops library");
-        return false;
+        exit(EXIT_FAILURE);
     }
 
-    // Create the shadeops library program group
-    OptixModule shadeops_module;
+    // Create the shadeops module
     sizeof_msg_log = sizeof(msg_log);
-    OPTIX_CHECK_MSG(optixModuleCreateFn(m_optix_ctx, &module_compile_options,
-                                        &pipeline_compile_options, shadeops_ptx,
-                                        shadeops_ptx_size, msg_log,
-                                        &sizeof_msg_log, &shadeops_module),
-                    fmtformat("Creating module for shadeops library: {}",
-                              msg_log));
-    modules.push_back(shadeops_module);
+    OPTIX_CHECK_MSG(
+        optixModuleCreateFn(m_optix_ctx, &state.module_compile_options,
+                            &state.pipeline_compile_options, shadeops_ptx,
+                            shadeops_ptx_size, msg_log, &sizeof_msg_log,
+                            &state.shadeops_module),
+        fmtformat("Creating module for shadeops library: {}", msg_log));
+}
 
-    OptixProgramGroupOptions program_options = {};
-    std::vector<OptixProgramGroup> shader_groups;
 
-    // Raygen group
-    OptixProgramGroupDesc raygen_desc    = {};
-    raygen_desc.kind                     = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-    raygen_desc.raygen.module            = program_module;
-    raygen_desc.raygen.entryFunctionName = "__raygen__";
 
-    OptixProgramGroup raygen_group;
-    create_optix_pg(&raygen_desc, 1, &program_options, &raygen_group);
+void
+OptixRaytracer::create_programs(State& state)
+{
+    // Raygen group
+    {
+        OptixProgramGroupDesc raygen_desc    = {};
+        raygen_desc.kind                     = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+        raygen_desc.raygen.module            = state.program_module;
+        raygen_desc.raygen.entryFunctionName = "__raygen__deferred";
+        create_optix_pg(&raygen_desc, 1, &state.program_options,
+                        &state.raygen_group);
+    }
 
     // Set Globals Raygen group
-    OptixProgramGroupDesc setglobals_raygen_desc = {};
-    setglobals_raygen_desc.kind          = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-    setglobals_raygen_desc.raygen.module = program_module;
-    setglobals_raygen_desc.raygen.entryFunctionName = "__raygen__setglobals";
-
-    OptixProgramGroup setglobals_raygen_group;
-    sizeof_msg_log = sizeof(msg_log);
-    OPTIX_CHECK_MSG(
-        optixProgramGroupCreate(m_optix_ctx, &setglobals_raygen_desc,
-                                1,                 // number of program groups
-                                &program_options,  // program options
-                                msg_log, &sizeof_msg_log,
-                                &setglobals_raygen_group),
-        fmtformat("Creating set-globals 'ray-gen' program group: {}", msg_log));
+    {
+        OptixProgramGroupDesc raygen_desc    = {};
+        raygen_desc.kind                     = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+        raygen_desc.raygen.module            = state.program_module;
+        raygen_desc.raygen.entryFunctionName = "__raygen__setglobals";
+        create_optix_pg(&raygen_desc, 1, &state.program_options,
+                        &state.setglobals_raygen_group);
+    }
 
     // Miss group
-    OptixProgramGroupDesc miss_desc = {};
-    miss_desc.kind                  = OPTIX_PROGRAM_GROUP_KIND_MISS;
-    miss_desc.miss.module
-        = program_module;  // raygen file/module contains miss program
-    miss_desc.miss.entryFunctionName = "__miss__";
-
-    OptixProgramGroup miss_group;
-    create_optix_pg(&miss_desc, 1, &program_options, &miss_group);
+    {
+        OptixProgramGroupDesc miss_desc = {};
+        miss_desc.kind                  = OPTIX_PROGRAM_GROUP_KIND_MISS;
+        miss_desc.miss.module
+            = state.program_module;  // raygen file/module contains miss program
+        miss_desc.miss.entryFunctionName = "__miss__";
+        create_optix_pg(&miss_desc, 1, &state.program_options,
+                        &state.miss_group);
+    }
 
     // Set Globals Miss group
-    OptixProgramGroupDesc setglobals_miss_desc  = {};
-    setglobals_miss_desc.kind                   = OPTIX_PROGRAM_GROUP_KIND_MISS;
-    setglobals_miss_desc.miss.module            = program_module;
-    setglobals_miss_desc.miss.entryFunctionName = "__miss__setglobals";
-    OptixProgramGroup setglobals_miss_group;
-    create_optix_pg(&setglobals_miss_desc, 1, &program_options,
-                    &setglobals_miss_group);
+    {
+        OptixProgramGroupDesc setglobals_miss_desc = {};
+        setglobals_miss_desc.kind        = OPTIX_PROGRAM_GROUP_KIND_MISS;
+        setglobals_miss_desc.miss.module = state.program_module;
+        setglobals_miss_desc.miss.entryFunctionName = "__miss__setglobals";
+        create_optix_pg(&setglobals_miss_desc, 1, &state.program_options,
+                        &state.setglobals_miss_group);
+    }
 
     // Hitgroup -- quads
-    OptixProgramGroupDesc quad_hitgroup_desc = {};
-    quad_hitgroup_desc.kind              = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    quad_hitgroup_desc.hitgroup.moduleCH = wrapper_module;
-    quad_hitgroup_desc.hitgroup.entryFunctionNameCH
-        = "__closesthit__closest_hit_osl";
-    quad_hitgroup_desc.hitgroup.moduleAH = wrapper_module;
-    quad_hitgroup_desc.hitgroup.entryFunctionNameAH = "__anyhit__any_hit_shadow";
-    quad_hitgroup_desc.hitgroup.moduleIS            = quad_module;
-    quad_hitgroup_desc.hitgroup.entryFunctionNameIS = "__intersection__quad";
-    OptixProgramGroup quad_hitgroup;
-    create_optix_pg(&quad_hitgroup_desc, 1, &program_options, &quad_hitgroup);
+    {
+        OptixProgramGroupDesc quad_hitgroup_desc = {};
+        quad_hitgroup_desc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+        quad_hitgroup_desc.hitgroup.moduleCH = state.program_module;
+        quad_hitgroup_desc.hitgroup.entryFunctionNameCH
+            = "__closesthit__deferred";
+        quad_hitgroup_desc.hitgroup.moduleAH = state.wrapper_module;
+        quad_hitgroup_desc.hitgroup.entryFunctionNameAH
+            = "__anyhit__any_hit_shadow";
+        quad_hitgroup_desc.hitgroup.moduleIS            = state.quad_module;
+        quad_hitgroup_desc.hitgroup.entryFunctionNameIS = "__intersection__quad";
+        create_optix_pg(&quad_hitgroup_desc, 1, &state.program_options,
+                        &state.quad_hit_group);
+    }
 
     // Direct-callable -- renderer-specific support functions for OSL on the device
-    OptixProgramGroupDesc rend_lib_desc = {};
-    rend_lib_desc.kind                  = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    rend_lib_desc.callables.moduleDC    = rend_lib_module;
-    rend_lib_desc.callables.entryFunctionNameDC
-        = "__direct_callable__dummy_rend_lib";
-    rend_lib_desc.callables.moduleCC            = 0;
-    rend_lib_desc.callables.entryFunctionNameCC = nullptr;
-    OptixProgramGroup rend_lib_group;
-    create_optix_pg(&rend_lib_desc, 1, &program_options, &rend_lib_group);
+    {
+        OptixProgramGroupDesc rend_lib_desc = {};
+        rend_lib_desc.kind               = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+        rend_lib_desc.callables.moduleDC = state.rend_lib_module;
+        rend_lib_desc.callables.entryFunctionNameDC
+            = "__direct_callable__dummy_rend_lib";
+        rend_lib_desc.callables.moduleCC            = 0;
+        rend_lib_desc.callables.entryFunctionNameCC = nullptr;
+        create_optix_pg(&rend_lib_desc, 1, &state.program_options,
+                        &state.rend_lib_group);
+    }
 
     // Direct-callable -- built-in support functions for OSL on the device
-    OptixProgramGroupDesc shadeops_desc = {};
-    shadeops_desc.kind                  = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    shadeops_desc.callables.moduleDC    = shadeops_module;
-    shadeops_desc.callables.entryFunctionNameDC
-        = "__direct_callable__dummy_shadeops";
-    shadeops_desc.callables.moduleCC            = 0;
-    shadeops_desc.callables.entryFunctionNameCC = nullptr;
-    OptixProgramGroup shadeops_group;
-    create_optix_pg(&shadeops_desc, 1, &program_options, &shadeops_group);
+    {
+        OptixProgramGroupDesc shadeops_desc = {};
+        shadeops_desc.kind               = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+        shadeops_desc.callables.moduleDC = state.shadeops_module;
+        shadeops_desc.callables.entryFunctionNameDC
+            = "__direct_callable__dummy_shadeops";
+        shadeops_desc.callables.moduleCC            = 0;
+        shadeops_desc.callables.entryFunctionNameCC = nullptr;
+        create_optix_pg(&shadeops_desc, 1, &state.program_options,
+                        &state.shadeops_group);
+    }
 
     // Direct-callable -- fills in ShaderGlobals for Quads
-    OptixProgramGroupDesc quad_fillSG_desc = {};
-    quad_fillSG_desc.kind                  = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    quad_fillSG_desc.callables.moduleDC    = quad_module;
-    quad_fillSG_desc.callables.entryFunctionNameDC
-        = "__direct_callable__quad_shaderglobals";
-    quad_fillSG_desc.callables.moduleCC            = 0;
-    quad_fillSG_desc.callables.entryFunctionNameCC = nullptr;
-    OptixProgramGroup quad_fillSG_dc;
-    create_optix_pg(&quad_fillSG_desc, 1, &program_options, &quad_fillSG_dc);
+    {
+        OptixProgramGroupDesc quad_fillSG_desc = {};
+        quad_fillSG_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+        quad_fillSG_desc.callables.moduleDC = state.quad_module;
+        quad_fillSG_desc.callables.entryFunctionNameDC
+            = "__direct_callable__quad_shaderglobals";
+        quad_fillSG_desc.callables.moduleCC            = 0;
+        quad_fillSG_desc.callables.entryFunctionNameCC = nullptr;
+        create_optix_pg(&quad_fillSG_desc, 1, &state.program_options,
+                        &state.quad_fillSG_dc_group);
+    }
 
     // Hitgroup -- sphere
-    OptixProgramGroupDesc sphere_hitgroup_desc = {};
-    sphere_hitgroup_desc.kind              = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    sphere_hitgroup_desc.hitgroup.moduleCH = wrapper_module;
-    sphere_hitgroup_desc.hitgroup.entryFunctionNameCH
-        = "__closesthit__closest_hit_osl";
-    sphere_hitgroup_desc.hitgroup.moduleAH = wrapper_module;
-    sphere_hitgroup_desc.hitgroup.entryFunctionNameAH
-        = "__anyhit__any_hit_shadow";
-    sphere_hitgroup_desc.hitgroup.moduleIS = sphere_module;
-    sphere_hitgroup_desc.hitgroup.entryFunctionNameIS = "__intersection__sphere";
-    OptixProgramGroup sphere_hitgroup;
-    create_optix_pg(&sphere_hitgroup_desc, 1, &program_options,
-                    &sphere_hitgroup);
+    {
+        OptixProgramGroupDesc sphere_hitgroup_desc = {};
+        sphere_hitgroup_desc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+        sphere_hitgroup_desc.hitgroup.moduleCH = state.program_module;
+        sphere_hitgroup_desc.hitgroup.entryFunctionNameCH
+            = "__closesthit__deferred";
+        sphere_hitgroup_desc.hitgroup.moduleAH = state.wrapper_module;
+        sphere_hitgroup_desc.hitgroup.entryFunctionNameAH
+            = "__anyhit__any_hit_shadow";
+        sphere_hitgroup_desc.hitgroup.moduleIS = state.sphere_module;
+        sphere_hitgroup_desc.hitgroup.entryFunctionNameIS
+            = "__intersection__sphere";
+        create_optix_pg(&sphere_hitgroup_desc, 1, &state.program_options,
+                        &state.sphere_hit_group);
+    }
 
     // Direct-callable -- fills in ShaderGlobals for Sphere
-    OptixProgramGroupDesc sphere_fillSG_desc = {};
-    sphere_fillSG_desc.kind               = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    sphere_fillSG_desc.callables.moduleDC = sphere_module;
-    sphere_fillSG_desc.callables.entryFunctionNameDC
-        = "__direct_callable__sphere_shaderglobals";
-    sphere_fillSG_desc.callables.moduleCC            = 0;
-    sphere_fillSG_desc.callables.entryFunctionNameCC = nullptr;
-    OptixProgramGroup sphere_fillSG_dc;
-    create_optix_pg(&sphere_fillSG_desc, 1, &program_options,
-                    &sphere_fillSG_dc);
-
-    // Create materials
+    {
+        OptixProgramGroupDesc sphere_fillSG_desc = {};
+        sphere_fillSG_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+        sphere_fillSG_desc.callables.moduleDC = state.sphere_module;
+        sphere_fillSG_desc.callables.entryFunctionNameDC
+            = "__direct_callable__sphere_shaderglobals";
+        sphere_fillSG_desc.callables.moduleCC            = 0;
+        sphere_fillSG_desc.callables.entryFunctionNameCC = nullptr;
+        create_optix_pg(&sphere_fillSG_desc, 1, &state.program_options,
+                        &state.sphere_fillSG_dc_group);
+    }
+}
+
+
+
+void
+OptixRaytracer::create_shaders(State& state)
+{
+    // Space for message logging
+    char msg_log[8192];
+    size_t sizeof_msg_log;
+
+    // Stand-in: names of shader outputs to preserve
+    std::vector<const char*> outputs { "Cout" };
     int mtl_id = 0;
+
     std::vector<void*> material_interactive_params;
+
     for (const auto& groupref : shaders()) {
         std::string group_name, fused_name;
         shadingsys->getattribute(groupref.get(), "groupname", group_name);
@@ -546,7 +553,7 @@ OptixRaytracer::make_optix_materials()
         if (osl_ptx.empty()) {
             errhandler().errorfmt("Failed to generate PTX for ShaderGroup {}",
                                   group_name);
-            return false;
+            exit(EXIT_FAILURE);
         }
 
         if (options.get_int("saveptx")) {
@@ -566,14 +573,14 @@ OptixRaytracer::make_optix_materials()
         // can be executed by the closest hit program in the wrapper
         sizeof_msg_log = sizeof(msg_log);
         OPTIX_CHECK_MSG(optixModuleCreateFn(m_optix_ctx,
-                                            &module_compile_options,
-                                            &pipeline_compile_options,
+                                            &state.module_compile_options,
+                                            &state.pipeline_compile_options,
                                             osl_ptx.c_str(), osl_ptx.size(),
                                             msg_log, &sizeof_msg_log,
                                             &optix_module),
                         fmtformat("Creating module for PTX group {}: {}",
                                   group_name, msg_log));
-        modules.push_back(optix_module);
+        state.shader_modules.push_back(optix_module);
 
         // Create program groups (for direct callables)
         OptixProgramGroupDesc pgDesc[1] = {};
@@ -583,60 +590,70 @@ OptixRaytracer::make_optix_materials()
         pgDesc[0].callables.moduleCC            = 0;
         pgDesc[0].callables.entryFunctionNameCC = nullptr;
 
-        shader_groups.resize(shader_groups.size() + 1);
+        state.shader_groups.resize(state.shader_groups.size() + 1);
         sizeof_msg_log = sizeof(msg_log);
         OPTIX_CHECK_MSG(
             optixProgramGroupCreate(m_optix_ctx, &pgDesc[0], 1,
-                                    &program_options, msg_log, &sizeof_msg_log,
-                                    &shader_groups[shader_groups.size() - 1]),
+                                    &state.program_options, msg_log, &sizeof_msg_log,
+                                    &state.shader_groups[state.shader_groups.size() - 1]),
             fmtformat("Creating 'shader' group for group {}: {}", group_name,
                       msg_log));
     }
 
-    OptixPipelineLinkOptions pipeline_link_options;
-    pipeline_link_options.maxTraceDepth = 1;
-#if (OPTIX_VERSION < 70700)
-    pipeline_link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
-#endif
-#if (OPTIX_VERSION < 70100)
-    pipeline_link_options.overrideUsesMotionBlur = false;
-#endif
-
-    // Set up OptiX pipeline
-    std::vector<OptixProgramGroup> final_groups = { rend_lib_group,
-                                                    raygen_group, miss_group };
+    // Upload per-material interactive buffer table
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_interactive_params),
+                          sizeof(void*) * material_interactive_params.size()));
+    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_interactive_params),
+                          material_interactive_params.data(),
+                          sizeof(void*) * material_interactive_params.size(),
+                          cudaMemcpyHostToDevice));
+    device_ptrs.push_back(reinterpret_cast<void*>(d_interactive_params));
+}
 
-    if (scene.quads.size() > 0)
-        final_groups.push_back(quad_hitgroup);
-    if (scene.spheres.size() > 0)
-        final_groups.push_back(sphere_hitgroup);
 
-    final_groups.push_back(quad_fillSG_dc);
-    final_groups.push_back(sphere_fillSG_dc);
 
-    // append the shader groups to our "official" list of program groups
-    // size_t shader_groups_start_index = final_groups.size();
-    final_groups.insert(final_groups.end(), shader_groups.begin(),
-                        shader_groups.end());
+void
+OptixRaytracer::create_pipeline(State& state)
+{
+    char msg_log[8192];
+    size_t sizeof_msg_log;
 
-    // append the program group for the built-in shadeops module
-    final_groups.push_back(shadeops_group);
+    // Set the pipeline link options
+    state.pipeline_link_options.maxTraceDepth = 1;
+#if (OPTIX_VERSION < 70700)
+    state.pipeline_link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+#endif
+#if (OPTIX_VERSION < 70100)
+    state.pipeline_link_options.overrideUsesMotionBlur = false;
+#endif
 
-    // append set-globals groups
-    final_groups.push_back(setglobals_raygen_group);
-    final_groups.push_back(setglobals_miss_group);
+    // Gather all of the program groups
+    state.final_groups.push_back(state.raygen_group);
+    state.final_groups.push_back(state.miss_group);
+    state.final_groups.push_back(state.quad_hit_group);
+    state.final_groups.push_back(state.sphere_hit_group);
+    state.final_groups.push_back(state.quad_fillSG_dc_group);
+    state.final_groups.push_back(state.sphere_fillSG_dc_group);
+    state.final_groups.push_back(state.rend_lib_group);
+    state.final_groups.push_back(state.shadeops_group);
+    state.final_groups.push_back(state.setglobals_raygen_group);
+    state.final_groups.push_back(state.setglobals_miss_group);
+    state.final_groups.insert(state.final_groups.end(),
+                              state.shader_groups.begin(),
+                              state.shader_groups.end());
 
     sizeof_msg_log = sizeof(msg_log);
-    OPTIX_CHECK_MSG(optixPipelineCreate(m_optix_ctx, &pipeline_compile_options,
-                                        &pipeline_link_options,
-                                        final_groups.data(),
-                                        int(final_groups.size()), msg_log,
+    OPTIX_CHECK_MSG(optixPipelineCreate(m_optix_ctx,
+                                        &state.pipeline_compile_options,
+                                        &state.pipeline_link_options,
+                                        state.final_groups.data(),
+                                        int(state.final_groups.size()), msg_log,
                                         &sizeof_msg_log, &m_optix_pipeline),
                     fmtformat("Creating optix pipeline: {}", msg_log));
 
     // Set the pipeline stack size
     OptixStackSizes stack_sizes = {};
-    for (OptixProgramGroup& program_group : final_groups) {
+    for (OptixProgramGroup& program_group : state.final_groups) {
 #if (OPTIX_VERSION < 70700)
         OPTIX_CHECK(optixUtilAccumulateStackSizes(program_group, &stack_sizes));
 #else
@@ -671,129 +688,207 @@ OptixRaytracer::make_optix_materials()
         direct_callable_stack_size_from_state, continuation_stack_size,
         max_traversal_depth));
 
-    // Build OptiX Shader Binding Table (SBT)
+}
+
+
+
+void
+OptixRaytracer::create_sbt(State& state)
+{
+    // Raygen
+    {
+        GenericRecord raygen_record;
+        CUdeviceptr d_raygen_record;
+        OPTIX_CHECK(
+            optixSbtRecordPackHeader(state.raygen_group, &raygen_record));
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygen_record),
+                              sizeof(GenericRecord)));
+        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_raygen_record),
+                              &raygen_record, sizeof(GenericRecord),
+                              cudaMemcpyHostToDevice));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_raygen_record));
+
+        m_optix_sbt.raygenRecord = d_raygen_record;
+    }
 
-    std::vector<GenericRecord> sbt_records(final_groups.size());
+    // Miss
+    {
+        GenericRecord miss_record;
+        CUdeviceptr d_miss_record;
 
-    CUdeviceptr d_raygen_record;
-    CUdeviceptr d_miss_record;
-    CUdeviceptr d_hitgroup_records;
-    CUdeviceptr d_callable_records;
-    CUdeviceptr d_setglobals_raygen_record;
-    CUdeviceptr d_setglobals_miss_record;
+        OPTIX_CHECK(optixSbtRecordPackHeader(state.miss_group, &miss_record));
 
-    std::vector<CUdeviceptr> d_sbt_records(final_groups.size());
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_miss_record),
+                              sizeof(GenericRecord)));
+        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_miss_record),
+                              &miss_record, sizeof(GenericRecord),
+                              cudaMemcpyHostToDevice));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_miss_record));
 
-    for (size_t i = 0; i < final_groups.size(); i++) {
-        OPTIX_CHECK(optixSbtRecordPackHeader(final_groups[i], &sbt_records[i]));
+        m_optix_sbt.missRecordBase          = d_miss_record;
+        m_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord);
+        m_optix_sbt.missRecordCount         = 1;
     }
 
-    int sbtIndex             = 3;
-    const int hitRecordStart = sbtIndex;
-    size_t setglobals_start  = final_groups.size() - 2;
+    // Hitgroups
+    {
+        const bool have_quads    = scene.quads.size() > 0;
+        const bool have_spheres  = scene.spheres.size() > 0;
+        const int num_geom_types = have_quads + have_spheres;
+        const int num_hit_groups = num_geom_types;
+
+        std::vector<GenericRecord> hitgroup_records;
+        CUdeviceptr d_hitgroup_records;
+        hitgroup_records.reserve(num_hit_groups);
+
+        size_t hg_idx = 0;
+        if (have_quads) {
+            OPTIX_CHECK(optixSbtRecordPackHeader(state.quad_hit_group,
+                                                 &hitgroup_records[hg_idx]));
+            hitgroup_records[hg_idx].data = reinterpret_cast<void*>(
+                d_quads_list);
+            hitgroup_records[hg_idx].sbtGeoIndex = 0;
+            ++hg_idx;
+        }
+
+        if (have_spheres) {
+            OPTIX_CHECK(optixSbtRecordPackHeader(state.sphere_hit_group,
+                                                 &hitgroup_records[hg_idx]));
+            hitgroup_records[hg_idx].data = reinterpret_cast<void*>(d_spheres_list);
+            hitgroup_records[hg_idx].sbtGeoIndex = 1;
+        }
 
-    // Copy geometry data to appropriate SBT records
-    if (scene.quads.size() > 0) {
-        sbt_records[sbtIndex].data = reinterpret_cast<void*>(d_quads_list);
-        sbt_records[sbtIndex].sbtGeoIndex
-            = 0;  // DC index for filling in Quad ShaderGlobals
-        ++sbtIndex;
+        // copy to device
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroup_records),
+                              num_hit_groups * sizeof(GenericRecord)));
+        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_hitgroup_records),
+                              hitgroup_records.data(),
+                              num_hit_groups * sizeof(GenericRecord),
+                              cudaMemcpyHostToDevice));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_hitgroup_records));
+
+        m_optix_sbt.hitgroupRecordBase          = d_hitgroup_records;
+        m_optix_sbt.hitgroupRecordStrideInBytes = sizeof(GenericRecord);
+        m_optix_sbt.hitgroupRecordCount         = num_hit_groups;
     }
 
-    if (scene.spheres.size() > 0) {
-        sbt_records[sbtIndex].data = reinterpret_cast<void*>(d_spheres_list);
-        sbt_records[sbtIndex].sbtGeoIndex
-            = 1;  // DC index for filling in Sphere ShaderGlobals
-        ++sbtIndex;
+    // Callable programs
+    {
+        const int ncallables = 2;  // ShaderGlobals setup for quad & sphere
+        const int nshaders   = int(state.shader_groups.size());
+
+        std::vector<GenericRecord> callable_records(ncallables + nshaders);
+        CUdeviceptr d_callable_records;
+        OPTIX_CHECK(optixSbtRecordPackHeader(state.quad_fillSG_dc_group,
+                                             &callable_records[0]));
+        callable_records[0].data        = reinterpret_cast<void*>(d_quads_list);
+        callable_records[0].sbtGeoIndex = 0;
+
+        OPTIX_CHECK(optixSbtRecordPackHeader(state.sphere_fillSG_dc_group,
+                                             &callable_records[1]));
+        callable_records[1].data = reinterpret_cast<void*>(d_spheres_list);
+        callable_records[1].sbtGeoIndex = 1;
+
+        for (size_t idx = 0; idx < state.shader_groups.size(); ++idx) {
+            OPTIX_CHECK(
+                optixSbtRecordPackHeader(state.shader_groups[idx],
+                                         &callable_records[ncallables + idx]));
+        }
+
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_callable_records),
+                              (ncallables + nshaders) * sizeof(GenericRecord)));
+        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_callable_records),
+                              callable_records.data(),
+                              (ncallables + nshaders) * sizeof(GenericRecord),
+                              cudaMemcpyHostToDevice));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_callable_records));
+
+        m_optix_sbt.callablesRecordBase          = d_callable_records;
+        m_optix_sbt.callablesRecordStrideInBytes = sizeof(GenericRecord);
+        m_optix_sbt.callablesRecordCount         = ncallables + nshaders;
+
+        m_setglobals_optix_sbt.callablesRecordBase          = d_callable_records;
+        m_setglobals_optix_sbt.callablesRecordStrideInBytes = sizeof(GenericRecord);
+        m_setglobals_optix_sbt.callablesRecordCount         = ncallables + nshaders;
     }
 
-    const int callableRecordStart = sbtIndex;
-
-    // Copy geometry data to our DC (direct-callable) funcs that fill ShaderGlobals
-    sbt_records[sbtIndex++].data = reinterpret_cast<void*>(d_quads_list);
-    sbt_records[sbtIndex++].data = reinterpret_cast<void*>(d_spheres_list);
-
-    const int nshaders   = int(shader_groups.size());
-    const int nhitgroups = (scene.quads.size() > 0)
-                           + (scene.spheres.size() > 0);
-
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygen_record),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_miss_record),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroup_records),
-                          nhitgroups * sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_callable_records),
-                          (2 + nshaders) * sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_setglobals_raygen_record),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_setglobals_miss_record),
-                          sizeof(GenericRecord)));
-
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_raygen_record),
-                          &sbt_records[1], sizeof(GenericRecord),
-                          cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_miss_record),
-                          &sbt_records[2], sizeof(GenericRecord),
-                          cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_hitgroup_records),
-                          &sbt_records[hitRecordStart],
-                          nhitgroups * sizeof(GenericRecord),
-                          cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_callable_records),
-                          &sbt_records[callableRecordStart],
-                          (2 + nshaders) * sizeof(GenericRecord),
-                          cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_setglobals_raygen_record),
-                          &sbt_records[setglobals_start + 0],
-                          sizeof(GenericRecord), cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_setglobals_miss_record),
-                          &sbt_records[setglobals_start + 1],
-                          sizeof(GenericRecord), cudaMemcpyHostToDevice));
-
-    // Looks like OptixShadingTable needs to be filled out completely
-    m_optix_sbt.raygenRecord                 = d_raygen_record;
-    m_optix_sbt.missRecordBase               = d_miss_record;
-    m_optix_sbt.missRecordStrideInBytes      = sizeof(GenericRecord);
-    m_optix_sbt.missRecordCount              = 1;
-    m_optix_sbt.hitgroupRecordBase           = d_hitgroup_records;
-    m_optix_sbt.hitgroupRecordStrideInBytes  = sizeof(GenericRecord);
-    m_optix_sbt.hitgroupRecordCount          = nhitgroups;
-    m_optix_sbt.callablesRecordBase          = d_callable_records;
-    m_optix_sbt.callablesRecordStrideInBytes = sizeof(GenericRecord);
-    m_optix_sbt.callablesRecordCount         = 2 + nshaders;
-
-    // Shader binding table for SetGlobals stage
-    m_setglobals_optix_sbt                         = {};
-    m_setglobals_optix_sbt.raygenRecord            = d_setglobals_raygen_record;
-    m_setglobals_optix_sbt.missRecordBase          = d_setglobals_miss_record;
-    m_setglobals_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord);
-    m_setglobals_optix_sbt.missRecordCount         = 1;
+    // SetGlobals raygen
+    {
+        GenericRecord record;
+        CUdeviceptr d_setglobals_raygen_record;
+        OPTIX_CHECK(
+            optixSbtRecordPackHeader(state.setglobals_raygen_group, &record));
+        CUDA_CHECK(
+            cudaMalloc(reinterpret_cast<void**>(&d_setglobals_raygen_record),
+                       sizeof(GenericRecord)));
+        CUDA_CHECK(
+            cudaMemcpy(reinterpret_cast<void*>(d_setglobals_raygen_record),
+                       &record, sizeof(GenericRecord), cudaMemcpyHostToDevice));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_setglobals_raygen_record));
+
+        m_setglobals_optix_sbt.raygenRecord = d_setglobals_raygen_record;
+    }
+
+    // SetGlobals miss
+    {
+        GenericRecord record;
+        CUdeviceptr d_setglobals_miss_record;
+        OPTIX_CHECK(
+            optixSbtRecordPackHeader(state.setglobals_miss_group, &record));
+        CUDA_CHECK(
+            cudaMalloc(reinterpret_cast<void**>(&d_setglobals_miss_record),
+                       sizeof(GenericRecord)));
+        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_setglobals_miss_record),
+                              &record, sizeof(GenericRecord),
+                              cudaMemcpyHostToDevice));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_setglobals_miss_record));
+
+        m_setglobals_optix_sbt.missRecordBase = d_setglobals_miss_record;
+        m_setglobals_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord);
+        m_setglobals_optix_sbt.missRecordCount         = 1;
+    }
+}
+
 
-    // Upload per-material interactive buffer table
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_interactive_params),
-                          sizeof(void*) * material_interactive_params.size()));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_interactive_params),
-                          material_interactive_params.data(),
-                          sizeof(void*) * material_interactive_params.size(),
-                          cudaMemcpyHostToDevice));
 
-    // Pipeline has been created so we can clean some things up
-    for (auto&& i : final_groups) {
+void
+OptixRaytracer::cleanup_programs(State& state)
+{
+    for (auto&& i : state.final_groups) {
         optixProgramGroupDestroy(i);
     }
-    for (auto&& i : modules) {
+    for (auto&& i : state.shader_modules) {
         optixModuleDestroy(i);
     }
-    modules.clear();
+    state.shader_modules.clear();
+
+    optixModuleDestroy(state.program_module);
+    optixModuleDestroy(state.quad_module);
+    optixModuleDestroy(state.sphere_module);
+    optixModuleDestroy(state.wrapper_module);
+    optixModuleDestroy(state.rend_lib_module);
+    optixModuleDestroy(state.shadeops_module);
+}
 
+
+
+bool
+OptixRaytracer::make_optix_materials()
+{
+    State state;
+    create_modules(state);
+    create_programs(state);
+    create_shaders(state);
+    create_pipeline(state);
+    create_sbt(state);
+    cleanup_programs(state);
     return true;
 }
 
 
 
-bool
-OptixRaytracer::finalize_scene()
+void
+OptixRaytracer::build_accel()
 {
     // Build acceleration structures
     OptixAccelBuildOptions accelOptions;
@@ -813,13 +908,14 @@ OptixRaytracer::finalize_scene()
     quadsParams.reserve(scene.quads.size());
     std::vector<int> quadShaders;
     quadShaders.reserve(scene.quads.size());
+    int objID = static_cast<int>(scene.spheres.size());
     for (const auto& quad : scene.quads) {
         OptixAabb aabb;
         quad.getBounds(aabb.minX, aabb.minY, aabb.minZ, aabb.maxX, aabb.maxY,
                        aabb.maxZ);
         quadsAabb.push_back(aabb);
         QuadParams quad_params;
-        quad.setOptixVariables(&quad_params);
+        quad.setOptixVariables(&quad_params, objID++);
         quadsParams.push_back(quad_params);
     }
     // Copy Quads bounding boxes to cuda device
@@ -828,6 +924,7 @@ OptixRaytracer::finalize_scene()
     CUDA_CHECK(cudaMemcpy(d_quadsAabb, quadsAabb.data(),
                           sizeof(OptixAabb) * scene.quads.size(),
                           cudaMemcpyHostToDevice));
+    device_ptrs.push_back(reinterpret_cast<void*>(d_quadsAabb));
 
     // Copy Quads to cuda device
     CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_quads_list),
@@ -836,12 +933,9 @@ OptixRaytracer::finalize_scene()
                           quadsParams.data(),
                           sizeof(QuadParams) * scene.quads.size(),
                           cudaMemcpyHostToDevice));
+    device_ptrs.push_back(reinterpret_cast<void*>(d_quads_list));
 
     // Fill in Quad shaders
-    CUdeviceptr d_quadsIndexOffsetBuffer;
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_quadsIndexOffsetBuffer),
-                          scene.quads.size() * sizeof(int)));
-
     int numBuildInputs = 0;
 
     unsigned int quadSbtRecord;
@@ -862,7 +956,7 @@ OptixRaytracer::finalize_scene()
         quadsInput.numSbtRecords = 1;
         quadsInput.sbtIndexOffsetSizeInBytes   = sizeof(int);
         quadsInput.sbtIndexOffsetStrideInBytes = sizeof(int);
-        quadsInput.sbtIndexOffsetBuffer = 0;  // d_quadsIndexOffsetBuffer;
+        quadsInput.sbtIndexOffsetBuffer = 0;
         ++numBuildInputs;
     }
 
@@ -874,6 +968,7 @@ OptixRaytracer::finalize_scene()
     spheresParams.reserve(scene.spheres.size());
     std::vector<int> sphereShaders;
     sphereShaders.reserve(scene.spheres.size());
+    objID = 0;
     for (const auto& sphere : scene.spheres) {
         OptixAabb aabb;
         sphere.getBounds(aabb.minX, aabb.minY, aabb.minZ, aabb.maxX, aabb.maxY,
@@ -881,7 +976,7 @@ OptixRaytracer::finalize_scene()
         spheresAabb.push_back(aabb);
 
         SphereParams sphere_params;
-        sphere.setOptixVariables(&sphere_params);
+        sphere.setOptixVariables(&sphere_params, objID++);
         spheresParams.push_back(sphere_params);
     }
     // Copy Spheres bounding boxes to cuda device
@@ -890,6 +985,7 @@ OptixRaytracer::finalize_scene()
     CUDA_CHECK(cudaMemcpy(d_spheresAabb, spheresAabb.data(),
                           sizeof(OptixAabb) * scene.spheres.size(),
                           cudaMemcpyHostToDevice));
+    device_ptrs.push_back(reinterpret_cast<void*>(d_spheresAabb));
 
     // Copy Spheres to cuda device
     CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_spheres_list),
@@ -898,12 +994,9 @@ OptixRaytracer::finalize_scene()
                           spheresParams.data(),
                           sizeof(SphereParams) * scene.spheres.size(),
                           cudaMemcpyHostToDevice));
+    device_ptrs.push_back(reinterpret_cast<void*>(d_spheres_list));
 
     // Fill in Sphere shaders
-    CUdeviceptr d_spheresIndexOffsetBuffer;
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_spheresIndexOffsetBuffer),
-                          scene.spheres.size() * sizeof(int)));
-
     unsigned int sphereSbtRecord;
     sphereSbtRecord = OPTIX_GEOMETRY_FLAG_NONE;
     if (scene.spheres.size() > 0) {
@@ -923,7 +1016,7 @@ OptixRaytracer::finalize_scene()
         spheresInput.numSbtRecords               = 1;
         spheresInput.sbtIndexOffsetSizeInBytes   = sizeof(int);
         spheresInput.sbtIndexOffsetStrideInBytes = sizeof(int);
-        spheresInput.sbtIndexOffsetBuffer = 0;  // d_spheresIndexOffsetBuffer;
+        spheresInput.sbtIndexOffsetBuffer = 0;
         ++numBuildInputs;
     }
 
@@ -954,13 +1047,42 @@ OptixRaytracer::finalize_scene()
     CUDA_CHECK(cudaMemcpy((void*)&h_aabb, reinterpret_cast<void*>(d_aabb),
                           sizeof(OptixAabb), cudaMemcpyDeviceToHost));
     cudaFree(d_aabb);
+    cudaFree(d_temp);
+
+    // We need to free the output buffer after rendering
+    device_ptrs.push_back(d_output);
 
     // Sanity check the AS bounds
     // printf ("AABB min: [%0.6f, %0.6f, %0.6f], max: [%0.6f, %0.6f, %0.6f]\n",
     //         h_aabb.minX, h_aabb.minY, h_aabb.minZ,
     //         h_aabb.maxX, h_aabb.maxY, h_aabb.maxZ );
+}
+
+
+
+void
+OptixRaytracer::prepare_background()
+{
+    if (getBackgroundShaderID() >= 0) {
+        const int bg_res = std::max<int>(32, getBackgroundResolution());
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_bg_values), 3 * sizeof(float) * bg_res * bg_res));
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_bg_rows), sizeof(float) * bg_res));
+        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_bg_cols), sizeof(float) * bg_res * bg_res));
+
+        device_ptrs.push_back(reinterpret_cast<void*>(d_bg_values));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_bg_rows));
+        device_ptrs.push_back(reinterpret_cast<void*>(d_bg_cols));
+    }
+}
+
+
 
+bool
+OptixRaytracer::finalize_scene()
+{
+    build_accel();
     make_optix_materials();
+    prepare_background();
     return true;
 }
 
@@ -986,7 +1108,6 @@ OptixRaytracer::get_texture_handle(ustring filename,
 {
     auto itr = m_samplers.find(filename);
     if (itr == m_samplers.end()) {
-        // Open image
         OIIO::ImageBuf image;
         if (!image.init_spec(filename, 0, 0)) {
             errhandler().errorfmt("Could not load: {} (hash {})", filename,
@@ -994,42 +1115,75 @@ OptixRaytracer::get_texture_handle(ustring filename,
             return (TextureHandle*)nullptr;
         }
 
-        OIIO::ROI roi = OIIO::get_roi_full(image.spec());
-        int32_t width = roi.width(), height = roi.height();
-        std::vector<float> pixels(width * height * 4);
-
-        for (int j = 0; j < height; j++) {
-            for (int i = 0; i < width; i++) {
-                image.getpixel(i, j, 0, &pixels[((j * width) + i) * 4 + 0]);
-            }
-        }
-        cudaResourceDesc res_desc = {};
+        int32_t nmiplevels = image.nmiplevels();
+        int32_t img_width  = image.xmax() + 1;
+        int32_t img_height = image.ymax() + 1;
 
         // hard-code textures to 4 channels
-        int32_t pitch = width * 4 * sizeof(float);
         cudaChannelFormatDesc channel_desc
             = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
 
-        cudaArray_t pixelArray;
-        CUDA_CHECK(cudaMallocArray(&pixelArray, &channel_desc, width, height));
+        cudaMipmappedArray_t mipmapArray;
+        cudaExtent extent = make_cudaExtent(img_width, img_height, 0);
+        CUDA_CHECK(cudaMallocMipmappedArray(&mipmapArray, &channel_desc, extent,
+                                            nmiplevels));
+
+        // Copy the pixel data for each mip level
+        std::vector<std::vector<float>> level_pixels(nmiplevels);
+        for (int32_t level = 0; level < nmiplevels; ++level) {
+            image.reset(filename, 0, level);
+            OIIO::ROI roi = OIIO::get_roi_full(image.spec());
+            if (!roi.defined()) {
+                errhandler().errorfmt(
+                    "Could not load mip level {}: {} (hash {})", level,
+                    filename, filename);
+                return (TextureHandle*)nullptr;
+            }
 
-        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, pixels.data(), pitch,
-                                       pitch, height, cudaMemcpyHostToDevice));
+            int32_t width = roi.width(), height = roi.height();
+            level_pixels[level].resize(width * height * 4);
+            for (int j = 0; j < height; j++) {
+                for (int i = 0; i < width; i++) {
+                    image.getpixel(i, j, 0,
+                                   &level_pixels[level][((j * width) + i) * 4]);
+                }
+            }
 
-        res_desc.resType         = cudaResourceTypeArray;
-        res_desc.res.array.array = pixelArray;
+            cudaArray_t miplevelArray;
+            CUDA_CHECK(
+                cudaGetMipmappedArrayLevel(&miplevelArray, mipmapArray, level));
 
-        cudaTextureDesc tex_desc = {};
-        tex_desc.addressMode[0]  = cudaAddressModeWrap;
-        tex_desc.addressMode[1]  = cudaAddressModeWrap;
-        tex_desc.filterMode      = cudaFilterModeLinear;
-        tex_desc.readMode
-            = cudaReadModeElementType;  //cudaReadModeNormalizedFloat;
+            // Copy the texel data into the miplevel array
+            int32_t pitch = width * 4 * sizeof(float);
+            CUDA_CHECK(cudaMemcpy2DToArray(miplevelArray, 0, 0,
+                                           level_pixels[level].data(), pitch,
+                                           pitch, height,
+                                           cudaMemcpyHostToDevice));
+        }
+
+        int32_t pitch = img_width * 4 * sizeof(float);
+        cudaArray_t pixelArray;
+        CUDA_CHECK(
+            cudaMallocArray(&pixelArray, &channel_desc, img_width, img_height));
+        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(),
+                                       pitch, pitch, img_height,
+                                       cudaMemcpyHostToDevice));
+        array_ptrs.push_back(reinterpret_cast<void*>(pixelArray));
+
+        cudaResourceDesc res_desc  = {};
+        res_desc.resType           = cudaResourceTypeMipmappedArray;
+        res_desc.res.mipmap.mipmap = mipmapArray;
+
+        cudaTextureDesc tex_desc     = {};
+        tex_desc.addressMode[0]      = cudaAddressModeWrap;
+        tex_desc.addressMode[1]      = cudaAddressModeWrap;
+        tex_desc.filterMode          = cudaFilterModeLinear;
+        tex_desc.readMode            = cudaReadModeElementType;
         tex_desc.normalizedCoords    = 1;
         tex_desc.maxAnisotropy       = 1;
-        tex_desc.maxMipmapLevelClamp = 99;
+        tex_desc.maxMipmapLevelClamp = float(nmiplevels - 1);
         tex_desc.minMipmapLevelClamp = 0;
-        tex_desc.mipmapFilterMode    = cudaFilterModePoint;
+        tex_desc.mipmapFilterMode    = cudaFilterModeLinear;
         tex_desc.borderColor[0]      = 1.0f;
         tex_desc.sRGB                = 0;
 
@@ -1076,10 +1230,17 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
                           xres * yres * 4 * sizeof(float)));
     CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_launch_params),
                           sizeof(RenderParams)));
+    device_ptrs.push_back(reinterpret_cast<void*>(d_output_buffer));
+    device_ptrs.push_back(reinterpret_cast<void*>(d_launch_params));
 
     m_xres = xres;
     m_yres = yres;
 
+    const int aa                  = std::max(1, options.get_int("aa"));
+    const int max_bounces         = options.get_int("max_bounces");
+    const float show_albedo_scale = options.get_float("show_albedo_scale");
+    const bool no_jitter          = options.get_int("no_jitter");
+
     RenderParams params;
     params.eye.x                   = camera.eye.x;
     params.eye.y                   = camera.eye.y;
@@ -1087,14 +1248,14 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
     params.dir.x                   = camera.dir.x;
     params.dir.y                   = camera.dir.y;
     params.dir.z                   = camera.dir.z;
-    params.cx.x                    = camera.cx.x;
-    params.cx.y                    = camera.cx.y;
-    params.cx.z                    = camera.cx.z;
-    params.cy.x                    = camera.cy.x;
-    params.cy.y                    = camera.cy.y;
-    params.cy.z                    = camera.cy.z;
-    params.invw                    = 1.0f / m_xres;
-    params.invh                    = 1.0f / m_yres;
+    params.up.x                    = camera.up.x;
+    params.up.y                    = camera.up.y;
+    params.up.z                    = camera.up.z;
+    params.fov                     = camera.fov;
+    params.aa                      = aa;
+    params.max_bounces             = max_bounces;
+    params.show_albedo_scale       = show_albedo_scale;
+    params.no_jitter               = no_jitter;
     params.interactive_params      = d_interactive_params;
     params.output_buffer           = d_output_buffer;
     params.traversal_handle        = m_travHandle;
@@ -1104,13 +1265,24 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
     params.color_system          = d_color_system;
     params.test_str_1            = test_str_1;
     params.test_str_2            = test_str_2;
+    params.num_quads             = scene.quads.size();
+    params.quads_buffer          = d_quads_list;
+    params.num_spheres           = scene.spheres.size();
+    params.spheres_buffer        = d_spheres_list;
+
+    // For the background shader
+    params.bg_res    = std::max<int>(32, getBackgroundResolution());
+    params.bg_id     = getBackgroundShaderID();
+    params.bg_values = d_bg_values;
+    params.bg_rows   = d_bg_rows;
+    params.bg_cols   = d_bg_cols;
 
     CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_launch_params), &params,
                           sizeof(RenderParams), cudaMemcpyHostToDevice));
 
     // Set up global variables
     OPTIX_CHECK(optixLaunch(m_optix_pipeline, m_cuda_stream, d_launch_params,
-                            sizeof(RenderParams), &m_setglobals_optix_sbt, 1, 1,
+                            sizeof(RenderParams), &m_setglobals_optix_sbt, 32, 1,
                             1));
     CUDA_SYNC_CHECK();
 
@@ -1254,8 +1426,14 @@ void
 OptixRaytracer::clear()
 {
     SimpleRaytracer::clear();
-    OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
-    m_optix_ctx = 0;
+    if (m_optix_pipeline) {
+        OPTIX_CHECK(optixPipelineDestroy(m_optix_pipeline));
+        m_optix_pipeline = 0;
+    }
+    if (m_optix_ctx) {
+        OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
+        m_optix_ctx = 0;
+    }
 }
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/optixraytracer.h b/src/testrender/optixraytracer.h
index b331a8422..4c73f762b 100644
--- a/src/testrender/optixraytracer.h
+++ b/src/testrender/optixraytracer.h
@@ -14,7 +14,37 @@
 #include "render_params.h"
 #include "simpleraytracer.h"
 
-OSL_NAMESPACE_ENTER
+OSL_NAMESPACE_ENTER;
+
+
+struct State {
+    OptixModuleCompileOptions module_compile_options     = {};
+    OptixPipelineCompileOptions pipeline_compile_options = {};
+    OptixPipelineLinkOptions pipeline_link_options       = {};
+    OptixProgramGroupOptions program_options             = {};
+
+    OptixModule program_module;
+    OptixModule quad_module;
+    OptixModule sphere_module;
+    OptixModule wrapper_module;
+    OptixModule rend_lib_module;
+    OptixModule shadeops_module;
+
+    OptixProgramGroup raygen_group;
+    OptixProgramGroup miss_group;
+    OptixProgramGroup rend_lib_group;
+    OptixProgramGroup shadeops_group;
+    OptixProgramGroup setglobals_raygen_group;
+    OptixProgramGroup setglobals_miss_group;
+    OptixProgramGroup quad_hit_group;
+    OptixProgramGroup sphere_hit_group;
+    OptixProgramGroup quad_fillSG_dc_group;
+    OptixProgramGroup sphere_fillSG_dc_group;
+
+    std::vector<OptixModule> shader_modules;
+    std::vector<OptixProgramGroup> shader_groups;
+    std::vector<OptixProgramGroup> final_groups;
+};
 
 
 class OptixRaytracer final : public SimpleRaytracer {
@@ -37,6 +67,8 @@ class OptixRaytracer final : public SimpleRaytracer {
 
     bool init_optix_context(int xres, int yres);
     bool make_optix_materials();
+    void build_accel();
+    void prepare_background();
     bool finalize_scene();
     void prepare_render() override;
     void warmup() override;
@@ -44,6 +76,13 @@ class OptixRaytracer final : public SimpleRaytracer {
     void finalize_pixel_buffer() override;
     void clear() override;
 
+    void create_modules(State& state);
+    void create_programs(State& state);
+    void create_shaders(State& state);
+    void create_pipeline(State& state);
+    void create_sbt(State& state);
+    void cleanup_programs(State& state);
+
     /// Return true if the texture handle (previously returned by
     /// get_texture_handle()) is a valid texture that can be subsequently
     /// read or sampled.
@@ -80,7 +119,11 @@ class OptixRaytracer final : public SimpleRaytracer {
     CUdeviceptr d_quads_list         = 0;
     CUdeviceptr d_spheres_list       = 0;
     CUdeviceptr d_interactive_params = 0;
-    int m_xres, m_yres;
+    CUdeviceptr d_bg_values          = 0;
+    CUdeviceptr d_bg_rows            = 0;
+    CUdeviceptr d_bg_cols            = 0;
+    int m_xres = 0;
+    int m_yres = 0;
     CUdeviceptr d_osl_printf_buffer;
     CUdeviceptr d_color_system;
     uint64_t test_str_1;
@@ -98,6 +141,9 @@ class OptixRaytracer final : public SimpleRaytracer {
 
     std::string m_materials_ptx;
     std::unordered_map<ustringhash, optix::TextureSampler> m_samplers;
+
+    std::vector<void*> device_ptrs;
+    std::vector<void*> array_ptrs;
 };
 
 
diff --git a/src/testrender/raytracer.h b/src/testrender/raytracer.h
index 5308f1870..1d912fadc 100644
--- a/src/testrender/raytracer.h
+++ b/src/testrender/raytracer.h
@@ -20,6 +20,11 @@
 #    include <vector_functions.h>  // from CUDA
 #endif
 
+#ifdef __CUDACC__
+#    include "cuda/rend_lib.h"
+#    include "cuda/vec_math.h"
+#endif
+
 // The primitives don't included the intersection routines, etc., from the
 // versions in testrender, since those operations are performed on the GPU.
 //
@@ -32,7 +37,7 @@ class OptixRenderer;  // FIXME -- should not be here
 
 
 // build two vectors orthogonal to the first, assumes n is normalized
-inline void
+inline OSL_HOSTDEVICE void
 ortho(const Vec3& n, Vec3& x, Vec3& y)
 {
     x = (fabsf(n.x) > .01f ? Vec3(n.z, 0, -n.x) : Vec3(0, -n.z, n.y))
@@ -41,7 +46,6 @@ ortho(const Vec3& n, Vec3& x, Vec3& y)
 }
 
 
-// Note: not used in OptiX mode
 struct Ray {
     enum RayType {
         CAMERA       = 1,
@@ -54,6 +58,7 @@ struct Ray {
         DISPLACEMENT = 128
     };
 
+    OSL_HOSTDEVICE
     Ray(const Vec3& o, const Vec3& d, float radius, float spread,
         RayType raytype)
         : origin(o)
@@ -64,7 +69,9 @@ struct Ray {
     {
     }
 
+    OSL_HOSTDEVICE
     Vec3 point(float t) const { return origin + direction * t; }
+
     Dual2<Vec3> dual_direction() const
     {
         Dual2<Vec3> v;
@@ -75,6 +82,7 @@ struct Ray {
         return v;
     }
 
+    OSL_HOSTDEVICE
     Dual2<Vec3> point(Dual2<float> t) const
     {
         const float r = radius + spread * t.val();
@@ -94,9 +102,10 @@ struct Ray {
 
 
 struct Camera {
-    Camera() {}
+    OSL_HOSTDEVICE Camera() {}
 
     // Set where the camera sits and looks at.
+    OSL_HOSTDEVICE
     void lookat(const Vec3& eye, const Vec3& dir, const Vec3& up, float fov)
     {
         this->eye = eye;
@@ -107,6 +116,7 @@ struct Camera {
     }
 
     // Set resolution
+    OSL_HOSTDEVICE
     void resolution(int w, int h)
     {
         xres = w;
@@ -117,6 +127,7 @@ struct Camera {
     }
 
     // Compute all derived values based on camera parameters.
+    OSL_HOSTDEVICE
     void finalize()
     {
         float k    = OIIO::fast_tan(fov * float(M_PI / 360));
@@ -126,10 +137,21 @@ struct Camera {
     }
 
     // Get a ray for the given screen coordinates.
+    OSL_HOSTDEVICE
     Ray get(float x, float y) const
     {
+        // TODO: On CUDA devices, the normalize() operation can result in vector
+        // components with magnitudes slightly greater than 1.0, which can cause
+        // downstream computations to blow up and produce NaNs. Normalizing the
+        // vector again avoids this issue.
         const Vec3 v = (cx * (x * invw - 0.5f) + cy * (0.5f - y * invh) + dir)
+#ifndef __CUDACC__
                            .normalize();
+#else
+                           .normalize()
+                           .normalized();
+#endif
+
         const float cos_a = dir.dot(v);
         const float spread
             = sqrtf(invw * invh * cx.length() * cy.length() * cos_a) * cos_a;
@@ -157,18 +179,22 @@ struct Camera {
 
 
 struct Primitive {
+    OSL_HOSTDEVICE
     Primitive(int shaderID, bool isLight) : shaderID(shaderID), isLight(isLight)
     {
     }
+
+#ifndef __CUDACC__
     virtual ~Primitive() {}
+#endif
 
     int shaderid() const { return shaderID; }
     bool islight() const { return isLight; }
     void getBounds(float& minx, float& miny, float& minz, float& maxx,
                    float& maxy, float& maxz) const;
 
-#if OSL_USE_OPTIX
-    virtual void setOptixVariables(void* data) const = 0;
+#if defined(OSL_USE_OPTIX) && !defined(__CUDACC__)
+    virtual void setOptixVariables(void* data, int objID) const = 0;
 #endif
 
 private:
@@ -178,6 +204,7 @@ struct Primitive {
 
 
 struct Sphere final : public Primitive {
+    OSL_HOSTDEVICE
     Sphere(Vec3 c, float r, int shaderID, bool isLight)
         : Primitive(shaderID, isLight), c(c), r(r), r2(r * r)
     {
@@ -196,6 +223,7 @@ struct Sphere final : public Primitive {
     }
 
     // returns distance to nearest hit or 0
+    OSL_HOSTDEVICE
     Dual2<float> intersect(const Ray& r, bool self) const
     {
         Dual2<Vec3> oc   = c - r.origin;
@@ -217,6 +245,7 @@ struct Sphere final : public Primitive {
 
     Dual2<Vec3> normal(const Dual2<Vec3>& p) const { return normalize(p - c); }
 
+    OSL_HOSTDEVICE
     Dual2<Vec2> uv(const Dual2<Vec3>& /*p*/, const Dual2<Vec3>& n, Vec3& dPdu,
                    Vec3& dPdv) const
     {
@@ -254,6 +283,7 @@ struct Sphere final : public Primitive {
     }
 
     // return a direction towards a point on the sphere
+    OSL_HOSTDEVICE
     Vec3 sample(const Vec3& x, float xi, float yi, float& pdf) const
     {
         const float TWOPI = float(2 * M_PI);
@@ -270,6 +300,7 @@ struct Sphere final : public Primitive {
         return (su * (cp * sin_a) + sv * (sp * sin_a) + sw * cos_a).normalize();
     }
 
+    OSL_HOSTDEVICE
     float shapepdf(const Vec3& x, const Vec3& /*p*/) const
     {
         const float TWOPI = float(2 * M_PI);
@@ -278,14 +309,17 @@ struct Sphere final : public Primitive {
         return 1 / (TWOPI * (1 - cmax));
     }
 
-#if OSL_USE_OPTIX
-    virtual void setOptixVariables(void* data) const
+#if defined(OSL_USE_OPTIX) && !defined(__CUDACC__)
+    virtual void setOptixVariables(void* data, int objID) const
     {
         SphereParams* sphere_data = reinterpret_cast<SphereParams*>(data);
         sphere_data->c            = make_float3(c.x, c.y, c.z);
+        sphere_data->r            = r;
         sphere_data->r2           = r2;
-        sphere_data->a            = M_PI * (r2 * r2);
+        sphere_data->a            = M_PI * r2;
         sphere_data->shaderID     = shaderid();
+        sphere_data->isLight      = islight();
+        sphere_data->objID        = objID;
     }
 #endif
 
@@ -297,6 +331,7 @@ struct Sphere final : public Primitive {
 
 
 struct Quad final : public Primitive {
+    OSL_HOSTDEVICE
     Quad(const Vec3& p, const Vec3& ex, const Vec3& ey, int shaderID,
          bool isLight)
         : Primitive(shaderID, isLight), p(p), ex(ex), ey(ey)
@@ -324,6 +359,7 @@ struct Quad final : public Primitive {
     }
 
     // returns distance to nearest hit or 0
+    OSL_HOSTDEVICE
     Dual2<float> intersect(const Ray& r, bool self) const
     {
         if (self)
@@ -348,6 +384,7 @@ struct Quad final : public Primitive {
         return Dual2<Vec3>(n, Vec3(0, 0, 0), Vec3(0, 0, 0));
     }
 
+    OSL_HOSTDEVICE
     Dual2<Vec2> uv(const Dual2<Vec3>& p, const Dual2<Vec3>& /*n*/, Vec3& dPdu,
                    Vec3& dPdv) const
     {
@@ -360,6 +397,7 @@ struct Quad final : public Primitive {
     }
 
     // return a direction towards a point on the sphere
+    OSL_HOSTDEVICE
     Vec3 sample(const Vec3& x, float xi, float yi, float& pdf) const
     {
         Vec3 l   = (p + xi * ex + yi * ey) - x;
@@ -369,6 +407,7 @@ struct Quad final : public Primitive {
         return dir;
     }
 
+    OSL_HOSTDEVICE
     float shapepdf(const Vec3& x, const Vec3& p) const
     {
         Vec3 l   = p - x;
@@ -377,8 +416,8 @@ struct Quad final : public Primitive {
         return d2 / (a * fabsf(dir.dot(n)));
     }
 
-#if OSL_USE_OPTIX
-    virtual void setOptixVariables(void* data) const
+#if defined(OSL_USE_OPTIX) && !defined(__CUDACC__)
+    virtual void setOptixVariables(void* data, int objID) const
     {
         QuadParams* quad_data = reinterpret_cast<QuadParams*>(data);
         quad_data->p          = make_float3(p.x, p.y, p.z);
@@ -389,6 +428,8 @@ struct Quad final : public Primitive {
         quad_data->ev         = ev;
         quad_data->a          = a;
         quad_data->shaderID   = shaderid();
+        quad_data->isLight    = islight();
+        quad_data->objID      = objID;
     }
 #endif
 
@@ -406,7 +447,8 @@ struct Scene {
 
     int num_prims() const { return spheres.size() + quads.size(); }
 
-    bool intersect(const Ray& r, Dual2<float>& t, int& primID) const
+    bool intersect(const Ray& r, Dual2<float>& t, int& primID,
+                   const void* sg = nullptr) const
     {
         const int ns   = spheres.size();
         const int nq   = quads.size();
diff --git a/src/testrender/render_params.h b/src/testrender/render_params.h
index faba736c2..85e1fb159 100644
--- a/src/testrender/render_params.h
+++ b/src/testrender/render_params.h
@@ -15,11 +15,12 @@ struct RenderParams {
 
     float3 eye;
     float3 dir;
-    float3 cx;
-    float3 cy;
-
-    float invw;
-    float invh;
+    float3 up;
+    float fov;
+    int aa;
+    int max_bounces;
+    float show_albedo_scale;
+    bool no_jitter;
 
     CUdeviceptr traversal_handle;
     CUdeviceptr output_buffer;
@@ -38,19 +39,34 @@ struct RenderParams {
     // for used-data tests
     uint64_t test_str_1;
     uint64_t test_str_2;
+
+    uint64_t num_spheres;
+    uint64_t num_quads;
+    CUdeviceptr spheres_buffer;
+    CUdeviceptr quads_buffer;
+
+    // for the background
+    int bg_res;
+    int bg_id;
+    CUdeviceptr bg_values;
+    CUdeviceptr bg_rows;
+    CUdeviceptr bg_cols;
 };
 
 
 
 struct PrimitiveParams {
     float a;  // area
-    unsigned int shaderID;
+    int32_t shaderID;
+    int32_t objID;
+    bool isLight;
 };
 
 
 
 struct SphereParams : PrimitiveParams {
     float3 c;  // center
+    float r;   // radius
     float r2;  // radius ^2
 };
 
diff --git a/src/testrender/sampling.h b/src/testrender/sampling.h
index 3266e72d9..5f1a7bc47 100644
--- a/src/testrender/sampling.h
+++ b/src/testrender/sampling.h
@@ -14,7 +14,7 @@ OSL_NAMESPACE_ENTER
 
 struct TangentFrame {
     // build frame from unit normal
-    TangentFrame(const Vec3& n) : w(n)
+    OSL_HOSTDEVICE TangentFrame(const Vec3& n) : w(n)
     {
         u = (fabsf(w.x) > .01f ? Vec3(w.z, 0, -w.x) : Vec3(0, -w.z, w.y))
                 .normalize();
@@ -22,25 +22,25 @@ struct TangentFrame {
     }
 
     // build frame from unit normal and unit tangent
-    TangentFrame(const Vec3& n, const Vec3& t) : w(n)
+    OSL_HOSTDEVICE TangentFrame(const Vec3& n, const Vec3& t) : w(n)
     {
         v = w.cross(t);
         u = v.cross(w);
     }
 
     // transform vector
-    Vec3 get(float x, float y, float z) const { return x * u + y * v + z * w; }
+    OSL_HOSTDEVICE Vec3 get(float x, float y, float z) const { return x * u + y * v + z * w; }
 
     // untransform vector
-    float getx(const Vec3& a) const { return a.dot(u); }
-    float gety(const Vec3& a) const { return a.dot(v); }
-    float getz(const Vec3& a) const { return a.dot(w); }
+    OSL_HOSTDEVICE float getx(const Vec3& a) const { return a.dot(u); }
+    OSL_HOSTDEVICE float gety(const Vec3& a) const { return a.dot(v); }
+    OSL_HOSTDEVICE float getz(const Vec3& a) const { return a.dot(w); }
 
-    Vec3 tolocal(const Vec3& a) const
+    OSL_HOSTDEVICE Vec3 tolocal(const Vec3& a) const
     {
         return Vec3(a.dot(u), a.dot(v), a.dot(w));
     }
-    Vec3 toworld(const Vec3& a) const { return get(a.x, a.y, a.z); }
+    OSL_HOSTDEVICE Vec3 toworld(const Vec3& a) const { return get(a.x, a.y, a.z); }
 
 private:
     Vec3 u, v, w;
@@ -49,7 +49,8 @@ struct TangentFrame {
 struct Sampling {
     /// Warp the unit disk onto the unit sphere
     /// http://psgraphics.blogspot.com/2011/01/improved-code-for-concentric-map.html
-    static void to_unit_disk(float& x, float& y)
+    static OSL_HOSTDEVICE void
+    to_unit_disk(float& x, float& y)
     {
         const float PI_OVER_4 = float(M_PI_4);
         const float PI_OVER_2 = float(M_PI_2);
@@ -71,8 +72,9 @@ struct Sampling {
         y *= r;
     }
 
-    static void sample_cosine_hemisphere(const Vec3& N, float rndx, float rndy,
-                                         Vec3& out, float& pdf)
+    static OSL_HOSTDEVICE void
+    sample_cosine_hemisphere(const Vec3& N, float rndx, float rndy,
+                             Vec3& out, float& pdf)
     {
         to_unit_disk(rndx, rndy);
         float cos_theta = sqrtf(std::max(1 - rndx * rndx - rndy * rndy, 0.0f));
@@ -81,8 +83,9 @@ struct Sampling {
         pdf = cos_theta * float(M_1_PI);
     }
 
-    static void sample_uniform_hemisphere(const Vec3& N, float rndx, float rndy,
-                                          Vec3& out, float& pdf)
+    static OSL_HOSTDEVICE void
+    sample_uniform_hemisphere(const Vec3& N, float rndx, float rndy,
+                              Vec3& out, float& pdf)
     {
         float phi       = float(2 * M_PI) * rndx;
         float cos_theta = rndy;
@@ -107,7 +110,8 @@ struct MIS {
     // Centralizing the handling of the pdfs this way ensures that all numerical
     // cases can be enumerated and handled robustly without arbitrary epsilons.
     template<MISMode mode>
-    static inline float power_heuristic(float sampled_pdf, float other_pdf)
+    static inline OSL_HOSTDEVICE float
+    power_heuristic(float sampled_pdf, float other_pdf)
     {
         // NOTE: inf is ok!
         assert(sampled_pdf >= 0);
@@ -148,14 +152,25 @@ struct MIS {
     // such as a BRDF mixture. This updates a (weight, pdf) pair with a new one
     // to represent the sum of both. b is the probability of choosing the provided
     // weight. A running sum should be started with a weight and pdf of 0.
-    static inline void update_eval(Color3* w, float* pdf, Color3 ow, float opdf,
-                                   float b)
+    static inline OSL_HOSTDEVICE void
+    update_eval(Color3* w, float* pdf, Color3 ow, float opdf,
+                float b)
     {
+        // TODO: Need to handle these cases for CUDA. Clamping might be okay if the
+        // values are only slightly out of range
+
+#ifdef __CUDACC__
+        // Check for those pesky NaNs
+        assert(*pdf == *pdf);
+        assert(b == b);
+        assert(opdf == opdf);
+#endif
+
         // NOTE: inf is ok!
-        assert(*pdf >= 0);
-        assert(opdf >= 0);
-        assert(b >= 0);
-        assert(b <= 1);
+        assert(*pdf >= 0.0f);
+        assert(opdf >= 0.0f);
+        assert(b >= 0.0f);
+        assert(b <= 1.0f);
 
         // make sure 1 / b is not inf
         // note that if the weight has components > 1 ow can still overflow, but
@@ -182,6 +197,7 @@ struct MIS {
 // "Practical Hash-based Owen Scrambling" - Brent Burley - JCGT 2020
 //    https://jcgt.org/published/0009/04/01/
 struct Sampler {
+    OSL_HOSTDEVICE
     Sampler(int px, int py, int si)
         : seed(((px & 2047) << 22) | ((py & 2047) << 11))
         , index(reversebits(si))
@@ -189,6 +205,7 @@ struct Sampler {
         assert(si < (1 << 24));
     }
 
+    OSL_HOSTDEVICE
     Vec3 get()
     {
         static const uint32_t zmatrix[24] = {
@@ -223,7 +240,7 @@ struct Sampler {
 private:
     uint32_t seed, index;
 
-    static uint32_t hash(uint32_t s)
+    static OSL_HOSTDEVICE uint32_t hash(uint32_t s)
     {
         // https://github.com/skeeto/hash-prospector
         s ^= s >> 16;
@@ -234,7 +251,7 @@ struct Sampler {
         return s;
     }
 
-    static uint32_t reversebits(uint32_t x)
+    static OSL_HOSTDEVICE uint32_t reversebits(uint32_t x)
     {
 #if defined(__clang__)
         return __builtin_bitreverse32(x);
@@ -248,7 +265,7 @@ struct Sampler {
 #endif
     }
 
-    static uint32_t owen_scramble(uint32_t p, uint32_t s)
+    static OSL_HOSTDEVICE uint32_t owen_scramble(uint32_t p, uint32_t s)
     {
         // https://psychopath.io/post/2021_01_30_building_a_better_lk_hash
         // assumes reversed input
diff --git a/src/testrender/shading.cpp b/src/testrender/shading.cpp
index 73821a73c..154f50e61 100644
--- a/src/testrender/shading.cpp
+++ b/src/testrender/shading.cpp
@@ -8,238 +8,79 @@
 #include "optics.h"
 #include "sampling.h"
 
+#ifdef __CUDACC__
+#    include "cuda/vec_math.h"
+#endif
+
+
 using namespace OSL;
 
+
+// TODO: This is a little clumsy. Is it necessary?
+#ifndef __CUDACC__
+using ShaderGlobalsType = OSL::ShaderGlobals;
+#else
+using ShaderGlobalsType = OSL_CUDA::ShaderGlobals;
+#endif
+
+
 namespace {  // anonymous namespace
 
 using OIIO::clamp;
+using OSL::dot;
 
-Color3
+OSL_HOSTDEVICE Color3
 clamp(const Color3& c, float min, float max)
 {
     return Color3(clamp(c.x, min, max), clamp(c.y, min, max),
                   clamp(c.z, min, max));
 }
 
-bool
+OSL_HOSTDEVICE bool
 is_black(const Color3& c)
 {
     return c.x == 0 && c.y == 0 && c.z == 0;
 }
 
 
+OSL_HOSTDEVICE Color3
+MxDielectricParams::evalR(float cos_theta) const
+{
+    return reflection_tint * fresnel_dielectric(cos_theta, ior);
+}
 
-// unique identifier for each closure supported by testrender
-enum ClosureIDs {
-    EMISSION_ID = 1,
-    BACKGROUND_ID,
-    DIFFUSE_ID,
-    OREN_NAYAR_ID,
-    TRANSLUCENT_ID,
-    PHONG_ID,
-    WARD_ID,
-    MICROFACET_ID,
-    REFLECTION_ID,
-    FRESNEL_REFLECTION_ID,
-    REFRACTION_ID,
-    TRANSPARENT_ID,
-    // See MATERIALX_CLOSURES in stdosl.h
-    MX_OREN_NAYAR_DIFFUSE_ID,
-    MX_BURLEY_DIFFUSE_ID,
-    MX_DIELECTRIC_ID,
-    MX_CONDUCTOR_ID,
-    MX_GENERALIZED_SCHLICK_ID,
-    MX_TRANSLUCENT_ID,
-    MX_TRANSPARENT_ID,
-    MX_SUBSURFACE_ID,
-    MX_SHEEN_ID,
-    MX_UNIFORM_EDF_ID,
-    MX_ANISOTROPIC_VDF_ID,
-    MX_MEDIUM_VDF_ID,
-    MX_LAYER_ID,
-    // TODO: adding vdfs would require extending testrender with volume support ...
-};
-
-// these structures hold the parameters of each closure type
-// they will be contained inside ClosureComponent
-struct EmptyParams {};
-struct DiffuseParams {
-    Vec3 N;
-};
-struct OrenNayarParams {
-    Vec3 N;
-    float sigma;
-};
-struct PhongParams {
-    Vec3 N;
-    float exponent;
-};
-struct WardParams {
-    Vec3 N, T;
-    float ax, ay;
-};
-struct ReflectionParams {
-    Vec3 N;
-    float eta;
-};
-struct RefractionParams {
-    Vec3 N;
-    float eta;
-};
-struct MicrofacetParams {
-    ustringhash dist;
-    Vec3 N, U;
-    float xalpha, yalpha, eta;
-    int refract;
-};
-
-// MATERIALX_CLOSURES
-
-struct MxOrenNayarDiffuseParams {
-    Vec3 N;
-    Color3 albedo;
-    float roughness;
-    // optional
-    ustringhash label;
-    int energy_compensation;
-};
-
-struct MxBurleyDiffuseParams {
-    Vec3 N;
-    Color3 albedo;
-    float roughness;
-    // optional
-    ustringhash label;
-};
-
-// common to all MaterialX microfacet closures
-struct MxMicrofacetBaseParams {
-    Vec3 N, U;
-    float roughness_x;
-    float roughness_y;
-    ustringhash distribution;
-    // optional
-    ustringhash label;
-};
-
-struct MxDielectricParams : public MxMicrofacetBaseParams {
-    Color3 reflection_tint;
-    Color3 transmission_tint;
-    float ior;
-    // optional
-    float thinfilm_thickness;
-    float thinfilm_ior;
-
-    Color3 evalR(float cos_theta) const
-    {
-        return reflection_tint * fresnel_dielectric(cos_theta, ior);
-    }
-
-    Color3 evalT(float cos_theta) const
-    {
-        return transmission_tint * (1.0f - fresnel_dielectric(cos_theta, ior));
-    }
-};
-
-struct MxConductorParams : public MxMicrofacetBaseParams {
-    Color3 ior;
-    Color3 extinction;
-    // optional
-    float thinfilm_thickness;
-    float thinfilm_ior;
-
-    Color3 evalR(float cos_theta) const
-    {
-        return fresnel_conductor(cos_theta, ior, extinction);
-    }
-
-    Color3 evalT(float cos_theta) const { return Color3(0.0f); }
-
-    // Avoid function was declared but never referenced
-    // float get_ior() const
-    // {
-    //     return 0;  // no transmission possible
-    // }
-};
-
-struct MxGeneralizedSchlickParams : public MxMicrofacetBaseParams {
-    Color3 reflection_tint;
-    Color3 transmission_tint;
-    Color3 f0;
-    Color3 f90;
-    float exponent;
-    // optional
-    float thinfilm_thickness;
-    float thinfilm_ior;
-
-    Color3 evalR(float cos_theta) const
-    {
-        return reflection_tint
-               * fresnel_generalized_schlick(cos_theta, f0, f90, exponent);
-    }
-
-    Color3 evalT(float cos_theta) const
-    {
-        return transmission_tint
-               * (Color3(1.0f)
-                  - fresnel_generalized_schlick(cos_theta, f0, f90, exponent));
-    }
-};
-
-struct MxTranslucentParams {
-    Vec3 N;
-    Color3 albedo;
-    // optional
-    ustringhash label;
-};
-
-struct MxSubsurfaceParams {
-    Vec3 N;
-    Color3 albedo;
-    float transmission_depth;
-    Color3 transmission_color;
-    float anisotropy;
-    // optional
-    ustringhash label;
-};
-
-struct MxSheenParams {
-    Vec3 N;
-    Color3 albedo;
-    float roughness;
-    // optional
-    ustringhash label;
-};
+OSL_HOSTDEVICE Color3
+MxDielectricParams::evalT(float cos_theta) const
+{
+    return transmission_tint * (1.0f - fresnel_dielectric(cos_theta, ior));
+}
 
-struct MxUniformEdfParams {
-    Color3 emittance;
-    // optional
-    ustringhash label;
-};
+OSL_HOSTDEVICE Color3
+MxConductorParams::evalR(float cos_theta) const
+{
+    return fresnel_conductor(cos_theta, ior, extinction);
+}
 
-struct MxLayerParams {
-    OSL::ClosureColor* top;
-    OSL::ClosureColor* base;
-};
+OSL_HOSTDEVICE Color3
+MxConductorParams::evalT(float cos_theta) const
+{
+    return Color3(0.0f);
+}
 
-struct MxAnisotropicVdfParams {
-    Color3 albedo;
-    Color3 extinction;
-    float anisotropy;
-    // optional
-    ustringhash label;
-};
+OSL_HOSTDEVICE Color3
+MxGeneralizedSchlickParams::evalR(float cos_theta) const
+{
+    return reflection_tint
+           * fresnel_generalized_schlick(cos_theta, f0, f90, exponent);
+}
 
-struct MxMediumVdfParams {
-    Color3 albedo;
-    float transmission_depth;
-    Color3 transmission_color;
-    float anisotropy;
-    float ior;
-    int priority;
-    // optional
-    ustringhash label;
-};
+OSL_HOSTDEVICE Color3
+MxGeneralizedSchlickParams::evalT(float cos_theta) const
+{
+    return transmission_tint
+           * (Color3(1.0f)
+              - fresnel_generalized_schlick(cos_theta, f0, f90, exponent));
+}
 
 }  // anonymous namespace
 
@@ -247,6 +88,7 @@ struct MxMediumVdfParams {
 OSL_NAMESPACE_ENTER
 
 
+#if !defined(__CUDACC__)
 void
 register_closures(OSL::ShadingSystem* shadingsys)
 {
@@ -433,24 +275,25 @@ register_closures(OSL::ShadingSystem* shadingsys)
     for (const BuiltinClosures& b : builtins)
         shadingsys->register_closure(b.name, b.id, b.params, nullptr, nullptr);
 }
+#endif // !defined(__CUDACC__)
 
 OSL_NAMESPACE_EXIT
 
 namespace {  // anonymous namespace
 
 template<int trans> struct Diffuse final : public BSDF, DiffuseParams {
-    Diffuse(const DiffuseParams& params) : BSDF(), DiffuseParams(params)
+    OSL_HOSTDEVICE Diffuse(const DiffuseParams& params) : BSDF(DIFFUSE_ID), DiffuseParams(params)
     {
         if (trans)
             N = -N;
     }
-    Sample eval(const Vec3& /*wo*/, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const OSL::Vec3& wi) const
     {
         const float pdf = std::max(N.dot(wi), 0.0f) * float(M_1_PI);
         return { wi, Color3(1.0f), pdf, 1.0f };
     }
-    Sample sample(const Vec3& /*wo*/, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& /*wo*/, float rx, float ry,
+                  float /*rz*/) const
     {
         Vec3 out_dir;
         float pdf;
@@ -460,10 +303,10 @@ template<int trans> struct Diffuse final : public BSDF, DiffuseParams {
 };
 
 struct OrenNayar final : public BSDF, OrenNayarParams {
-    OrenNayar(const OrenNayarParams& params) : BSDF(), OrenNayarParams(params)
+    OSL_HOSTDEVICE OrenNayar(const OrenNayarParams& params) : BSDF(OREN_NAYAR_ID), OrenNayarParams(params)
     {
     }
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         float NL = N.dot(wi);
         float NV = N.dot(wo);
@@ -485,8 +328,8 @@ struct OrenNayar final : public BSDF, OrenNayarParams {
         }
         return {};
     }
-    Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float /*rz*/) const
     {
         Vec3 out_dir;
         float pdf;
@@ -496,11 +339,14 @@ struct OrenNayar final : public BSDF, OrenNayarParams {
 };
 
 struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
+    OSL_HOSTDEVICE
     EnergyCompensatedOrenNayar(const MxOrenNayarDiffuseParams& params)
-        : BSDF(), MxOrenNayarDiffuseParams(params)
+        : BSDF(MX_OREN_NAYAR_DIFFUSE_ID), MxOrenNayarDiffuseParams(params)
     {
     }
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+
+    OSL_HOSTDEVICE
+    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         float NL = N.dot(wi);
         float NV = N.dot(wo);
@@ -537,8 +383,9 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
         return {};
     }
 
+    OSL_HOSTDEVICE
     Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+                  float /*rz*/) const
     {
         Vec3 out_dir;
         float pdf;
@@ -551,6 +398,7 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
     static constexpr float constant2_FON = float(2.0 / 3.0
                                                  - 28.0 / (15.0 * M_PI));
 
+    OSL_HOSTDEVICE
     float E_FON_analytic(float mu) const
     {
         const float sigma = roughness;
@@ -567,8 +415,8 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
 };
 
 struct Phong final : public BSDF, PhongParams {
-    Phong(const PhongParams& params) : BSDF(), PhongParams(params) {}
-    Sample eval(const Vec3& wo, const Vec3& wi) const override
+    OSL_HOSTDEVICE Phong(const PhongParams& params) : BSDF(PHONG_ID), PhongParams(params) {}
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         float cosNI = N.dot(wi);
         float cosNO = N.dot(wo);
@@ -585,8 +433,8 @@ struct Phong final : public BSDF, PhongParams {
         }
         return {};
     }
-    Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                  float /*rz*/) const
     {
         float cosNO = N.dot(wo);
         if (cosNO > 0) {
@@ -607,8 +455,8 @@ struct Phong final : public BSDF, PhongParams {
 };
 
 struct Ward final : public BSDF, WardParams {
-    Ward(const WardParams& params) : BSDF(), WardParams(params) {}
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Ward(const WardParams& params) : BSDF(WARD_ID), WardParams(params) {}
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         float cosNO = N.dot(wo);
         float cosNI = N.dot(wi);
@@ -632,8 +480,8 @@ struct Ward final : public BSDF, WardParams {
         }
         return {};
     }
-    Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float /*rz*/) const
     {
         float cosNO = N.dot(wo);
         if (cosNO > 0) {
@@ -710,17 +558,17 @@ struct Ward final : public BSDF, WardParams {
  * is sufficient).
  */
 struct GGXDist {
-    static float F(const float tan_m2)
+    static OSL_HOSTDEVICE float F(const float tan_m2)
     {
         return 1 / (float(M_PI) * (1 + tan_m2) * (1 + tan_m2));
     }
 
-    static float Lambda(const float a2)
+    static OSL_HOSTDEVICE float Lambda(const float a2)
     {
         return 0.5f * (-1.0f + sqrtf(1.0f + 1.0f / a2));
     }
 
-    static Vec2 sampleSlope(float cos_theta, float randu, float randv)
+    static OSL_HOSTDEVICE Vec2 sampleSlope(float cos_theta, float randu, float randv)
     {
         // GGX
         Vec2 slope;
@@ -747,12 +595,12 @@ struct GGXDist {
 };
 
 struct BeckmannDist {
-    static float F(const float tan_m2)
+    static OSL_HOSTDEVICE float F(const float tan_m2)
     {
         return float(1 / M_PI) * OIIO::fast_exp(-tan_m2);
     }
 
-    static float Lambda(const float a2)
+    static OSL_HOSTDEVICE float Lambda(const float a2)
     {
         const float a = sqrtf(a2);
         return a < 1.6f ? (1.0f - 1.259f * a + 0.396f * a2)
@@ -760,7 +608,7 @@ struct BeckmannDist {
                         : 0.0f;
     }
 
-    static Vec2 sampleSlope(float cos_theta, float randu, float randv)
+    static OSL_HOSTDEVICE Vec2 sampleSlope(float cos_theta, float randu, float randv)
     {
         const float SQRT_PI_INV = 1 / sqrtf(float(M_PI));
         float ct                = cos_theta < 1e-6f ? 1e-6f : cos_theta;
@@ -808,14 +656,14 @@ struct BeckmannDist {
 
 template<typename Distribution, int Refract>
 struct Microfacet final : public BSDF, MicrofacetParams {
-    Microfacet(const MicrofacetParams& params)
-        : BSDF()
+    OSL_HOSTDEVICE Microfacet(const MicrofacetParams& params)
+        : BSDF(MICROFACET_ID)
         , MicrofacetParams(params)
         , tf(U == Vec3(0) || xalpha == yalpha ? TangentFrame(N)
                                               : TangentFrame(N, U))
     {
     }
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         if (Refract == 2)
             return Color3(1.0f);
@@ -824,7 +672,7 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         float fr = fresnel_dielectric(N.dot(wo), eta);
         return Color3(Refract ? 1 - fr : fr);
     }
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         const Vec3 wo_l = tf.tolocal(wo);
         const Vec3 wi_l = tf.tolocal(wi);
@@ -891,7 +739,7 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         return {};
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, float rz) const
     {
         const Vec3 wo_l   = tf.tolocal(wo);
         const float cosNO = wo_l.z;
@@ -917,8 +765,9 @@ struct Microfacet final : public BSDF, MicrofacetParams {
             if (Refract == 2) {
                 pdf *= F;
                 return { wi, Color3(out), pdf, std::max(xalpha, yalpha) };
-            } else
+            } else {
                 return { wi, Color3(F * out), pdf, std::max(xalpha, yalpha) };
+            }
         } else {
             const Vec3 M = tf.toworld(m);
             Vec3 wi;
@@ -938,6 +787,7 @@ struct Microfacet final : public BSDF, MicrofacetParams {
 
             float pdf = (fabsf(cosHI * cosHO) * (eta * eta) * (G1 * D) * invHt2)
                         / fabsf(wo_l.z);
+
             float out = G2 / G1;
             if (Refract == 2) {
                 pdf *= Ft;
@@ -949,9 +799,9 @@ struct Microfacet final : public BSDF, MicrofacetParams {
     }
 
 private:
-    static float SQR(float x) { return x * x; }
+    static OSL_HOSTDEVICE float SQR(float x) { return x * x; }
 
-    float evalLambda(const Vec3 w) const
+    OSL_HOSTDEVICE float evalLambda(const Vec3 w) const
     {
         float cosTheta2 = SQR(w.z);
         /* Have these two multiplied by sinTheta^2 for convenience */
@@ -960,15 +810,15 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         return Distribution::Lambda(cosTheta2 / (cosPhi2st2 + sinPhi2st2));
     }
 
-    static float evalG2(float Lambda_i, float Lambda_o)
+    static OSL_HOSTDEVICE float evalG2(float Lambda_i, float Lambda_o)
     {
         // correlated masking-shadowing
         return 1 / (Lambda_i + Lambda_o + 1);
     }
 
-    static float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); }
+    static OSL_HOSTDEVICE float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); }
 
-    float evalD(const Vec3 Hr) const
+    OSL_HOSTDEVICE float evalD(const Vec3 Hr) const
     {
         float cosThetaM = Hr.z;
         if (cosThetaM > 0) {
@@ -985,7 +835,7 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         return 0;
     }
 
-    Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const
+    OSL_HOSTDEVICE Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const
     {
         /* Project wo and stretch by alpha values */
         Vec3 swo = wo;
@@ -993,6 +843,14 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         swo.y *= yalpha;
         swo = swo.normalize();
 
+#ifdef __CUDACC__
+        // TODO: For some reason, after being normalized, swo can end up with components
+        // with magnitudes slightly greater than 1.0, which makes subsequent operations
+        // start producing NaNs. Normalizing this vector again fixes the issue, but it's
+        // a pretty ugly hack...
+        swo = swo.normalize();
+#endif
+
         // figure out angles for the incoming vector
         float cos_theta = std::max(swo.z, 0.0f);
         float cos_phi   = 1;
@@ -1015,6 +873,7 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         float mlen = sqrtf(s.x * s.x + s.y * s.y + 1);
         Vec3 m(fabsf(s.x) < mlen ? -s.x / mlen : 1.0f,
                fabsf(s.y) < mlen ? -s.y / mlen : 1.0f, 1.0f / mlen);
+
         return m;
     }
 
@@ -1030,11 +889,12 @@ typedef Microfacet<BeckmannDist, 2> MicrofacetBeckmannBoth;
 
 
 // We use the CRTP to inherit the parameters because each MaterialX closure uses a different set of parameters
-template<typename MxMicrofacetParams, typename Distribution,
+template<typename MxMicrofacetParams, typename Distribution, ClosureIDs ID,
          bool EnableTransmissionLobe>
 struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
-    MxMicrofacet(const MxMicrofacetParams& params, float refraction_ior)
-        : BSDF()
+    OSL_HOSTDEVICE MxMicrofacet(const MxMicrofacetParams& params,
+                                float refraction_ior)
+        : BSDF(ID)
         , MxMicrofacetParams(params)
         , tf(MxMicrofacetParams::U == Vec3(0)
                      || MxMicrofacetParams::roughness_x
@@ -1045,7 +905,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
     {
     }
 
-    float get_fresnel_angle(float cos_theta) const
+    OSL_HOSTDEVICE float get_fresnel_angle(float cos_theta) const
     {
         if (EnableTransmissionLobe && refraction_ior < 1) {
             // handle TIR if we are on the backside
@@ -1059,7 +919,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
         return cos_theta;
     }
 
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         // if transmission is enabled, punt on
         if (EnableTransmissionLobe)
@@ -1071,7 +931,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
             get_fresnel_angle(MxMicrofacetParams::N.dot(wo)));
     }
 
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         const Vec3 wo_l = tf.tolocal(wo);
         const Vec3 wi_l = tf.tolocal(wi);
@@ -1150,7 +1010,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
     }
 
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, float rz) const
     {
         const Vec3 wo_l   = tf.tolocal(wo);
         const float cosNO = wo_l.z;
@@ -1225,9 +1085,9 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
     }
 
 private:
-    static float SQR(float x) { return x * x; }
+    static OSL_HOSTDEVICE float SQR(float x) { return x * x; }
 
-    float evalLambda(const Vec3 w) const
+    OSL_HOSTDEVICE float evalLambda(const Vec3 w) const
     {
         float cosTheta2 = SQR(w.z);
         /* Have these two multiplied by sinTheta^2 for convenience */
@@ -1236,15 +1096,15 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
         return Distribution::Lambda(cosTheta2 / (cosPhi2st2 + sinPhi2st2));
     }
 
-    static float evalG2(float Lambda_i, float Lambda_o)
+    static OSL_HOSTDEVICE float evalG2(float Lambda_i, float Lambda_o)
     {
         // correlated masking-shadowing
         return 1 / (Lambda_i + Lambda_o + 1);
     }
 
-    static float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); }
+    static OSL_HOSTDEVICE float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); }
 
-    float evalD(const Vec3 Hr) const
+    OSL_HOSTDEVICE float evalD(const Vec3 Hr) const
     {
         float cosThetaM = Hr.z;
         if (cosThetaM > 0) {
@@ -1256,14 +1116,26 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
 
             float tanThetaM2 = (cosPhi2st2 + sinPhi2st2) / cosThetaM2;
 
+#ifndef __CUDACC__
             return Distribution::F(tanThetaM2)
                    / (MxMicrofacetParams::roughness_x
                       * MxMicrofacetParams::roughness_y * cosThetaM4);
+#else
+            // Division can result in NaN if both inputs are 0.0, so only
+            // perform the divide if the numerator is not 0.0.
+            const float dist_val = Distribution::F(tanThetaM2);
+            if (dist_val == 0.0f)
+                return 0.0f;
+            else
+                return dist_val
+                       / (MxMicrofacetParams::roughness_x
+                          * MxMicrofacetParams::roughness_y * cosThetaM4);
+#endif
         }
         return 0;
     }
 
-    Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const
+    OSL_HOSTDEVICE Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const
     {
         /* Project wo and stretch by alpha values */
         Vec3 swo = wo;
@@ -1301,23 +1173,23 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
 };
 
 struct Reflection final : public BSDF, ReflectionParams {
-    Reflection(const ReflectionParams& params)
-        : BSDF(), ReflectionParams(params)
+    OSL_HOSTDEVICE Reflection(const ReflectionParams& params)
+        : BSDF(REFLECTION_ID), ReflectionParams(params)
     {
     }
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         float cosNO = N.dot(wo);
         if (cosNO > 0)
             return Color3(fresnel_dielectric(cosNO, eta));
         return Color3(1);
     }
-    Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const
     {
         return {};
     }
-    Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
+                  float /*rz*/) const
     {
         // only one direction is possible
         float cosNO = dot(N, wo);
@@ -1331,21 +1203,21 @@ struct Reflection final : public BSDF, ReflectionParams {
 };
 
 struct Refraction final : public BSDF, RefractionParams {
-    Refraction(const RefractionParams& params)
-        : BSDF(), RefractionParams(params)
+    OSL_HOSTDEVICE Refraction(const RefractionParams& params)
+        : BSDF(REFRACTION_ID), RefractionParams(params)
     {
     }
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         float cosNO = N.dot(wo);
         return Color3(1 - fresnel_dielectric(cosNO, eta));
     }
-    Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const
     {
         return {};
     }
-    Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
+                  float /*rz*/) const
     {
         float pdf = std::numeric_limits<float>::infinity();
         Vec3 wi;
@@ -1355,13 +1227,13 @@ struct Refraction final : public BSDF, RefractionParams {
 };
 
 struct Transparent final : public BSDF {
-    Transparent() : BSDF() {}
-    Sample eval(const Vec3& /*wo*/, const Vec3& /*wi*/) const override
+    OSL_HOSTDEVICE Transparent() : BSDF(TRANSPARENT_ID) {}
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const Vec3& /*wi*/) const
     {
         return {};
     }
-    Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
+                  float /*rz*/) const
     {
         Vec3 wi   = -wo;
         float pdf = std::numeric_limits<float>::infinity();
@@ -1370,14 +1242,14 @@ struct Transparent final : public BSDF {
 };
 
 struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams {
-    MxBurleyDiffuse(const MxBurleyDiffuseParams& params)
-        : BSDF(), MxBurleyDiffuseParams(params)
+    OSL_HOSTDEVICE MxBurleyDiffuse(const MxBurleyDiffuseParams& params)
+        : BSDF(MX_BURLEY_DIFFUSE_ID), MxBurleyDiffuseParams(params)
     {
     }
 
-    Color3 get_albedo(const Vec3& wo) const override { return albedo; }
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { return albedo; }
 
-    Sample eval(const Vec3& wo, const Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         const Vec3 L = wi, V = wo;
         const Vec3 H = (L + V).normalize();
@@ -1391,7 +1263,7 @@ struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams {
         return { wi, albedo * refL * refV, pdf, 1.0f };
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, float rz) const
     {
         Vec3 out_dir;
         float pdf;
@@ -1401,9 +1273,12 @@ struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams {
 };
 
 struct MxSheen final : public BSDF, MxSheenParams {
-    MxSheen(const MxSheenParams& params) : BSDF(), MxSheenParams(params) {}
+    OSL_HOSTDEVICE MxSheen(const MxSheenParams& params)
+        : BSDF(MX_SHEEN_ID), MxSheenParams(params)
+    {
+    }
 
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         const float NdotV = clamp(N.dot(wo), 0.0f, 1.0f);
         // Rational fit from the Material X project
@@ -1417,7 +1292,7 @@ struct MxSheen final : public BSDF, MxSheenParams {
         return clamp(albedo * (r.x / r.y), 0.0f, 1.0f);
     }
 
-    Sample eval(const Vec3& wo, const Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         const Vec3 L = wi, V = wo;
         const Vec3 H       = (L + V).normalize();
@@ -1437,7 +1312,7 @@ struct MxSheen final : public BSDF, MxSheenParams {
                  pdf, 1.0f };
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, float rz) const
     {
         Vec3 out_dir;
         float pdf;
@@ -1446,393 +1321,800 @@ struct MxSheen final : public BSDF, MxSheenParams {
     }
 };
 
-Color3
-evaluate_layer_opacity(const OSL::ShaderGlobals& sg,
+
+OSL_HOSTDEVICE Color3
+evaluate_layer_opacity(const ShaderGlobalsType& sg,
                        const ClosureColor* closure)
 {
     // Null closure, the layer is fully transparent
     if (closure == nullptr)
         return Color3(0);
 
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = Color3(1.0f);
+
+    while (closure) {
     switch (closure->id) {
-    case ClosureColor::MUL:
-        return closure->as_mul()->weight
-               * evaluate_layer_opacity(sg, closure->as_mul()->closure);
-    case ClosureColor::ADD:
-        return evaluate_layer_opacity(sg, closure->as_add()->closureA)
-               + evaluate_layer_opacity(sg, closure->as_add()->closureB);
-    default: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 w                     = comp->w;
-        switch (comp->id) {
+        case ClosureColor::MUL:
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        case ClosureColor::ADD:
+            ptr_stack[stack_idx]      = closure->as_add()->closureB;
+            weight_stack[stack_idx++] = weight;
+            closure                   = closure->as_add()->closureA;
+            break;
+        default: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 w                     = comp->w;
+            switch (comp->id) {
+            case MX_LAYER_ID: {
+                const MxLayerParams* srcparams = comp->as<MxLayerParams>();
+                closure                        = srcparams->top;
+                ptr_stack[stack_idx]           = srcparams->base;
+                weight_stack[stack_idx++]      = weight * w;
+                break;
+            }
+            case REFLECTION_ID:
+            case FRESNEL_REFLECTION_ID: {
+                Reflection bsdf(*comp->as<ReflectionParams>());
+                const Vec3& I = *reinterpret_cast<const Vec3*>(&sg.I);
+                weight *= w * bsdf.get_albedo(-I);
+                closure = nullptr;
+                break;
+            }
+            case MX_DIELECTRIC_ID: {
+                const MxDielectricParams& params = *comp->as<MxDielectricParams>();
+                // Transmissive dielectrics are opaque
+                if (!is_black(params.transmission_tint)) {
+                    // return Color3(1);
+                    closure = nullptr;
+                    break;
+                }
+                MxMicrofacet<MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, false>
+                    mf(params, 1.0f);
+                const Vec3& I = *reinterpret_cast<const Vec3*>(&sg.I);
+                weight *= w * mf.get_albedo(-I);
+                closure = nullptr;
+                break;
+            }
+            case MX_GENERALIZED_SCHLICK_ID: {
+                const MxGeneralizedSchlickParams& params
+                    = *comp->as<MxGeneralizedSchlickParams>();
+                // Transmissive dielectrics are opaque
+                if (!is_black(params.transmission_tint)) {
+                    // return Color3(1);
+                    closure = nullptr;
+                    break;
+                }
+                MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                             MX_GENERALIZED_SCHLICK_ID, false>
+                    mf(params, 1.0f);
+                const Vec3& I = *reinterpret_cast<const Vec3*>(&sg.I);
+                weight *= w * mf.get_albedo(-I);
+                closure = nullptr;
+                break;
+            }
+            case MX_SHEEN_ID: {
+                MxSheen bsdf(*comp->as<MxSheenParams>());
+                const Vec3& I = *reinterpret_cast<const Vec3*>(&sg.I);
+                weight *= w * bsdf.get_albedo(-I);
+                closure = nullptr;
+                break;
+            }
+            default:  // Assume unhandled BSDFs are opaque
+                closure = nullptr;
+                break;
+            }
+        }
+        }
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
+        }
+    }
+    return weight;
+}
+
+
+OSL_HOSTDEVICE void
+process_medium_closure(const ShaderGlobalsType& sg, ShadingResult& result,
+                       const ClosureColor* closure, const Color3& w_in)
+{
+    if (!closure)
+        return;
+
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = w_in;
+
+    while (closure) {
+        switch (closure->id) {
+        case ClosureColor::MUL: {
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        }
+        case ClosureColor::ADD: {
+            weight_stack[stack_idx] = weight;
+            ptr_stack[stack_idx++]  = closure->as_add()->closureB;
+            closure                 = closure->as_add()->closureA;
+            break;
+        }
         case MX_LAYER_ID: {
-            const MxLayerParams* srcparams = comp->as<MxLayerParams>();
-            return w
-                   * (evaluate_layer_opacity(sg, srcparams->top)
-                      + evaluate_layer_opacity(sg, srcparams->base));
+            const ClosureComponent* comp = closure->as_comp();
+            const MxLayerParams* params  = comp->as<MxLayerParams>();
+            Color3 base_w
+                = weight
+                  * (Color3(1)
+                     - clamp(evaluate_layer_opacity(sg, params->top), 0.f, 1.f));
+            closure                   = params->top;
+            ptr_stack[stack_idx]      = params->base;
+            weight_stack[stack_idx++] = weight * base_w;
+            break;
         }
-        case REFLECTION_ID:
-        case FRESNEL_REFLECTION_ID: {
-            Reflection bsdf(*comp->as<ReflectionParams>());
-            return w * bsdf.get_albedo(-sg.I);
+        case MX_ANISOTROPIC_VDF_ID: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 cw                    = weight * comp->w;
+            const auto& params           = *comp->as<MxAnisotropicVdfParams>();
+            result.sigma_t               = cw * params.extinction;
+            result.sigma_s               = params.albedo * result.sigma_t;
+            result.medium_g              = params.anisotropy;
+            result.refraction_ior        = 1.0f;
+            result.priority = 0;  // TODO: should this closure have a priority?
+            closure = nullptr;
+            break;
+        }
+        case MX_MEDIUM_VDF_ID: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 cw                    = weight * comp->w;
+            const auto& params           = *comp->as<MxMediumVdfParams>();
+            result.sigma_t = { -OIIO::fast_log(params.transmission_color.x),
+                               -OIIO::fast_log(params.transmission_color.y),
+                               -OIIO::fast_log(params.transmission_color.z) };
+            // NOTE: closure weight scales the extinction parameter
+            result.sigma_t *= cw / params.transmission_depth;
+            result.sigma_s  = params.albedo * result.sigma_t;
+            result.medium_g = params.anisotropy;
+            // TODO: properly track a medium stack here ...
+            result.refraction_ior = sg.backfacing ? 1.0f / params.ior : params.ior;
+            result.priority       = params.priority;
+            closure = nullptr;
+            break;
         }
         case MX_DIELECTRIC_ID: {
-            const MxDielectricParams& params = *comp->as<MxDielectricParams>();
-            // Transmissive dielectrics are opaque
-            if (!is_black(params.transmission_tint))
-                return Color3(1);
-            MxMicrofacet<MxDielectricParams, GGXDist, false> mf(params, 1.0f);
-            return w * mf.get_albedo(-sg.I);
+            const ClosureComponent* comp = closure->as_comp();
+            const auto& params           = *comp->as<MxDielectricParams>();
+            if (!is_black(weight * comp->w * params.transmission_tint)) {
+                // TODO: properly track a medium stack here ...
+                result.refraction_ior = sg.backfacing ? 1.0f / params.ior
+                                                      : params.ior;
+            }
+            closure = nullptr;
+            break;
         }
         case MX_GENERALIZED_SCHLICK_ID: {
-            const MxGeneralizedSchlickParams& params
-                = *comp->as<MxGeneralizedSchlickParams>();
-            // Transmissive dielectrics are opaque
-            if (!is_black(params.transmission_tint))
-                return Color3(1);
-            MxMicrofacet<MxGeneralizedSchlickParams, GGXDist, false> mf(params,
-                                                                        1.0f);
-            return w * mf.get_albedo(-sg.I);
+            const ClosureComponent* comp = closure->as_comp();
+            const auto& params           = *comp->as<MxGeneralizedSchlickParams>();
+            if (!is_black(weight * comp->w * params.transmission_tint)) {
+                // TODO: properly track a medium stack here ...
+                float avg_F0  = clamp((params.f0.x + params.f0.y + params.f0.z)
+                                          / 3.0f,
+                                      0.0f, 0.99f);
+                float sqrt_F0 = sqrtf(avg_F0);
+                float ior     = (1 + sqrt_F0) / (1 - sqrt_F0);
+                result.refraction_ior = sg.backfacing ? 1.0f / ior : ior;
+            }
+            closure = nullptr;
+            break;
         }
-        case MX_SHEEN_ID: {
-            MxSheen bsdf(*comp->as<MxSheenParams>());
-            return w * bsdf.get_albedo(-sg.I);
+        default:
+            closure = nullptr;
+            break;
         }
-        default:  // Assume unhandled BSDFs are opaque
-            return Color3(1);
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
         }
     }
-    }
-    OSL_ASSERT(false && "Layer opacity evaluation failed");
-    return Color3(0);
 }
 
-void
-process_medium_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
-                       const ClosureColor* closure, const Color3& w)
+// walk through the closure tree, creating bsdfs as we go
+OSL_HOSTDEVICE void
+process_bsdf_closure(const ShaderGlobalsType& sg, ShadingResult& result,
+                     const ClosureColor* closure, const Color3& w_in,
+                     bool light_only)
 {
+    static const ustringhash uh_ggx("ggx");
+    static const ustringhash uh_beckmann("beckmann");
+    static const ustringhash uh_default("default");
     if (!closure)
         return;
-    switch (closure->id) {
-    case ClosureColor::MUL: {
-        process_medium_closure(sg, result, closure->as_mul()->closure,
-                               w * closure->as_mul()->weight);
-        break;
+
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = w_in;
+
+    while (closure) {
+        switch (closure->id) {
+        case ClosureColor::MUL: {
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        }
+        case ClosureColor::ADD: {
+            ptr_stack[stack_idx]      = closure->as_add()->closureB;
+            weight_stack[stack_idx++] = weight;
+            closure                   = closure->as_add()->closureA;
+            break;
+        }
+        default: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 cw                    = weight * comp->w;
+            closure = nullptr;
+            if (comp->id == EMISSION_ID) {
+                result.Le += cw;
+            }
+            else if (comp->id == MX_UNIFORM_EDF_ID) {
+                result.Le += cw * comp->as<MxUniformEdfParams>()->emittance;
+            }
+            else if (!light_only) {
+                bool ok = false;
+                switch (comp->id) {
+                case DIFFUSE_ID:
+                    ok = result.bsdf.add_bsdf<Diffuse<0>>(
+                        cw, *comp->as<DiffuseParams>());
+                    break;
+                case OREN_NAYAR_ID:
+                    ok = result.bsdf.add_bsdf<OrenNayar>(
+                        cw, *comp->as<OrenNayarParams>());
+                    break;
+                case TRANSLUCENT_ID:
+                    ok = result.bsdf.add_bsdf<Diffuse<1>>(
+                        cw, *comp->as<DiffuseParams>());
+                    break;
+                case PHONG_ID:
+                    ok = result.bsdf.add_bsdf<Phong>(cw, *comp->as<PhongParams>());
+                    break;
+                case WARD_ID:
+                    ok = result.bsdf.add_bsdf<Ward>(cw, *comp->as<WardParams>());
+                    break;
+                case MICROFACET_ID: {
+                    const MicrofacetParams* mp = comp->as<MicrofacetParams>();
+                    if (mp->dist == uh_ggx) {
+                        switch (mp->refract) {
+                        case 0:
+                            ok = result.bsdf.add_bsdf<MicrofacetGGXRefl>(cw, *mp);
+                            break;
+                        case 1:
+                            ok = result.bsdf.add_bsdf<MicrofacetGGXRefr>(cw, *mp);
+                            break;
+                        case 2:
+                            ok = result.bsdf.add_bsdf<MicrofacetGGXBoth>(cw, *mp);
+                            break;
+                        }
+                    } else if (mp->dist == uh_beckmann || mp->dist == uh_default) {
+                        switch (mp->refract) {
+                        case 0:
+                            ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefl>(cw,
+                                                                              *mp);
+                            break;
+                        case 1:
+                            ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefr>(cw,
+                                                                              *mp);
+                            break;
+                        case 2:
+                            ok = result.bsdf.add_bsdf<MicrofacetBeckmannBoth>(cw,
+                                                                              *mp);
+                            break;
+                        }
+                    }
+                    break;
+                }
+                case REFLECTION_ID:
+                case FRESNEL_REFLECTION_ID:
+                    ok = result.bsdf.add_bsdf<Reflection>(
+                        cw, *comp->as<ReflectionParams>());
+                    break;
+                case REFRACTION_ID:
+                    ok = result.bsdf.add_bsdf<Refraction>(
+                        cw, *comp->as<RefractionParams>());
+                    break;
+                case TRANSPARENT_ID:
+                    ok = result.bsdf.add_bsdf<Transparent>(cw);
+                    break;
+                case MX_OREN_NAYAR_DIFFUSE_ID: {
+                    const MxOrenNayarDiffuseParams* srcparams
+                        = comp->as<MxOrenNayarDiffuseParams>();
+                    if (srcparams->energy_compensation) {
+                        // energy compensation handled by its own BSDF
+                        ok = result.bsdf.add_bsdf<EnergyCompensatedOrenNayar>(
+                            cw, *srcparams);
+                    } else {
+                        // translate MaterialX parameters into existing closure
+                        OrenNayarParams params = {};
+                        params.N               = srcparams->N;
+                        params.sigma           = srcparams->roughness;
+                        ok = result.bsdf.add_bsdf<OrenNayar>(cw * srcparams->albedo,
+                                                             params);
+                    }
+                    break;
+                }
+                case MX_BURLEY_DIFFUSE_ID: {
+                    const MxBurleyDiffuseParams& params
+                        = *comp->as<MxBurleyDiffuseParams>();
+                    ok = result.bsdf.add_bsdf<MxBurleyDiffuse>(cw, params);
+                    break;
+                }
+                case MX_DIELECTRIC_ID: {
+                    const MxDielectricParams& params
+                        = *comp->as<MxDielectricParams>();
+                    if (is_black(params.transmission_tint))
+                        ok = result.bsdf.add_bsdf<MxMicrofacet<
+                            MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, false>>(
+                            cw, params, 1.0f);
+                    else
+                        ok = result.bsdf.add_bsdf<MxMicrofacet<
+                            MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, true>>(
+                            cw, params, result.refraction_ior);
+                    break;
+                }
+                case MX_CONDUCTOR_ID: {
+                    const MxConductorParams& params = *comp->as<MxConductorParams>();
+                    ok = result.bsdf.add_bsdf<MxMicrofacet<
+                        MxConductorParams, GGXDist, MX_CONDUCTOR_ID, false>>(
+                        cw, params, 1.0f);
+                    break;
+                };
+                case MX_GENERALIZED_SCHLICK_ID: {
+                    const MxGeneralizedSchlickParams& params
+                        = *comp->as<MxGeneralizedSchlickParams>();
+                    if (is_black(params.transmission_tint))
+                        ok = result.bsdf.add_bsdf<
+                            MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                                         MX_GENERALIZED_SCHLICK_ID, false>>(cw,
+                                                                            params,
+                                                                            1.0f);
+                    else
+                        ok = result.bsdf.add_bsdf<
+                            MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                                         MX_GENERALIZED_SCHLICK_ID, true>>(
+                            cw, params, result.refraction_ior);
+                    break;
+                };
+                case MX_TRANSLUCENT_ID: {
+                    const MxTranslucentParams* srcparams
+                        = comp->as<MxTranslucentParams>();
+                    DiffuseParams params = {};
+                    params.N             = srcparams->N;
+                    ok = result.bsdf.add_bsdf<Diffuse<1>>(cw * srcparams->albedo,
+                                                          params);
+                    break;
+                }
+                case MX_TRANSPARENT_ID: {
+                    ok = result.bsdf.add_bsdf<Transparent>(cw);
+                    break;
+                }
+                case MX_SUBSURFACE_ID: {
+                    // TODO: implement BSSRDF support?
+                    const MxSubsurfaceParams* srcparams
+                        = comp->as<MxSubsurfaceParams>();
+                    DiffuseParams params = {};
+                    params.N             = srcparams->N;
+                    ok = result.bsdf.add_bsdf<Diffuse<0>>(cw * srcparams->albedo,
+                                                          params);
+                    break;
+                }
+                case MX_SHEEN_ID: {
+                    const MxSheenParams& params = *comp->as<MxSheenParams>();
+                    ok = result.bsdf.add_bsdf<MxSheen>(cw, params);
+                    break;
+                }
+                case MX_LAYER_ID: {
+                    const MxLayerParams* srcparams = comp->as<MxLayerParams>();
+                    Color3 base_w
+                        = weight
+                          * (Color3(1, 1, 1)
+                             - clamp(evaluate_layer_opacity(sg, srcparams->top),
+                                     0.f, 1.f));
+                    closure = srcparams->top;
+                    weight  = cw;
+                    if (!is_black(base_w)) {
+                        ptr_stack[stack_idx]      = srcparams->base;
+                        weight_stack[stack_idx++] = base_w;
+                    }
+                    ok = true;
+                    break;
+                }
+                case MX_ANISOTROPIC_VDF_ID:
+                case MX_MEDIUM_VDF_ID: {
+                    // already processed by process_medium_closure
+                    ok = true;
+                    break;
+                default:
+                    // TODO: Add a diagnostic message?
+                    break;
+                }
+                }
+#ifndef __CUDACC__
+                OSL_ASSERT(ok && "Invalid closure invoked in surface shader");
+#else
+                if (!ok) {
+                    printf("Invalid closure invoked in surface shader\n");
+                }
+#endif
+            }
+            break;
+        }
+        }
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
+        }
     }
-    case ClosureColor::ADD: {
-        process_medium_closure(sg, result, closure->as_add()->closureA, w);
-        process_medium_closure(sg, result, closure->as_add()->closureB, w);
-        break;
+}
+
+}  // anonymous namespace
+
+OSL_NAMESPACE_ENTER
+
+OSL_HOSTDEVICE void
+process_closure(const ShaderGlobalsType& sg, ShadingResult& result,
+                const ClosureColor* Ci, bool light_only)
+{
+    if (!light_only)
+        process_medium_closure(sg, result, Ci, Color3(1));
+    process_bsdf_closure(sg, result, Ci, Color3(1), light_only);
+}
+
+OSL_HOSTDEVICE Vec3
+process_background_closure(const ClosureColor* closure)
+{
+    if (!closure)
+        return Vec3(0, 0, 0);
+
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = Color3(1.0f);
+
+    while (closure) {
+        switch (closure->id) {
+        case ClosureColor::MUL: {
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        }
+        case ClosureColor::ADD: {
+            ptr_stack[stack_idx]      = closure->as_add()->closureB;
+            weight_stack[stack_idx++] = weight;
+            closure                   = closure->as_add()->closureA;
+            break;
+        }
+        case BACKGROUND_ID: {
+            weight *= closure->as_comp()->w;
+            closure = nullptr;
+            break;
+        }
+        }
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
+        }
     }
-    case MX_LAYER_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        const MxLayerParams* params  = comp->as<MxLayerParams>();
-        Color3 base_w
-            = w
-              * (Color3(1)
-                 - clamp(evaluate_layer_opacity(sg, params->top), 0.f, 1.f));
-        process_medium_closure(sg, result, params->top, w);
-        process_medium_closure(sg, result, params->base, base_w);
+    return weight;
+}
+
+
+typedef MxMicrofacet<MxConductorParams, GGXDist, MX_CONDUCTOR_ID, false>
+    MxConductor;
+typedef MxMicrofacet<MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, true>
+    MxDielectric;
+typedef MxMicrofacet<MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, false>
+    MxDielectricOpaque;
+typedef MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                     MX_GENERALIZED_SCHLICK_ID, true>
+    MxGeneralizedSchlick;
+typedef MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                     MX_GENERALIZED_SCHLICK_ID, false>
+    MxGeneralizedSchlickOpaque;
+
+
+OSL_HOSTDEVICE Color3
+CompositeBSDF::get_albedo(const BSDF* bsdf, const Vec3& wo) const
+{
+    static const ustringhash uh_ggx("ggx");
+    static const ustringhash uh_beckmann("beckmann");
+    static const ustringhash uh_default("default");
+
+    Color3 albedo(0);
+    switch (bsdf->id) {
+    case DIFFUSE_ID:
+        albedo = BSDF_CAST(Diffuse<0>, bsdf)->get_albedo(wo);
         break;
-    }
-    case MX_ANISOTROPIC_VDF_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 cw                    = w * comp->w;
-        const auto& params           = *comp->as<MxAnisotropicVdfParams>();
-        result.sigma_t               = cw * params.extinction;
-        result.sigma_s               = params.albedo * result.sigma_t;
-        result.medium_g              = params.anisotropy;
-        result.refraction_ior        = 1.0f;
-        result.priority = 0;  // TODO: should this closure have a priority?
+    case TRANSPARENT_ID:
+    case MX_TRANSPARENT_ID:
+        albedo = BSDF_CAST(Transparent, bsdf)->get_albedo(wo);
         break;
-    }
-    case MX_MEDIUM_VDF_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 cw                    = w * comp->w;
-        const auto& params           = *comp->as<MxMediumVdfParams>();
-        result.sigma_t = { -OIIO::fast_log(params.transmission_color.x),
-                           -OIIO::fast_log(params.transmission_color.y),
-                           -OIIO::fast_log(params.transmission_color.z) };
-        // NOTE: closure weight scales the extinction parameter
-        result.sigma_t *= cw / params.transmission_depth;
-        result.sigma_s  = params.albedo * result.sigma_t;
-        result.medium_g = params.anisotropy;
-        // TODO: properly track a medium stack here ...
-        result.refraction_ior = sg.backfacing ? 1.0f / params.ior : params.ior;
-        result.priority       = params.priority;
+    case OREN_NAYAR_ID:
+        albedo = BSDF_CAST(OrenNayar, bsdf)->get_albedo(wo);
         break;
-    }
-    case MX_DIELECTRIC_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        const auto& params           = *comp->as<MxDielectricParams>();
-        if (!is_black(w * comp->w * params.transmission_tint)) {
-            // TODO: properly track a medium stack here ...
-            result.refraction_ior = sg.backfacing ? 1.0f / params.ior
-                                                  : params.ior;
+    case TRANSLUCENT_ID:
+        albedo = BSDF_CAST(Diffuse<1>, bsdf)->get_albedo(wo);
+        break;
+    case PHONG_ID: albedo = BSDF_CAST(Phong, bsdf)->get_albedo(wo); break;
+    case WARD_ID: albedo = BSDF_CAST(Ward, bsdf)->get_albedo(wo); break;
+    case REFLECTION_ID:
+    case FRESNEL_REFLECTION_ID:
+        albedo = BSDF_CAST(Reflection, bsdf)->get_albedo(wo);
+        break;
+    case REFRACTION_ID:
+        albedo = BSDF_CAST(Refraction, bsdf)->get_albedo(wo);
+        break;
+    case MICROFACET_ID: {
+        const int refract      = ((MicrofacetBeckmannRefl*)bsdf)->refract;
+        const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist;
+        if (dist == uh_default || dist == uh_beckmann) {
+            switch (refract) {
+            case 0:
+                albedo = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)->get_albedo(wo);
+                break;
+            case 1:
+                albedo = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)->get_albedo(wo);
+                break;
+            case 2:
+                albedo = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)->get_albedo(wo);
+                break;
+            }
+        } else if (dist == uh_ggx) {
+            switch (refract) {
+            case 0:
+                albedo = BSDF_CAST(MicrofacetGGXRefl, bsdf)->get_albedo(wo);
+                break;
+            case 1:
+                albedo = BSDF_CAST(MicrofacetGGXRefr, bsdf)->get_albedo(wo);
+                break;
+            case 2:
+                albedo = BSDF_CAST(MicrofacetGGXBoth, bsdf)->get_albedo(wo);
+                break;
+            }
         }
         break;
     }
+    case MX_CONDUCTOR_ID:
+        albedo = BSDF_CAST(MxConductor, bsdf)->get_albedo(wo);
+        break;
+    case MX_DIELECTRIC_ID:
+        if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint))
+            albedo = BSDF_CAST(MxDielectricOpaque, bsdf)->get_albedo(wo);
+        else
+            albedo = BSDF_CAST(MxDielectric, bsdf)->get_albedo(wo);
+        break;
+    case MX_OREN_NAYAR_DIFFUSE_ID:
+        albedo = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)->get_albedo(wo);
+        break;
+    case MX_BURLEY_DIFFUSE_ID:
+        albedo = BSDF_CAST(MxBurleyDiffuse, bsdf)->get_albedo(wo);
+        break;
+    case MX_SHEEN_ID: albedo = BSDF_CAST(MxSheen, bsdf)->get_albedo(wo); break;
     case MX_GENERALIZED_SCHLICK_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        const auto& params           = *comp->as<MxGeneralizedSchlickParams>();
-        if (!is_black(w * comp->w * params.transmission_tint)) {
-            // TODO: properly track a medium stack here ...
-            float avg_F0  = clamp((params.f0.x + params.f0.y + params.f0.z)
-                                      / 3.0f,
-                                  0.0f, 0.99f);
-            float sqrt_F0 = sqrtf(avg_F0);
-            float ior     = (1 + sqrt_F0) / (1 - sqrt_F0);
-            result.refraction_ior = sg.backfacing ? 1.0f / ior : ior;
-        }
+        const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint;
+        if (is_black(tint))
+            albedo = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)->get_albedo(wo);
+        else
+            albedo = BSDF_CAST(MxGeneralizedSchlick, bsdf)->get_albedo(wo);
         break;
     }
+    default: break;
     }
+    return albedo;
 }
 
-// recursively walk through the closure tree, creating bsdfs as we go
-void
-process_bsdf_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
-                     const ClosureColor* closure, const Color3& w,
-                     bool light_only)
+
+OSL_HOSTDEVICE BSDF::Sample
+CompositeBSDF::sample(const BSDF* bsdf, const Vec3& wo, float rx, float ry,
+                      float rz) const
 {
     static const ustringhash uh_ggx("ggx");
     static const ustringhash uh_beckmann("beckmann");
     static const ustringhash uh_default("default");
-    if (!closure)
-        return;
-    switch (closure->id) {
-    case ClosureColor::MUL: {
-        Color3 cw = w * closure->as_mul()->weight;
-        process_bsdf_closure(sg, result, closure->as_mul()->closure, cw,
-                             light_only);
+
+    BSDF::Sample sample = {};
+    switch (bsdf->id) {
+    case DIFFUSE_ID:
+        sample = BSDF_CAST(Diffuse<0>, bsdf)->sample(wo, rx, ry, rz);
         break;
-    }
-    case ClosureColor::ADD: {
-        process_bsdf_closure(sg, result, closure->as_add()->closureA, w,
-                             light_only);
-        process_bsdf_closure(sg, result, closure->as_add()->closureB, w,
-                             light_only);
+    case TRANSPARENT_ID:
+    case MX_TRANSPARENT_ID:
+        sample = BSDF_CAST(Transparent, bsdf)->sample(wo, rx, ry, rz);
         break;
-    }
-    default: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 cw                    = w * comp->w;
-        if (comp->id == EMISSION_ID)
-            result.Le += cw;
-        else if (comp->id == MX_UNIFORM_EDF_ID)
-            result.Le += cw * comp->as<MxUniformEdfParams>()->emittance;
-        else if (!light_only) {
-            bool ok = false;
-            switch (comp->id) {
-            case DIFFUSE_ID:
-                ok = result.bsdf.add_bsdf<Diffuse<0>>(
-                    cw, *comp->as<DiffuseParams>());
-                break;
-            case OREN_NAYAR_ID:
-                ok = result.bsdf.add_bsdf<OrenNayar>(
-                    cw, *comp->as<OrenNayarParams>());
-                break;
-            case TRANSLUCENT_ID:
-                ok = result.bsdf.add_bsdf<Diffuse<1>>(
-                    cw, *comp->as<DiffuseParams>());
-                break;
-            case PHONG_ID:
-                ok = result.bsdf.add_bsdf<Phong>(cw, *comp->as<PhongParams>());
+    case OREN_NAYAR_ID:
+        sample = BSDF_CAST(OrenNayar, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case TRANSLUCENT_ID:
+        sample = BSDF_CAST(Diffuse<1>, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case PHONG_ID:
+        sample = BSDF_CAST(Phong, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case WARD_ID: sample = BSDF_CAST(Ward, bsdf)->sample(wo, rx, ry, rz); break;
+    case REFLECTION_ID:
+    case FRESNEL_REFLECTION_ID:
+        sample = BSDF_CAST(Reflection, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case REFRACTION_ID:
+        sample = BSDF_CAST(Refraction, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MICROFACET_ID: {
+        const int refract      = ((MicrofacetBeckmannRefl*)bsdf)->refract;
+        const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist;
+        if (dist == uh_default || dist == uh_beckmann) {
+            switch (refract) {
+            case 0:
+                sample = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)
+                             ->sample(wo, rx, ry, rz);
                 break;
-            case WARD_ID:
-                ok = result.bsdf.add_bsdf<Ward>(cw, *comp->as<WardParams>());
+            case 1:
+                sample = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)
+                             ->sample(wo, rx, ry, rz);
                 break;
-            case MICROFACET_ID: {
-                const MicrofacetParams* mp = comp->as<MicrofacetParams>();
-                if (mp->dist == uh_ggx) {
-                    switch (mp->refract) {
-                    case 0:
-                        ok = result.bsdf.add_bsdf<MicrofacetGGXRefl>(cw, *mp);
-                        break;
-                    case 1:
-                        ok = result.bsdf.add_bsdf<MicrofacetGGXRefr>(cw, *mp);
-                        break;
-                    case 2:
-                        ok = result.bsdf.add_bsdf<MicrofacetGGXBoth>(cw, *mp);
-                        break;
-                    }
-                } else if (mp->dist == uh_beckmann || mp->dist == uh_default) {
-                    switch (mp->refract) {
-                    case 0:
-                        ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefl>(cw,
-                                                                          *mp);
-                        break;
-                    case 1:
-                        ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefr>(cw,
-                                                                          *mp);
-                        break;
-                    case 2:
-                        ok = result.bsdf.add_bsdf<MicrofacetBeckmannBoth>(cw,
-                                                                          *mp);
-                        break;
-                    }
-                }
+            case 2:
+                sample = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)
+                             ->sample(wo, rx, ry, rz);
                 break;
             }
-            case REFLECTION_ID:
-            case FRESNEL_REFLECTION_ID:
-                ok = result.bsdf.add_bsdf<Reflection>(
-                    cw, *comp->as<ReflectionParams>());
-                break;
-            case REFRACTION_ID:
-                ok = result.bsdf.add_bsdf<Refraction>(
-                    cw, *comp->as<RefractionParams>());
-                break;
-            case TRANSPARENT_ID:
-                ok = result.bsdf.add_bsdf<Transparent>(cw);
-                break;
-            case MX_OREN_NAYAR_DIFFUSE_ID: {
-                const MxOrenNayarDiffuseParams* srcparams
-                    = comp->as<MxOrenNayarDiffuseParams>();
-                if (srcparams->energy_compensation) {
-                    // energy compensation handled by its own BSDF
-                    ok = result.bsdf.add_bsdf<EnergyCompensatedOrenNayar>(
-                        cw, *srcparams);
-                } else {
-                    // translate MaterialX parameters into existing closure
-                    OrenNayarParams params = {};
-                    params.N               = srcparams->N;
-                    params.sigma           = srcparams->roughness;
-                    ok = result.bsdf.add_bsdf<OrenNayar>(cw * srcparams->albedo,
-                                                         params);
-                }
+        } else if (dist == uh_ggx) {
+            switch (refract) {
+            case 0:
+                sample
+                    = BSDF_CAST(MicrofacetGGXRefl, bsdf)->sample(wo, rx, ry, rz);
                 break;
-            }
-            case MX_BURLEY_DIFFUSE_ID: {
-                const MxBurleyDiffuseParams& params
-                    = *comp->as<MxBurleyDiffuseParams>();
-                ok = result.bsdf.add_bsdf<MxBurleyDiffuse>(cw, params);
+            case 1:
+                sample
+                    = BSDF_CAST(MicrofacetGGXRefr, bsdf)->sample(wo, rx, ry, rz);
                 break;
-            }
-            case MX_DIELECTRIC_ID: {
-                const MxDielectricParams& params
-                    = *comp->as<MxDielectricParams>();
-                if (is_black(params.transmission_tint))
-                    ok = result.bsdf.add_bsdf<
-                        MxMicrofacet<MxDielectricParams, GGXDist, false>>(
-                        cw, params, 1.0f);
-                else
-                    ok = result.bsdf.add_bsdf<
-                        MxMicrofacet<MxDielectricParams, GGXDist, true>>(
-                        cw, params, result.refraction_ior);
+            case 2:
+                sample
+                    = BSDF_CAST(MicrofacetGGXBoth, bsdf)->sample(wo, rx, ry, rz);
                 break;
             }
-            case MX_CONDUCTOR_ID: {
-                const MxConductorParams& params = *comp->as<MxConductorParams>();
-                ok = result.bsdf.add_bsdf<
-                    MxMicrofacet<MxConductorParams, GGXDist, false>>(cw, params,
-                                                                     1.0f);
-                break;
-            };
-            case MX_GENERALIZED_SCHLICK_ID: {
-                const MxGeneralizedSchlickParams& params
-                    = *comp->as<MxGeneralizedSchlickParams>();
-                if (is_black(params.transmission_tint))
-                    ok = result.bsdf.add_bsdf<MxMicrofacet<
-                        MxGeneralizedSchlickParams, GGXDist, false>>(cw, params,
-                                                                     1.0f);
-                else
-                    ok = result.bsdf.add_bsdf<
-                        MxMicrofacet<MxGeneralizedSchlickParams, GGXDist, true>>(
-                        cw, params, result.refraction_ior);
-                break;
-            };
-            case MX_TRANSLUCENT_ID: {
-                const MxTranslucentParams* srcparams
-                    = comp->as<MxTranslucentParams>();
-                DiffuseParams params = {};
-                params.N             = srcparams->N;
-                ok = result.bsdf.add_bsdf<Diffuse<1>>(cw * srcparams->albedo,
-                                                      params);
+        }
+        break;
+    }
+    case MX_CONDUCTOR_ID:
+        sample = BSDF_CAST(MxConductor, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_DIELECTRIC_ID:
+        if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint))
+            sample = BSDF_CAST(MxDielectricOpaque, bsdf)->sample(wo, rx, ry, rz);
+        else
+            sample = BSDF_CAST(MxDielectric, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_BURLEY_DIFFUSE_ID:
+        sample = BSDF_CAST(MxBurleyDiffuse, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_OREN_NAYAR_DIFFUSE_ID:
+        sample = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_SHEEN_ID:
+        sample = BSDF_CAST(MxSheen, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_GENERALIZED_SCHLICK_ID: {
+        const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint;
+        if (is_black(tint)) {
+            sample = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)
+                         ->sample(wo, rx, ry, rz);
+        } else {
+            sample
+                = BSDF_CAST(MxGeneralizedSchlick, bsdf)->sample(wo, rx, ry, rz);
+        }
+        break;
+    }
+    default: break;
+    }
+    return sample;
+}
+
+
+OSL_HOSTDEVICE BSDF::Sample
+CompositeBSDF::eval(const BSDF* bsdf, const Vec3& wo, const Vec3& wi) const
+{
+    static const ustringhash uh_ggx("ggx");
+    static const ustringhash uh_beckmann("beckmann");
+    static const ustringhash uh_default("default");
+
+    BSDF::Sample sample = {};
+    switch (bsdf->id) {
+    case DIFFUSE_ID: sample = BSDF_CAST(Diffuse<0>, bsdf)->eval(wo, wi); break;
+    case TRANSPARENT_ID:
+    case MX_TRANSPARENT_ID:
+        sample = BSDF_CAST(Transparent, bsdf)->eval(wo, wi);
+        break;
+    case OREN_NAYAR_ID:
+        sample = BSDF_CAST(OrenNayar, bsdf)->eval(wo, wi);
+        break;
+    case TRANSLUCENT_ID:
+        sample = BSDF_CAST(Diffuse<1>, bsdf)->eval(wo, wi);
+        break;
+    case PHONG_ID: sample = BSDF_CAST(Phong, bsdf)->eval(wo, wi); break;
+    case WARD_ID: sample = BSDF_CAST(Ward, bsdf)->eval(wo, wi); break;
+    case REFLECTION_ID:
+    case FRESNEL_REFLECTION_ID:
+        sample = BSDF_CAST(Reflection, bsdf)->eval(wo, wi);
+        break;
+    case REFRACTION_ID:
+        sample = BSDF_CAST(Refraction, bsdf)->eval(wo, wi);
+        break;
+    case MICROFACET_ID: {
+        const int refract      = ((MicrofacetBeckmannRefl*)bsdf)->refract;
+        const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist;
+        if (dist == uh_default || dist == uh_beckmann) {
+            switch (refract) {
+            case 0:
+                sample = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)->eval(wo, wi);
                 break;
-            }
-            case MX_TRANSPARENT_ID: {
-                ok = result.bsdf.add_bsdf<Transparent>(cw);
+            case 1:
+                sample = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)->eval(wo, wi);
                 break;
-            }
-            case MX_SUBSURFACE_ID: {
-                // TODO: implement BSSRDF support?
-                const MxSubsurfaceParams* srcparams
-                    = comp->as<MxSubsurfaceParams>();
-                DiffuseParams params = {};
-                params.N             = srcparams->N;
-                ok = result.bsdf.add_bsdf<Diffuse<0>>(cw * srcparams->albedo,
-                                                      params);
+            case 2:
+                sample = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)->eval(wo, wi);
                 break;
             }
-            case MX_SHEEN_ID: {
-                const MxSheenParams& params = *comp->as<MxSheenParams>();
-                ok = result.bsdf.add_bsdf<MxSheen>(cw, params);
+        } else if (dist == uh_ggx) {
+            switch (refract) {
+            case 0:
+                sample = BSDF_CAST(MicrofacetGGXRefl, bsdf)->eval(wo, wi);
                 break;
-            }
-            case MX_LAYER_ID: {
-                const MxLayerParams* srcparams = comp->as<MxLayerParams>();
-                Color3 base_w
-                    = w
-                      * (Color3(1, 1, 1)
-                         - clamp(evaluate_layer_opacity(sg, srcparams->top),
-                                 0.f, 1.f));
-                process_bsdf_closure(sg, result, srcparams->top, w, light_only);
-                if (!is_black(base_w))
-                    process_bsdf_closure(sg, result, srcparams->base, base_w,
-                                         light_only);
-                ok = true;
+            case 1:
+                sample = BSDF_CAST(MicrofacetGGXRefr, bsdf)->eval(wo, wi);
                 break;
-            }
-            case MX_ANISOTROPIC_VDF_ID:
-            case MX_MEDIUM_VDF_ID: {
-                // already processed by process_medium_closure
-                ok = true;
+            case 2:
+                sample = BSDF_CAST(MicrofacetGGXBoth, bsdf)->eval(wo, wi);
                 break;
             }
-            }
-            OSL_ASSERT(ok && "Invalid closure invoked in surface shader");
         }
         break;
     }
+    case MX_CONDUCTOR_ID:
+        sample = BSDF_CAST(MxConductor, bsdf)->eval(wo, wi);
+        break;
+    case MX_DIELECTRIC_ID:
+        if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint))
+            sample = BSDF_CAST(MxDielectricOpaque, bsdf)->eval(wo, wi);
+        else
+            sample = BSDF_CAST(MxDielectric, bsdf)->eval(wo, wi);
+        break;
+    case MX_BURLEY_DIFFUSE_ID:
+        sample = BSDF_CAST(MxBurleyDiffuse, bsdf)->eval(wo, wi);
+        break;
+    case MX_OREN_NAYAR_DIFFUSE_ID:
+        sample = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)->eval(wo, wi);
+        break;
+    case MX_SHEEN_ID: sample = ((MxSheen*)bsdf)->MxSheen::eval(wo, wi); break;
+    case MX_GENERALIZED_SCHLICK_ID: {
+        const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint;
+        if (is_black(tint)) {
+            sample = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)->eval(wo, wi);
+        } else {
+            sample = BSDF_CAST(MxGeneralizedSchlick, bsdf)->eval(wo, wi);
+        }
+        break;
     }
-}
-
-}  // anonymous namespace
-
-OSL_NAMESPACE_ENTER
-
-void
-process_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
-                const ClosureColor* Ci, bool light_only)
-{
-    if (!light_only)
-        process_medium_closure(sg, result, Ci, Color3(1));
-    process_bsdf_closure(sg, result, Ci, Color3(1), light_only);
-}
-
-Vec3
-process_background_closure(const ClosureColor* closure)
-{
-    if (!closure)
-        return Vec3(0, 0, 0);
-    switch (closure->id) {
-    case ClosureColor::MUL: {
-        return closure->as_mul()->weight
-               * process_background_closure(closure->as_mul()->closure);
-    }
-    case ClosureColor::ADD: {
-        return process_background_closure(closure->as_add()->closureA)
-               + process_background_closure(closure->as_add()->closureB);
-    }
-    case BACKGROUND_ID: {
-        return closure->as_comp()->w;
-    }
+    default: break;
     }
-    // should never happen
-    OSL_ASSERT(false && "Invalid closure invoked in background shader");
-    return Vec3(0, 0, 0);
+    return sample;
 }
 
-
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/shading.h b/src/testrender/shading.h
index d7fdb190d..c7e6c67dc 100644
--- a/src/testrender/shading.h
+++ b/src/testrender/shading.h
@@ -6,24 +6,234 @@
 #pragma once
 
 #include <OSL/dual_vec.h>
+#include <OSL/hashes.h>
 #include <OSL/oslclosure.h>
 #include <OSL/oslconfig.h>
 #include <OSL/oslexec.h>
 #include "sampling.h"
 
+// TODO: This used to be in the anonymous namespace ...
+// unique identifier for each closure supported by testrender
+enum ClosureIDs {
+    COMPONENT_BASE_ID = 0, MUL = -1, ADD = -2,
+    EMISSION_ID = 1,
+    BACKGROUND_ID,
+    DIFFUSE_ID,
+    OREN_NAYAR_ID,
+    TRANSLUCENT_ID,
+    PHONG_ID,
+    WARD_ID,
+    MICROFACET_ID,
+    REFLECTION_ID,
+    FRESNEL_REFLECTION_ID,
+    REFRACTION_ID,
+    TRANSPARENT_ID,
+    DEBUG_ID,
+    HOLDOUT_ID,
+    // See MATERIALX_CLOSURES in stdosl.h
+    MX_OREN_NAYAR_DIFFUSE_ID,
+    MX_BURLEY_DIFFUSE_ID,
+    MX_DIELECTRIC_ID,
+    MX_CONDUCTOR_ID,
+    MX_GENERALIZED_SCHLICK_ID,
+    MX_TRANSLUCENT_ID,
+    MX_TRANSPARENT_ID,
+    MX_SUBSURFACE_ID,
+    MX_SHEEN_ID,
+    MX_UNIFORM_EDF_ID,
+    MX_ANISOTROPIC_VDF_ID,
+    MX_MEDIUM_VDF_ID,
+    MX_LAYER_ID,
+    // TODO: adding vdfs would require extending testrender with volume support ...
+    EMPTY_ID
+};
+
+
+// Closure params
+namespace {
+
+// these structures hold the parameters of each closure type
+// they will be contained inside ClosureComponent
+struct EmptyParams {
+};
+struct DiffuseParams {
+    OSL::Vec3 N;
+};
+struct OrenNayarParams {
+    OSL::Vec3 N;
+    float sigma;
+};
+struct PhongParams {
+    OSL::Vec3 N;
+    float exponent;
+};
+struct WardParams {
+    OSL::Vec3 N, T;
+    float ax, ay;
+};
+struct ReflectionParams {
+    OSL::Vec3 N;
+    float eta;
+};
+struct RefractionParams {
+    OSL::Vec3 N;
+    float eta;
+};
+struct MicrofacetParams {
+    OSL::ustringhash dist;
+    OSL::Vec3 N, U;
+    float xalpha, yalpha, eta;
+    int refract;
+};
+
+// MATERIALX_CLOSURES
+
+struct MxOrenNayarDiffuseParams {
+    OSL::Vec3 N;
+    OSL::Color3 albedo;
+    float roughness;
+    // optional
+    OSL::ustringhash label;
+    int energy_compensation;
+};
+
+struct MxBurleyDiffuseParams {
+    OSL::Vec3 N;
+    OSL::Color3 albedo;
+    float roughness;
+    // optional
+    OSL::ustringhash label;
+};
+
+// common to all MaterialX microfacet closures
+struct MxMicrofacetBaseParams {
+    OSL::Vec3 N, U;
+    float roughness_x;
+    float roughness_y;
+    OSL::ustringhash distribution;
+    // optional
+    OSL::ustringhash label;
+};
+
+struct MxDielectricParams : public MxMicrofacetBaseParams {
+    OSL::Color3 reflection_tint;
+    OSL::Color3 transmission_tint;
+    float ior;
+    // optional
+    float thinfilm_thickness;
+    float thinfilm_ior;
+
+    OSL_HOSTDEVICE OSL::Color3 evalR(float cos_theta) const;
+    OSL_HOSTDEVICE OSL::Color3 evalT(float cos_theta) const;
+};
+
+struct MxConductorParams : public MxMicrofacetBaseParams {
+    OSL::Color3 ior;
+    OSL::Color3 extinction;
+    // optional
+    float thinfilm_thickness;
+    float thinfilm_ior;
+
+    OSL_HOSTDEVICE OSL::Color3 evalR(float cos_theta) const;
+    OSL_HOSTDEVICE OSL::Color3 evalT(float cos_theta) const;
+
+    // Avoid function was declared but never referenced
+    // float get_ior() const
+    // {
+    //     return 0;  // no transmission possible
+    // }
+};
+
+struct MxGeneralizedSchlickParams : public MxMicrofacetBaseParams {
+    OSL::Color3 reflection_tint;
+    OSL::Color3 transmission_tint;
+    OSL::Color3 f0;
+    OSL::Color3 f90;
+    float exponent;
+    // optional
+    float thinfilm_thickness;
+    float thinfilm_ior;
+
+    OSL_HOSTDEVICE OSL::Color3 evalR(float cos_theta) const;
+    OSL_HOSTDEVICE OSL::Color3 evalT(float cos_theta) const;
+};
+
+struct MxTranslucentParams {
+    OSL::Vec3 N;
+    OSL::Color3 albedo;
+    // optional
+    OSL::ustringhash label;
+};
+
+struct MxSubsurfaceParams {
+    OSL::Vec3 N;
+    OSL::Color3 albedo;
+    float transmission_depth;
+    OSL::Color3 transmission_color;
+    float anisotropy;
+    // optional
+    OSL::ustringhash label;
+};
+
+struct MxSheenParams {
+    OSL::Vec3 N;
+    OSL::Color3 albedo;
+    float roughness;
+    // optional
+    OSL::ustringhash label;
+};
+
+struct MxUniformEdfParams {
+    OSL::Color3 emittance;
+    // optional
+    OSL::ustringhash label;
+};
+
+struct MxLayerParams {
+    OSL::ClosureColor* top;
+    OSL::ClosureColor* base;
+};
+
+struct MxAnisotropicVdfParams {
+    OSL::Color3 albedo;
+    OSL::Color3 extinction;
+    float anisotropy;
+    // optional
+    OSL::ustringhash label;
+};
+
+struct MxMediumVdfParams {
+    OSL::Color3 albedo;
+    float transmission_depth;
+    OSL::Color3 transmission_color;
+    float anisotropy;
+    float ior;
+    int priority;
+    // optional
+    OSL::ustringhash label;
+};
+
+}
+
 
 OSL_NAMESPACE_ENTER
 
+
+struct ShadingResult;
+
+// Cast a BSDF* to the specified sub-type
+#define BSDF_CAST(BSDF_TYPE, bsdf) reinterpret_cast<const BSDF_TYPE*>(bsdf)
+
 /// Individual BSDF (diffuse, phong, refraction, etc ...)
 /// Actual implementations of this class are private
 struct BSDF {
     struct Sample {
-        Sample() : wi(0.0f), weight(0.0f), pdf(0.0f), roughness(0.0f) {}
-        Sample(const Sample& o)
+        OSL_HOSTDEVICE Sample() : wi(0.0f), weight(0.0f), pdf(0.0f), roughness(0.0f) {}
+        OSL_HOSTDEVICE Sample(const Sample& o)
             : wi(o.wi), weight(o.weight), pdf(o.pdf), roughness(o.roughness)
         {
         }
-        Sample(Vec3 wi, Color3 w, float pdf, float r)
+        OSL_HOSTDEVICE Sample(Vec3 wi, Color3 w, float pdf, float r)
             : wi(wi), weight(w), pdf(pdf), roughness(r)
         {
         }
@@ -32,48 +242,82 @@ struct BSDF {
         float pdf;
         float roughness;
     };
-    BSDF() {}
-    virtual Color3 get_albedo(const Vec3& /*wo*/) const { return Color3(1); }
-    virtual Sample eval(const Vec3& wo, const Vec3& wi) const = 0;
-    virtual Sample sample(const Vec3& wo, float rx, float ry, float rz) const
-        = 0;
+    OSL_HOSTDEVICE BSDF(ClosureIDs id=EMPTY_ID) : id(id) {}
+
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& /*wo*/) const
+    {
+        return Color3(1);
+    }
+
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
+    {
+        return {};
+    }
+
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float rz) const
+    {
+        return {};
+    }
+
+    ClosureIDs id;
 };
 
+
 /// Represents a weighted sum of BSDFS
 /// NOTE: no need to inherit from BSDF here because we use a "flattened" representation and therefore never nest these
 ///
 struct CompositeBSDF {
-    CompositeBSDF() : num_bsdfs(0), num_bytes(0) {}
+    OSL_HOSTDEVICE CompositeBSDF() : num_bsdfs(0), num_bytes(0) {}
 
+    OSL_HOSTDEVICE
     void prepare(const Vec3& wo, const Color3& path_weight, bool absorb)
     {
         float total = 0;
         for (int i = 0; i < num_bsdfs; i++) {
-            pdfs[i] = weights[i].dot(path_weight * bsdfs[i]->get_albedo(wo))
+            pdfs[i] = weights[i].dot(path_weight * get_albedo(bsdfs[i], wo))
                       / (path_weight.x + path_weight.y + path_weight.z);
+
+#ifndef __CUDACC__
+            // TODO: Figure out what to do with weights/albedos with negative
+            //       components (e.g., as might happen when bipolar noise is
+            //       used as a color).
+
+            // The PDF is out-of-range in some test scenes on the CPU path, but
+            // these asserts are no-ops in release builds. The asserts are active
+            // on the CUDA path, so we need to skip them.
             assert(pdfs[i] >= 0);
             assert(pdfs[i] <= 1);
+#endif
             total += pdfs[i];
         }
         if ((!absorb && total > 0) || total > 1) {
-            for (int i = 0; i < num_bsdfs; i++)
+            for (int i = 0; i < num_bsdfs; i++) {
+#ifndef __CUDACC__
                 pdfs[i] /= total;
+#else
+                pdfs[i] = __fdiv_rn(pdfs[i], total);
+#endif
+            }
         }
     }
 
+    OSL_HOSTDEVICE
     Color3 get_albedo(const Vec3& wo) const
     {
         Color3 result(0, 0, 0);
-        for (int i = 0; i < num_bsdfs; i++)
-            result += weights[i] * bsdfs[i]->get_albedo(wo);
+        for (int i = 0; i < num_bsdfs; i++) {
+            result += weights[i] * get_albedo(bsdfs[i], wo);
+        }
         return result;
     }
 
+    OSL_HOSTDEVICE
     BSDF::Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         BSDF::Sample s = {};
         for (int i = 0; i < num_bsdfs; i++) {
-            BSDF::Sample b = bsdfs[i]->eval(wo, wi);
+            BSDF::Sample b = eval(bsdfs[i], wo, wi);
             b.weight *= weights[i];
             MIS::update_eval(&s.weight, &s.pdf, b.weight, b.pdf, pdfs[i]);
             s.roughness += b.roughness * pdfs[i];
@@ -81,6 +325,7 @@ struct CompositeBSDF {
         return s;
     }
 
+    OSL_HOSTDEVICE
     BSDF::Sample sample(const Vec3& wo, float rx, float ry, float rz) const
     {
         float accum = 0;
@@ -88,7 +333,7 @@ struct CompositeBSDF {
             if (rx < (pdfs[i] + accum)) {
                 rx = (rx - accum) / pdfs[i];
                 rx = std::min(rx, 0.99999994f);  // keep result in [0,1)
-                BSDF::Sample s = bsdfs[i]->sample(wo, rx, ry, rz);
+                BSDF::Sample s = sample(bsdfs[i], wo, rx, ry, rz);
                 s.weight *= weights[i] * (1 / pdfs[i]);
                 s.pdf *= pdfs[i];
                 if (s.pdf == 0.0f)
@@ -96,7 +341,7 @@ struct CompositeBSDF {
                 // we sampled PDF i, now figure out how much the other bsdfs contribute to the chosen direction
                 for (int j = 0; j < num_bsdfs; j++) {
                     if (i != j) {
-                        BSDF::Sample b = bsdfs[j]->eval(wo, s.wi);
+                        BSDF::Sample b = eval(bsdfs[j], wo, s.wi);
                         b.weight *= weights[j];
                         MIS::update_eval(&s.weight, &s.pdf, b.weight, b.pdf,
                                          pdfs[j]);
@@ -110,9 +355,8 @@ struct CompositeBSDF {
     }
 
     template<typename BSDF_Type, typename... BSDF_Args>
-    bool add_bsdf(const Color3& w, BSDF_Args&&... args)
+    OSL_HOSTDEVICE bool add_bsdf(const Color3& w, BSDF_Args&&... args)
     {
-        // make sure we have enough space
         if (num_bsdfs >= MaxEntries)
             return false;
         if (num_bytes + sizeof(BSDF_Type) > MaxSize)
@@ -127,8 +371,14 @@ struct CompositeBSDF {
 
 private:
     /// Never try to copy this struct because it would invalidate the bsdf pointers
-    CompositeBSDF(const CompositeBSDF& c);
-    CompositeBSDF& operator=(const CompositeBSDF& c);
+    OSL_HOSTDEVICE CompositeBSDF(const CompositeBSDF& c);
+    OSL_HOSTDEVICE CompositeBSDF& operator=(const CompositeBSDF& c);
+
+    OSL_HOSTDEVICE Color3 get_albedo(const BSDF* bsdf, const Vec3& wo) const;
+    OSL_HOSTDEVICE BSDF::Sample eval(const BSDF* bsdf, const Vec3& wo,
+                                     const Vec3& wi) const;
+    OSL_HOSTDEVICE BSDF::Sample sample(const BSDF* bsdf, const Vec3& wo,
+                                       float rx, float ry, float rz) const;
 
     enum { MaxEntries = 8 };
     enum { MaxSize = 256 * sizeof(float) };
@@ -151,12 +401,15 @@ struct ShadingResult {
     int priority         = 0;
 };
 
+#ifndef __CUDACC__
 void
 register_closures(ShadingSystem* shadingsys);
-void
+#endif
+
+OSL_HOSTDEVICE void
 process_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
                 const ClosureColor* Ci, bool light_only);
-Vec3
+OSL_HOSTDEVICE Vec3
 process_background_closure(const ClosureColor* Ci);
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/simpleraytracer.cpp b/src/testrender/simpleraytracer.cpp
index 13c92b88f..477bc4a05 100644
--- a/src/testrender/simpleraytracer.cpp
+++ b/src/testrender/simpleraytracer.cpp
@@ -2,19 +2,21 @@
 // SPDX-License-Identifier: BSD-3-Clause
 // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
 
-#include <OpenImageIO/filesystem.h>
-#include <OpenImageIO/parallel.h>
+#ifndef __CUDACC__
+#    include <OpenImageIO/filesystem.h>
+#    include <OpenImageIO/parallel.h>
 
-#include <pugixml.hpp>
+#    include <pugixml.hpp>
 
-#ifdef USING_OIIO_PUGI
+#    ifdef USING_OIIO_PUGI
 namespace pugi = OIIO::pugi;
-#endif
+#    endif
 
-#include <OSL/hashes.h>
-#include "raytracer.h"
-#include "shading.h"
-#include "simpleraytracer.h"
+#    include <OSL/hashes.h>
+#    include "raytracer.h"
+#    include "shading.h"
+#    include "simpleraytracer.h"
+#endif
 
 // Create ustrings for all strings used by the free function renderer services.
 // Required to allow the reverse mapping of hash->string to work when processing messages
@@ -41,6 +43,7 @@ using namespace OSL;
 
 OSL_NAMESPACE_ENTER
 
+#ifndef __CUDACC__
 static TypeDesc TypeFloatArray2(TypeDesc::FLOAT, 2);
 static TypeDesc TypeFloatArray4(TypeDesc::FLOAT, 4);
 static TypeDesc TypeIntArray2(TypeDesc::INT, 2);
@@ -846,36 +849,60 @@ SimpleRaytracer::globals_from_hit(ShaderGlobals& sg, const Ray& r,
     sg.renderstate = &sg;
 }
 
-Vec3
+#endif // #ifndef __CUDACC__
+
+
+#ifndef __CUDACC__
+    using ShaderGlobalsType = OSL::ShaderGlobals;
+#else
+    using ShaderGlobalsType = OSL_CUDA::ShaderGlobals;
+#endif
+
+
+OSL_HOSTDEVICE Vec3
 SimpleRaytracer::eval_background(const Dual2<Vec3>& dir, ShadingContext* ctx,
                                  int bounce)
 {
-    ShaderGlobals sg;
-    memset((char*)&sg, 0, sizeof(ShaderGlobals));
+    ShaderGlobalsType sg;
+    memset((char*)&sg, 0, sizeof(ShaderGlobalsType));
     sg.I    = dir.val();
     sg.dIdx = dir.dx();
     sg.dIdy = dir.dy();
     if (bounce >= 0)
         sg.raytype = bounce > 0 ? Ray::DIFFUSE : Ray::CAMERA;
+#ifndef __CUDACC__
     shadingsys->execute(*ctx, *m_shaders[backgroundShaderID], sg);
-    return process_background_closure(sg.Ci);
+#else
+    alignas(8) char closure_pool[256];
+    sg.shaderID = render_params.bg_id;
+    execute_shader(sg, closure_pool);
+#endif
+    return process_background_closure((const ClosureColor*) sg.Ci);
 }
 
-Color3
-SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
-                                   ShadingContext* ctx)
+
+OSL_HOSTDEVICE Color3
+SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, ShadingContext* ctx)
 {
+#ifdef __CUDACC__
+    // Scratch space for the output closures
+    alignas(8) char closure_pool[256];
+    alignas(8) char light_closure_pool[256];
+#endif
+
     Ray r = camera.get(x, y);
     Color3 path_weight(1, 1, 1);
     Color3 path_radiance(0, 0, 0);
     int prev_id    = -1;
     float bsdf_pdf = std::numeric_limits<
         float>::infinity();  // camera ray has only one possible direction
+
     for (int b = 0; b <= max_bounces; b++) {
         // trace the ray against the scene
         Dual2<float> t;
-        int id = prev_id;
-        if (!scene.intersect(r, t, id)) {
+        int id       = prev_id;
+        ShaderGlobalsType sg;
+        if (!scene.intersect(r, t, id, &sg)) {
             // we hit nothing? check background shader
             if (backgroundShaderID >= 0) {
                 if (b > 0 && backgroundResolution > 0) {
@@ -894,8 +921,8 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
             break;
         }
 
+#ifndef __CUDACC__
         // construct a shader globals for the hit point
-        ShaderGlobals sg;
         globals_from_hit(sg, r, t, id);
         const float radius = r.radius + r.spread * t.val();
         int shaderID       = scene.shaderid(id);
@@ -904,9 +931,16 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
 
         // execute shader and process the resulting list of closures
         shadingsys->execute(*ctx, *m_shaders[shaderID], sg);
+#else
+        // The ShaderGlobals are populated in the closest-hit program,
+        // so we don't need to do that here.
+        execute_shader(sg, closure_pool);
+        const float radius = r.radius + r.spread * t.val();
+#endif
+
         ShadingResult result;
         bool last_bounce = b == max_bounces;
-        process_closure(sg, result, sg.Ci, last_bounce);
+        process_closure(sg, result, (const ClosureColor*)sg.Ci, last_bounce);
 
         // add self-emission
         float k = 1;
@@ -953,8 +987,10 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
                 Ray shadow_ray = Ray(sg.P, bg_dir.val(), radius, 0,
                                      Ray::SHADOW);
                 Dual2<float> shadow_dist;
-                if (!scene.intersect(shadow_ray, shadow_dist,
-                                     shadow_id))  // ray reached the background?
+                ShaderGlobalsType shadow_sg;
+
+                if (!scene.intersect(shadow_ray, shadow_dist, shadow_id,
+                                     &shadow_sg))  // ray reached the background?
                     path_radiance += contrib;
             }
         }
@@ -965,9 +1001,12 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
                 continue;  // skip self
             if (!scene.islight(lid))
                 continue;  // doesn't want to be sampled as a light
+
+#ifndef __CUDACC__
             int shaderID = scene.shaderid(lid);
             if (shaderID < 0 || !m_shaders[shaderID])
                 continue;  // no shader attached to this light
+#endif
             // sample a random direction towards the object
             float light_pdf;
             Vec3 ldir      = scene.sample(lid, sg.P, xi, yi, light_pdf);
@@ -975,21 +1014,26 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
             Color3 contrib = path_weight * b.weight
                              * MIS::power_heuristic<MIS::EVAL_WEIGHT>(light_pdf,
                                                                       b.pdf);
+
             if ((contrib.x + contrib.y + contrib.z) > 0) {
                 Ray shadow_ray = Ray(sg.P, ldir, radius, 0, Ray::SHADOW);
                 // trace a shadow ray and see if we actually hit the target
                 // in this tiny renderer, tracing a ray is probably cheaper than evaluating the light shader
                 int shadow_id = id;  // ignore self hit
                 Dual2<float> shadow_dist;
-                if (scene.intersect(shadow_ray, shadow_dist, shadow_id)
+                ShaderGlobalsType light_sg;
+                if (scene.intersect(shadow_ray, shadow_dist, shadow_id, &light_sg)
                     && shadow_id == lid) {
+#ifndef __CUDACC__
                     // setup a shader global for the point on the light
-                    ShaderGlobals light_sg;
                     globals_from_hit(light_sg, shadow_ray, shadow_dist, lid);
                     // execute the light shader (for emissive closures only)
                     shadingsys->execute(*ctx, *m_shaders[shaderID], light_sg);
+#else
+                    execute_shader(light_sg, light_closure_pool);
+#endif
                     ShadingResult light_result;
-                    process_closure(light_sg, light_result, light_sg.Ci, true);
+                    process_closure(light_sg, light_result, (const ClosureColor*) light_sg.Ci, true);
                     // accumulate contribution
                     path_radiance += contrib * light_result.Le;
                 }
@@ -1008,12 +1052,14 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
         if (!(path_weight.x > 0) && !(path_weight.y > 0)
             && !(path_weight.z > 0))
             break;  // filter out all 0's or NaNs
-        prev_id  = id;
-        r.origin = sg.P;
+        prev_id   = id;
+        r.origin  = sg.P;
     }
     return path_radiance;
 }
 
+#ifndef __CUDACC__
+
 Color3
 SimpleRaytracer::antialias_pixel(int x, int y, ShadingContext* ctx)
 {
@@ -1107,4 +1153,6 @@ SimpleRaytracer::clear()
     shaders().clear();
 }
 
+#endif // #ifndef __CUDACC__
+
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/simpleraytracer.h b/src/testrender/simpleraytracer.h
index e9cae486b..ea70e950f 100644
--- a/src/testrender/simpleraytracer.h
+++ b/src/testrender/simpleraytracer.h
@@ -99,6 +99,9 @@ class SimpleRaytracer : public RendererServices {
     OIIO::ParamValueList options;
     OIIO::ImageBuf pixelbuf;
 
+    int getBackgroundShaderID() const { return backgroundShaderID; }
+    int getBackgroundResolution() const { return backgroundResolution; }
+
 private:
     // Camera parameters
     Matrix44 m_world_to_camera;
diff --git a/src/testrender/testrender.cpp b/src/testrender/testrender.cpp
index aa807953e..d5ef062cd 100644
--- a/src/testrender/testrender.cpp
+++ b/src/testrender/testrender.cpp
@@ -51,6 +51,7 @@ static std::string texoptions;
 static int xres = 640, yres = 480;
 static int aa = 1, max_bounces = 1000000, rr_depth = 5;
 static float show_albedo_scale = 0.0f;
+static bool no_jitter          = false;
 static int num_threads         = 0;
 static int iters               = 1;
 static std::string scenefile, imagefile;
@@ -174,6 +175,8 @@ getargs(int argc, const char* argv[])
       .help("Trace NxN rays per pixel");
     ap.arg("-albedo %f:SCALE", &show_albedo_scale)
       .help("Visualize the albedo of each pixel instead of path tracing");
+    ap.arg("--no-jitter", &no_jitter)
+      .help("Disable AA pixel jitter (OptiX mode only)");
     ap.arg("--iters %d:N", &iters)
       .help("Number of iterations");
     ap.arg("-O0", &O0)
@@ -281,6 +284,7 @@ main(int argc, const char* argv[])
     rend->attribute("rr_depth", rr_depth);
     rend->attribute("aa", aa);
     rend->attribute("show_albedo_scale", show_albedo_scale);
+    rend->attribute("no_jitter", (int)no_jitter);
     OIIO::attribute("threads", num_threads);
 
 #if OSL_USE_OPTIX
@@ -368,6 +372,8 @@ main(int argc, const char* argv[])
 
     // We're done with the shading system now, destroy it
     rend->clear();
+    // We need to destroy the shading system before the renderer, because the
+    // shading system destructor may call into the renderer.
     delete shadingsys;
     delete rend;
     return EXIT_SUCCESS;
diff --git a/src/testshade/CMakeLists.txt b/src/testshade/CMakeLists.txt
index 88d710166..da7471620 100644
--- a/src/testshade/CMakeLists.txt
+++ b/src/testshade/CMakeLists.txt
@@ -24,7 +24,9 @@ if (OSL_USE_OPTIX)
         )
 
     set ( testshade_cuda_headers
-        ../testrender/cuda/rend_lib.h )
+        ../testrender/cuda/rend_lib.h
+        ../testrender/raytracer.h
+        )
 
     # We need to make sure that the PTX files are regenerated whenever these
     # headers change.
diff --git a/src/testshade/cuda/optix_grid_renderer.cu b/src/testshade/cuda/optix_grid_renderer.cu
index 1304e9bfa..fdc44fc4b 100644
--- a/src/testshade/cuda/optix_grid_renderer.cu
+++ b/src/testshade/cuda/optix_grid_renderer.cu
@@ -29,7 +29,7 @@ OSL_NAMESPACE_EXIT
 
 
 extern "C" {
-__device__ __constant__ RenderParams render_params;
+__device__ __constant__ testshade::RenderParams render_params;
 }
 
 extern "C" __global__ void
@@ -101,7 +101,7 @@ __raygen__()
     alignas(8) char closure_pool[256];
     alignas(8) char params[256];
 
-    ShaderGlobals sg;
+    OSL_CUDA::ShaderGlobals sg;
     // Setup the ShaderGlobals
     sg.I  = make_float3(0, 0, 1);
     sg.N  = make_float3(0, 0, 1);
@@ -130,7 +130,7 @@ __raygen__()
     sg.backfacing  = 0;
 
     // NB: These variables are not used in the current iteration of the sample
-    sg.raytype        = CAMERA;
+    sg.raytype        = OSL::Ray::CAMERA;
     sg.flipHandedness = 0;
 
     sg.shader2common = reinterpret_cast<void*>(render_params.shader2common);
@@ -143,20 +143,20 @@ __raygen__()
     // Run the OSL group and init functions
     if (render_params.fused_callable)
         // call osl_init_func
-        optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
+        optixDirectCall<void, OSL_CUDA::ShaderGlobals*, void*, void*, void*, int, void*>(
             0u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/,
             nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
             0 /*shadeindex - unused*/,
             sbtdata->data /*interactive_params_ptr*/);
     else {
         // call osl_init_func
-        optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
+        optixDirectCall<void, OSL_CUDA::ShaderGlobals*, void*, void*, void*, int, void*>(
             0u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/,
             nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
             0 /*shadeindex - unused*/,
             sbtdata->data /*interactive_params_ptr*/);
         // call osl_group_func
-        optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
+        optixDirectCall<void, OSL_CUDA::ShaderGlobals*, void*, void*, void*, int, void*>(
             1u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/,
             nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
             0 /*shadeindex - unused*/,
@@ -173,8 +173,10 @@ __raygen__()
 // Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics"
 // let's do the texture look-ups in this file.
 extern "C" __device__ float4
-osl_tex2DLookup(void* handle, float s, float t)
+osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx, float dsdy, float dtdy)
 {
+    const float2 dx = {dsdx, dtdx};
+    const float2 dy = {dsdy, dtdy};
     cudaTextureObject_t texID = cudaTextureObject_t(handle);
-    return tex2D<float4>(texID, s, t);
+    return tex2DGrad<float4>(texID, s, t, dx, dy);
 }
diff --git a/src/testshade/optixgridrender.cpp b/src/testshade/optixgridrender.cpp
index ff64029cc..d066c777f 100644
--- a/src/testshade/optixgridrender.cpp
+++ b/src/testshade/optixgridrender.cpp
@@ -14,7 +14,7 @@
 #include "render_params.h"
 
 #include <cuda.h>
-#include <nvrtc.h>
+#include <cuda_runtime.h>
 #include <optix_function_table_definition.h>
 #include <optix_stack_size.h>
 #include <optix_stubs.h>
@@ -33,6 +33,9 @@ const auto optixModuleCreateFn = optixModuleCreate;
 #endif
 
 
+using namespace testshade;
+
+
 OSL_NAMESPACE_ENTER
 
 
@@ -179,6 +182,8 @@ OptixGridRenderer::~OptixGridRenderer()
 {
     for (void* p : m_ptrs_to_free)
         cudaFree(p);
+    for (void* p : m_arrays_to_free)
+        cudaFreeArray(reinterpret_cast<cudaArray_t>(p));
     if (m_optix_ctx)
         OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
 }
@@ -817,53 +822,82 @@ OptixGridRenderer::get_texture_handle(ustring filename,
 {
     auto itr = m_samplers.find(filename);
     if (itr == m_samplers.end()) {
-        // Open image
+        // Open image to check the number of mip levels
         OIIO::ImageBuf image;
         if (!image.init_spec(filename, 0, 0)) {
             errhandler().errorfmt("Could not load: {} (hash {})", filename,
                                   filename);
             return (TextureHandle*)nullptr;
         }
-
-        OIIO::ROI roi = OIIO::get_roi_full(image.spec());
-        int32_t width = roi.width(), height = roi.height();
-        std::vector<float> pixels(width * height * 4);
-
-        for (int j = 0; j < height; j++) {
-            for (int i = 0; i < width; i++) {
-                image.getpixel(i, j, 0, &pixels[((j * width) + i) * 4 + 0]);
-            }
-        }
-        cudaResourceDesc res_desc = {};
+        int32_t nmiplevels = image.nmiplevels();
+        int32_t img_width  = image.xmax() + 1;
+        int32_t img_height = image.ymax() + 1;
 
         // hard-code textures to 4 channels
-        int32_t pitch = width * 4 * sizeof(float);
         cudaChannelFormatDesc channel_desc
             = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
 
-        cudaArray_t pixelArray;
-        CUDA_CHECK(cudaMallocArray(&pixelArray, &channel_desc, width, height));
+        cudaMipmappedArray_t mipmapArray;
+        cudaExtent extent = make_cudaExtent(img_width, img_height, 0);
+        CUDA_CHECK(cudaMallocMipmappedArray(&mipmapArray, &channel_desc, extent,
+                                            nmiplevels));
+
+        // Copy the pixel data for each mip level
+        std::vector<std::vector<float>> level_pixels(nmiplevels);
+        for (int32_t level = 0; level < nmiplevels; ++level) {
+            image.reset(filename, 0, level);
+            OIIO::ROI roi = OIIO::get_roi_full(image.spec());
+            if (!roi.defined()) {
+                errhandler().errorfmt(
+                    "Could not load mip level {}: {} (hash {})", level,
+                    filename, filename);
+                return (TextureHandle*)nullptr;
+            }
 
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(pixelArray));
+            int32_t width = roi.width(), height = roi.height();
+            level_pixels[level].resize(width * height * 4);
+            for (int j = 0; j < height; j++) {
+                for (int i = 0; i < width; i++) {
+                    image.getpixel(i, j, 0,
+                                   &level_pixels[level][((j * width) + i) * 4]);
+                }
+            }
 
-        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray,
-                                       /* offset */ 0, 0, pixels.data(), pitch,
-                                       pitch, height, cudaMemcpyHostToDevice));
+            cudaArray_t miplevelArray;
+            CUDA_CHECK(
+                cudaGetMipmappedArrayLevel(&miplevelArray, mipmapArray, level));
 
-        res_desc.resType         = cudaResourceTypeArray;
-        res_desc.res.array.array = pixelArray;
+            // Copy the texel data into the miplevel array
+            int32_t pitch = width * 4 * sizeof(float);
+            CUDA_CHECK(cudaMemcpy2DToArray(miplevelArray, 0, 0,
+                                           level_pixels[level].data(), pitch,
+                                           pitch, height,
+                                           cudaMemcpyHostToDevice));
+        }
 
-        cudaTextureDesc tex_desc = {};
-        tex_desc.addressMode[0]  = cudaAddressModeWrap;
-        tex_desc.addressMode[1]  = cudaAddressModeWrap;
-        tex_desc.filterMode      = cudaFilterModeLinear;
-        tex_desc.readMode
-            = cudaReadModeElementType;  //cudaReadModeNormalizedFloat;
+        int32_t pitch = img_width * 4 * sizeof(float);
+        cudaArray_t pixelArray;
+        CUDA_CHECK(
+            cudaMallocArray(&pixelArray, &channel_desc, img_width, img_height));
+        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(),
+                                       pitch, pitch, img_height,
+                                       cudaMemcpyHostToDevice));
+        m_arrays_to_free.push_back(reinterpret_cast<void*>(pixelArray));
+
+        cudaResourceDesc res_desc  = {};
+        res_desc.resType           = cudaResourceTypeMipmappedArray;
+        res_desc.res.mipmap.mipmap = mipmapArray;
+
+        cudaTextureDesc tex_desc     = {};
+        tex_desc.addressMode[0]      = cudaAddressModeWrap;
+        tex_desc.addressMode[1]      = cudaAddressModeWrap;
+        tex_desc.filterMode          = cudaFilterModeLinear;
+        tex_desc.readMode            = cudaReadModeElementType;
         tex_desc.normalizedCoords    = 1;
         tex_desc.maxAnisotropy       = 1;
-        tex_desc.maxMipmapLevelClamp = 99;
+        tex_desc.maxMipmapLevelClamp = float(nmiplevels - 1);
         tex_desc.minMipmapLevelClamp = 0;
-        tex_desc.mipmapFilterMode    = cudaFilterModePoint;
+        tex_desc.mipmapFilterMode    = cudaFilterModeLinear;
         tex_desc.borderColor[0]      = 1.0f;
         tex_desc.sRGB                = 0;
 
diff --git a/src/testshade/optixgridrender.h b/src/testshade/optixgridrender.h
index 5f630f922..f689217d2 100644
--- a/src/testshade/optixgridrender.h
+++ b/src/testshade/optixgridrender.h
@@ -105,6 +105,7 @@ class OptixGridRenderer final : public SimpleRenderer {
 
     // CUdeviceptrs that need to be freed after we are done
     std::vector<void*> m_ptrs_to_free;
+    std::vector<void*> m_arrays_to_free;
 };
 
 
diff --git a/src/testshade/render_params.h b/src/testshade/render_params.h
index c2e9c26c9..6670fe099 100644
--- a/src/testshade/render_params.h
+++ b/src/testshade/render_params.h
@@ -4,6 +4,8 @@
 
 #pragma once
 
+namespace testshade {
+
 struct RenderParams {
     float invw;
     float invh;
@@ -40,3 +42,5 @@ struct GenericRecord {
     // What follows should duplicate GenericData
     void* data;
 };
+
+}  // namespace testshade
diff --git a/testsuite/testoptix-noise/ref/out.exr b/testsuite/testoptix-noise/ref/out.exr
index a1a3cefe1..3c38e37e1 100644
Binary files a/testsuite/testoptix-noise/ref/out.exr and b/testsuite/testoptix-noise/ref/out.exr differ
diff --git a/testsuite/testoptix-noise/ref/out_02.exr b/testsuite/testoptix-noise/ref/out_02.exr
index 7f29b6185..c97803b2a 100644
Binary files a/testsuite/testoptix-noise/ref/out_02.exr and b/testsuite/testoptix-noise/ref/out_02.exr differ
diff --git a/testsuite/testoptix-noise/run.py b/testsuite/testoptix-noise/run.py
index 865b73183..3a787e3dd 100755
--- a/testsuite/testoptix-noise/run.py
+++ b/testsuite/testoptix-noise/run.py
@@ -7,5 +7,5 @@
 failthresh = 0.03   # allow a little more LSB noise between platforms
 failpercent = .5
 outputs  = [ "out.exr", "out_02.exr" ]
-command  = testrender("-optix -res 320 240 scene.xml out.exr")
-command += testrender("-optix -res 320 240 scene_02.xml out_02.exr")
+command  = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr")
+command += testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene_02.xml out_02.exr")
diff --git a/testsuite/testoptix-reparam/ref/out.exr b/testsuite/testoptix-reparam/ref/out.exr
index 895e33842..deb4c66a2 100644
Binary files a/testsuite/testoptix-reparam/ref/out.exr and b/testsuite/testoptix-reparam/ref/out.exr differ
diff --git a/testsuite/testoptix-reparam/run.py b/testsuite/testoptix-reparam/run.py
index ef900eae8..b196355b0 100755
--- a/testsuite/testoptix-reparam/run.py
+++ b/testsuite/testoptix-reparam/run.py
@@ -7,4 +7,4 @@
 failthresh = 0.03   # allow a little more LSB noise between platforms
 failpercent = .5
 outputs  = [ "out.exr" ]
-command  = testrender("-optix -res 320 240 scene.xml out.exr")
+command  = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr")
diff --git a/testsuite/testoptix/ref/out.exr b/testsuite/testoptix/ref/out.exr
index 9ab3ac64b..b3f22b78a 100644
Binary files a/testsuite/testoptix/ref/out.exr and b/testsuite/testoptix/ref/out.exr differ
diff --git a/testsuite/testoptix/ref/out.txt b/testsuite/testoptix/ref/out.txt
index e8fc1c646..c545d4b61 100644
--- a/testsuite/testoptix/ref/out.txt
+++ b/testsuite/testoptix/ref/out.txt
@@ -16,8 +16,6 @@ temp:  abracadabra
 temp2: open sesame
 str:             default
 hash(str):       3d6b922d
-strlen(str):     7
-getchar(str, 0): 100
 str: userdata string
 
 Output Cout to test_spline.exr
diff --git a/testsuite/testoptix/ref/test_microfacet_dist.exr b/testsuite/testoptix/ref/test_microfacet_dist.exr
index 34317b655..350f9c3fc 100644
Binary files a/testsuite/testoptix/ref/test_microfacet_dist.exr and b/testsuite/testoptix/ref/test_microfacet_dist.exr differ
diff --git a/testsuite/testoptix/ref/test_spline.exr b/testsuite/testoptix/ref/test_spline.exr
index 6d19261e9..636870be2 100644
Binary files a/testsuite/testoptix/ref/test_spline.exr and b/testsuite/testoptix/ref/test_spline.exr differ
diff --git a/testsuite/testoptix/ref/test_texture.exr b/testsuite/testoptix/ref/test_texture.exr
index cc780678b..0026f1084 100644
Binary files a/testsuite/testoptix/ref/test_texture.exr and b/testsuite/testoptix/ref/test_texture.exr differ
diff --git a/testsuite/testoptix/run.py b/testsuite/testoptix/run.py
index 833915c27..660e7e867 100755
--- a/testsuite/testoptix/run.py
+++ b/testsuite/testoptix/run.py
@@ -7,14 +7,14 @@
 failthresh = 0.03   # allow a little more LSB noise between platforms
 failpercent = .5
 outputs  = [ "out.exr", "test_microfacet_dist.exr", "test_texture.exr", "test_spline.exr", "out.txt" ]
-command  = testrender("-optix -res 320 240 scene.xml out.exr")
-command += testrender("-optix -res 320 240 test_microfacet_dist.xml test_microfacet_dist.exr")
-command += testrender("-optix -res 1 1 test_print.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_compare.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_assign.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_assign_02.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_str_ops.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_userdata_string.xml dummy.exr")
+command  = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr")
+command += testrender("-optix -res 320 240 -no-jitter -albedo 1.0 test_microfacet_dist.xml test_microfacet_dist.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_print.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_compare.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_assign.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_assign_02.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_str_ops.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_userdata_string.xml dummy.exr")
 
 command += testshade("-optix -res 256 256 test_spline -o Cout test_spline.exr")
 command += testshade("-optix -res 512 512 test_texture -o Cout test_texture.exr")
diff --git a/testsuite/testoptix/test_str_ops.osl b/testsuite/testoptix/test_str_ops.osl
index b23c79576..e7c72463e 100644
--- a/testsuite/testoptix/test_str_ops.osl
+++ b/testsuite/testoptix/test_str_ops.osl
@@ -6,7 +6,9 @@ surface test_str_ops (string str = "default", color Cs = 1)
 {
     printf("str:             %s\n", str);
     printf("hash(str):       %x\n", hash(str));
-    printf("strlen(str):     %d\n", strlen(str));
-    printf("getchar(str, 0): %d\n", getchar(str, 0));
+    // NB: These string operations don't work with the current
+    //     device string implementation.
+    // printf("strlen(str):     %d\n", strlen(str));
+    // printf("getchar(str, 0): %d\n", getchar(str, 0));
     Ci = Cs * diffuse (N);
 }