From ae76938cfc0b0dc8f6a72f4b20851268c2653e36 Mon Sep 17 00:00:00 2001
From: Tuomas Tonteri <e90@tuomastonteri.fi>
Date: Thu, 4 Jul 2024 20:18:54 +0300
Subject: [PATCH 1/3] Add support for b4_SSE2 batched mode.

Signed-off-by: Tuomas Tonteri <e90@tuomastonteri.fi>
---
 .github/workflows/ci.yml                      |  11 +
 INSTALL.md                                    |   6 +-
 src/cmake/compiler.cmake                      |   2 +-
 src/include/OSL/batched_texture.h             |   9 +-
 src/include/OSL/llvm_util.h                   |   3 +
 src/include/OSL/rendererservices.h            |   1 +
 src/liboslexec/CMakeLists.txt                 |   4 +
 src/liboslexec/batched_analysis.cpp           |   8 +-
 src/liboslexec/batched_backendllvm.cpp        |   1 +
 src/liboslexec/batched_llvm_instance.cpp      |  44 +++
 src/liboslexec/batched_rendservices.cpp       |   1 +
 src/liboslexec/context.cpp                    |   1 +
 src/liboslexec/llvm_passes.h                  |   2 +
 src/liboslexec/llvm_util.cpp                  | 370 ++++++++++++++----
 src/liboslexec/rendservices.cpp               |   7 +
 src/liboslexec/shadingsys.cpp                 |  31 +-
 src/testshade/batched_simplerend.cpp          |   1 +
 src/testshade/simplerend.cpp                  |   4 +-
 src/testshade/simplerend.h                    |   5 +
 src/testshade/testshade.cpp                   |  19 +-
 .../oslbatcheddeformer.cpp                    |  16 +-
 21 files changed, 463 insertions(+), 83 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0ff552ad3..4a00e9814 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,6 +74,17 @@ jobs:
             pybind11_ver: v2.5.0
             simd: sse4.2
             setenvs: export CONAN_LLVM_VERSION=10.0.1
+          - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
+            nametag: linux-vfx2021
+            runner: ubuntu-latest
+            container: aswftesting/ci-osl:2021-clang11
+            vfxyear: 2021
+            cxx_std: 17
+            openimageio_ver: v2.4.13.0
+            python_ver: 3.7
+            pybind11_ver: v2.7.0
+            simd: sse2
+            batched: b4_SSE2
           - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2
             nametag: linux-vfx2021
             runner: ubuntu-latest
diff --git a/INSTALL.md b/INSTALL.md
index 25ec63d52..58ba76337 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as
 well, but we don't officially support or test other than these platforms.
 
 Shader execution is supported on the native architectures of those x86_64 and
-aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode
-requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs
-using Cuda+OptiX.
+aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode
+requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on
+NVIDIA GPUs using Cuda+OptiX.
 
 Dependencies
 ------------
diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index 81ac2092f..4e86cc055 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -329,7 +329,7 @@ endif ()
 #
 # The USE_BATCHED option may be set to indicate that support for batched
 # SIMD shader execution be compiled along with targe specific libraries
-set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
+set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
 option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
 set (BATCHED_SUPPORT_DEFINES "")
 set (BATCHED_TARGET_LIBS "")
diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h
index c720e9bed..787664472 100644
--- a/src/include/OSL/batched_texture.h
+++ b/src/include/OSL/batched_texture.h
@@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
 static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 template<int WidthT> struct BatchedTextureOptions {
     VaryingTextureOptions<WidthT> varying;
@@ -90,11 +93,15 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
 static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
                   == VecReg<8>::alignment,
               "Expect alignment of data member to set alignment of struct");
+static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
+                  == VecReg<4>::alignment,
+              "Expect alignment of data member to set alignment of struct");
 
 #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
 // Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
 // and safe to reinterpret_cast<TextureOptBatch*>
-static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
+static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8)
+                  || (OIIO::Tex::BatchWidth == 4),
               "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");
 
 namespace validate_offsets {
diff --git a/src/include/OSL/llvm_util.h b/src/include/OSL/llvm_util.h
index 7f112ccf5..49df62891 100644
--- a/src/include/OSL/llvm_util.h
+++ b/src/include/OSL/llvm_util.h
@@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util {
     llvm::Constant* constant(uint32_t i);
 
     /// Return an llvm::Constant holding the given integer constant.
+    llvm::Constant* constant4(int8_t i);
+    llvm::Constant* constant4(uint8_t i);
     llvm::Constant* constant8(int8_t i);
     llvm::Constant* constant8(uint8_t i);
     llvm::Constant* constant16(int16_t i);
@@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util {
 
     llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index);
     llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index);
+    llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index);
     std::array<llvm::Value*, 2> op_split_16x(llvm::Value* vector_val);
     std::array<llvm::Value*, 2> op_split_8x(llvm::Value* vector_val);
     std::array<llvm::Value*, 4> op_quarter_16x(llvm::Value* vector_val);
diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h
index 04e5269ae..62a6b6179 100644
--- a/src/include/OSL/rendererservices.h
+++ b/src/include/OSL/rendererservices.h
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
     /// Unless overridden, a nullptr is returned.
     virtual BatchedRendererServices<16>* batched(WidthOf<16>);
     virtual BatchedRendererServices<8>* batched(WidthOf<8>);
+    virtual BatchedRendererServices<4>* batched(WidthOf<4>);
 
 protected:
     TextureSystem* m_texturesys;  // A place to hold a TextureSystem
diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt
index 328565af6..2a2ea5c1b 100644
--- a/src/liboslexec/CMakeLists.txt
+++ b/src/liboslexec/CMakeLists.txt
@@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=x86-64")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()
@@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
             list (APPEND TARGET_CXX_OPTS "-march=haswell")
         elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
             list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
+        elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
+            list (APPEND TARGET_CXX_OPTS "-march=x86-64")
         else ()
             message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
         endif ()
diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp
index 888f19874..9f76c1acf 100644
--- a/src/liboslexec/batched_analysis.cpp
+++ b/src/liboslexec/batched_analysis.cpp
@@ -1813,10 +1813,16 @@ struct Analyzer {
                     // specific BatchedRendererServices.
                     // Right here we don't know which width will be used,
                     // so we will just require all widths provide the same answer
+                    auto rs4  = m_ba.renderer()->batched(WidthOf<4>());
                     auto rs8  = m_ba.renderer()->batched(WidthOf<8>());
                     auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
-                    if (rs8 || rs16) {
+                    if (rs4 || rs8 || rs16) {
                         get_attr_is_uniform = true;
+                        if (rs4) {
+                            get_attr_is_uniform
+                                &= rs4->is_attribute_uniform(obj_name,
+                                                             attr_name);
+                        }
                         if (rs8) {
                             get_attr_is_uniform
                                 &= rs8->is_attribute_uniform(obj_name,
diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp
index e94122ef4..79f87ca90 100644
--- a/src/liboslexec/batched_backendllvm.cpp
+++ b/src/liboslexec/batched_backendllvm.cpp
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
     switch (vector_width()) {
     case 16: m_true_mask_value = Mask<16>(true).value(); break;
     case 8: m_true_mask_value = Mask<8>(true).value(); break;
+    case 4: m_true_mask_value = Mask<4>(true).value(); break;
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     ll.dumpasm(shadingsys.m_llvm_dumpasm);
diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp
index 8e6ff0a76..218063786 100644
--- a/src/liboslexec/batched_llvm_instance.cpp
+++ b/src/liboslexec/batched_llvm_instance.cpp
@@ -537,6 +537,33 @@ const char*
     = "b8_AVX_";
 #endif
 
+#ifdef __OSL_SUPPORTS_b4_SSE2
+template<>
+const NameAndSignature
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
+    = {
+#    define DECL_INDIRECT(name, signature) \
+        NameAndSignature { #name, signature },
+#    define DECL(name, signature) DECL_INDIRECT(name, signature)
+#    define __OSL_WIDTH           4
+#    define __OSL_TARGET_ISA      SSE2
+// Don't allow order of xmacro includes be rearranged
+// clang-format off
+#    include "wide/define_opname_macros.h"
+#    include "builtindecl_wide_xmacro.h"
+#    include "wide/undef_opname_macros.h"
+// clang-format on
+#    undef __OSL_TARGET_ISA
+#    undef __OSL_WIDTH
+#    undef DECL
+#    undef DECL_INDIRECT
+      };
+template<>
+const char*
+    ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
+    = "b4_SSE2_";
+#endif
+
 
 
 std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
         default: break;
         }
         break;
+    case 4:
+        switch (target_isa) {
+#ifdef __OSL_SUPPORTS_b4_SSE2
+        case TargetISA::x64:
+            return RetType(
+                new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
+#endif
+        default: break;
+        }
+        break;
+
     default: OSL_ASSERT(0 && "unsupported vector width");
     }
     std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
             break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
     {
         std::vector<unsigned int> offset_by_index;
         switch (m_width) {
+        case 4:
+            build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
+            break;
         case 8:
             build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
             break;
diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp
index fbff377b2..1c5fcaa4a 100644
--- a/src/liboslexec/batched_rendservices.cpp
+++ b/src/liboslexec/batched_rendservices.cpp
@@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
 // Explicitly instantiate BatchedRendererServices template
 template class OSLEXECPUBLIC BatchedRendererServices<16>;
 template class OSLEXECPUBLIC BatchedRendererServices<8>;
+template class OSLEXECPUBLIC BatchedRendererServices<4>;
 
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp
index a97b427e1..b001315a8 100644
--- a/src/liboslexec/context.cpp
+++ b/src/liboslexec/context.cpp
@@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
 // Explicit template instantiation for supported batch sizes
 template class ShadingContext::Batched<16>;
 template class ShadingContext::Batched<8>;
+template class ShadingContext::Batched<4>;
 #endif
 
 
diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h
index 852ec82f9..43c7a7289 100644
--- a/src/liboslexec/llvm_passes.h
+++ b/src/liboslexec/llvm_passes.h
@@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
 // including this file will need its own static members defined. LLVM will
 // assign IDs when they get registered, so this initialization value is not
 // important.
+template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;
+
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;
 
 template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;
diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp
index 3dd888cab..2a434a0b8 100644
--- a/src/liboslexec/llvm_util.cpp
+++ b/src/liboslexec/llvm_util.cpp
@@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM()
 
 #ifndef OSL_LLVM_NEW_PASS_MANAGER
     // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks
+    static llvm::RegisterPass<
+        LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>>
+        sRegCustomPass2(
+            "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>",
+            "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass",
+            false /* Only looks at CFG */, false /* Analysis Pass */);
     static llvm::RegisterPass<
         LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>>
         sRegCustomPass0(
@@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host)
                 break;
             }
             case 4:
-                // We don't use masking or SIMD shading for 4-wide
+                // MUST BE THE FINAL PASS!
+                m_new_pass_manager->module_pass_manager.addPass(
+                    createModuleToFunctionPassAdaptor(
+                        NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>(
+                            context())));
                 break;
             default:
                 std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host)
                     new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>());
                 break;
             case 4:
-                // We don't use masking or SIMD shading for 4-wide
+                // MUST BE THE FINAL PASS!
+                mpm.add(
+                    new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>());
                 break;
             default:
                 std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -3393,6 +3405,19 @@ LLVM_Util::constant(uint32_t i)
     return llvm::ConstantInt::get(context(), llvm::APInt(32, i));
 }
 
+llvm::Constant*
+LLVM_Util::constant4(int8_t i)
+{
+    return llvm::ConstantInt::get(context(),
+                                  llvm::APInt(4, i, true /*signed*/));
+}
+
+llvm::Constant*
+LLVM_Util::constant4(uint8_t i)
+{
+    return llvm::ConstantInt::get(context(), llvm::APInt(4, i));
+}
+
 llvm::Constant*
 LLVM_Util::constant8(int8_t i)
 {
@@ -3592,6 +3617,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             // and all types are happy
             intMaskType = type_int8();
             break;
+        case 4:
+            // We can just reinterpret cast a 4 bit mask to a 8 bit integer
+            // and all types are happy
+            intMaskType = type_int8();
+            break;
         default: OSL_ASSERT(0 && "unsupported native bit mask width");
         };
 
@@ -3659,6 +3689,25 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             int8_mask = builder().CreateCall(func, toArrayRef(args));
             return int8_mask;
         }
+        case 4: {
+            // We need to do more than a simple cast to an int. Since we
+            // know vectorized comparison for SSE2 ends up setting 4
+            // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to
+            // do more than a simple cast to an int.
+
+            // Convert <4 x i1> -> <4 x i32>
+            llvm::Value* w4_int_mask = builder().CreateSExt(mask,
+                                                            type_wide_int());
+            // Now we will use the horizontal sign extraction intrinsic
+            // to build a 32 bit mask value.
+            llvm::Function* func = llvm::Intrinsic::getDeclaration(
+                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+
+            llvm::Value* args[1] = { w4_int_mask };
+            llvm::Value* int8_mask;
+            int8_mask = builder().CreateCall(func, toArrayRef(args));
+            return int8_mask;
+        }
         default: {
             OSL_ASSERT(0 && "unsupported native bit mask width");
             return mask;
@@ -3678,28 +3727,18 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             auto w4_int_masks          = op_quarter_16x(wide_int_mask);
 
             // Now we will use the horizontal sign extraction intrinsic
-            // to build a 32 bit mask value. However the only 128bit
-            // version works on floats, so we will cast from int32 to
-            // float beforehand
-            llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4);
-            std::array<llvm::Value*, 4> w4_float_masks = {
-                { builder().CreateBitCast(w4_int_masks[0], w4_float_type),
-                  builder().CreateBitCast(w4_int_masks[1], w4_float_type),
-                  builder().CreateBitCast(w4_int_masks[2], w4_float_type),
-                  builder().CreateBitCast(w4_int_masks[3], w4_float_type) }
-            };
-
+            // to build a 32 bit mask value.
             llvm::Function* func = llvm::Intrinsic::getDeclaration(
-                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
+                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
 
-            llvm::Value* args[1] = { w4_float_masks[0] };
+            llvm::Value* args[1] = { w4_int_masks[0] };
             std::array<llvm::Value*, 4> int4_masks;
             int4_masks[0] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_float_masks[1];
+            args[0]       = w4_int_masks[1];
             int4_masks[1] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_float_masks[2];
+            args[0]       = w4_int_masks[2];
             int4_masks[2] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_float_masks[3];
+            args[0]       = w4_int_masks[3];
             int4_masks[3] = builder().CreateCall(func, toArrayRef(args));
 
             llvm::Value* bits12_15 = op_shl(int4_masks[3], constant(12));
@@ -3720,22 +3759,14 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             auto w4_int_masks          = op_split_8x(wide_int_mask);
 
             // Now we will use the horizontal sign extraction intrinsic
-            // to build a 32 bit mask value. However the only 128bit
-            // version works on floats, so we will cast from int32 to
-            // float beforehand
-            llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4);
-            std::array<llvm::Value*, 2> w4_float_masks = {
-                { builder().CreateBitCast(w4_int_masks[0], w4_float_type),
-                  builder().CreateBitCast(w4_int_masks[1], w4_float_type) }
-            };
-
+            // to build a 32 bit mask value.
             llvm::Function* func = llvm::Intrinsic::getDeclaration(
-                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
+                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
 
-            llvm::Value* args[1] = { w4_float_masks[0] };
+            llvm::Value* args[1] = { w4_int_masks[0] };
             std::array<llvm::Value*, 2> int4_masks;
             int4_masks[0] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_float_masks[1];
+            args[0]       = w4_int_masks[1];
             int4_masks[1] = builder().CreateCall(func, toArrayRef(args));
 
             llvm::Value* bits4_7 = op_shl(int4_masks[1], constant(4));
@@ -3748,21 +3779,15 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             // do more than a simple cast to an int.
 
             // Convert <4 x i1> -> <4 x i32>
-            llvm::Value* wide_int_mask = builder().CreateSExt(mask,
-                                                              type_wide_int());
+            llvm::Value* w4_int_mask = builder().CreateSExt(mask,
+                                                            type_wide_int());
 
             // Now we will use the horizontal sign extraction intrinsic
-            // to build a 32 bit mask value. However the only 128bit
-            // version works on floats, so we will cast from int32 to
-            // float beforehand
-            llvm::Type* w4_float_type  = llvm_vector_type(m_llvm_type_float, 4);
-            llvm::Value* w4_float_mask = builder().CreateBitCast(wide_int_mask,
-                                                                 w4_float_type);
-
+            // to build a 32 bit mask value.
             llvm::Function* func = llvm::Intrinsic::getDeclaration(
-                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
+                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
 
-            llvm::Value* args[1]   = { w4_float_mask };
+            llvm::Value* args[1]   = { w4_int_mask };
             llvm::Value* int4_mask = builder().CreateCall(func,
                                                           toArrayRef(args));
 
@@ -3797,13 +3822,28 @@ LLVM_Util::mask_as_int8(llvm::Value* mask)
 llvm::Value*
 LLVM_Util::mask4_as_int8(llvm::Value* mask)
 {
-    OSL_ASSERT(m_supports_llvm_bit_masks_natively);
-    // combine <4xi1> mask with <4xi1> zero init to get <8xi1> and cast it
-    // to i8
-    llvm::Value* zero_mask4
-        = llvm::ConstantDataVector::getSplat(4, constant_bool(false));
-    return builder().CreateBitCast(op_combine_4x_vectors(mask, zero_mask4),
-                                   type_int8());
+    if (m_supports_llvm_bit_masks_natively) {
+        // combine <4xi1> mask with <4xi1> zero init to get <8xi1> and cast it
+        // to i8
+        llvm::Value* zero_mask4
+            = llvm::ConstantDataVector::getSplat(4, constant_bool(false));
+        return builder().CreateBitCast(op_combine_4x_vectors(mask, zero_mask4),
+                                       type_int8());
+    } else {
+        // Convert <4 x i1> -> <4 x i32>
+        llvm::Value* w4_int_mask = builder().CreateSExt(mask, type_wide_int());
+
+        // Now we will use the horizontal sign extraction intrinsic
+        // to build a 32 bit mask value.
+        llvm::Function* func = llvm::Intrinsic::getDeclaration(
+            module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+
+        llvm::Value* args[1] = { w4_int_mask };
+        llvm::Value* int32   = builder().CreateCall(func, toArrayRef(args));
+        llvm::Value* i8 = builder().CreateIntCast(int32, type_int8(), true);
+
+        return i8;
+    }
 }
 
 
@@ -3828,14 +3868,19 @@ LLVM_Util::int_as_mask(llvm::Value* value)
             // and all types are happy
             intMaskType = type_int8();
             break;
+        case 4:
+            // We can just reinterpret cast a 8 bit integer to a 4 bit mask
+            // and all types are happy
+            intMaskType = type_int8();
+            break;
         default: OSL_ASSERT(0 && "unsupported native bit mask width");
         };
         llvm::Value* intMask = builder().CreateTrunc(value, intMaskType);
 
         result = builder().CreateBitCast(intMask, type_wide_bool());
     } else {
-        // Since we know vectorized comparisons for AVX&AVX2 end up setting
-        // 8 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more
+        // Since we know vectorized comparisons for SSE2&AVX&AVX2 end up setting
+        // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more
         // than a simple cast to an int.
 
         // Broadcast out the int32 mask to all data lanes
@@ -3950,23 +3995,25 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
         // and all types are happy
         intMaskType = type_int8();
         break;
-#if 0  // WIP
-        case 4:
-        {
-            // We can just reinterpret cast a 8 bit mask to a 8 bit integer
-            // and all types are happy
-            intMaskType = type_int8();
+    case 4: {
+        intMaskType = type_int8();
 
-//            extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width);
-//            llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type);
-//
-//            int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context);
-//            zeroConstant = constant128(0);
-//
-//            llvm::Value * mask_as_int =  builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type);
-            break;
-        }
-#endif
+        llvm::Value* mask_as_int = mask4_as_int8(mask);
+
+        // Count trailing zeros, least significant
+        llvm::Type* types[] = { intMaskType };
+        llvm::Function* func_cttz
+            = llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::cttz,
+                                              toArrayRef(types));
+
+        llvm::Value* args[2] = { mask_as_int, constant_bool(true) };
+
+        llvm::Value* firstNonZeroIndex = builder().CreateCall(func_cttz,
+                                                              toArrayRef(args));
+        return firstNonZeroIndex;
+
+        break;
+    }
     default: OSL_ASSERT(0 && "unsupported native bit mask width");
     };
 
@@ -4455,6 +4502,19 @@ LLVM_Util::op_linearize_8x_indices(llvm::Value* wide_index)
 }
 
 
+llvm::Value*
+LLVM_Util::op_linearize_4x_indices(llvm::Value* wide_index)
+{
+    llvm::Value* strided_indices = op_mul(wide_index, wide_constant(4, 4));
+    llvm::Constant* offsets_to_lane[4] = { constant(0), constant(1),
+                                           constant(2), constant(3) };
+    llvm::Value* const_vec_offsets     = llvm::ConstantVector::get(
+        llvm::ArrayRef<llvm::Constant*>(&offsets_to_lane[0], 4));
+
+    return op_add(strided_indices, const_vec_offsets);
+}
+
+
 std::array<llvm::Value*, 2>
 LLVM_Util::op_split_16x(llvm::Value* vector_val)
 {
@@ -4613,6 +4673,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                     module(), llvm::Intrinsic::x86_avx512_gather_dpi_512);
                 break;
             case 8:
+            case 4:
                 int_mask              = mask_as_int8(current_mask());
                 func_avx512_gather_pi = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_gather3siv8_si);
@@ -4663,6 +4724,16 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                            toArrayRef(args));
                 return gather_result;
             }
+            case 4: {
+                llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr),
+                                        wide_index, wide_int_mask,
+                                        constant4((uint8_t)4) };
+                llvm::Value* gather_result
+                    = builder().CreateCall(func_avx2_gather_pi,
+                                           toArrayRef(args));
+                return gather_result;
+            }
+
             default: OSL_ASSERT(0 && "unsupported width");
             };
         } else {
@@ -4680,6 +4751,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                     module(), llvm::Intrinsic::x86_avx512_gather_dps_512);
                 break;
             case 8:
+            case 4:
                 int_mask              = mask_as_int8(current_mask());
                 func_avx512_gather_ps = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf);
@@ -4739,6 +4811,17 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                                            toArrayRef(args));
                 return gather;
             }
+            case 4: {
+                llvm::Value* args[] = {
+                    avx2_unmasked_value, void_ptr(src_ptr), wide_index,
+                    builder().CreateBitCast(wide_int_mask,
+                                            llvm_vector_type(type_float(), 4)),
+                    constant4((uint8_t)4)
+                };
+                llvm::Value* gather = builder().CreateCall(func_avx2_gather_ps,
+                                                           toArrayRef(args));
+                return gather;
+            }
             }
         } else {
             return clamped_gather_from_uniform(type_wide_float());
@@ -4805,6 +4888,29 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                                                       gather2),
                                                 type_wide_ustring());
             }
+            case 4: {
+                // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring
+                llvm::Function* func_avx512_gather_dpq
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_gather3siv4_di);
+                OSL_ASSERT(func_avx512_gather_dpq);
+
+                auto w4_bit_masks   = current_mask();
+                auto w4_int_indices = wide_index;
+
+                llvm::Value* unmasked_value
+                    = builder().CreateVectorSplat(4, constant64((uint64_t)0));
+                llvm::Value* args[]
+                    = { unmasked_value, void_ptr(src_ptr), w4_int_indices,
+                        mask4_as_int8(w4_bit_masks), constant(4) };
+                llvm::Value* gather1
+                    = builder().CreateCall(func_avx512_gather_dpq,
+                                           toArrayRef(args));
+                args[2] = w4_int_indices;
+                args[3] = mask4_as_int8(w4_bit_masks);
+
+                return builder().CreateIntToPtr(gather1, type_wide_ustring());
+            }
             default: OSL_ASSERT(0 && "unsupported native bit mask width");
             }
         } else {
@@ -4841,6 +4947,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                 return builder().CreateCall(func_avx512_gather_ps,
                                             toArrayRef(args));
             }
+            case 4: {
+                llvm::Function* func_avx512_gather_ps
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf);
+                OSL_ASSERT(func_avx512_gather_ps);
+
+                llvm::Value* unmasked_value = wide_constant(0.0f);
+                llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr),
+                                        op_linearize_4x_indices(wide_index),
+                                        mask_as_int8(current_mask()),
+                                        constant(4) };
+                return builder().CreateCall(func_avx512_gather_ps,
+                                            toArrayRef(args));
+            }
             default: OSL_ASSERT(0 && "unsupported native bit mask width");
             };
 
@@ -4889,6 +5009,19 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                            toArrayRef(args));
                 return gather_result;
             }
+            case 4: {
+                auto int_indices    = op_linearize_4x_indices(wide_index);
+                llvm::Value* args[] = {
+                    avx2_unmasked_value, void_ptr(src_ptr), int_indices,
+                    builder().CreateBitCast(wide_int_mask,
+                                            llvm_vector_type(type_float(), 4)),
+                    constant8((uint8_t)4)
+                };
+                llvm::Value* gather_result
+                    = builder().CreateCall(func_avx2_gather_ps,
+                                           toArrayRef(args));
+                return gather_result;
+            }
             default:
                 OSL_ASSERT(0 && "unsupported vector width for avx2 gather");
             }
@@ -4926,6 +5059,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                 return builder().CreateCall(func_avx512_gather_pi,
                                             toArrayRef(args));
             }
+            case 4: {
+                llvm::Function* func_avx512_gather_pi
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_gather3siv8_si);
+                OSL_ASSERT(func_avx512_gather_pi);
+
+                llvm::Value* unmasked_value = wide_constant(0);
+                llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr),
+                                        op_linearize_4x_indices(wide_index),
+                                        mask_as_int8(current_mask()),
+                                        constant(4) };
+                return builder().CreateCall(func_avx512_gather_pi,
+                                            toArrayRef(args));
+            }
             default: OSL_ASSERT(0 && "unsupported native bit mask width");
             }
         } else if (m_supports_avx2) {
@@ -4975,6 +5122,26 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                            toArrayRef(args));
                 return gather_result;
             }
+            case 4: {
+                llvm::Function* func_avx2_gather_pi
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx2_gather_d_d_256);
+                OSL_ASSERT(func_avx2_gather_pi);
+
+                llvm::Constant* avx2_unmasked_value = wide_constant(8, 0);
+
+                // Convert <16 x i1> -> <16 x i32> -> to <2 x< 8 x i32>>
+                llvm::Value* wide_int_mask
+                    = builder().CreateSExt(current_mask(), type_wide_int());
+                auto int_indices    = op_linearize_4x_indices(wide_index);
+                llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr),
+                                        int_indices, wide_int_mask,
+                                        constant8((uint8_t)4) };
+                llvm::Value* gather_result
+                    = builder().CreateCall(func_avx2_gather_pi,
+                                           toArrayRef(args));
+                return gather_result;
+            }
             default:
                 OSL_ASSERT(0 && "unsupported vector width for avx2 gather");
             }
@@ -5017,7 +5184,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
                                                                       gather2),
                                                 type_wide_ustring());
             }
-            case 8: {
+            case 8:
+            case 4: {
                 // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring
                 llvm::Function* func_avx512_gather_dpq
                     = llvm::Intrinsic::getDeclaration(
@@ -5093,6 +5261,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 linear_indices = op_linearize_16x_indices(wide_index);
                 break;
             case 8: linear_indices = op_linearize_8x_indices(wide_index); break;
+            case 4: linear_indices = op_linearize_4x_indices(wide_index); break;
             default: OSL_ASSERT(0 && "unsupported vector width for scatter");
             };
         } else {
@@ -5150,6 +5319,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                     module(), llvm::Intrinsic::x86_avx512_scatter_dps_512);
                 break;
             case 8:
+            case 4:
                 int_mask               = mask_as_int8(current_mask());
                 func_avx512_scatter_ps = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf);
@@ -5182,6 +5352,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                     module(), llvm::Intrinsic::x86_avx512_scatter_dpi_512);
                 break;
             case 8:
+            case 4:
                 int_mask               = mask_as_int8(current_mask());
                 func_avx512_scatter_pi = llvm::Intrinsic::getDeclaration(
                     module(), llvm::Intrinsic::x86_avx512_scattersiv8_si);
@@ -5256,6 +5427,25 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Value* linear_indices = wide_index;
+
+                llvm::Function* func_avx512_scatter_dpq
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512);
+                OSL_ASSERT(func_avx512_scatter_dpq);
+
+                llvm::Type* wide_address_int_type
+                    = llvm_vector_type(type_addrint(), 4);
+                llvm::Value* address_int_val
+                    = builder().CreatePtrToInt(wide_val, wide_address_int_type);
+
+                llvm::Value* args[]
+                    = { void_ptr(src_ptr), mask_as_int8(current_mask()),
+                        linear_indices, address_int_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
@@ -5295,6 +5485,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Function* func_avx512_scatter_ps
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf);
+                OSL_ASSERT(func_avx512_scatter_ps);
+
+                llvm::Value* args[] = { void_ptr(src_ptr),
+                                        mask_as_int8(current_mask()),
+                                        op_linearize_4x_indices(wide_index),
+                                        wide_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
@@ -5338,6 +5541,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Function* func_avx512_scatter_pi
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scattersiv8_si);
+                OSL_ASSERT(func_avx512_scatter_pi);
+
+                llvm::Value* args[] = { void_ptr(src_ptr),
+                                        mask_as_int8(current_mask()),
+                                        op_linearize_4x_indices(wide_index),
+                                        wide_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
@@ -5407,6 +5623,26 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type,
                 builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
                 return;
             }
+            case 4: {
+                llvm::Value* linear_indices = op_linearize_4x_indices(
+                    wide_index);
+
+                llvm::Function* func_avx512_scatter_dpq
+                    = llvm::Intrinsic::getDeclaration(
+                        module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512);
+                OSL_ASSERT(func_avx512_scatter_dpq);
+
+                llvm::Type* wide_address_int_type
+                    = llvm_vector_type(type_addrint(), 4);
+                llvm::Value* address_int_val
+                    = builder().CreatePtrToInt(wide_val, wide_address_int_type);
+
+                llvm::Value* args[]
+                    = { void_ptr(src_ptr), mask_as_int8(current_mask()),
+                        linear_indices, address_int_val, constant(4) };
+                builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args));
+                return;
+            }
             default:
                 OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter");
             }
diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp
index c0c84b03d..b3bd5c898 100644
--- a/src/liboslexec/rendservices.cpp
+++ b/src/liboslexec/rendservices.cpp
@@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>)
     return nullptr;
 }
 
+BatchedRendererServices<4>*
+RendererServices::batched(WidthOf<4>)
+{
+    // No default implementation for batched services
+    return nullptr;
+}
+
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp
index 7efc5b13d..c0afc98f1 100644
--- a/src/liboslexec/shadingsys.cpp
+++ b/src/liboslexec/shadingsys.cpp
@@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width)
                 m_impl->attribute("llvm_jit_fma", 0);
                 return true;
             }
+#    endif
+            if (target_requested) {
+                break;
+            }
+            // fallthrough
+        default: return false;
+        };
+        return false;
+    case 4:
+        switch (requestedISA) {
+        case TargetISA::UNKNOWN:
+            // fallthrough
+        case TargetISA::x64:
+#    ifdef __OSL_SUPPORTS_b4_SSE2
+            if (LLVM_Util::supports_isa(TargetISA::x64)) {
+                if (!target_requested)
+                    m_impl->attribute("llvm_jit_target",
+                                      LLVM_Util::target_isa_name(
+                                          TargetISA::x64));
+                // SSE2 doesn't support FMA
+                m_impl->attribute("llvm_jit_fma", 0);
+                return true;
+            }
 #    endif
             if (target_requested) {
                 break;
@@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor<WidthT>::jit_all_groups(int nthreads)
 // Explicitly instantiate
 template class ShadingSystem::BatchedExecutor<16>;
 template class ShadingSystem::BatchedExecutor<8>;
+template class ShadingSystem::BatchedExecutor<4>;
 #endif
 
 
@@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer,
     , m_opt_groupdata(true)
 #if OSL_USE_BATCHED
     , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr)
-                             || (renderer->batched(WidthOf<8>()) != nullptr))
+                             || (renderer->batched(WidthOf<8>()) != nullptr)
+                             || (renderer->batched(WidthOf<4>()) != nullptr))
 #else
     , m_opt_batched_analysis(false)
 #endif
@@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
         // the batch jit has already happened,
         // as it requires the ops so we can't delete them yet!
         if (((renderer()->batched(WidthOf<16>()) == nullptr)
-             && (renderer()->batched(WidthOf<8>()) == nullptr))
+             && (renderer()->batched(WidthOf<8>()) == nullptr)
+             && (renderer()->batched(WidthOf<4>()) == nullptr))
             || group.batch_jitted()) {
             group_post_jit_cleanup(group);
         }
@@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched<WidthT>::jit_all_groups(int nthreads, int mythread,
 // machine as well, start with just the batch size
 template class pvt::ShadingSystemImpl::Batched<16>;
 template class pvt::ShadingSystemImpl::Batched<8>;
+template class pvt::ShadingSystemImpl::Batched<4>;
 #endif
 
 int
diff --git a/src/testshade/batched_simplerend.cpp b/src/testshade/batched_simplerend.cpp
index 937655af4..ea2acbdf9 100644
--- a/src/testshade/batched_simplerend.cpp
+++ b/src/testshade/batched_simplerend.cpp
@@ -1001,6 +1001,7 @@ BatchedSimpleRenderer<WidthT>::get_camera_screen_window(ustringhash /*object*/,
 // Explicitly instantiate BatchedSimpleRenderer template
 template class BatchedSimpleRenderer<16>;
 template class BatchedSimpleRenderer<8>;
+template class BatchedSimpleRenderer<4>;
 
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testshade/simplerend.cpp b/src/testshade/simplerend.cpp
index 65862c2db..3582c9cc4 100644
--- a/src/testshade/simplerend.cpp
+++ b/src/testshade/simplerend.cpp
@@ -218,7 +218,9 @@ register_closures(OSL::ShadingSystem* shadingsys)
 
 SimpleRenderer::SimpleRenderer()
 #if OSL_USE_BATCHED
-    : m_batch_16_simple_renderer(*this), m_batch_8_simple_renderer(*this)
+    : m_batch_16_simple_renderer(*this)
+    , m_batch_8_simple_renderer(*this)
+    , m_batch_4_simple_renderer(*this)
 #endif
 {
     Matrix44 M;
diff --git a/src/testshade/simplerend.h b/src/testshade/simplerend.h
index 87d0b96dd..8ebe1c1fc 100644
--- a/src/testshade/simplerend.h
+++ b/src/testshade/simplerend.h
@@ -177,12 +177,17 @@ class SimpleRenderer : public RendererServices {
     {
         return &m_batch_8_simple_renderer;
     }
+    BatchedRendererServices<4>* batched(WidthOf<4>) override
+    {
+        return &m_batch_4_simple_renderer;
+    }
 #endif
 
 protected:
 #if OSL_USE_BATCHED
     BatchedSimpleRenderer<16> m_batch_16_simple_renderer;
     BatchedSimpleRenderer<8> m_batch_8_simple_renderer;
+    BatchedSimpleRenderer<4> m_batch_4_simple_renderer;
 #endif
 
     // Camera parameters
diff --git a/src/testshade/testshade.cpp b/src/testshade/testshade.cpp
index db5bac164..d8b6f749c 100644
--- a/src/testshade/testshade.cpp
+++ b/src/testshade/testshade.cpp
@@ -306,6 +306,9 @@ set_shadingsys_options()
         } else if ((!batch_size_requested || batch_size == 8)
                    && shadingsys->configure_batch_execution_at(8)) {
             batch_size = 8;
+        } else if ((!batch_size_requested || batch_size == 4)
+                   && shadingsys->configure_batch_execution_at(4)) {
+            batch_size = 4;
         } else {
             OSL::print(
                 "WARNING:  Hardware or library requirements to utilize batched execution");
@@ -1194,9 +1197,11 @@ setup_output_images(SimpleRenderer* rend, ShadingSystem* shadingsys,
             // jit_group will optimize the group if necesssary
             if (batch_size == 16) {
                 shadingsys->batched<16>().jit_group(shadergroup.get(), ctx);
-            } else {
-                ASSERT((batch_size == 8) && "Unsupported batch size");
+            } else if (batch_size == 8) {
                 shadingsys->batched<8>().jit_group(shadergroup.get(), ctx);
+            } else {
+                ASSERT((batch_size == 4) && "Unsupported batch size");
+                shadingsys->batched<4>().jit_group(shadergroup.get(), ctx);
             }
         } else
 #endif
@@ -2195,13 +2200,19 @@ test_shade(int argc, const char* argv[])
                             batched_shade_region<16>(rend, shadergroup.get(),
                                                      sub_roi, save);
                         });
-                } else {
-                    ASSERT((batch_size == 8) && "Unsupported batch size");
+                } else if (batch_size == 8) {
                     OIIO::ImageBufAlgo::parallel_image(
                         roi, num_threads, [&](OIIO::ROI sub_roi) -> void {
                             batched_shade_region<8>(rend, shadergroup.get(),
                                                     sub_roi, save);
                         });
+                } else {
+                    ASSERT((batch_size == 4) && "Unsupported batch size");
+                    OIIO::ImageBufAlgo::parallel_image(
+                        roi, num_threads, [&](OIIO::ROI sub_roi) -> void {
+                            batched_shade_region<4>(rend, shadergroup.get(),
+                                                    sub_roi, save);
+                        });
                 }
             } else
 #    endif
diff --git a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp
index 0b7af16e4..449f06f59 100644
--- a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp
+++ b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp
@@ -182,10 +182,15 @@ class MyRendererServices final : public OSL::RendererServices {
     {
         return &m_batch_8_rs;
     }
+    OSL::BatchedRendererServices<4>* batched(OSL::WidthOf<4>) override
+    {
+        return &m_batch_4_rs;
+    }
 
 private:
     MyBatchedRendererServices<16> m_batch_16_rs;
     MyBatchedRendererServices<8> m_batch_8_rs;
+    MyBatchedRendererServices<4> m_batch_4_rs;
 };
 
 
@@ -232,11 +237,13 @@ main(int argc, char* argv[])
         batch_width = 16;
     } else if (shadsys->configure_batch_execution_at(8)) {
         batch_width = 8;
+    } else if (shadsys->configure_batch_execution_at(4)) {
+        batch_width = 4;
     } else {
         std::cout
-            << "Error:  Hardware doesn't support 8 or 16 wide SIMD or the OSL has not been configured and built with a proper USE_BATCHED."
+            << "Error:  Hardware doesn't support 4, 8 or 16 wide SIMD or the OSL has not been configured and built with a proper USE_BATCHED."
             << std::endl;
-        std::cout << "Error:  e.g.:  USE_BATCHED=b8_AVX2,b8_AVX512,b16_AVX512"
+        std::cout << "Error:  e.g.:  USE_BATCHED=b4_SSE2,b8_AVX2,b8_AVX512,b16_AVX512"
                   << std::endl;
         return -1;
     }
@@ -432,8 +439,11 @@ main(int argc, char* argv[])
 
     if (batch_width == 16) {
         batched_shadepoints(std::integral_constant<int, 16> {});
-    } else {
+    }
+    else if (batch_width == 8) {
         batched_shadepoints(std::integral_constant<int, 8> {});
+    } else {
+        batched_shadepoints(std::integral_constant<int, 4> {});
     }
 
     // Print some results to prove that we generated an expected Pout.

From 9e5c6749f0faed043a912485ad68fea890a3bea6 Mon Sep 17 00:00:00 2001
From: Tuomas Tonteri <e90@tuomastonteri.fi>
Date: Fri, 16 Aug 2024 13:24:36 +0300
Subject: [PATCH 2/3] Fix regression in batched SSE2 patch.

Signed-off-by: Tuomas Tonteri <e90@tuomastonteri.fi>
---
 src/liboslexec/llvm_util.cpp | 75 ++++++++++++++++++++++++++++--------
 1 file changed, 59 insertions(+), 16 deletions(-)

diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp
index 2a434a0b8..400cef535 100644
--- a/src/liboslexec/llvm_util.cpp
+++ b/src/liboslexec/llvm_util.cpp
@@ -3698,12 +3698,21 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             // Convert <4 x i1> -> <4 x i32>
             llvm::Value* w4_int_mask = builder().CreateSExt(mask,
                                                             type_wide_int());
+
+            // Now we will use the horizontal sign extraction intrinsic
+            // to build a 32 bit mask value. However the only 256bit
+            // version works on floats, so we will cast from int32 to
+            // float beforehand
+            llvm::Type* w4_float_type  = llvm_vector_type(m_llvm_type_float, 4);
+            llvm::Value* w4_float_mask = builder().CreateBitCast(w4_int_mask,
+                                                                 w4_float_type);
+
             // Now we will use the horizontal sign extraction intrinsic
             // to build a 32 bit mask value.
             llvm::Function* func = llvm::Intrinsic::getDeclaration(
-                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
 
-            llvm::Value* args[1] = { w4_int_mask };
+            llvm::Value* args[1] = { w4_float_mask };
             llvm::Value* int8_mask;
             int8_mask = builder().CreateCall(func, toArrayRef(args));
             return int8_mask;
@@ -3727,18 +3736,28 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             auto w4_int_masks          = op_quarter_16x(wide_int_mask);
 
             // Now we will use the horizontal sign extraction intrinsic
-            // to build a 32 bit mask value.
+            // to build a 32 bit mask value. However the only 128bit
+            // version works on floats, so we will cast from int32 to
+            // float beforehand
+            llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4);
+            std::array<llvm::Value*, 4> w4_float_masks = {
+                { builder().CreateBitCast(w4_int_masks[0], w4_float_type),
+                  builder().CreateBitCast(w4_int_masks[1], w4_float_type),
+                  builder().CreateBitCast(w4_int_masks[2], w4_float_type),
+                  builder().CreateBitCast(w4_int_masks[3], w4_float_type) }
+            };
+
             llvm::Function* func = llvm::Intrinsic::getDeclaration(
-                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
 
-            llvm::Value* args[1] = { w4_int_masks[0] };
+            llvm::Value* args[1] = { w4_float_masks[0] };
             std::array<llvm::Value*, 4> int4_masks;
             int4_masks[0] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_int_masks[1];
+            args[0]       = w4_float_masks[1];
             int4_masks[1] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_int_masks[2];
+            args[0]       = w4_float_masks[2];
             int4_masks[2] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_int_masks[3];
+            args[0]       = w4_float_masks[3];
             int4_masks[3] = builder().CreateCall(func, toArrayRef(args));
 
             llvm::Value* bits12_15 = op_shl(int4_masks[3], constant(12));
@@ -3759,14 +3778,22 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             auto w4_int_masks          = op_split_8x(wide_int_mask);
 
             // Now we will use the horizontal sign extraction intrinsic
-            // to build a 32 bit mask value.
+            // to build a 32 bit mask value. However the only 128bit
+            // version works on floats, so we will cast from int32 to
+            // float beforehand
+            llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4);
+            std::array<llvm::Value*, 2> w4_float_masks = {
+                { builder().CreateBitCast(w4_int_masks[0], w4_float_type),
+                  builder().CreateBitCast(w4_int_masks[1], w4_float_type) }
+            };
+
             llvm::Function* func = llvm::Intrinsic::getDeclaration(
-                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
 
-            llvm::Value* args[1] = { w4_int_masks[0] };
+            llvm::Value* args[1] = { w4_float_masks[0] };
             std::array<llvm::Value*, 2> int4_masks;
             int4_masks[0] = builder().CreateCall(func, toArrayRef(args));
-            args[0]       = w4_int_masks[1];
+            args[0]       = w4_float_masks[1];
             int4_masks[1] = builder().CreateCall(func, toArrayRef(args));
 
             llvm::Value* bits4_7 = op_shl(int4_masks[1], constant(4));
@@ -3782,12 +3809,20 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
             llvm::Value* w4_int_mask = builder().CreateSExt(mask,
                                                             type_wide_int());
 
+            // Now we will use the horizontal sign extraction intrinsic
+            // to build a 32 bit mask value. However the only 256bit
+            // version works on floats, so we will cast from int32 to
+            // float beforehand
+            llvm::Type* w4_float_type  = llvm_vector_type(m_llvm_type_float, 4);
+            llvm::Value* w4_float_mask = builder().CreateBitCast(w4_int_mask,
+                                                                 w4_float_type);
+
             // Now we will use the horizontal sign extraction intrinsic
             // to build a 32 bit mask value.
             llvm::Function* func = llvm::Intrinsic::getDeclaration(
-                module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+                module(), llvm::Intrinsic::x86_sse_movmsk_ps);
 
-            llvm::Value* args[1]   = { w4_int_mask };
+            llvm::Value* args[1]   = { w4_float_mask };
             llvm::Value* int4_mask = builder().CreateCall(func,
                                                           toArrayRef(args));
 
@@ -3833,12 +3868,20 @@ LLVM_Util::mask4_as_int8(llvm::Value* mask)
         // Convert <4 x i1> -> <4 x i32>
         llvm::Value* w4_int_mask = builder().CreateSExt(mask, type_wide_int());
 
+        // Now we will use the horizontal sign extraction intrinsic
+        // to build a 32 bit mask value. However the only 256bit
+        // version works on floats, so we will cast from int32 to
+        // float beforehand
+        llvm::Type* w4_float_type  = llvm_vector_type(m_llvm_type_float, 4);
+        llvm::Value* w4_float_mask = builder().CreateBitCast(w4_int_mask,
+                                                             w4_float_type);
+
         // Now we will use the horizontal sign extraction intrinsic
         // to build a 32 bit mask value.
         llvm::Function* func = llvm::Intrinsic::getDeclaration(
-            module(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
+            module(), llvm::Intrinsic::x86_sse_movmsk_ps);
 
-        llvm::Value* args[1] = { w4_int_mask };
+        llvm::Value* args[1] = { w4_float_mask };
         llvm::Value* int32   = builder().CreateCall(func, toArrayRef(args));
         llvm::Value* i8 = builder().CreateIntCast(int32, type_int8(), true);
 

From c6d796abdb0c351258b5e566349d93815b1e0448 Mon Sep 17 00:00:00 2001
From: Tuomas Tonteri <e90@tuomastonteri.fi>
Date: Sat, 17 Aug 2024 12:14:52 +0300
Subject: [PATCH 3/3] Fix unnecessary code repetition in SSE2 patch.

Signed-off-by: Tuomas Tonteri <e90@tuomastonteri.fi>
---
 src/liboslexec/llvm_util.cpp | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp
index 400cef535..de41e217f 100644
--- a/src/liboslexec/llvm_util.cpp
+++ b/src/liboslexec/llvm_util.cpp
@@ -4027,34 +4027,23 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
     // Assumes mask is not empty
 
     llvm::Type* intMaskType = nullptr;
+    llvm::Value* int_mask   = nullptr;
     switch (m_vector_width) {
     case 16:
         // We can just reinterpret cast a 16 bit mask to a 16 bit integer
         // and all types are happy
         intMaskType = type_int16();
+        int_mask    = builder().CreateBitCast(mask, intMaskType);
         break;
     case 8:
         // We can just reinterpret cast a 8 bit mask to a 8 bit integer
         // and all types are happy
         intMaskType = type_int8();
+        int_mask    = builder().CreateBitCast(mask, intMaskType);
         break;
     case 4: {
         intMaskType = type_int8();
-
-        llvm::Value* mask_as_int = mask4_as_int8(mask);
-
-        // Count trailing zeros, least significant
-        llvm::Type* types[] = { intMaskType };
-        llvm::Function* func_cttz
-            = llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::cttz,
-                                              toArrayRef(types));
-
-        llvm::Value* args[2] = { mask_as_int, constant_bool(true) };
-
-        llvm::Value* firstNonZeroIndex = builder().CreateCall(func_cttz,
-                                                              toArrayRef(args));
-        return firstNonZeroIndex;
-
+        int_mask    = mask4_as_int8(mask);
         break;
     }
     default: OSL_ASSERT(0 && "unsupported native bit mask width");
@@ -4066,8 +4055,7 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
         = llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::cttz,
                                           toArrayRef(types));
 
-    llvm::Value* int_mask = builder().CreateBitCast(mask, intMaskType);
-    llvm::Value* args[2]  = { int_mask, constant_bool(true) };
+    llvm::Value* args[2] = { int_mask, constant_bool(true) };
 
     llvm::Value* firstNonZeroIndex = builder().CreateCall(func_cttz,
                                                           toArrayRef(args));